# Setup

In [1]:
import datetime, json, re, math, sys
import numpy as np
import numpy #lol
import matplotlib.pyplot as plt
from scipy.stats import linregress as lm
import scipy
import random
from collections import Counter, defaultdict
from __future__ import division
% matplotlib inline
#import the files
sys.path.append('../API')
import main

## Mu analysis

In [2]:
def preprocess(text):
    counts = Counter()
    text = text.encode("utf8")
    words = []
    for word in re.split(" ", text):
        if word not in counts:
            words.append(word)
        counts[word] += 1
        
    return counts

def order(counts):
    words = counts.keys()
    ps = np.array(counts.values())
    ps = ps/float(sum(ps))
    N = len(words)
    return np.random.choice(words, size=N, replace=False, p=ps)

def innovationrate(counts, reps = 2, termmax = 100):
    
    N = len(counts)    
    FN = sum(counts.values())

    ns = range(1,N+1)
    Mn = [0 for n in ns]
    
    for rep in range(reps):
        n = 0
        Fn = 0
        Msum = 0
        for n, word in zip(ns, order(counts)):

            f = counts[word]
            Fn += f

            if n == N:
                break 

            ## compute In and Jn
            In = Fn - (n - 1 + int(Msum))
            Jn = FN - (n - 1 + int(Msum))

            ## compute the average
            ms = np.array(range(1, min([In,termmax])+1))

            logfacts = np.log10(In - ms) - np.log10(Jn - ms)
            prods = 10**np.cumsum(logfacts)
            terms = ms*prods*(Jn - In)/(Jn - ms)
            termsum = sum(terms)
            Mn[n] += termsum/reps

            Msum += termsum

    return 1/(1 + np.array(Mn)), np.array(ns)

def decayExponent(text):
    counts = preprocess(text)
    reps = int(round(0.5 + (5000. / len(counts))))
    termmax = 1000
    alphas, ns = innovationrate(counts, reps, termmax)
    
    ix = range(int(len(ns)*1/3.),len(ns))

    x = np.log10(ns)[ix]
    y = np.log10(alphas)[ix]

    mu, b, r, p, err = lm(x,y)

    return -mu, sum(counts.values())

### The actual function

In [3]:
def getMu(text):
    if text.strip(): #in case there's no text
        mu, numwords = decayExponent(text)
        return mu, numwords
    else:
        return 0.0, 0

## Number of links

In [4]:
def getNumLinks(text):
    links = re.findall(r'(https?://[^\s]+)', text)
    return len(links)

## Deviation from thread mean

In [5]:
def thread_deviation(comment_dict):
    """get the deviation from the thread's mean response time for
    the comment"""
    try:
        thread_id = comment_dict['id'].split('_')[0]
        thread_times = main.threadResponse(thread_id)
        thread_mean = np.mean(thread_times)
        thread_deviation = comment_dict['response'] - thread_mean
        return thread_deviation
    except:
        return None



In [6]:
with open('../../data/COMMENTS.json', 'r') as f2:
    COMMENTS = json.load(f2)

In [7]:
with open('TRAIN_STATS.json', 'r') as f:
    ALL_STATS = json.load(f)

# Cross-Validation

### Classification function

In [8]:
def classify(commID, thresholds = [2086.68, 0, 0.2, 0.5, 0, 0.25, 100]):
    #     delta_t = userFeatures[userID][0]
    #     mu = userFeatures[userID][2]
    #     l = userFeatures[userID][4]
    #     c_bar = userFeatures[userID][5]
    #     d_max = userFeatures[userID][6]
    # i think that threshold = [avg response time,
    #                           deviation from thread mean,
    #                           mu lower,
    #                           mu upper,
    #                           mu word count,
    #                           avg number links,
    #                           avg comment length]
    
    if commID in ALL_STATS:
        comm_stats = ALL_STATS[commID]
        mu1, mu2, num_links, response, deviation, length = comm_stats
    else:
        comm = COMMENTS[commID]
        text = comm['message']
        mu1, mu2 = getMu(text)
        num_links = getNumLinks(text)
        response = comm['response']
        deviation = thread_deviation(comm)
        length = len(text)
    
    if mu1 < thresholds[2] or mu1 > thresholds[3] or num_links > thresholds[5] or length > thresholds[6]:
        if response > thresholds[0] or deviation > thresholds[1]:
            return False
        else:
            return True
    else:
        return False

In [9]:
def split_data(list_commIDs):
    """splits the list of list_commIDs into 10 equally-sized, random samples
    and returns a nested list representing them"""
    random.seed(0)
    random.shuffle(list_commIDs)
    data_split = [[] for i in range(10)]
    for i, comm in enumerate(list_commIDs):
        batch_num = i % 10
        data_split[batch_num].append(comm)
    return data_split

Create the function itself. We want to apply this to the 7 different parameters, so allow for one of the arguments
to be the parameter:
* 0 = response time
* 1 = deviation from thread mean response time
* 2 = mu lower bound
* 3 = mu upper bound
* 4 = mu word count
* 5 = number links
* 6 = comment length

In [15]:
with open('ALL_USER_STATS.json', 'r') as f:
    all_users = json.load(f)
    
#get the bots info
with open('../../data/annotation.json', 'r') as f2:
    annotation = json.load(f2)
    
BOTS = [user for bucket in annotation for user in annotation[bucket] if annotation[bucket][user] != '1']

IOError: [Errno 2] No such file or directory: 'ALL_USER_STATS.json'

In [10]:
def crossValidateParam(parameter, guess_range, n = 100):
    """performs 10-fold cross validation on the parameter which is found by using the function
    _param_function_. _all_users_ is a dict of the annotation data.
    _guess_range_ is a tuple of two numbers, which is the range of values that
    we'll be scanning through for the parameter. _n_ is the size of the mesh for the interval
    _guess_range_. returns a list of the best parameter value for each of the 10 folds"""
    
    data_split = split_data(ALL_STATS.keys())
    result = [(None, None) for _ in range(10)] #list of tuples (parameter, F1 score) for each fold
    
    for i, fold in enumerate(data_split):
        best_param = 0 
        max_F1 = -1000   #stores the F1 score associated with best_param
        copy = list(data_split)   #we don't want to mess up the data
        test = copy.pop(i)   
        train = [comm for fold in copy for comm in fold]  #flatten the list
        #now start scanning the parameter values
        for scan_num in range(n):
            step_size = (guess_range[1] - guess_range[0]) / n  #increasing by this much each iteration
            param_value = guess_range[0] + step_size * scan_num  #test this paramater value
            param_F1 = cross_helper(train, parameter, param_value)
            #check if this is better than what we already have
            if param_F1 > max_F1:
                #update our values
                best_param = param_value
                max_F1 = param_F1
        
        #now that we have the best param for the fold, apply it to the test data
        fold_F1 = cross_helper(test, parameter, best_param)
        result[i] = (best_param, fold_F1)
    return result
        
def cross_helper(comments, parameter, param_value):     
    """takes a list of comments with a parameter and its value and returns the F1
    score obtained by using that value of the parameter in classification on only
    the users desired"""
    thresholds = [float('Inf'), float('Inf'), -float('Inf'), float('Inf'),
                  0, float('Inf'), float('Inf')]
    if parameter in (0, 1):
        thresholds = [float('Inf'), float('Inf'), 0, -float('Inf'), 0, 0, 0]  #just need one to set off the "OR" switch
    thresholds[parameter] = param_value
    return create_matrix(comments, thresholds) 

def create_matrix(comments, thresholds):
    """classifies each comment in _comments_ according to _thresholds_ and creates the confusion matrix. returns
    the corresponding F1 score"""
    confusion = Counter()
    for comm in comments:
        classification = classify(comm, thresholds)
        if COMMENTS[comm]['bot']:  # bot comment
            if classification:
                confusion["tp"] += 1 #classify says bot, is bot
            else:
                confusion["fn"] += 1  #classify says human, is bot
            
        else: # human comment
            if classification:
                confusion["fp"] += 1  #classify says bot, but human
            else:
                confusion["tn"] += 1 #classify says human, is human
    return eval_F1(confusion)

def eval_precision(confusion_matrix):
    """given the Counter _confusion_matrix_, calculates and returns the
    precision"""
    precision = 0.0
    try:
        precision = confusion_matrix["tp"] / (confusion_matrix["tp"] + confusion_matrix["fp"])
    except:
        pass
    return precision

def eval_recall(confusion_matrix):
    """given the Counter _confusion_matrix_, calculates and returns the
    recall"""
    recall = 0.0
    try:
        recall = confusion_matrix["tp"] / (confusion_matrix["tp"] + confusion_matrix["fn"])
    except:
        pass
    return recall

def eval_F1(confusion_matrix):
    """calculates and returns the F1 score"""
    precision = eval_precision(confusion_matrix)
    recall = eval_recall(confusion_matrix)
    F1 = 0.0
    try:
        F1 = (2 * precision * recall / (precision + recall))
    except:
        pass
    return F1

def total_eval(confusion_matrix):
    """does all 3"""
    return eval_precision(confusion_matrix), eval_recall(confusion_matrix), eval_F1(confusion_matrix)
            

In [29]:
print crossValidateParam(1, (30000, 50000))

[(30800.0, 0.6318577374195989), (44600.0, 0.6210299886119195), (30800.0, 0.6150707637383654), (30800.0, 0.6339499182081288), (30800.0, 0.6307692307692309), (30800.0, 0.6348088531187123), (44600.0, 0.6309045226130654), (30800.0, 0.6251266464032422), (41800.0, 0.6317244846656611), (30800.0, 0.6303198186854697)]


In [30]:
def run_cross():
    """runs the cross-validation process on each parameter in order, and makes a list of the optimal values for
    each. then applies these optimal values as thresholds to the full data set as a measure of the usefulness of
    our technique"""
    parameters = [0, 1, 5, 6]  #only want to look at these 
    intervals = [(30000, 50000), (60000, 80000), (0, .05), (0, 100)]
    thresholds = [2000, 0, -float('Inf'), float('Inf'), 0, 0, 0]
    for param, interval in zip(parameters, intervals):
        results = crossValidateParam(param, interval)
        temp_sum = 0
        for result in results:
            temp_sum += result[0]
        param_result = temp_sum / 10
        thresholds[param] = param_result  #update the parameter value
        
    
    #now run this on the full set
    confusion = Counter()
    for comm in ALL_STATS.keys():
        classification = classify(comm, thresholds)
        if COMMENTS[comm]['bot']:
            if classification:
                confusion["tp"] += 1 #classify says bot, is bot
            else:
                confusion["fn"] += 1  #classify says human, is bot

        else: # human comment
            if classification:
                confusion["fp"] += 1  #classify says bot, but human
            else:
                confusion["tn"] += 1 #classify says human, is human
    #display the optimal parameters
    print thresholds
    return eval_precision(confusion), eval_recall(confusion), eval_F1(confusion)

In [31]:
print run_cross()

[37400.0, 70000.0, -inf, inf, 0, 0.0, 3.0]
(0.48439683278993945, 0.9548902195608783, 0.6427429431285352)


* Without thread response deviation:
(0.5024390243902439, 0.8223552894211577, 0.6237698713096139)
[4925.0, 0, -inf, inf, 0, 0.0, 3.0]

* With thread response deviation:
(0.48874488403819916, 0.9152894211576846, 0.6372252021900446)
* thresholds = [4925.0, 4900.0, -inf, inf, 0, 0.0, 3.0]
 

## Scipy Optimization

In [18]:
def optimizeCross(thresholds = [3235.0, 1010.0, 0.004200000000000001, 162.3, 30.0]):
    """uses scipy.optimize to find best parameters for each fold of training data,
    then tests it on the test data. will return a list of the best parameters found
    by taking their averages over the 10 iterations of optimization"""
    data_split = split_data(all_users.keys())
    results = [[None] * 5 for _ in range(10)] #store the thresholds obtained
    fold_F1 = [None] * 10
    for i, fold in enumerate(data_split):
        copy = list(data_split)   #we don't want to mess up the data
        test = copy.pop(i)   
        train = [comm for fold in copy for comm in fold]  #flatten the list
        answer = scipy.optimize.minimize(log_create_matrix,
                                         thresholds,
                                         args = (train,),
                                         method = 'Nelder-Mead')
        #print answer.x
        results[i] = answer.x
        fold_F1[i] = optimize_matrix(results[i], test)
    final_params = [np.mean(param) for param in zip(*results)] #take the means across the folds
    
    confusion = Counter()
    for user in all_users.keys():
        classification = optimize_classify(user, final_params)
        if user in BOTS:  #the user is a bot
            if classification:
                confusion["tp"] += 1 #classify says bot, is bot
            else:
                confusion["fn"] += 1  #classify says human, is bot
            
        else: #the user is a human
            if classification:
                confusion["fp"] += 1  #classify says bot, but human
            else:
                confusion["tn"] += 1 #classify says human, is human
    print final_params
    print fold_F1
    return total_eval(confusion)
  

def log_create_matrix(thresholds, users):
    """since we're performing a minimization procedure we'll be converting the F1 scores
    returned by create_matrix to be the negative logs, to allow for better accuracy. this
    is the function we'll be running the process on. temp_thresholds must be the first
    input as per scipy's requirements, and will just be the variables we're trying to 
    optimize (the three mu values won'be used yet.)"""
    return -math.log(optimize_matrix(thresholds, users))

def optimize_matrix(thresholds, users):
    """the old one doesn't seem to work with the optimization function, since
    we're trying to add the mu values. this version of the function will be the
    same, but we won't require any setting of the mu values"""
    confusion = Counter()
    for user in users:
        classification = optimize_classify(user, thresholds)
        if user in BOTS:  #the user is a bot
            if classification:
                confusion["tp"] += 1 #classify says bot, is bot
            else:
                confusion["fn"] += 1  #classify says human, is bot
            
        else: #the user is a human
            if classification:
                confusion["fp"] += 1  #classify says bot, but human
            else:
                confusion["tn"] += 1 #classify says human, is human
    return eval_F1(confusion)

def optimize_classify(userID, thresholds):
    """the mu values are messing the optimization up. this is exactly the same as the normal
    classify function except the mu values are completely taken out, for now."""

    if userID in all_users:
        delta_t = all_users[userID][4]
        l = all_users[userID][2]
        c_bar = all_users[userID][6]
        d_max = all_users[userID][3]
        thread_avg = all_users[userID][5]
    else:
        delta_t = avg_response(userID)
        l = getNumLinks(userID, all_comments)
        c_bar = comment_length(userID)
        d_max = maxDailyComments(userID)
        thread_avg = thread_deviation(userID)
    
    if l > thresholds[2] or d_max > thresholds[4] or c_bar > thresholds[3]:
        if delta_t > thresholds[0] or thread_avg > thresholds[1]:
            return False
        else:
            return True
    else:
        return False
    
    
    

In [19]:
optimizeCross()

[3176.8488895278083, 989.54603895500804, 0.0041883628274073624, 174.42818973379582, 30.156292945920008]
[0.5000000000000001, 0.3783783783783784, 0.5909090909090909, 0.7037037037037038, 0.5714285714285714, 0.5641025641025641, 0.42553191489361697, 0.625, 0.6341463414634146, 0.48979591836734687]


(0.42955326460481097, 0.8169934640522876, 0.563063063063063)

### Results of optimization:
(recall, precision, F1)
* threshold from just parameter scan: (0.41946308724832215, 0.8169934640522876, 0.5543237250554325)
* after scipy.optimize: (0.42955326460481097, 0.8169934640522876, 0.563063063063063)

Looks like the optimization just increased precision a bit. Recall was exactly the same