### Setup

In [1]:
import datetime, json, re, math, sys, scipy
import numpy as np
import random as ra
import matplotlib.pyplot as plt
from scipy.stats import linregress as lm
import random
from collections import Counter, defaultdict
from __future__ import division
% matplotlib inline
#import the files
sys.path.append('../API')
import main

### Necessary Functions

In [35]:
def preprocess(text):
    counts = Counter()
    text = text.encode("utf8")
    words = []
    for word in re.split(" ", text):
        if word not in counts:
            words.append(word)
        counts[word] += 1
        
    return counts

def order(counts):
    words = counts.keys()
    ps = np.array(counts.values())
    ps = ps/float(sum(ps))
    N = len(words)
    return np.random.choice(words, size=N, replace=False, p=ps)

def innovationrate(counts, reps = 2, termmax = 100):
    
    N = len(counts)    
    FN = sum(counts.values())

    ns = range(1,N+1)
    Mn = [0 for n in ns]
    
    for rep in range(reps):
        n = 0
        Fn = 0
        Msum = 0
        for n, word in zip(ns, order(counts)):

            f = counts[word]
            Fn += f

            if n == N:
                break 

            ## compute In and Jn
            In = Fn - (n - 1 + int(Msum))
            Jn = FN - (n - 1 + int(Msum))

            ## compute the average
            ms = np.array(range(1, min([In,termmax])+1))

            logfacts = np.log10(In - ms) - np.log10(Jn - ms)
            prods = 10**np.cumsum(logfacts)
            terms = ms*prods*(Jn - In)/(Jn - ms)
            termsum = sum(terms)
            Mn[n] += termsum/reps

            Msum += termsum

    return 1/(1 + np.array(Mn)), np.array(ns)

def decayExponent(text):
    counts = preprocess(text)
    reps = int(round(0.5 + (5000. / len(counts))))
    termmax = 1000
    alphas, ns = innovationrate(counts, reps, termmax)
    
    ix = range(int(len(ns)*1/3.),len(ns))

    x = np.log10(ns)[ix]
    y = np.log10(alphas)[ix]

    mu, b, r, p, err = lm(x,y)

    return -mu, sum(counts.values())

def getMu(userID, all_comments):
    comments = all_comments[userID]
    total_text = ""
    for comment in comments:
        total_text = total_text + comment + " "
    if total_text.strip(): #in case there's no text
        mu, numwords = decayExponent(total_text)
        return mu, numwords
    else:
        return 0.0, 0
    
def getNumLinks(userID, all_comments):
    numlinks = []
    comments = all_comments[userID]
    for comment in comments:
        links = re.findall(r'(https?://[^\s]+)', comment)
        numlink = len(links)
        numlinks.append(numlink)
    avg_numlinks = np.mean(numlinks)
    return avg_numlinks

def maxDailyComments(UID):
    days = Counter()
    for comment in main.COMMENTS_BY_USER[UID]:
        date = comment['time'][0:10]
        days[date] += 1
    return(max(days.values()))

def avg_response(userID, choice = "All"):
    """get the avg response time for _userID"""
    comment_times = main.userResponse(userID, choice)
    if not comment_times:
        return None
    else:
        return np.mean(comment_times)
    
def thread_deviation(userID, choice = "All"):
    """get the average deviation from the thread's mean response time for
    comments made by _userID_"""
    deviations = []
    thread_dict = defaultdict(list)
    threads = main.userThreads(userID, choice)
    comment_times = main.userResponse(userID, choice)
    if comment_times:
        for thread, comment_time in zip(threads, comment_times):
            #need to organize this by thread
            thread_dict[thread].append(comment_time)
        for threadID in thread_dict:
            thread_deviations = []
            thread_times = main.threadResponse(threadID, choice)
            thread_mean = np.mean(thread_times)
            for response in thread_dict[threadID]:
                thread_deviations.append(response - thread_mean)
            avg_thread_deviation = np.mean(thread_deviations)
            #now add to the total deviations
            deviations.append(avg_thread_deviation)
        return np.mean(deviations)
    else:
        return None
    
def comment_length(userID, choice = "All"):
    """get the avg length of comments made by _userID_"""
    comments = main.userText(userID, choice)
    lengths = [len(comment) for comment in comments]
    return np.mean(lengths)

def split_users(list_users):
    """splits the list of userIDs _users_ into 10 equally-sized, random samples
    and returns a nested list representing them"""
    random.seed(0)
    random.shuffle(list_users)
    blind_test = list_users[0:100]
    data_split = [[] for i in range(10)]
    for i, user in enumerate(list_users[100:]):
        batch_num = i % 10
        data_split[batch_num].append(user)
    return data_split, blind_test


def eval_precision(confusion_matrix):
    """given the Counter _confusion_matrix_, calculates and returns the
    precision"""
    precision = 0.0
    try:
        precision = confusion_matrix["tp"] / (confusion_matrix["tp"] + confusion_matrix["fp"])
    except:
        pass
    return precision

def eval_recall(confusion_matrix):
    """given the Counter _confusion_matrix_, calculates and returns the
    recall"""
    recall = 0.0
    try:
        recall = confusion_matrix["tp"] / (confusion_matrix["tp"] + confusion_matrix["fn"])
    except:
        pass
    return recall

def eval_F1(confusion_matrix):
    """calculates and returns the F1 score"""
    precision = eval_precision(confusion_matrix)
    recall = eval_recall(confusion_matrix)
    F1 = 0.0
    try:
        F1 = (2 * precision * recall / (precision + recall))
    except:
        pass
    return F1

### Data Load

In [3]:
all_comments = {user : main.userText(user) for user in main.COMMENTS_BY_USER}

In [4]:
with open('ALL_USER_STATS.json', 'r') as f:
    all_users = json.load(f)
    
#get the bots info
with open('../../data/annotation.json', 'r') as f2:
    annotation = json.load(f2)
    
BOTS = [user for bucket in annotation for user in annotation[bucket] if annotation[bucket][user] != '1']

buckets = {
    user: int(bucket)
    for bucket in annotation 
    for user in annotation[bucket]
}

for user in buckets:
    if buckets[user] <= 25:
        buckets[user] = 0
    elif buckets[user] <= 50:
        buckets[user] = 1
    elif buckets[user] <=  75:
        buckets[user] = 2
    else:
        buckets[user] = 3

### Optimization

thresholds = [avg response time, deviation from thread mean, avg number of links, avg comment length, max comments per day]

In [5]:
def optimize_classify(userID, thresholds, means, indicators = [0,1,2,4], breakers = [3, 5]):
    
    b = buckets[userID]
    
    if userID in all_users:
        delta_t = all_users[userID][4]
        thread_avg = all_users[userID][5]
        mu = all_users[userID][0]
        l = all_users[userID][2]
        c_bar = all_users[userID][6]
        d_max = all_users[userID][3]
        
    else:
        delta_t = avg_response(userID)
        thread_avg = thread_deviation(userID)
        mu, _ = getMu(userID, all_comments)
        l = all_users(userID, all_comments)
        c_bar = comment_length(userID)
        d_max = maxDailyComments(userID)

    uvals = [mu, l, d_max, delta_t,  c_bar, thread_avg]
        
    decision = False
    for i in indicators:
        if uvals[i] < means[6*b+i] - thresholds[6*b+i] or uvals[i] > means[6*b+i] + thresholds[6*b+i]:
            decision = True
            break
    if decision:
        for i in breakers:
            if uvals[i] < means[6*b+i] - thresholds[6*b+i] or uvals[i] > means[6*b+i] + thresholds[6*b+i]:
                decision = False
                break
    return decision       

In [6]:
def log_create_matrix(thresholds, users, means):
#     return -optimize_matrix(thresholds, users, means)
    try:
        return -math.log(optimize_matrix(thresholds, users, means))
    except ValueError:
        return float("Inf")

def optimize_matrix(thresholds, users, means, indicators = [0,1,2,4], breakers = [3, 5]):
    confusion = Counter()
    for user in users:
        classification = optimize_classify(user, thresholds, means, indicators = indicators, breakers = breakers)
        if user in BOTS:  #the user is a bot
            if classification:
                confusion["tp"] += 1 #classify says bot, is bot
            else:
                confusion["fn"] += 1  #classify says human, is bot
            
        else: #the user is a human
            if classification:
                confusion["fp"] += 1  #classify says bot, but human
            else:
                confusion["tn"] += 1 #classify says human, is human
    return eval_F1(confusion)

In [36]:
# def optimize_cross(thresholds = [0.0, 5000.0, -1000.0, 1000.0, 0.0, 1.0, 0.0, 150.0, 0.0, 10.0]):
def optimize_cross(
    indicators = [0,1,2,4], breakers = [3, 5], 
    numseeds = 10, checkperseed = 10, gridnum = 1000
):
    data_split, blind_test = split_users(all_users.keys())
    results = [[None] * 10 for _ in range(10)]
    allmeans = [[None] * 10 for _ in range(10)]
    fold_F1 = [None] * 10

    for i, fold in enumerate(data_split):
        print("working on fold "+str(i+1)+" of 10")
        copy = list(data_split)
        test = copy.pop(i)
        train = [user for fold in copy for user in fold]

        means = []
        thresholds = []
        bounds = []
        for b in [0,1,2,3]:
        
            means.extend([
                np.mean([all_users[user][0] for user in train if user not in BOTS and buckets[user] == b]), # mu
                np.mean([all_users[user][2] for user in train if user not in BOTS and buckets[user] == b]), # l
                np.mean([all_users[user][3] for user in train if user not in BOTS and buckets[user] == b]), # d_max
                np.mean([all_users[user][4] for user in train if user in BOTS and buckets[user] == b]), # delta_t
                np.mean([all_users[user][6] for user in train if user not in BOTS and buckets[user] == b]), # c_bar
                np.mean([all_users[user][5] for user in train if user in BOTS and buckets[user] == b]) # thread_avg
            ])

            thresholds.extend([
                np.std([all_users[user][0] for user in train if user not in BOTS and buckets[user] == b]), # mu
                np.std([all_users[user][2] for user in train if user not in BOTS and buckets[user] == b]), # l
                np.std([all_users[user][3] for user in train if user not in BOTS and buckets[user] == b]), # d_max
                np.std([all_users[user][4] for user in train if user in BOTS and buckets[user] == b]), # delta_t
                np.std([all_users[user][6] for user in train if user not in BOTS and buckets[user] == b]), # c_bar            
                np.std([all_users[user][5] for user in train if user in BOTS and buckets[user] == b]) # thread_avg
            ])   
            
            bounds.extend([
                (0,max([all_users[user][0] for user in train if user not in BOTS and buckets[user] == b])),
                (0,max([all_users[user][2] for user in train if user not in BOTS and buckets[user] == b])),
                (0,max([all_users[user][3] for user in train if user not in BOTS and buckets[user] == b])),
                (0,max([all_users[user][4] for user in train if user in BOTS and buckets[user] == b])),
                (0,max([all_users[user][6] for user in train if user not in BOTS and buckets[user] == b])),
                (0,max([all_users[user][5] for user in train if user in BOTS and buckets[user] == b]))
            ])

        
        for seed in range(numseeds):
#             print("working on seed "+str(seed + 1)+" of "+str(numseeds))
            ra.seed(seed)
            param_order = range(24)
            ra.shuffle(param_order)
            for p_i in param_order:
                if p_i % 6 in indicators or p_i % 6 in breakers:                
                    best_F1 = 0
                    best_thresh = 0
                    ra.seed(seed)
                    for t_i in ra.sample(range(1, gridnum + 1), checkperseed):
                        thresholds[p_i] = t_i*bounds[p_i][1]/gridnum
                        current_F1 = optimize_matrix(thresholds, train, means, indicators = indicators, breakers = breakers)
                        if current_F1 > best_F1:
                            best_F1 = current_F1
                            best_thresh = thresholds[p_i]
                    thresholds[p_i] = best_thresh
        results[i] = thresholds
    
        allmeans[i] = means
        ## This is the step where we run the fold's test
        ## using the best determined thresholds
        ## right now it only returns the confusion matrix's F1
        ## but we need a list of the of the FN users from test
        fold_F1[i] = optimize_matrix(results[i], test, allmeans[i], indicators = indicators, breakers = breakers)
        
    final_means = [np.mean(param) for param in zip(*allmeans)]
    final_params = [np.mean(param) for param in zip(*results)]
    
    confusion = Counter()
    for user in blind_test:
        classification = optimize_classify(user, final_params, final_means, indicators = indicators, breakers = breakers)
        if user in BOTS:  #the user is a bot
            if classification:
                confusion["tp"] += 1 #classify says bot, is bot
            else:
                confusion["fn"] += 1  #classify says human, is bot
            
        else: #the user is a human
            if classification:
                confusion["fp"] += 1  #classify says bot, but human
            else:
                confusion["tn"] += 1 #classify says human, is human
    ## results and allmeans hold the in-fold parameter thresholds and means
    ## used for the best classifier
    return(
        eval_precision(confusion), eval_recall(confusion), eval_F1(confusion),
        final_means, final_params, allmeans, results
    )

In [37]:
precision, recall, F1, final_means, final_params, allmeans, results =\
optimize_cross(
    indicators = [0,1,2,4], breakers = [3,5], 
    numseeds = 2, checkperseed = 10, gridnum = 1000
)

working on fold 1 of 10
working on fold 2 of 10
working on fold 3 of 10
working on fold 4 of 10
working on fold 5 of 10
working on fold 6 of 10
working on fold 7 of 10
working on fold 8 of 10
working on fold 9 of 10
working on fold 10 of 10


In [38]:
print(precision, recall, F1)

(0.7857142857142857, 0.6875, 0.7333333333333334)


In [28]:
def crossval_models(
    numseeds = 2, checkperseed = 2, gridnum = 10,
    IndicatorsToCheck = [[0], [1], [2], [4], [0,1,2,4]],
    BreakersToCheck = [[], [3], [5], [3,5]]
):
    allresults = {
        "crossvals": [],
        "numseeds": numseeds,
        "checkperseed": checkperseed,
        "gridnum": gridnum,
        "best_ix": 0,
        "best_F1": 0
    }
    ix = 0

    for indicators in IndicatorsToCheck:
        for breakers in BreakersToCheck:
            
            print(
                "Working on indicators: " + str(indicators) +\
                " using breakers: " + str(breakers)
            )
            
            crossval_results = {
                "indicators": indicators,
                "breakers": breakers
            }

            crossval_results["P"], crossval_results["R"], crossval_results["F1"],\
            crossval_results["final_means"], crossval_results["final_params"],\
            crossval_results["allmeans"], crossval_results["results"] =\
            optimize_cross(
                indicators = crossval_results["indicators"], 
                breakers = crossval_results["breakers"], 
                numseeds = allresults["numseeds"], 
                checkperseed = allresults["checkperseed"], 
                gridnum = allresults["gridnum"]
            )
            
            allresults["crossvals"].append(crossval_results)
            if allresults["crossvals"][ix]["F1"] > allresults["best_F1"]:
                allresults["best_F1"] = allresults["crossvals"][ix]["F1"]
                allresults["best_ix"] = ix

            ix += 1
            
    print(
        "The best model used the indicators:\n"+str(allresults[allresults["best_ix"]]["indicators"])+\
        "\n\nand breakers:\n"+str(allresults[allresults["best_ix"]]["breakers"])+\
        "\n\nand achieved (P,R,F1) = ("+\
        ",".join(map(
            str, 
            [allresults[allresults["best_ix"]]["P"], allresults[allresults["best_ix"]]["R"], allresults[allresults["best_ix"]]["F1"]]
        ))+")"
    )

    outfilename = "allresults_crossval"+\
                  "-numseeds_"+str(allresults["numseeds"])+\
                  "-checkperseed_"+str(allresults["checkperseed"])+\
                  "-gridnum_"+str(allresults["gridnum"])+\
                  ".json"

    with open(outfilename, "w") as f:
        f.write(json.dumps(allresults))

In [39]:
crossval_models(numseeds = 2, checkperseed = 2, gridnum = 10)

Working on indicators: [0] using breakers: []
working on fold 1 of 10
working on fold 2 of 10
working on fold 3 of 10
working on fold 4 of 10
working on fold 5 of 10
working on fold 6 of 10
working on fold 7 of 10
working on fold 8 of 10
working on fold 9 of 10
working on fold 10 of 10
Working on indicators: [0] using breakers: [3]
working on fold 1 of 10
working on fold 2 of 10
working on fold 3 of 10
working on fold 4 of 10
working on fold 5 of 10
working on fold 6 of 10
working on fold 7 of 10
working on fold 8 of 10
working on fold 9 of 10
working on fold 10 of 10
Working on indicators: [0] using breakers: [5]
working on fold 1 of 10
working on fold 2 of 10
working on fold 3 of 10
working on fold 4 of 10
working on fold 5 of 10
working on fold 6 of 10
working on fold 7 of 10
working on fold 8 of 10
working on fold 9 of 10
working on fold 10 of 10
Working on indicators: [0] using breakers: [3, 5]
working on fold 1 of 10
working on fold 2 of 10
working on fold 3 of 10
working on fold

In [40]:
with open("allresults_crossval-numseeds_2-checkperseed_2-gridnum_10.json", "r") as f:
    run_data = json.loads(f.read())
for x in sorted([(run["F1"], run["P"], run["R"], run["indicators"], run["breakers"]) for run in run_data["crossvals"]], reverse=True):
    print(x)

(0.7857142857142857, 0.9166666666666666, 0.6875, [0, 1, 2, 4], [3])
(0.7586206896551724, 0.8461538461538461, 0.6875, [0, 1, 2, 4], [])
(0.7407407407407406, 0.9090909090909091, 0.625, [0, 1, 2, 4], [5])
(0.7407407407407406, 0.9090909090909091, 0.625, [0, 1, 2, 4], [3, 5])
(0.5454545454545454, 1.0, 0.375, [0], [3])
(0.5217391304347825, 0.8571428571428571, 0.375, [0], [])
(0.47619047619047616, 1.0, 0.3125, [0], [5])
(0.47619047619047616, 1.0, 0.3125, [0], [3, 5])
(0.45454545454545453, 0.8333333333333334, 0.3125, [4], [3])
(0.43478260869565216, 0.7142857142857143, 0.3125, [4], [])
(0.38095238095238093, 0.8, 0.25, [4], [5])
(0.38095238095238093, 0.8, 0.25, [4], [3, 5])
(0.38095238095238093, 0.8, 0.25, [1], [5])
(0.38095238095238093, 0.8, 0.25, [1], [3, 5])
(0.38095238095238093, 0.8, 0.25, [1], [3])
(0.38095238095238093, 0.8, 0.25, [1], [])
(0.3333333333333333, 0.5, 0.25, [2], [5])
(0.3333333333333333, 0.5, 0.25, [2], [3, 5])
(0.3333333333333333, 0.5, 0.25, [2], [3])
(0.32, 0.444444444444444

In [31]:
crossval_models(numseeds = 10, checkperseed = 100, gridnum = 1000)

Working on indicators: [0] using breakers: []
working on fold 1 of 10
working on fold 2 of 10
working on fold 3 of 10
working on fold 4 of 10
working on fold 5 of 10
working on fold 6 of 10
working on fold 7 of 10
working on fold 8 of 10
working on fold 9 of 10
working on fold 10 of 10
Working on indicators: [0] using breakers: [3]
working on fold 1 of 10
working on fold 2 of 10
working on fold 3 of 10
working on fold 4 of 10
working on fold 5 of 10
working on fold 6 of 10
working on fold 7 of 10
working on fold 8 of 10
working on fold 9 of 10
working on fold 10 of 10
Working on indicators: [0] using breakers: [5]
working on fold 1 of 10
working on fold 2 of 10
working on fold 3 of 10
working on fold 4 of 10
working on fold 5 of 10
working on fold 6 of 10
working on fold 7 of 10
working on fold 8 of 10
working on fold 9 of 10
working on fold 10 of 10
Working on indicators: [0] using breakers: [3, 5]
working on fold 1 of 10
working on fold 2 of 10
working on fold 3 of 10
working on fold

In [34]:
with open("allresults_crossval-numseeds_10-checkperseed_100-gridnum_1000.json", "r") as f:
    run_data = json.loads(f.read())
for x in sorted([(run["F1"], run["P"], run["R"], run["indicators"], run["breakers"]) for run in run_data["crossvals"]], reverse=True):
    print(x)

(0.7543252595155711, 0.8014705882352942, 0.7124183006535948, [0, 1, 2, 4], [3])
(0.7534246575342465, 0.7913669064748201, 0.7189542483660131, [0, 1, 2, 4], [5])
(0.7534246575342465, 0.7913669064748201, 0.7189542483660131, [0, 1, 2, 4], [3, 5])
(0.7183098591549295, 0.7786259541984732, 0.6666666666666666, [0, 1, 2, 4], [])
(0.6720647773279352, 0.8829787234042553, 0.5424836601307189, [0], [5])
(0.6693227091633467, 0.8571428571428571, 0.5490196078431373, [0], [])
(0.6558704453441295, 0.8617021276595744, 0.5294117647058824, [0], [3, 5])
(0.6539923954372623, 0.7818181818181819, 0.5620915032679739, [0], [3])
(0.6065162907268171, 0.491869918699187, 0.7908496732026143, [1], [3, 5])
(0.6057441253263707, 0.5043478260869565, 0.7581699346405228, [4], [3, 5])
(0.6039603960396039, 0.4860557768924303, 0.7973856209150327, [1], [3])
(0.6009615384615385, 0.4752851711026616, 0.8169934640522876, [1], [])
(0.599483204134367, 0.49572649572649574, 0.7581699346405228, [4], [3])
(0.5990099009900991, 0.4820717131

In [41]:
crossval_models(numseeds = 10, checkperseed = 100, gridnum = 1000)

Working on indicators: [0] using breakers: []
working on fold 1 of 10
working on fold 2 of 10
working on fold 3 of 10
working on fold 4 of 10
working on fold 5 of 10
working on fold 6 of 10
working on fold 7 of 10
working on fold 8 of 10
working on fold 9 of 10
working on fold 10 of 10
Working on indicators: [0] using breakers: [3]
working on fold 1 of 10
working on fold 2 of 10
working on fold 3 of 10
working on fold 4 of 10
working on fold 5 of 10
working on fold 6 of 10
working on fold 7 of 10
working on fold 8 of 10
working on fold 9 of 10
working on fold 10 of 10
Working on indicators: [0] using breakers: [5]
working on fold 1 of 10
working on fold 2 of 10
working on fold 3 of 10
working on fold 4 of 10
working on fold 5 of 10
working on fold 6 of 10
working on fold 7 of 10
working on fold 8 of 10
working on fold 9 of 10
working on fold 10 of 10
Working on indicators: [0] using breakers: [3, 5]
working on fold 1 of 10
working on fold 2 of 10
working on fold 3 of 10
working on fold

In [47]:
with open("allresults_crossval-numseeds_10-checkperseed_100-gridnum_1000.json", "r") as f:
    run_data = json.loads(f.read())
print(
    "\\textbf{I} & $\\overline{\\delta_t}$ & $\\overline{\\delta_t - \\overline{\\delta_t(\\text{thread})}}$ & " + 
    "\\textbf{P} & \\textbf{R} & \\textbf{F}$_1$ \\\\ \\hline"
)
for x in sorted([(run["F1"], run["P"], run["R"], run["indicators"], run["breakers"]) for run in run_data["crossvals"]], reverse=True):
    print(
        str(x[-2]) + " & " + str(x[-1]) + " & " + str(round(x[1]*100, 2)) + " & " +
        str(round(x[2]*100, 2)) + " & " + str(round(x[0]*100, 2)) + " \\\\ \\hline"
    )


\textbf{I} & $\overline{\delta_t}$ & $\overline{\delta_t - \overline{\delta_t(\text{thread})}}$ & \textbf{P} & \textbf{R} & \textbf{F}$_1$ \\ \hline
[0, 1, 2, 4] & [3] & 92.31 & 75.0 & 82.76 \\ \hline
[0, 1, 2, 4] & [] & 85.71 & 75.0 & 80.0 \\ \hline
[0, 1, 2, 4] & [5] & 90.91 & 62.5 & 74.07 \\ \hline
[0, 1, 2, 4] & [3, 5] & 90.91 & 62.5 & 74.07 \\ \hline
[0] & [3] & 90.0 & 56.25 & 69.23 \\ \hline
[1] & [3] & 53.85 & 87.5 & 66.67 \\ \hline
[1] & [] & 51.72 & 93.75 & 66.67 \\ \hline
[0] & [] & 81.82 & 56.25 & 66.67 \\ \hline
[2] & [3] & 54.17 & 81.25 & 65.0 \\ \hline
[0] & [5] & 88.89 & 50.0 & 64.0 \\ \hline
[0] & [3, 5] & 88.89 & 50.0 & 64.0 \\ \hline
[1] & [3, 5] & 52.0 & 81.25 & 63.41 \\ \hline
[2] & [] & 50.0 & 81.25 & 61.9 \\ \hline
[1] & [5] & 50.0 & 81.25 & 61.9 \\ \hline
[4] & [3] & 52.17 & 75.0 & 61.54 \\ \hline
[2] & [3, 5] & 52.17 & 75.0 & 61.54 \\ \hline
[2] & [5] & 50.0 & 75.0 & 60.0 \\ \hline
[4] & [] & 48.0 & 75.0 & 58.54 \\ \hline
[4] & [3, 5] & 50.0 & 68.75 & 57.89 \\ \

#### Doing medium-length (10-seed) cross-validations:
- All indicators, all breakers: (0.7482993197278912, 0.7189542483660131, 0.7333333333333333)
- All indicators, $\overline{\delta_t}$: (0.7417218543046358, 0.7320261437908496, 0.7368421052631579)
- All indicators, $\overline{\delta_t - \overline{\delta_t(\text{thread})}}$: (0.7449664429530202, 0.7254901960784313, 0.7350993377483444)
- All indicators, no breakers: (0.7290322580645161, 0.738562091503268, 0.7337662337662337)

#### $\mu$
- Just $\mu$, all breakers: (0.865979381443299, 0.5490196078431373, 0.672)
- Just $\mu$, $\overline{\delta_t}$: (0.8269230769230769, 0.5620915032679739, 0.669260700389105)
- Just $\mu$, $\overline{\delta_t - \overline{\delta_t(\text{thread})}}$: 
- Just $\mu$, no breakers: 

#### Doing short-length (1-seed) cross-validations:
- All indicators, all breakers: (0.6992481203007519, 0.6078431372549019, 0.6503496503496503)
- All indicators, $\overline{\delta_t}$: (0.6162162162162163, 0.7450980392156863, 0.6745562130177516)
- All indicators, $\overline{\delta_t - \overline{\delta_t(\text{thread})}}$: (0.6879432624113475, 0.6339869281045751, 0.6598639455782312)
- All indicators, no breakers: (0.6629213483146067, 0.7712418300653595, 0.7129909365558912)
#### $\mu$
- Just $\mu$, all breakers: (0.7478260869565218, 0.5620915032679739, 0.6417910447761194)
- Just $\mu$, $\overline{\delta_t}$: (0.7457627118644068, 0.5751633986928104, 0.6494464944649446)
- Just $\mu$, $\overline{\delta_t - \overline{\delta_t(\text{thread})}}$: (0.7478991596638656, 0.5816993464052288, 0.6544117647058824)
- Just $\mu$, no breakers: (0.7301587301587301, 0.6013071895424836, 0.6594982078853047)
#### $\ell$
- Just $\ell$, all breakers: (0.6935483870967742, 0.28104575163398693, 0.4)
- Just $\ell$, $\overline{\delta_t}$: (0.696969696969697, 0.3006535947712418, 0.42009132420091316)
- Just $\ell$, $\overline{\delta_t - \overline{\delta_t(\text{thread})}}$: (0.7076923076923077, 0.3006535947712418, 0.42201834862385323)
- Just $\ell$, no breakers: (0.696969696969697, 0.3006535947712418, 0.42009132420091316)
#### $d_\text{max}$
- Just $d_\text{max}$, all breakers: (0.48936170212765956, 0.6013071895424836, 0.5395894428152492)
- Just $d_\text{max}$, $\overline{\delta_t}$: (0.49206349206349204, 0.6078431372549019, 0.543859649122807)
- Just $d_\text{max}$, $\overline{\delta_t - \overline{\delta_t(\text{thread})}}$: (0.46938775510204084, 0.6013071895424836, 0.5272206303724928)
- Just $d_\text{max}$, no breakers: (0.4567307692307692, 0.6209150326797386, 0.5263157894736842)
#### $\overline{C}$
- Just $\overline{C}$, all breakers: (0.55, 0.5751633986928104, 0.5623003194888179)
- Just $\overline{C}$, $\overline{\delta_t}$: (0.5398773006134969, 0.5751633986928104, 0.5569620253164557)
- Just $\overline{C}$, $\overline{\delta_t - \overline{\delta_t(\text{thread})}}$: (0.546583850931677, 0.5751633986928104, 0.5605095541401274)
- Just $\overline{C}$, no breakers: (0.5393939393939394, 0.5816993464052288, 0.559748427672956)
#### $\overline{C}$ and $\mu$
- $\overline{C}$ and $\mu$, all breakers: (0.6458333333333334, 0.6078431372549019, 0.6262626262626263)
- $\overline{C}$ and $\mu$, $\overline{\delta_t}$: (0.6506849315068494, 0.6209150326797386, 0.6354515050167224)
- $\overline{C}$ and $\mu$, $\overline{\delta_t - \overline{\delta_t(\text{thread})}}$: (0.5987654320987654, 0.6339869281045751, 0.6158730158730159)
- $\overline{C}$ and $\mu$, no breakers: (0.7230769230769231, 0.6143790849673203, 0.6643109540636042)

# Finding the False Negatives

In [None]:
# import the params and means for the full case
with open('FN.json', 'r') as f:
    FN = json.load(f)
    
# create a customized version of cross_matrix which returns the user IDs
# of the users which were false negatives

def optimize_matrix(thresholds, users, means, indicators = [0,1,2,4], breakers = [3, 5]):
    fn_IDS = []
    for user in users:
        classification = optimize_classify(user, thresholds, means, indicators = indicators, breakers = breakers)
        if user in BOTS:  #the user is a bot
            if not classification:   #false negative
                fn_IDS.append(user)
    return fn_IDS

# now go through the 10 folds of cross validation and find the FNs for each, keep track of all of them
data_split, blind_test = split_users(all_users.keys())
total_FN = []
for i, fold in enumerate(data_split):
    
    
