In [26]:
import datetime, json, re, math, sys
import numpy as np
import numpy #lol
import matplotlib.pyplot as plt
from scipy.stats import linregress as lm
import random as ra
from collections import Counter, defaultdict
from __future__ import division
% matplotlib inline
#import the files
sys.path.append('../API')
import main

# Functions
## Mu analysis

In [47]:
def preprocess(text):
    counts = Counter()
    text = text.encode("utf8")
    words = []
    for word in re.split(" ", text):
        if word not in counts:
            words.append(word)
        counts[word] += 1
        
    return counts

def order(counts):
    words = counts.keys()
    ps = np.array(counts.values())
    ps = ps/float(sum(ps))
    N = len(words)
    return np.random.choice(words, size=N, replace=False, p=ps)

def innovationrate(counts, reps = 2, termmax = 100):
    
    N = len(counts)    
    FN = sum(counts.values())

    ns = range(1,N+1)
    Mn = [0 for n in ns]
    
    for rep in range(reps):
        n = 0
        Fn = 0
        Msum = 0
        for n, word in zip(ns, order(counts)):

            f = counts[word]
            Fn += f

            if n == N:
                break 

            ## compute In and Jn
            In = Fn - (n - 1 + int(Msum))
            Jn = FN - (n - 1 + int(Msum))

            ## compute the average
            ms = np.array(range(1, min([In,termmax])+1))

            logfacts = np.log10(In - ms) - np.log10(Jn - ms)
            prods = 10**np.cumsum(logfacts)
            terms = ms*prods*(Jn - In)/(Jn - ms)
            termsum = sum(terms)
            Mn[n] += termsum/reps

            Msum += termsum

    return 1/(1 + np.array(Mn)), np.array(ns)

def decayExponent(text):
    counts = preprocess(text)
    reps = int(round(0.5 + (5000. / len(counts))))
    termmax = 1000
    alphas, ns = innovationrate(counts, reps, termmax)
    
    ix = range(int(len(ns)*1/3.),len(ns))

    x = np.log10(ns)[ix]
    y = np.log10(alphas)[ix]

    mu, b, r, p, err = lm(x,y)

    return -mu, sum(counts.values())

In [28]:
all_comments = {user : main.userText(user) for user in main.COMMENTS_BY_USER}

### The actual function

In [40]:
def getMu(userID, all_comments):
    comments = all_comments[userID]
    total_text = ""
    for comment in comments:
        total_text = total_text + comment + " "
    if total_text.strip(): #in case there's no text
        mu, numwords = decayExponent(total_text)
        return mu, numwords
    else:
        return 0.0, 0

## Number of links

In [30]:
def getNumLinks(userID, all_comments):
    numlinks = []
    comments = all_comments[userID]
    for comment in comments:
        links = re.findall(r'(https?://[^\s]+)', comment)
        numlink = len(links)
        numlinks.append(numlink)
    avg_numlinks = np.mean(numlinks)
    return avg_numlinks

## User's max daily comments

In [31]:
def maxDailyComments(UID):
    days = Counter()
    for comment in main.COMMENTS_BY_USER[UID]:
        date = comment['time'][0:10]
        days[date] += 1
    return(max(days.values()))

## Average response time

In [37]:
def avg_response(userID, choice = "All"):
    """get the avg response time for _userID"""
    comment_times = main.userResponse(userID, choice)
    if not comment_times:
        return None
    else:
        return np.mean(comment_times)

## Deviation from thread mean

In [33]:
def thread_deviation(userID, choice = "All"):
    """get the average deviation from the thread's mean response time for
    comments made by _userID_"""
    deviations = []
    thread_dict = defaultdict(list)
    threads = main.userThreads(userID, choice)
    comment_times = main.userResponse(userID, choice)
    if comment_times:
        for thread, comment_time in zip(threads, comment_times):
            #need to organize this by thread
            thread_dict[thread].append(comment_time)
        for threadID in thread_dict:
            thread_deviations = []
            thread_times = main.threadResponse(threadID, choice)
            thread_mean = np.mean(thread_times)
            for response in thread_dict[threadID]:
                thread_deviations.append(response - thread_mean)
            avg_thread_deviation = np.mean(thread_deviations)
            #now add to the total deviations
            deviations.append(avg_thread_deviation)
        return np.mean(deviations)
    else:
        return None

## Average comment length

In [34]:
def comment_length(userID, choice = "All"):
    """get the avg length of comments made by _userID_"""
    comments = main.userText(userID, choice)
    lengths = [len(comment) for comment in comments]
    return np.mean(lengths)

# Computation
First we need to get the annotation data

In [35]:
with open('../../data/annotation.json', 'r') as f:
    annotations = json.load(f)
all_users = defaultdict(list)
humans = defaultdict(list)
bots = defaultdict(list)
for bucket in annotations:
    for user in annotations[bucket]:
        all_users[user] = [None] * 7
        if annotations[bucket][user] == '1':
            humans[user] = [None] * 7
        else:
            bots[user] = [None] * 7




## Run the calculation for all users

In [49]:
for user in all_users:
    print user
    all_users[user][0], all_users[user][1] = getMu(user, all_comments)   #both mu values
    all_users[user][2] = getNumLinks(user, all_comments)  #number of links
    all_users[user][3] = maxDailyComments(user)  #max daily comments
    all_users[user][4] = avg_response(user)  #avg response time
    all_users[user][5] = thread_deviation(user)  #mean thread deviation time
    all_users[user][6] = comment_length(user)  #avg comment length
    
print all_users
    

10214241900140457




10155063013063393
1976120809324649
1694030580639235
298828177189543
1474767735924477
10211749463165804
10111566062624601
2037997719757388
10209473535622129
1525053780848283
1682404455133938
10155593996585820
833740173447633
1588813281128993
1632390513440184
10154869062373506
10211760764806426
1240642332712475
10155507561751660
10155249897035831
476041099415737
10213069906053277
1722545331106130
1940184736202893
1216482815127895
1883975365261766
10209865096210650
1715908838437925
474748719561129
10158981607175177
10212483746156029
1564237523621519
1838161149845229
10212166029018556
1768548753158519
746277492211014
732301000288669
1946082565670809
10209678980947844
697693950419073
1620552851351027
318097631976204
10213972997024908
801132993391943
440501436349195
1343766255701560
1322778547834690
10208807343129619
460878510935901
1511437058914916
10209896909435659
10213665528382390
349215358842885
10213139930004073
356732514745013
10211706882925567
1753760787997527
1617465461596820
446610

659079767635750
1763391403952431
1462741520487888
1946523688927079
1890507444523736
1437844916307366
1471516892925599
10209539888476621
482635802077891
1599610760072216
10203437159167623
10213816306861185
10211985051654482
10211396200998785
1122429757857386
1534479159956376
10209804407928720
1424280487667005
538456036545358
10209132522611813
1435376453220055
1344631935650504
789936964517505
10156542139744768
2151757805050969
10212540622177866
10214339879350747
10158930694875024
1927330794175265
10210637434955607
1724715087568747
10208047741226091
10212176803391863
1768505209833197
338468403275477
1380745748627688
10211202752958837
10159130206525424
1691168910923594
1552491964782343
1366418416807642
1874164452904999
1834276033256249
10214098215838204
1404182803006031
10210008561512107
1805029299807710
1581053585302946
10211258326588442
1909799132678395
10159180520335370
1342149895834451
10203564304306154
1497597676970333
1487154911307573
1234787206668129
10212164079890951
36459764394336

1924660727804852
617551565108821
1786297458328793
10155376690264564
1722048414761973
10210049104959484
10211735794969185
1859151817433275
10155573532329594
914545228693577
1320672528048519
868106496677915
1553872691313629
10213606760910246
841858419314840
748000875372024
1623503844340611
332648647175128
10203899679810088
1794763203874811
10203405125124277
10214055509405927
10211355727623727
866787970139559
1440056772740200
10209849565623593
10213973776368212
1908311099493410
1720475771582529
1810010299015065
10105164893947954
10159024477945553
1488491177877509
1480659441972261
10210278326734887
10211893554067482
1645470408811212
10203643755332232
1610699108953942
10154545748491780
502287813449993
1500001276759423
1267810446662594
1976564759245694
10155332244609003
defaultdict(<type 'list'>, {u'10214241900140457': [0.023262084421484544, 28, 0.0, 1, 58.333333333333336, -1552.217651596676, 44.333333333333336], u'10155063013063393': [0.65795451802366167, 20717, 0.010443864229765013, 62, 28

In [50]:
mu = getMu("10155611491949456", all_comments)
test = [mu[0],
        mu[1],
        getNumLinks("10155611491949456", all_comments),
        maxDailyComments("10155611491949456"),
        avg_response("10155611491949456"),
        thread_deviation("10155611491949456"),
        comment_length("10155611491949456")]
print test



[0.78722160673392594, 7150, 0.14437367303609341, 141, 1013.0042462845011, -1989.4409374373254, 95.479830148619953]


In [52]:
with open('ALL_USER_STATS.json', 'w') as f:
    json.dump(all_users, f, indent = 4, sort_keys = True)