In [1]:
import datetime, json, re, math, sys
import numpy as np
import numpy #lol
import matplotlib.pyplot as plt
from scipy.stats import linregress as lm
import random as ra
from collections import Counter, defaultdict
from __future__ import division
% matplotlib inline
#import the files
sys.path.append('../API')
import main

# Functions
## Mu analysis

In [2]:
def preprocess(text):
    counts = Counter()
    text = text.encode("utf8")
    words = []
    for word in re.split(" ", text):
        if word not in counts:
            words.append(word)
        counts[word] += 1
        
    return counts

def order(counts):
    words = counts.keys()
    ps = np.array(counts.values())
    ps = ps/float(sum(ps))
    N = len(words)
    return np.random.choice(words, size=N, replace=False, p=ps)

def innovationrate(counts, reps = 2, termmax = 100):
    
    N = len(counts)    
    FN = sum(counts.values())

    ns = range(1,N+1)
    Mn = [0 for n in ns]
    
    for rep in range(reps):
        n = 0
        Fn = 0
        Msum = 0
        for n, word in zip(ns, order(counts)):

            f = counts[word]
            Fn += f

            if n == N:
                break 

            ## compute In and Jn
            In = Fn - (n - 1 + int(Msum))
            Jn = FN - (n - 1 + int(Msum))

            ## compute the average
            ms = np.array(range(1, min([In,termmax])+1))

            logfacts = np.log10(In - ms) - np.log10(Jn - ms)
            prods = 10**np.cumsum(logfacts)
            terms = ms*prods*(Jn - In)/(Jn - ms)
            termsum = sum(terms)
            Mn[n] += termsum/reps

            Msum += termsum

    return 1/(1 + np.array(Mn)), np.array(ns)

def decayExponent(text):
    counts = preprocess(text)
    reps = int(round(0.5 + (5000. / len(counts))))
    termmax = 1000
    alphas, ns = innovationrate(counts, reps, termmax)
    
    ix = range(int(len(ns)*1/3.),len(ns))

    x = np.log10(ns)[ix]
    y = np.log10(alphas)[ix]

    mu, b, r, p, err = lm(x,y)

    return -mu, sum(counts.values())

### The actual function

In [3]:
def getMu(text):
    if text.strip(): #in case there's no text
        mu, numwords = decayExponent(text)
        return mu, numwords
    else:
        return 0.0, 0

## Number of links

In [4]:
def getNumLinks(text):
    links = re.findall(r'(https?://[^\s]+)', text)
    return len(links)

## Deviation from thread mean

In [5]:
def thread_deviation(comment_dict):
    """get the deviation from the thread's mean response time for
    the comment"""
    try:
        thread_id = comment_dict['id'].split('_')[0]
        thread_times = main.threadResponse(thread_id)
        thread_mean = np.mean(thread_times)
        thread_deviation = comment_dict['response'] - thread_mean
        return thread_deviation
    except:
        return None

# Computation
First we need to get the comments data

In [8]:
all_comms = defaultdict(list)
humans = defaultdict(list)
bots = defaultdict(list)

with open('../../data/TRAIN.json', 'r') as f:
    TRAIN = json.load(f)
    
for comment in TRAIN:
    comment_id = comment['id']
    all_comms[comment_id] = [None] * 6
    if comment['bot']:
        bots[comment_id] = [None] * 6
    else:
        humans[comment_id] = [None] * 6

## Run the calculation for all comments

In [None]:
i = 0
for comm in TRAIN:
    comm_id = comm['id']
    text = comm['message']
    if i % 1000 == 0:
        print comm_id
    i += 1
    all_comms[comm_id][0], all_comms[comm_id][1] = getMu(text)   #both mu values
    all_comms[comm_id][2] = getNumLinks(text)  #number of links
    all_comms[comm_id][3] = comm['response']  #response time
    all_comms[comm_id][4] = thread_deviation(comm)  #mean thread deviation time
    all_comms[comm_id][5] = len(text)  #comment length
    
print all_comms
    

1252678871491831_1252713878154997




10153844429351680_10153844658751680
1285887041453126_1285916618116835
1290121891029641_1810465029197886
1038834739548085_1040062552758637
1286584094716754_1286587904716373
10153860289606680_10153860448211680
1464677133543260_1464876773523296
1250043525088699_1027187967393997
10153850782621680_10153850955311680
1282229875152176_1282243208484176
1290530250988805_1290649274310236
1291465524228611_1291471324228031
1248672898559095_1789126087978830
1282551825119981_1282657765109387
1290571237651373_1290589834316180
1291673040874526_1291676560874174
1291208607587636_1291230947585402
1290509944324169_1290632867645210
10153849865081680_10153852545926680
1247104542049264_1247156718710713
10153859749961680_10153859758791680
1291455630896267_1291488454226318


In [50]:
mu = getMu("10155611491949456", all_comments)
test = [mu[0],
        mu[1],
        getNumLinks("10155611491949456", all_comments),
        maxDailyComments("10155611491949456"),
        avg_response("10155611491949456"),
        thread_deviation("10155611491949456"),
        comment_length("10155611491949456")]
print test



[0.78722160673392594, 7150, 0.14437367303609341, 141, 1013.0042462845011, -1989.4409374373254, 95.479830148619953]


In [52]:
with open('ALL_USER_STATS.json', 'w') as f:
    json.dump(all_users, f, indent = 4, sort_keys = True)