In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
HOME = '/srv/home/christinedk/wp_internship/'
DATA_DIR = HOME + 'data/'
sys.path.append(HOME + 'collaboration/')

In [4]:
import pandas as pd
import json
import numpy as np
from collections import Counter

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler

from math import log2
from utils import entropy, load_all

In [12]:
# Page history

In [13]:
features_pos = load_all(HOME+'features/activity_{}.json')
features_neg = load_all(HOME+'negative_features/activity_{}.json')

In [14]:
len(features_pos)

19011

In [15]:
len(features_neg)

15457

In [16]:
article_features_neg = pd.DataFrame([d['article'] for d in features_neg])
article_features_neg['label'] = 0
article_features_pos = pd.DataFrame([d['article'] for d in features_pos])
article_features_pos['label'] = 1

In [17]:
article_data = pd.concat([article_features_neg, article_features_pos])
labels = article_data['label']
feat = article_data.drop('label',axis=1)

In [18]:
len(article_data)

34468

In [24]:
correlations = feat.corrwith(labels).reset_index().rename(columns={'index':'feature',0:'score'})
correlations['abs'] = correlations.score.abs()
correlations.sort_values(by='abs', ascending=False)[:10]

Unnamed: 0,feature,score,abs
8,frac_recent_revisions,0.350963,0.350963
11,concentration_ratio,0.145774,0.145774
12,contribution_frac_entropy,-0.107417,0.107417
13,recent_edit_size,0.103975,0.103975
0,edit_size,0.102426,0.102426
14,recent_response_time,0.097329,0.097329
9,top_contributor_frac,0.090308,0.090308
1,time_to_respond,0.079483,0.079483
10,frac_anon_revisions,-0.057234,0.057234
6,num_revisions,-0.046866,0.046866


In [25]:
# linear model

In [82]:
hypers = {'solver':'lbfgs','max_iter':10000,'C':10000}

In [83]:
scaler = StandardScaler()
model = LogisticRegression(**hypers)
#C=0.1,class_weight={0:1,1:5}

X_train, X_test, y_train, y_test = train_test_split(feat.fillna(0),labels)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  import sys


In [84]:
model.fit(X=X_train,y=y_train)
predictions=model.predict_proba(X_test)
y_pred = np.argmax(predictions,axis=1)
roc_auc_score(y_score=predictions[:,1],y_true=y_test)

0.7557425925016372

In [85]:
confusion_matrix(y_pred=y_pred,y_true=y_test)

array([[2793, 1070],
       [1760, 2994]])

In [86]:
f1_score(y_pred=y_pred, y_true=y_test)

0.6790655477432525

In [31]:
# user-article

In [32]:
def aggregate_features(article_users):
    if len(article_users) == 0:
        return {}
    else:
        article_users = pd.DataFrame(article_users).drop('event_user_id',axis=1)
        means = {'mean_'+key:val for key, val in article_users.mean().items()}
        std = {'std_'+key:val for key, val in article_users.std().items()}
        max_ = {'max_'+key:val for key, val in article_users.max().items()}
        ent = {'frac_page_edits_ent': entropy(article_users.frac_page_edits)}
        features = {**means, **std, **max_, **ent}

        return features

In [33]:
user_article_features_pos = pd.DataFrame([aggregate_features(d['user_article']) 
                                          for d in features_pos])
user_article_features_neg = pd.DataFrame([aggregate_features(d['user_article']) 
                                          for d in features_neg])

In [34]:
user_article_features_pos['label'] = 1
user_article_features_neg['label'] = 0

user_article_data = pd.concat([user_article_features_neg,user_article_features_pos])
labels = user_article_data['label']
feat = user_article_data.drop('label',axis=1)

In [35]:
correlations = feat.corrwith(labels).reset_index().rename(columns={'index':'feature',0:'score'})
correlations['abs'] = correlations.score.abs()
correlations.sort_values(by='abs', ascending=False)[:10]

Unnamed: 0,feature,score,abs
5,mean_frac_page_edits,0.237757,0.237757
24,frac_page_edits_ent,-0.127706,0.127706
21,max_frac_page_edits,0.100884,0.100884
0,mean_edit_size,0.090631,0.090631
13,std_frac_page_edits,0.089781,0.089781
2,mean_time_to_respond,0.078438,0.078438
3,mean_time_responded_to,0.072992,0.072992
10,std_time_to_respond,0.059014,0.059014
12,std_num_edits,-0.052719,0.052719
20,max_num_edits,-0.050877,0.050877


In [36]:
scaler = StandardScaler()
model = LogisticRegression(**hypers)
#C=0.1,class_weight={0:1,1:5}

X_train, X_test, y_train, y_test = train_test_split(feat.fillna(0),labels)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [37]:
model.fit(X=X_train,y=y_train)
predictions=model.predict_proba(X_test)
y_pred = np.argmax(predictions,axis=1)
roc_auc_score(y_score=predictions[:,1],y_true=y_test)

0.6536386359081394

In [38]:
confusion_matrix(y_pred=y_pred,y_true=y_test)

array([[2314, 1493],
       [1922, 2888]])

In [39]:
f1_score(y_pred=y_pred, y_true=y_test)

0.6284408660646285

In [None]:
# talk vol

In [40]:
talk_pos = load_all(HOME+'features/talk_{}.json')
talk_neg = load_all(HOME+'negative_features/talk_{}.json')

In [41]:
talk_vol_neg = pd.DataFrame([d['talk_volume'] for d in talk_neg])
talk_vol_neg['label'] = 0
talk_vol_pos = pd.DataFrame([d['talk_volume'] for d in talk_pos])
talk_vol_pos['label'] = 1

In [42]:
talk_vol = pd.concat([talk_vol_neg,talk_vol_pos])
labels = talk_vol['label']
feat = talk_vol.drop('label',axis=1)

In [43]:
correlations = feat.corrwith(labels).reset_index().rename(columns={'index':'feature',0:'score'})
correlations['abs'] = correlations.score.abs()
correlations.sort_values(by='abs', ascending=False)[:10]

Unnamed: 0,feature,score,abs
5,mean_response_time,0.068302,0.068302
1,frac_recent_revisions,0.035473,0.035473
2,num_editors,-0.026659,0.026659
6,page_talk_ratio,-0.024916,0.024916
0,num_revisions,-0.020181,0.020181
4,mean_edit_size,-0.015382,0.015382
3,top_contributor_frac,0.013786,0.013786


In [44]:
scaler = StandardScaler()
model = LogisticRegression(**hypers)
#C=0.1,class_weight={0:1,1:5}

X_train, X_test, y_train, y_test = train_test_split(feat.fillna(0),labels)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  import sys


In [45]:
model.fit(X=X_train,y=y_train)
predictions=model.predict_proba(X_test)
y_pred = np.argmax(predictions,axis=1)
roc_auc_score(y_score=predictions[:,1],y_true=y_test)

0.5676982862110361

In [46]:
confusion_matrix(y_pred=y_pred,y_true=y_test)

array([[ 854, 3003],
       [ 723, 4037]])

In [47]:
f1_score(y_pred=y_pred, y_true=y_test)

0.6842372881355934

In [None]:
# talk lang

In [48]:
def aggregate_language(talk_feat):
    if len(talk_feat) == 0:
        return {}
    else:
        all_utts = pd.DataFrame(list(np.concatenate(list(talk_feat.values()))))
        means = {'mean_'+key:val for key, val in all_utts.mean().items()}
        std = {'std_'+key:val for key, val in all_utts.std().items()}
        max_ = {'max_'+key:val for key, val in all_utts.max().items()}
        features = {**means, **std, **max_}

        return features

In [49]:
talk_lang_neg = pd.DataFrame([aggregate_language(d['talk_language']) for d in talk_neg])
talk_lang_neg['label'] = 0

In [50]:
len(talk_lang_neg)

15457

In [51]:
talk_lang_pos = pd.DataFrame([aggregate_language(d['talk_language']) for d in talk_pos])
talk_lang_pos['label'] = 1

In [52]:
len(talk_lang_pos)

19011

In [53]:
talk_lang = pd.concat([talk_lang_neg,talk_lang_pos])
labels = talk_lang['label']
feat = talk_lang.drop('label',axis=1)

In [54]:
correlations = feat.corrwith(labels).reset_index().rename(columns={'index':'feature',0:'score'})
correlations['abs'] = correlations.score.abs()
correlations.sort_values(by='abs', ascending=False)[:10]

Unnamed: 0,feature,score,abs
44,max_politeness_markers_==Please==,-0.029585,0.029585
45,max_politeness_markers_==Please_start==,-0.028034,0.028034
55,max_politeness_markers_==1st_person_start==,-0.026753,0.026753
51,max_politeness_markers_==Gratitude==,-0.026207,0.026207
58,max_politeness_markers_==Indirect_(greeting)==,-0.025383,0.025383
56,max_politeness_markers_==2nd_person==,-0.024626,0.024626
54,max_politeness_markers_==1st_person==,-0.024332,0.024332
65,max_reply_depth,-0.023883,0.023883
46,max_politeness_markers_==HASHEDGE==,-0.023032,0.023032
48,max_politeness_markers_==Hedges==,-0.022984,0.022984


In [55]:
scaler = StandardScaler()
model = LogisticRegression(**hypers)
#C=0.1,class_weight={0:1,1:5}

X_train, X_test, y_train, y_test = train_test_split(feat.fillna(0),labels)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [56]:
model.fit(X=X_train,y=y_train)
predictions=model.predict_proba(X_test)
y_pred = np.argmax(predictions,axis=1)
roc_auc_score(y_score=predictions[:,1],y_true=y_test)

0.5096086515920547

In [57]:
confusion_matrix(y_pred=y_pred,y_true=y_test)

array([[  82, 3815],
       [  96, 4624]])

In [58]:
f1_score(y_pred=y_pred, y_true=y_test)

0.702788965726879

In [None]:
# combined

In [59]:
labels = article_data['label']

feat = pd.concat([ds.reset_index(drop=True).drop('label',axis=1) 
                  for ds in [article_data,talk_vol,talk_lang,user_article_data]],axis=1) #editor_data,collab_data

In [60]:
len(feat)

34468

In [61]:
correlations = feat.corrwith(labels).reset_index().rename(columns={'index':'feature',0:'score'})
correlations['abs'] = correlations.score.abs()
correlations.sort_values(by='abs', ascending=False)[:10]

Unnamed: 0,feature,score,abs
8,frac_recent_revisions,0.066059,0.066059
14,recent_response_time,0.060187,0.060187
1,time_to_respond,0.056839,0.056839
90,mean_time_to_respond,0.055441,0.055441
2,article_age_years,0.053727,0.053727
20,mean_response_time,0.050521,0.050521
91,mean_time_responded_to,0.050519,0.050519
98,std_time_to_respond,0.041386,0.041386
106,max_time_to_respond,0.04111,0.04111
16,frac_recent_revisions,-0.040411,0.040411


In [62]:
scaler = StandardScaler()
model = LogisticRegression(**hypers)#,class_weight={0:1,1:5}

X_train, X_test, y_train, y_test = train_test_split(feat.fillna(0),labels)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


In [63]:
model.fit(X=X_train,y=y_train)
predictions=model.predict_proba(X_test)
y_pred = np.argmax(predictions,axis=1)
roc_auc_score(y_score=predictions[:,1],y_true=y_test)

0.7529580078880209

In [64]:
confusion_matrix(y_pred=y_pred,y_true=y_test)

array([[2761, 1099],
       [1763, 2994]])

In [65]:
f1_score(y_pred=y_pred, y_true=y_test)

0.6766101694915254

In [178]:
# Thoughts:

# article:
# comparing to same pages means some features are less relevant; eg. num_editors, mean edit_size, top_contributor_frac, etc
# recency features should be more important 
#     - calculate all features for recent revisions only as well?


# user-article
# empty for some negatives? sample page': 189559,'date': '2003-02-28 16:09:01'
# revert features are incorrect, need to re-export

In [None]:
# Editor