In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
HOME = '/srv/home/christinedk/wp_internship/'
DATA_DIR = HOME + 'data/'
sys.path.append(HOME + 'collaboration/')

In [4]:
import pandas as pd
import json
import numpy as np
from collections import Counter

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler

from math import log2
from utils import entropy

In [5]:
# Page history

In [6]:
template='autobiography'

In [7]:
with open(HOME+'features/activity_{}.json'.format(template),'rb') as f:
    features_pos = json.load(f)
with open(HOME+'negative_features/activity_{}.json'.format(template),'rb') as f:
    features_neg = json.load(f)

In [8]:
len(features_pos)

4224

In [9]:
len(features_neg)

3376

In [10]:
article_features_neg = pd.DataFrame([d['article'] for d in features_neg])
article_features_neg['label'] = 0
article_features_pos = pd.DataFrame([d['article'] for d in features_pos])
article_features_pos['label'] = 1

In [11]:
article_data = pd.concat([article_features_neg, article_features_pos])
labels = article_data['label']
feat = article_data.drop('label',axis=1)

In [12]:
len(article_data)

7600

In [13]:
correlations = feat.corrwith(labels).reset_index().rename(columns={'index':'feature',0:'score'})
correlations['abs'] = correlations.score.abs()
correlations.sort_values(by='abs', ascending=False)[:10]

Unnamed: 0,feature,score,abs
8,frac_recent_revisions,0.48269,0.48269
11,concentration_ratio,0.172245,0.172245
12,contribution_frac_entropy,-0.165734,0.165734
13,recent_edit_size,0.144688,0.144688
9,top_contributor_frac,0.143939,0.143939
0,edit_size,0.129611,0.129611
10,frac_anon_revisions,-0.077423,0.077423
1,time_to_respond,0.069562,0.069562
6,num_revisions,-0.065043,0.065043
4,frac_minor_edits,-0.061321,0.061321


In [14]:
# linear model

In [15]:
hypers = {'solver':'lbfgs','max_iter':10000,'C':10000}

In [16]:
scaler = StandardScaler()
model = LogisticRegression(**hypers)
#C=0.1,class_weight={0:1,1:5}

X_train, X_test, y_train, y_test = train_test_split(feat.fillna(0),labels)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  import sys


In [17]:
model.fit(X=X_train,y=y_train)
predictions=model.predict_proba(X_test)
y_pred = np.argmax(predictions,axis=1)
roc_auc_score(y_score=predictions[:,1],y_true=y_test)

0.8418778077268644

In [18]:
confusion_matrix(y_pred=y_pred,y_true=y_test)

array([[665, 175],
       [300, 760]])

In [19]:
f1_score(y_pred=y_pred, y_true=y_test)

0.7619047619047619

In [20]:
# user-article

In [21]:
def aggregate_features(article_users):
    if len(article_users) == 0:
        return {}
    else:
        article_users = pd.DataFrame(article_users).drop('event_user_id',axis=1)
        means = {'mean_'+key:val for key, val in article_users.mean().items()}
        std = {'std_'+key:val for key, val in article_users.std().items()}
        max_ = {'max_'+key:val for key, val in article_users.max().items()}
        ent = {'frac_page_edits_ent': entropy(article_users.frac_page_edits)}
        features = {**means, **std, **max_, **ent}

        return features

In [22]:
user_article_features_pos = pd.DataFrame([aggregate_features(d['user_article']) 
                                          for d in features_pos])
user_article_features_neg = pd.DataFrame([aggregate_features(d['user_article']) 
                                          for d in features_neg])

In [23]:
user_article_features_pos['label'] = 1
user_article_features_neg['label'] = 0

user_article_data = pd.concat([user_article_features_neg,user_article_features_pos])
labels = user_article_data['label']
feat = user_article_data.drop('label',axis=1)

In [24]:
correlations = feat.corrwith(labels).reset_index().rename(columns={'index':'feature',0:'score'})
correlations['abs'] = correlations.score.abs()
correlations.sort_values(by='abs', ascending=False)[:10]

Unnamed: 0,feature,score,abs
5,mean_frac_page_edits,0.32086,0.32086
24,frac_page_edits_ent,-0.195256,0.195256
21,max_frac_page_edits,0.160401,0.160401
0,mean_edit_size,0.12747,0.12747
13,std_frac_page_edits,0.1231,0.1231
8,std_edit_size,0.081161,0.081161
15,std_revision_is_identity_revert,-0.064561,0.064561
2,mean_time_to_respond,0.062493,0.062493
3,mean_time_responded_to,0.062403,0.062403
20,max_num_edits,-0.053866,0.053866


In [25]:
scaler = StandardScaler()
model = LogisticRegression(**hypers)
#C=0.1,class_weight={0:1,1:5}

X_train, X_test, y_train, y_test = train_test_split(feat.fillna(0),labels)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [26]:
model.fit(X=X_train,y=y_train)
predictions=model.predict_proba(X_test)
y_pred = np.argmax(predictions,axis=1)
roc_auc_score(y_score=predictions[:,1],y_true=y_test)

0.7240801569877362

In [27]:
confusion_matrix(y_pred=y_pred,y_true=y_test)

array([[571, 278],
       [364, 687]])

In [28]:
f1_score(y_pred=y_pred, y_true=y_test)

0.681547619047619

In [29]:
# talk vol

In [30]:
with open(HOME+'features/talk_{}.json'.format(template),'rb') as f:
    talk_pos = json.load(f)
with open(HOME+'negative_features/talk_{}.json'.format(template),'rb') as f:
    talk_neg = json.load(f)

In [31]:
talk_vol_neg = pd.DataFrame([d['talk_volume'] for d in talk_neg])
talk_vol_neg['label'] = 0
talk_vol_pos = pd.DataFrame([d['talk_volume'] for d in talk_pos])
talk_vol_pos['label'] = 1

In [32]:
talk_vol = pd.concat([talk_vol_neg,talk_vol_pos])
labels = talk_vol['label']
feat = talk_vol.drop('label',axis=1)

In [33]:
correlations = feat.corrwith(labels).reset_index().rename(columns={'index':'feature',0:'score'})
correlations['abs'] = correlations.score.abs()
correlations.sort_values(by='abs', ascending=False)[:10]

Unnamed: 0,feature,score,abs
5,mean_response_time,0.081944,0.081944
1,frac_recent_revisions,0.039484,0.039484
6,page_talk_ratio,-0.028584,0.028584
0,num_revisions,-0.025806,0.025806
2,num_editors,-0.014642,0.014642
4,mean_edit_size,-0.003368,0.003368
3,top_contributor_frac,0.003081,0.003081


In [34]:
scaler = StandardScaler()
model = LogisticRegression(**hypers)
#C=0.1,class_weight={0:1,1:5}

X_train, X_test, y_train, y_test = train_test_split(feat.fillna(0),labels)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  import sys


In [35]:
model.fit(X=X_train,y=y_train)
predictions=model.predict_proba(X_test)
y_pred = np.argmax(predictions,axis=1)
roc_auc_score(y_score=predictions[:,1],y_true=y_test)

0.5950850403975404

In [36]:
confusion_matrix(y_pred=y_pred,y_true=y_test)

array([[215, 649],
       [150, 886]])

In [37]:
f1_score(y_pred=y_pred, y_true=y_test)

0.6892259821081291

In [38]:
# talk lang

In [39]:
with open(HOME+'features/talk_{}.json'.format(template),'rb') as f:
    talk_pos = json.load(f)
with open(HOME+'negative_features/talk_{}.json'.format(template),'rb') as f:
    talk_neg = json.load(f)

In [40]:
def aggregate_language(talk_feat):
    if len(talk_feat) == 0:
        return {}
    else:
        all_utts = pd.DataFrame(list(np.concatenate(list(talk_feat.values()))))
        means = {'mean_'+key:val for key, val in all_utts.mean().items()}
        std = {'std_'+key:val for key, val in all_utts.std().items()}
        max_ = {'max_'+key:val for key, val in all_utts.max().items()}
        features = {**means, **std, **max_}

        return features

In [41]:
talk_lang_neg = pd.DataFrame([aggregate_language(d['talk_language']) for d in talk_neg])
talk_lang_neg['label'] = 0

In [42]:
len(talk_lang_neg)

3376

In [43]:
talk_lang_pos = pd.DataFrame([aggregate_language(d['talk_language']) for d in talk_pos])
talk_lang_pos['label'] = 1

In [44]:
len(talk_lang_pos)

4224

In [45]:
talk_lang = pd.concat([talk_lang_neg,talk_lang_pos])
labels = talk_lang['label']
feat = talk_lang.drop('label',axis=1)

In [46]:
correlations = feat.corrwith(labels).reset_index().rename(columns={'index':'feature',0:'score'})
correlations['abs'] = correlations.score.abs()
correlations.sort_values(by='abs', ascending=False)[:10]

Unnamed: 0,feature,score,abs
29,std_politeness_markers_==Gratitude==,-0.031645,0.031645
45,max_politeness_markers_==Please_start==,-0.03131,0.03131
14,mean_politeness_markers_==Indirect_(greeting)==,-0.028187,0.028187
51,max_politeness_markers_==Gratitude==,-0.028184,0.028184
44,max_politeness_markers_==Please==,-0.027673,0.027673
12,mean_politeness_markers_==2nd_person==,-0.027588,0.027588
58,max_politeness_markers_==Indirect_(greeting)==,-0.026661,0.026661
1,mean_politeness_markers_==Please_start==,-0.026454,0.026454
23,std_politeness_markers_==Please_start==,-0.025945,0.025945
55,max_politeness_markers_==1st_person_start==,-0.025695,0.025695


In [47]:
scaler = StandardScaler()
model = LogisticRegression(**hypers)
#C=0.1,class_weight={0:1,1:5}

X_train, X_test, y_train, y_test = train_test_split(feat.fillna(0),labels)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [48]:
model.fit(X=X_train,y=y_train)
predictions=model.predict_proba(X_test)
y_pred = np.argmax(predictions,axis=1)
roc_auc_score(y_score=predictions[:,1],y_true=y_test)

0.4714312745604876

In [49]:
confusion_matrix(y_pred=y_pred,y_true=y_test)

array([[ 61, 768],
       [ 94, 977]])

In [50]:
f1_score(y_pred=y_pred, y_true=y_test)

0.6938920454545454

In [170]:
# combined

In [51]:
labels = article_data['label']

feat = pd.concat([ds.reset_index(drop=True).drop('label',axis=1) 
                  for ds in [article_data,talk_vol,talk_lang,user_article_data]],axis=1) #editor_data,collab_data

In [52]:
len(feat)

7600

In [53]:
correlations = feat.corrwith(labels).reset_index().rename(columns={'index':'feature',0:'score'})
correlations['abs'] = correlations.score.abs()
correlations.sort_values(by='abs', ascending=False)[:10]

Unnamed: 0,feature,score,abs
8,frac_recent_revisions,0.096268,0.096268
2,article_age_years,0.082205,0.082205
1,time_to_respond,0.076882,0.076882
90,mean_time_to_respond,0.072649,0.072649
91,mean_time_responded_to,0.066997,0.066997
20,mean_response_time,0.062515,0.062515
17,num_editors,0.058598,0.058598
106,max_time_to_respond,0.058216,0.058216
18,top_contributor_frac,-0.056674,0.056674
98,std_time_to_respond,0.056469,0.056469


In [54]:
scaler = StandardScaler()
model = LogisticRegression(**hypers)#,class_weight={0:1,1:5}

X_train, X_test, y_train, y_test = train_test_split(feat.fillna(0),labels)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


In [55]:
model.fit(X=X_train,y=y_train)
predictions=model.predict_proba(X_test)
y_pred = np.argmax(predictions,axis=1)
roc_auc_score(y_score=predictions[:,1],y_true=y_test)

0.8198512969588552

In [56]:
confusion_matrix(y_pred=y_pred,y_true=y_test)

array([[672, 188],
       [330, 710]])

In [57]:
f1_score(y_pred=y_pred, y_true=y_test)

0.7327141382868938

In [178]:
# Thoughts:

# article:
# comparing to same pages means some features are less relevant; eg. num_editors, mean edit_size, top_contributor_frac, etc
# recency features should be more important 
#     - calculate all features for recent revisions only as well?


# user-article
# empty for some negatives? sample page': 189559,'date': '2003-02-28 16:09:01'
# revert features are incorrect, need to re-export

In [None]:
# Editor