In [26]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [27]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression

In [28]:
comments = pd.read_csv('final_dataset_cleaned_fourplus.csv')

In [29]:
comments.drop('Unnamed: 0',axis=1,inplace=True)

I thought it might be useful to create a binary variable for whether a comment was highly upvoted.  This will allow us to use classification methods to predict comment success.  I thought 10 was an appropriate threshold in this case because this is a liminal number of upvotes between buried and salient in subreddits of this size.  It's about right as indicative of comment success. A 20 upvote comment is definitely well liked, while 5 is meh or may be controversial.  This target will also serve to normalize against the hugely disproportionate pull of a few comments that get massive upvotes on an analysis like this.  It reduces things to good comment / bad comment, which scale more accurately reflects the reality of the plus/minus quality of comments than the upvote score likely does.

In [31]:
elite_threshold = 10

comments['elitecomment'] = comments['score'].map(lambda x: 1 if x>=elite_threshold else 0)

In [6]:
comments.head()

Unnamed: 0,body,created_utc,id,parent_id,score,subreddit,word length,elitecomment
0,I was 23 but I went with my ~29 year old cowor...,1545243578,ec4lbi4,t3_a7oy9v,1,AskMen,23,0
1,"Portland, OR. The city itself is now unafforda...",1545243546,ec4la0n,t3_a7mkui,1,AskMen,36,0
2,"nope. ""the cats goodbye"" watch how a cat says ...",1545243536,ec4l9lm,t3_a7fe60,1,AskMen,28,0
3,Drunk as fuck me during an unintended one nigh...,1545243524,ec4l90i,t3_a79zu9,1,AskMen,16,0
4,There was this one time when I went over one o...,1545243449,ec4l5g6,t3_a7kmvc,1,AskMen,192,0


In [7]:
comments.groupby('subreddit')['elitecomment'].mean()

subreddit
AskMen      0.148941
AskWomen    0.157396
Name: elitecomment, dtype: float64

About 15% of comments meet the 10 upvote criterion, and it looks like these are pretty evenly distributed between the two subreddits.  Now to see what n-grams are predictive of upvotes.

### N-Grams and Upvotes by Subreddit

In [50]:
men = comments[comments['subreddit']=='AskMen'].copy()
women = comments[comments['subreddit']=='AskWomen'].copy()

In [51]:
Xm = men['body']
ym = men['elitecomment']
Xw = women['body']
yw = women['elitecomment']

In [52]:
Xm_train, Xm_test, ym_train, ym_test = train_test_split(Xm,ym,shuffle=True,stratify=ym)
Xw_train, Xw_test, yw_train, yw_test = train_test_split(Xw,yw,shuffle=True,stratify=yw)

In [15]:
#Gridsearch for AskMen words predictive of elite comment status

pipemen = Pipeline([
    ('vect', TfidfVectorizer()),
    ('model', LogisticRegression())
     ])

params = {
    'vect__ngram_range':[(1,3)],
    'vect__min_df':[2,5],
    'model__penalty':['l2','l1'],
    'model__C':[0.1, 1, 10],
}

gs_men = GridSearchCV(pipemen, params, cv=3, verbose=2, n_jobs=-1)

In [12]:
gs_men.fit(Xm_train, ym_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  2.8min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vect__ngram_range': [(1, 3)], 'vect__min_df': [2, 5], 'model__penalty': ['l2', 'l1'], 'model__C': [0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [34]:
gs_men.best_estimator_.steps

[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
          dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
          lowercase=True, max_df=1.0, max_features=None, min_df=2,
          ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
          stop_words=None, strip_accents=None, sublinear_tf=False,
          token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
          vocabulary=None)),
 ('model',
  LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
            penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
            verbose=0, warm_start=False))]

In [13]:
gs_men.score(Xm_train, ym_train)

0.8510574277854195

In [14]:
gs_men.score(Xm_test, ym_test)

0.851063829787234

Below is a coefficient matrix.  Note that the coefficients are significantly smaller than those seen in the Subreddit Classification LogReg (top predictors had coefficients of 8-10 there).  This makes sense when you consider that these coefficients correspond directly to increase in the likelihood of the result, and a comment containing a word like 'wife' should effect the likelihood of classification into AskMen a lot more powerfully than a comment containing a word like 'she' affecting whether a comment has more than 10 upvotes.

In [16]:
coefsm = pd.DataFrame(gs_men.best_estimator_.steps[1][1].coef_)

In [17]:
coefsm.columns=gs_men.best_estimator_.steps[0][1].get_feature_names()

In [21]:
coefsm.T.sort_values(0,ascending=False)

Unnamed: 0,0
she,0.779843
her,0.695038
we,0.423000
me,0.418301
they,0.394249
to,0.387257
this,0.351974
that,0.317279
who,0.302320
woman,0.298675


### AskWomen Upvotes GridSearch

In [24]:
Gridsearch for AskWomen words predictive of elite comment status
pipewomen = Pipeline([
    ('vect', TfidfVectorizer()),
    ('model', LogisticRegression())
     ])

params = {
    'vect__ngram_range':[(1,3)],
    'vect__min_df':[2,5],
    'model__penalty':['l2','l1'],
    'model__C':[0.1, 1, 10],
}

gs_women = GridSearchCV(pipewomen, params, cv=3, verbose=2, n_jobs=-1)

In [26]:
gs_women.fit(Xw_train, yw_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  3.7min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vect__ngram_range': [(1, 3)], 'vect__min_df': [2, 5], 'model__penalty': ['l2', 'l1'], 'model__C': [0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [35]:
gs_women.best_estimator_.steps

[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
          dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
          lowercase=True, max_df=1.0, max_features=None, min_df=2,
          ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
          stop_words=None, strip_accents=None, sublinear_tf=False,
          token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
          vocabulary=None)),
 ('model',
  LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
            penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
            verbose=0, warm_start=False))]

In [27]:
gs_women.score(Xw_train, yw_train)

0.8426112803199882

In [28]:
gs_women.score(Xw_test, yw_test)

0.8425803610744166

In [29]:
coefsw = pd.DataFrame(gs_women.best_estimator_.steps[1][1].coef_)
coefsw.columns=gs_women.best_estimator_.steps[0][1].get_feature_names()
coefsw.T.sort_values(0,ascending=False)

Unnamed: 0,0
she,1.055887
he,0.927168
was,0.883008
women,0.881722
her,0.862972
that,0.818250
you,0.705215
men,0.694496
the,0.686267
to,0.630533


In [30]:
coefsm.T.sort_values(0,ascending=False).to_csv('coefsm_upvotes.csv')
coefsw.T.sort_values(0,ascending=False).to_csv('coefsw_upvotes.csv')

Here I tabulated the top associated coefficients for predicting upvoted comments in each subreddit and then calculated the difference to get a sense of which coefficients were particularly different in their direction of influence on upvotes in one sub versus the other.

In [15]:
coefsw.rename(columns={'Unnamed: 0':'gram','0':'wcoef'},inplace=True)

In [16]:
coefsm.rename(columns={'Unnamed: 0':'gram','0':'mcoef'},inplace=True)

In [18]:
coefsmerged = coefsw.merge(coefsm,how='inner',on='gram')

In [19]:
coefsmerged['diff'] = coefsmerged['wcoef'] - coefsmerged['mcoef']

In [58]:
coefsmerged.sort_values('diff').to_csv('coefsmerged.csv')

In [24]:
coefsmerged.sort_values('diff').head(10)

Unnamed: 0,gram,wcoef,mcoef,diff
107067,my,-0.517453,-0.043388,-0.474065
93529,we,-0.010835,0.423,-0.433836
107060,really,-0.232504,0.145319,-0.377823
107061,year,-0.236511,0.084623,-0.321133
107056,work,-0.20314,0.112019,-0.31516
106997,never,-0.121444,0.183579,-0.305023
97833,actually,-0.013538,0.272541,-0.286079
107057,in my,-0.215462,0.066717,-0.28218
106908,best,-0.085222,0.196334,-0.281556
106996,partner,-0.12095,0.157021,-0.277971


In [25]:
coefsmerged.sort_values('diff',ascending=False).head(10)

Unnamed: 0,gram,wcoef,mcoef,diff
1,he,0.927168,-0.020592,0.94776
3,women,0.881722,0.143953,0.737769
2,was,0.883008,0.226954,0.656054
19,him,0.394799,-0.218672,0.613471
6,you,0.705215,0.104216,0.600999
5,that,0.81825,0.317279,0.500971
8,the,0.686267,0.250836,0.435431
44,down,0.251346,-0.182996,0.434342
7,men,0.694496,0.279073,0.415424
17,how,0.431155,0.038642,0.392513


In [22]:
coefsm.tail(10)

Unnamed: 0,gram,mcoef
191886,hair,-0.179463
191887,maybe,-0.181377
191888,down,-0.182996
191889,take,-0.190694
191890,amp,-0.199952
191891,get,-0.202898
191892,if,-0.208593
191893,him,-0.218672
191894,what,-0.239289
191895,much,-0.319288


In [None]:
pd.concat(coefsmerged.sort_values(mcoef))

These are remarkable numbers.  The train and test score are almost a perfect match, and 0.85 is substantially stronger than the numbers seen in my subreddit classification models.  After thinking about what might be allowing this predictive power on what seems like a harder classification problem on the face of it, I realized (post presentation!) that the baseline accuracy could be too high.

In [39]:
comments['elitecomment'].value_counts()

0    57015
1    10339
Name: elitecomment, dtype: int64

In [36]:
57015/(10339+57015)

0.8464976096445646

In [40]:
comments.groupby('subreddit')['elitecomment'].value_counts()

subreddit  elitecomment
AskMen     0               26399
           1                4620
AskWomen   0               30616
           1                5719
Name: elitecomment, dtype: int64

In [41]:
26399/(26399+4620)

0.8510590283374706

In [42]:
30616/(5719+30616)

0.842603550295858

Alas.  The numbers correspond almost exactly to the observed accuracy values from the GridSearch.  Shouldve guessed from the stopwords topping the lists. Great example of how it's easy to run with the wrong data narrative if you're not careful.  If I have some more time, I'm going to re-run the gridsearch with a higher threshold to generate more equal classes and hopefully get more useful numbers.  Undersampling or Bootstrapping while maintaining the 10+ threshold would probably be ideal, but for the sake of time, I'm going to lower the threshold a bit.

In [47]:
elite_threshold = 3

comments['elitecomment'] = comments['score'].map(lambda x: 1 if x>=elite_threshold else 0)

In [48]:
comments.groupby('subreddit')['elitecomment'].value_counts()

subreddit  elitecomment
AskMen     0               19603
           1               11416
AskWomen   0               20898
           1               15437
Name: elitecomment, dtype: int64

In [49]:
pipemen3 = Pipeline([
    ('vect', TfidfVectorizer()),
    ('model', LogisticRegression())
     ])

params = {
    'vect__ngram_range':[(1,3)],
    'vect__min_df':[2,5],
    'model__penalty':['l2','l1'],
    'model__C':[0.1, 1, 10],
}

gs_men3 = GridSearchCV(pipemen3, params, cv=3, verbose=2, n_jobs=-1)

In [53]:
gs_men3.fit(Xm_train, ym_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  2.8min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vect__ngram_range': [(1, 3)], 'vect__min_df': [2, 5], 'model__penalty': ['l2', 'l1'], 'model__C': [0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [54]:
gs_men3.score(Xm_train, ym_train)

0.6547025447042641

In [55]:
gs_men3.score(Xm_test, ym_test)

0.6288845905867182

In [56]:
19603/(19603+11406)

0.6321713051049695

Not a lot of evidence that this leads to anything meaningful.  I think the correct approach might be to use balanced classes and leave a gap, so maybe 2 and fewer upvotes against 100 or more upvotes.  Perhaps there'd be more traction there.  Worth trying at a later point.  Getting on plane now.