In [1]:
import cPickle as pickle
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, classification_report

import numpy as np
import pandas as pd

In [4]:
df = pickle.load(open('../Data/labeledhate_5cats.p', 'rb'))

In [3]:
df.head()

Unnamed: 0,subreddit,id,name,body,label
0,CoonTown,cqug92k,t1_cqug92k,&gt;maybe jews\n\nnot maybe,RaceHate
1,CoonTown,cqug9f5,t1_cqug9f5,juh-juh-juh-juh-juh-juh-just cant even,RaceHate
2,CoonTown,cqug9wy,t1_cqug9wy,I like the idea...have an upvote!,RaceHate
3,CoonTown,cquga8b,t1_cquga8b,Never underestimate the stupidity of niggers. ...,RaceHate
4,CoonTown,cquga92,t1_cquga92,Someone has deeper internal issues they have w...,RaceHate


## Test-train split

In [5]:
X = df.body

In [6]:
y = df.label

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
X_train.head()

848185            He's like Poppins from It's always Sunny.
917229    As a Londoner thats currently in Cambridge, I ...
406707    Ah yes I always forget about aviation. Well th...
667836                                            [deleted]
979603    Is the Nepal earthquake still a thing? I thoug...
Name: body, dtype: object

In [8]:
X_test.shape

(473426,)

## Start with sk-learn models

### Tfidf Vectorizer

In [6]:
#with 
tfidf_v = TfidfVectorizer(stop_words='english', decode_error = 'ignore')
tfidf_fit = tfidf_v.fit_transform(X_train)
#test vector
tfidf_fit_test = tfidf_v.transform(X_test)

In [7]:
tfidf_v.idf_.shape

(289667,)

### Count Vectorizer (TF)

In [8]:
countv = CountVectorizer(decode_error = 'ignore', stop_words = 'english')
#Training vector
countv_fit = countv.fit_transform(X_train)
#test vector
countv_fit_test = countv.transform(X_test)

In [10]:
len(countv.vocabulary_)

289667

In [11]:
#Try 2-grams
countv2 = CountVectorizer(decode_error = 'ignore', stop_words = 'english', ngram_range=(1,2))
countv2_fit = countv2.fit_transform(X_train)
#test vector
countv2_fit_test = countv2.transform(X_test)

In [12]:
len(countv2.vocabulary_)

8295421

In [17]:
countv2.vocabulary_

{u'tripolitan': 7578374,
 u'terrorists guardian': 7331194,
 u'near birth': 4973934,
 u'louder claiming': 4499130,
 u'10x market': 24119,
 u'supposed slimming': 7168258,
 u'woopdiduuu': 8132981,
 u'bloqueos': 949159,
 u'hospitals connecticut': 3595827,
 u'individualism ability': 3795152,
 u'administrators focus': 319167,
 u'unethical makeup': 7698094,
 u'vs kgb': 7899327,
 u'want spineless': 7932236,
 u'boil high': 968080,
 u'threats crime': 7426370,
 u'conservatives believe': 1624518,
 u'wipe fdic': 8094679,
 u'poor surrounding': 5607389,
 u'substances homeopathy': 7109191,
 u'requirement calculated': 6207135,
 u'unknown ask': 7721578,
 u'af won': 342652,
 u'consider does': 1626799,
 u'58 waist': 163339,
 u'map guide': 4604533,
 u'pathetic talking': 5371508,
 u'reddit bribing': 6074279,
 u'vet real': 7838771,
 u'best stabbing': 865822,
 u'multiple underage': 4913250,
 u'vet read': 7838770,
 u'pressure sa': 5715206,
 u'amendment proves': 443840,
 u'assertion young': 627012,
 u'wisdom th

### Hashing Vectorizer

In [11]:
hashvect = HashingVectorizer(stop_words='english', decode_error = 'ignore', non_negative=True)
# train vector
hashvectfit = hashvect.fit_transform(X_train)
# test vector
hashvectfit_test = hashvect.transform(X_test)

In [12]:
y_train

848185        NotHate
917229        NotHate
406707       SizeHate
667836        NotHate
979603        NotHate
528009       SizeHate
1315021       NotHate
870102        NotHate
1169077       NotHate
702346        NotHate
1389627       NotHate
873782        NotHate
587043        NotHate
271605       SizeHate
1373693       NotHate
612479        NotHate
1453210       NotHate
359292       SizeHate
1524312       NotHate
944535        NotHate
826690        NotHate
399093       SizeHate
1560370       NotHate
1478906       NotHate
1295130       NotHate
666487        NotHate
1395165       NotHate
519938       SizeHate
821413        NotHate
53162        RaceHate
              ...    
1470485       NotHate
1396025       NotHate
184779     GenderHate
1262752       NotHate
1284372       NotHate
103355     GenderHate
791743        NotHate
1247617       NotHate
327069       SizeHate
1370455       NotHate
787201        NotHate
1113396       NotHate
329365       SizeHate
41090        RaceHate
278167    

## Predict 
### MultinomialNB

### Using hashing vect

In [9]:
MNBvect = MultinomialNB()

In [14]:
model_hash = MNBvect.fit(hashvectfit, y_train)

In [15]:
hashpreds = model_hash.predict(hashvectfit_test)
hashpreds_prob = model_hash.predict_proba(hashvectfit_test)

In [16]:
model_hash.score(hashvectfit_test, y_test)

0.67721460164840963

In [17]:
# get the name of the classes used
model_hash.classes_

array(['GenderHate', 'NotHate', 'RaceHate', 'ReligionHate', 'SizeHate'], 
      dtype='|S12')

In [18]:
hashpreds[0:10]

array(['NotHate', 'NotHate', 'NotHate', 'NotHate', 'NotHate', 'NotHate',
       'SizeHate', 'NotHate', 'NotHate', 'NotHate'], 
      dtype='|S12')

In [19]:
hashpreds_prob[0:10]

array([[  2.99288617e-03,   8.73152541e-01,   1.13488876e-04,
          2.97276559e-07,   1.23740787e-01],
       [  7.70988275e-03,   9.81835115e-01,   2.71478245e-04,
          4.51266264e-07,   1.01830730e-02],
       [  5.61382022e-02,   8.73970366e-01,   5.10866115e-03,
          1.12612142e-05,   6.47715094e-02],
       [  1.14680387e-01,   8.35478158e-01,   1.57775820e-03,
          8.09835261e-06,   4.82555987e-02],
       [  2.02965241e-04,   9.99673574e-01,   3.19018698e-07,
          1.95802033e-13,   1.23141612e-04],
       [  3.86844256e-08,   9.99999960e-01,   4.51064086e-12,
          9.77546172e-19,   8.87175252e-10],
       [  1.24465809e-02,   4.87122405e-01,   1.15918392e-03,
          6.44777754e-06,   4.99265383e-01],
       [  9.48970008e-02,   8.84338802e-01,   4.59753974e-03,
          1.86240059e-04,   1.59804174e-02],
       [  1.87629380e-01,   6.22900454e-01,   3.83027501e-03,
          3.59847949e-05,   1.85603906e-01],
       [  2.23498749e-02,   9.4062938

### Using count vectorizer

In [10]:
model_cv = MNBvect.fit(countv_fit, y_train)

In [11]:
cvpreds = model_cv.predict(countv_fit_test)
cvpreds_prob = model_cv.predict_proba(countv_fit_test)

In [12]:
model_cv.score(countv_fit_test, y_test)

0.75784811142607289

In [14]:
print(classification_report(y_test, cvpreds))

             precision    recall  f1-score   support

 GenderHate       0.62      0.53      0.57     56937
    NotHate       0.82      0.85      0.84    304440
   RaceHate       0.70      0.33      0.45     15963
ReligionHate       0.06      0.00      0.01      1717
   SizeHate       0.63      0.68      0.65     94369

avg / total       0.75      0.76      0.75    473426



In [16]:
model_cv2 = MNBvect.fit(countv2_fit, y_train)
model_cv2.score(countv2_fit_test, y_test)

0.75456776771871426

### Using tf-idf (though MultinomialNB may not work as well with fractional counts)

In [23]:
model_tfidf = MNBvect.fit(tfidf_fit, y_train)

In [24]:
tfidfpred = model_tfidf.predict(tfidf_fit_test)
tfidfpred_prob = model_tfidf.predict_proba(tfidf_fit_test)

In [25]:
model_tfidf.score(tfidf_fit_test, y_test)

0.73415275037703887

In [None]:
tfidf_fit.shape

## Try count vectorizer with random forest classifier

In [26]:
rfc = RandomForestClassifier(n_jobs=-1)

In [None]:
modelcv_rfc = rfc.fit_transform(countv_fit, y_train)

In [None]:
modelcv_rfc.score(countv_fit_test, y_test)

In [None]:
modelcv_rfc.feature_importances_

In [None]:
### To create ROC curve:
# Need to use one columns of prediction probabilities with one row of 1s & 0s --> build up ROC curve for each class.
# Ming thinks it's fine to rely on the accuracy score in this case, since it's multiclass.

In [None]:
fpr, tpr, thresholds = roc_curve(y, scores, pos_label=2)
roc_auc_score(y_true, y_scores)  #roc_auc_score(y_test, hashpreds, average = None)