In [1]:
import cPickle as pickle
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve

import numpy as np
import pandas as pd

In [2]:
df = pickle.load(open('../Data/labeledhate_5cats.p', 'rb'))

In [4]:
df.head()

Unnamed: 0,subreddit,id,name,body,label
0,CoonTown,cqug92k,t1_cqug92k,&gt;maybe jews\n\nnot maybe,RaceHate
1,CoonTown,cqug9f5,t1_cqug9f5,juh-juh-juh-juh-juh-juh-just cant even,RaceHate
2,CoonTown,cqug9wy,t1_cqug9wy,I like the idea...have an upvote!,RaceHate
3,CoonTown,cquga8b,t1_cquga8b,Never underestimate the stupidity of niggers. ...,RaceHate
4,CoonTown,cquga92,t1_cquga92,Someone has deeper internal issues they have w...,RaceHate


## Test-train split

In [3]:
X = df.body

In [4]:
y = df.label

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
X_train.head()

848185            He's like Poppins from It's always Sunny.
917229    As a Londoner thats currently in Cambridge, I ...
406707    Ah yes I always forget about aviation. Well th...
667836                                            [deleted]
979603    Is the Nepal earthquake still a thing? I thoug...
Name: body, dtype: object

In [39]:
X_test.shape

(473426,)

## Start with sk-learn models

### Tfidf Vectorizer

In [77]:
tfidf_v = TfidfVectorizer(stop_words='english', decode_error = 'ignore')
tfidf_fit = tfidf_v.fit_transform(X_train)
#test vector
tfidf_fit_test = tfidf_v.transform(X_test)

### Count Vectorizer (TF)

In [6]:
#Training vector
countv = CountVectorizer(decode_error = 'ignore', stop_words = 'english')
countv_fit = countv.fit_transform(X_train)
#test vector
countv_fit_test = countv.transform(X_test)

### Hashing Vectorizer

In [37]:
hashvect = HashingVectorizer(stop_words='english', decode_error = 'ignore', non_negative=True)
# train vector
hashvectfit = hashvect.fit_transform(X_train)
# test vector
hashvectfit_test = hashvect.transform(X_test)

In [69]:
y_train

848185        NotHate
917229        NotHate
406707       SizeHate
667836        NotHate
979603        NotHate
528009       SizeHate
1315021       NotHate
870102        NotHate
1169077       NotHate
702346        NotHate
1389627       NotHate
873782        NotHate
587043        NotHate
271605       SizeHate
1373693       NotHate
612479        NotHate
1453210       NotHate
359292       SizeHate
1524312       NotHate
944535        NotHate
826690        NotHate
399093       SizeHate
1560370       NotHate
1478906       NotHate
1295130       NotHate
666487        NotHate
1395165       NotHate
519938       SizeHate
821413        NotHate
53162        RaceHate
              ...    
1470485       NotHate
1396025       NotHate
184779     GenderHate
1262752       NotHate
1284372       NotHate
103355     GenderHate
791743        NotHate
1247617       NotHate
327069       SizeHate
1370455       NotHate
787201        NotHate
1113396       NotHate
329365       SizeHate
41090        RaceHate
278167    

## Predict 
### MultinomialNB

### Using hashing vect

In [8]:
MNBvect = MultinomialNB()

In [40]:
model_hash = MNBvect.fit(hashvectfit, y_train)

In [63]:
hashpreds = model_hash.predict(hashvectfit_test)
hashpreds_prob = model_hash.predict_proba(hashvectfit_test)

In [72]:
model_hash.score(hashvectfit_test, y_test)

0.67711954983460987

In [71]:
# get the name of the classes used
model_hash.classes_

array(['GenderHate', 'NotHate', 'RaceHate', 'ReligionHate', 'SexOrHate',
       'SizeHate'], 
      dtype='|S12')

In [66]:
hashpreds[0:10]

array(['NotHate', 'NotHate', 'NotHate', 'NotHate', 'NotHate', 'NotHate',
       'SizeHate', 'NotHate', 'NotHate', 'NotHate'], 
      dtype='|S12')

In [67]:
hashpreds_prob[0:10]

array([[  2.77617153e-03,   8.73342290e-01,   1.13513539e-04,
          2.97341162e-07,   5.05560194e-08,   1.23767677e-01],
       [  7.48087278e-03,   9.82061704e-01,   2.71540897e-04,
          4.51370407e-07,   8.11350104e-09,   1.01854231e-02],
       [  5.45005943e-02,   8.75485219e-01,   5.11751599e-03,
          1.12807333e-05,   1.61231716e-06,   6.48837779e-02],
       [  1.13457222e-01,   8.36631682e-01,   1.57993658e-03,
          8.10953381e-06,   8.25386048e-07,   4.83222241e-02],
       [  1.91755279e-04,   9.99684783e-01,   3.19022274e-07,
          1.95804228e-13,   7.15720426e-16,   1.23142993e-04],
       [  3.47313144e-08,   9.99999964e-01,   4.51064088e-12,
          9.77546176e-19,   2.63860503e-22,   8.87175255e-10],
       [  1.17650019e-02,   4.87457864e-01,   1.15998220e-03,
          6.45221784e-06,   1.49488584e-06,   4.99609205e-01],
       [  9.30854478e-02,   8.86010091e-01,   4.60622851e-03,
          1.86592029e-04,   1.01022427e-04,   1.60106184e-02],


### Using count vectorizer

In [9]:
model_cv = MNBvect.fit(countv_fit, y_train)

In [10]:
cvpreds = model_cv.predict(countv_fit_test)
cvpreds_prob = model_cv.predict_proba(countv_fit_test)

In [11]:
model_cv.score(countv_fit_test, y_test)

0.75784811142607289

### Using tf-idf (though MultinomialNB may not work as well with fractional counts)

In [80]:
model_tfidf = MNBvect.fit(tfidf_fit, y_train)

In [81]:
tfidfpred = model_tfidf.predict(tfidf_fit_test)
tfidfpred_prob = model_tfidf.predict_proba(tfidf_fit_test)

In [82]:
model_tfidf.score(tfidf_fit_test, y_test)

0.73381901289747498

## Try count vectorizer with random forest classifier

In [7]:
rfc = RandomForestClassifier(n_jobs=-1)

In [None]:
modelcv_rfc = rfc.fit_transform(countv_fit, y_train)

In [None]:
modelcv_rfc.score(countv_fit_test, y_test)

In [None]:
modelcv_rfc.feature_importances_

In [None]:
### To create ROC curve:
# Need to use one columns of prediction probabilities with one row of 1s & 0s --> build up ROC curve for each class.
# Ming thinks it's fine to rely on the accuracy score in this case, since it's multiclass.

In [None]:
fpr, tpr, thresholds = roc_curve(y, scores, pos_label=2)
roc_auc_score(y_true, y_scores)  #roc_auc_score(y_test, hashpreds, average = None)