In [6]:
import cPickle as pickle
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, classification_report, auc
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize
from sklearn.pipeline import Pipeline
from scipy import interp
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from string import punctuation

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib notebook

## Function Definitions

In [7]:
def createmulticlassROC(classes, y_test, y_score):
    '''
    Function to create & plot ROC curve & associated areas
    Adapted from http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html

    Inputs: n_classes: the number of classes
            y_test: the test labels
            y_score: the predicted probabilities for each class.   
                (e.g. y_score = classifier.fit(countv_fit_X_train, y_train).predict_proba(countv_fit_X_test) )
    '''
    
    # Compute ROC curve and ROC area for each class
    n_classes = len(classes)
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Plot all ROC curves
    plt.figure(figsize = (12,8))

    for i in range(n_classes):
        plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'
                                       ''.format(classes[i], roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves for Predicting Hate Speech')
    plt.legend(loc="lower right")
    plt.show()

In [8]:
# Stemming / Tokenizing
# via http://stackoverflow.com/questions/26126442/combining-text-stemming-and-removal-of-punctuation-in-nltk-and-scikit-learn

stemmer = SnowballStemmer("english")

def stem_tokens(tokens, stemmer):
    stemmed=[]
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems


In [9]:
def classify_this(modelchoice, classes, tfidf_X_train, tfidf_X_test, y_train, y_test):
    '''
    Automate running model & seeing output results.
    modelchoice: MultinomialNB(), LinearSVC()
    classes: the list of hateful classes, used in plotting ROC
    tfidf_X_train/test: the tfidf of the train or test set
    y_train/y_test: from test/train split --> BE SURE Y & TFIDFs USED MATCH THEIR TEST/TRAIN SPLITS!!! (e.g. test_size=0.3, random_state=42)
    '''
    classifier =  OneVsRestClassifier(modelchoice, n_jobs=-1)
    y_score = classifier.fit(tfidf_X_train, y_train).predict_proba(tfidf_X_test)
    y_preds = classifier.fit(tfidf_X_train, y_train).predict(tfidf_X_test)
    createmulticlassROC(classes, y_test, y_score)
    print(roc_auc_score(y_test, y_score))
    print(classification_report(y_test, y_preds))

## Load Comments & Labels

In [10]:
X_stripped = pickle.load(open('../Data/X_stripped.p', 'rb')) #Comments stripped of punctuation
y = pickle.load(open('../Data/y.p', 'rb'))
# # X = pickle.load(open('../Data/X.p', 'rb'))

In [None]:
# df = pickle.load(open('../Data/labeledhate_5cats.p', 'rb'))
# X = df.body
# X_stripped = X.apply(lambda x: ''.join([l for l in x if l not in punctuation]))
# y = df.label

In [12]:
# Binarize the output for sklearn
y = label_binarize(y.astype(int), classes=['NotHate', 'SizeHate', 'GenderHate', 'RaceHate', 'ReligionHate'],sparse_output=False)
# class labels
classes=['NotHate', 'SizeHate', 'GenderHate', 'RaceHate', 'ReligionHate']

  mask |= (ar1 == a)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_stripped, y, test_size=0.3, random_state=42)

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       ..., 
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

## Load or create TFIDF matrix

In [14]:
# tfidf_X_train = pickle.load(open('tfidf1000_fittrans_X_train.p', 'rb'))
# tfidf_X_test = pickle.load(open('tfidf1000_fittrans_X_train.p', 'rb'))


In [49]:
# tfidfv = TfidfVectorizer(decode_error = 'ignore', stop_words = 'english', max_features=8000, tokenizer=tokenize)
# tfidfv_X_train = tfidfv.fit_transform(X_train)
# tfidfv_X_test = tfidfv.transform(X_test)


In [55]:
y_test.shape

(473426, 5)

(1104659, 5)

## Run Models

In [56]:
classify_this(LinearSVC(), classes, tfidf_X_train, tfidf_X_test, y_train, y_test)

AttributeError: 'LinearSVC' object has no attribute 'predict_proba'

In [61]:
# classify_this(MultinomialNB(), classes, tfidf_X_train, tfidf_X_test, y_train, y_test)
classifier =  OneVsRestClassifier(MultinomialNB(), n_jobs=-1)


In [71]:
print(tfidfv_X_train.shape)
print(y_train.shape)
print(tfidfv_X_test.shape)

(1104659, 8000)
(1104659, 5)
(473426, 8000)


In [70]:
y_train.shape

(1104659, 5)

In [64]:
fitted_clf = classifier.fit(tfidfv_X_train, y_train)

In [72]:
y_score = fitted_clf.predict(tfidf_X_test)

ValueError: dimension mismatch

In [52]:
y_preds = classifier.fit(tfidf_X_train, y_train).predict(tfidf_X_test)

In [None]:
y_score.shape

In [48]:
y_preds.shape

(1104659, 5)

In [53]:
createmulticlassROC(classes, y_test, y_score)
print(roc_auc_score(y_test, y_score))
print(classification_report(y_test, y_preds))

ValueError: Found arrays with inconsistent numbers of samples: [ 473426 1104659]

In [17]:
y_test.shape

(473426, 5)

In [26]:
y_score.shape

(1104659, 5)