In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline
import seaborn as sns

import nltk
from nltk.tokenize import (sent_tokenize, TreebankWordTokenizer, WhitespaceTokenizer)
from nltk.stem.porter import PorterStemmer
from nltk.util import ngrams
from nltk.corpus import stopwords

from collections import Counter
from operator import itemgetter

from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
import re

# from sqlalchemy import create_engine

from sklearn.naive_bayes import GaussianNB, BernoulliNB,MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import (accuracy_score, classification_report,confusion_matrix, precision_score, 
                             recall_score, f1_score, roc_curve, roc_auc_score, average_precision_score, 
                             precision_recall_curve, auc)

from sklearn.utils import resample
from sklearn.cross_validation import train_test_split 
from sklearn.model_selection import KFold  
from sklearn.preprocessing import StandardScaler 

from sklearn.base import TransformerMixin

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

from sklearn.utils import shuffle

from xgboost import XGBClassifier

import datetime
import warnings
import os

warnings.filterwarnings("ignore")

In [None]:
# helper functions
def sentence_tokenizer(text):
    sentences = sent_tokenize(text)
    return sentences

def whitespace_tokenizer(sentences):
    listy = []
    tokenizer = WhitespaceTokenizer()
    for i in list(range(0,len(sentences))):
        tokenized = tokenizer.tokenize(sentences[i])
        listy.append(tokenized)
    return listy

def polarity(sentences):
    listy = []
    for i in list(range(0,len(sentences))):
        pol = TextBlob(sentences[i]).polarity
        listy.append(pol)
    return np.min(listy), np.max(listy), np.mean(listy),listy

def stemmer(text):
    stemmer = PorterStemmer()
    listy = []
    for word in TextBlob(text).words:
        listy.append(stemmer.stem(word))
    return listy

def token_clean(text):
    text = text.replace('\n',' ')
    text = re.sub('[^A-Za-z0-9 ]+', '', text)
    text = text.lower().split()
    return text

In [None]:
df = pd.read_pickle('../data/toxictrain.pkl')
print(df.shape)
df.head()

In [None]:
df.sum(axis=0,numeric_only=True)

### Upsampling

In [None]:
df.columns

In [None]:
df_Cl = df[df.rating == 0]
df_To = df[df.toxic == 1]
df_ST = df[df.severe_toxic == 1]
df_Ob = df[df.obscene == 1]
df_Th = df[df.threat == 1]
df_In = df[df.insult == 1]
df_IH = df[df.identity_hate == 1]
print(df_Cl.shape, df_To.shape,df_ST.shape,df_Ob.shape,df_Th.shape,df_In.shape,df_IH.shape)

df_STu = resample(df_ST, replace=True, n_samples=30000)
df_Obu = resample(df_Ob, replace=True, n_samples=20000)
df_Thu = resample(df_Th, replace=True, n_samples=20000)
df_Inu = resample(df_In, replace=True, n_samples=20000)
df_IHu = resample(df_IH, replace=True, n_samples=30000)

df = pd.concat([df_Cl, df_STu, df_Obu, df_Thu, df_Inu, df_IHu])
print(df.shape)

In [None]:
df.sum(axis=0,numeric_only=True)

In [None]:
# df_t.sum(axis=0,numeric_only=True)

In [None]:
# df1 = shuffle(df)[-100000:]
# # df1 = df[['comment_text_s','toxic']]
# X = df1['comment_text_s']
# y = df1['toxic']
# print(X.shape, y.shape)

In [None]:
# count_vect = CountVectorizer()
# X_cv = count_vect.fit_transform(df1.comment_text_s)
# X_cv.shape

In [None]:
# X = X.toarray()

In [None]:
# X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X_cv, y, test_size=0.3,random_state=42, stratify=y)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42, stratify=y)

# k_range = list(range(1, 101))
# print(X_train.shape, y_train.shape)
# print(X_test.shape,y_test.shape)

In [None]:
class DenseTransformer(TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

In [None]:
# clf = MultinomialNB().fit(X_train_tfidf, y)
# text_clf = Pipeline([('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
#                      ('clf', MultinomialNB())])
# text_clf.fit(X_train, y_train)
# y_pred = text_clf.predict(X_test)

# print(np.mean(y_pred == y_test))
# print(confusion_matrix(y_test,y_pred))
# print(classification_report(y_test,y_pred)) 

In [None]:
# text_clf = Pipeline([('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
#                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
#                                            alpha=1e-3, random_state=42,
#                                            max_iter=5, tol=None))])
# text_clf.fit(X_train,y_train) 
# y_pred = text_clf.predict(X_test)
# print(np.mean(y_pred == y_test))
# print(confusion_matrix(y_test,y_pred))
# print(classification_report(y_test,y_pred)) 

In [None]:
# pipe = Pipeline([('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
#                      ('pipe', RandomForestClassifier(max_features='auto', n_estimators=1000))])
# pipe.fit(X_train,y_train) 
# y_pred = pipe.predict(X_test)
# print(np.mean(y_pred == y_test))
# print(confusion_matrix(y_test,y_pred))
# print(classification_report(y_test,y_pred))

In [None]:
# pipe = Pipeline([('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
#                      ('pipe', GradientBoostingClassifier(n_estimators=500))])
# pipe.fit(X_train,y_train) 
# y_pred = pipe.predict(X_test)
# print(np.mean(y_pred == y_test))
# print(confusion_matrix(y_test,y_pred))
# print(classification_report(y_test,y_pred))

In [None]:
def test_model(dataframe, model):
    categories = ['clean','toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']
    cat_dict = {}
    
    
    for cat in categories:
        df = dataframe[dataframe[cat]==1]
        X = df['comment_text_s']
        y = pd.cut(df.rating, bins=7, labels=list(range(7)))
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42, stratify=y)
        X_train, X_test = np.array(X_train), np.array(X_test)

        pipe = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('to_dense', DenseTransformer()),
                             ('pipe', model)])
        pipe.fit(X_train,y_train.ravel()) # added .ravel() due to feature names mismatch
        y_pred = pipe.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision_1 = precision_score(y_test, y_pred ,pos_label=1, average=None)
        precision_0 = precision_score(y_test, y_pred ,pos_label=0, average=None)
        recall_1 = recall_score(y_test, y_pred, pos_label=1, average=None)
        recall_0 = recall_score(y_test, y_pred, pos_label=0, average=None)
        f1_1 = f1_score(y_test, y_pred, pos_label=1, average=None)
        f1_0 = f1_score(y_test, y_pred, pos_label=0, average=None)
#         auc = roc_auc_score(y_test, y_pred)        
        
        mn = np.mean(y_pred == y_test)
        cm = confusion_matrix(y_test,y_pred)
        cr = classification_report(y_test,y_pred)
        cat_dict[cat] = {"accuracy": accuracy, "precision_1": precision_1, 
                         "precision_0": precision_0, "recall_1": recall_1, 
                         "recall_0": recall_0, "f1_1": f1_1, "f1_0": f1_0, 
                         "mean":mn,"confusion_matrix":cm,"class_report":cr} # "auc": auc,
    
    cat_dict = pd.DataFrame(cat_dict)
#     cat_dict = cat_dict.copy()
    cat_dict['average'] = cat_dict.mean(numeric_only=True, axis=1)
    return cat_dict

In [None]:
df_tm = shuffle(df)[-5000:]
xgb = test_model(df_tm, XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
                                       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=6,
                                       min_child_weight=11, missing=-999, n_estimators=500, nthread=4,
                                       objective='reg:linear', reg_alpha=0, reg_lambda=1,
                                       scale_pos_weight=1, seed=0, silent=1, subsample=0.8))
xgb

In [None]:
# xgb = xgb.copy()
# xgb['average'] = xgb.mean(numeric_only=True, axis=1)
# xgb

In [None]:
df_rf = shuffle(df)[-5000:]
rf = test_model(df_rf, RandomForestClassifier(max_features='auto',n_estimators=1000))
rf

In [None]:
df_gb = shuffle(df)[-5000:]
gb = test_model(df_gb, GradientBoostingClassifier(n_estimators=500))
gb

In [None]:
# df_nbm = shuffle(df)[-5000:]
# nbm = test_model(df_nbm, MultinomialNB(alpha=0))
# nbm

In [24]:
# xgb

In [25]:
# df_XGB = shuffle(df)[-5000:]

# categories = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']
# cat_XGB = {}
# X = df_XGB['comment_text_s']

# for cat in categories:
#     y = df_XGB[cat]
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42, stratify=y)
#     X_train, X_test = np.array(X_train), np.array(X_test)

#     pipe = Pipeline([('vect', CountVectorizer()),
#                          ('tfidf', TfidfTransformer()),
#                          ('to_dense', DenseTransformer()),
#                          ('pipe', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
#                                    gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=6,
#                                    min_child_weight=11, missing=-999, n_estimators=500, nthread=4,
#                                    objective='reg:linear', reg_alpha=0, reg_lambda=1,
#                                    scale_pos_weight=1, seed=0, silent=1, subsample=0.8))])
#     pipe.fit(X_train,y_train.ravel()) # added .ravel() due to feature names mismatch
#     y_pred = pipe.predict(X_test) 
#     mn = np.mean(y_pred == y_test)
#     cm = confusion_matrix(y_test,y_pred)
#     cr = classification_report(y_test,y_pred)
#     cat_XGB[cat] = mn, cm, cr
# cat_XGB

In [26]:
# cat_XGB_df = pd.DataFrame(cat_XGB).T
# cat_XGB_df

In [27]:
# pipe = Pipeline([('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
#                      ('pipe', SVC(C=10,gamma=0.001,probability=True))])
# pipe.fit(X_train,y_train) 
# y_pred = pipe.predict(X_test)
# print(np.mean(y_pred == y_test))
# print(confusion_matrix(y_test,y_pred))
# print(classification_report(y_test,y_pred))

In [None]:
# from sklearn.preprocessing import MultiLabelBinarizer
# Define models to test
model_list = [['RandomForest', RandomForestClassifier(max_features='auto',n_estimators=1000)], 
                ['GradientBoost', GradientBoostingClassifier(n_estimators=500)],
                ['XGBoost', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
                       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=6,
                       min_child_weight=11, missing=-999, n_estimators=500, nthread=4,
                       objective='reg:linear', reg_alpha=0, reg_lambda=1,
                       scale_pos_weight=1, seed=0, silent=1, subsample=0.8)]] 

# Calculate metrics for each model
roc = {}
results_dict = {}
for model in model_list:
    
    df_A = shuffle(df)[-10000:]
    X = df_A['comment_text_s']
    y = pd.cut(df_A.rating, bins=7, labels=list(range(7)))
#     mlb = MultiLabelBinarizer()
#     y = mlb.fit_transform(y)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42, stratify=y)
    
    model_name = model[0]
    model = model[1]
    pipe = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),
                     ('to_dense', DenseTransformer()), ('pipe', model)])

    accuracy = []
    precision_1 = []
    precision_0 = []
    recall_1 = []
    recall_0 = []
    f1_1 = []
    f1_0 = []
#     auc = []
        
    # Perform K-Fold CV and calculate metrics for each fold
    kf = KFold(5, random_state=42, shuffle=True) 
    for train_idx, test_idx in kf.split(X, y=y):
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        accuracy.append(accuracy_score(y_test, y_pred))
        precision_1.append(precision_score(y_test, y_pred ,pos_label=1, average=None))
        precision_0.append(precision_score(y_test, y_pred ,pos_label=0, average=None))
        recall_1.append(recall_score(y_test, y_pred, pos_label=1, average=None))
        recall_0.append(recall_score(y_test, y_pred, pos_label=0, average=None))
        f1_1.append(f1_score(y_test, y_pred, pos_label=1, average=None))
        f1_0.append(f1_score(y_test, y_pred, pos_label=0, average=None))
#         auc.append(roc_auc_score(y_test, y_pred, average='macro'))
        
    # Calculate mean metric across K-folds
    mean_accuracy = np.mean(accuracy)
    mean_precision_1 = np.mean(precision_1)
    mean_precision_0 = np.mean(precision_0)
    mean_recall_1 = np.mean(recall_1)
    mean_recall_0 = np.mean(recall_0)
    mean_f1_1 = np.mean(f1_1)
    mean_f1_0 = np.mean(f1_0)
#     mean_auc = np.mean(auc)
    
    # Capture TPR and FPR from last fold for plotting
    y_score = pipe.predict_proba(X_test)[:,1]
#     roc[model_name] = roc_curve(y_test, y_score), mean_auc
    results_dict[model_name] = {"accuracy": mean_accuracy, "precision_s": mean_precision_1, 
                                "precision_f": mean_precision_0, "recall_s": mean_recall_1, 
                                "recall_f": mean_recall_0, "f1_s": mean_f1_1, "f1_f": mean_f1_0} # "auc": mean_auc
    
    # Print formatted results
    print(model)
    print('\t==============================')
    print('\tAccuracy:', mean_accuracy)
    print('\tAUC:', mean_auc)
    print('\n')
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred)) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
	Accuracy: 0.801266666667
	AUC: 0.934563523665


[[1585    0    1   34    2    0    0]
 [  27    6    0    1    0    0    0]
 [  86    0   71   10   24    0    0]
 [ 141    0    6  100   91    1    0]
 [  87    0    3   24  430    7    0]
 [  10    0    0    3   32  186    0]
 [   5    0    0    0    1    0   26]]
             precision    recall  f1-score   support

          0       0.82      0.98      0.89      1622
          1       1.00      0.18      0.30        34
          2       0.88      0.37      0.52       191
          3       0.58      0.29      0.39      

In [None]:
# Plot the ROC curve from the last K-Fold split
fig, ax = plt.subplots(figsize=(10, 10))

# Plot 50-50 Line
ax.plot([0,1],[0,1], ls='--', color='k', label='50-50')

# Plot Classifier ROC Curves
for key, value in roc.items():
    label = '{}, AUC: {}%'.format(key, round(100*value[1],1))
    ax.plot(roc[key][0][0], roc[key][0][1], label=label)
    
ax.set_xlabel('FPR')
ax.set_ylabel('TPR')
ax.set_title('ROC Curve - All Models',fontweight='bold',fontsize=15)
ax.legend(loc='best')
plt.savefig('../charts/toxic_roc_1.png')

In [None]:
rd = pd.DataFrame(results_dict).T
rd = rd.apply(lambda x: round(100*x,1).astype(str) + "%")
rd = rd.sort_values(['auc'],ascending=[False])
rd