In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline
import seaborn as sns

import nltk
from nltk.tokenize import (sent_tokenize, TreebankWordTokenizer, WhitespaceTokenizer)
from nltk.stem.porter import PorterStemmer
from nltk.util import ngrams
from nltk.corpus import stopwords

from collections import Counter
from operator import itemgetter

from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
import re

# from sqlalchemy import create_engine

from sklearn.naive_bayes import GaussianNB, BernoulliNB,MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import (accuracy_score, classification_report,confusion_matrix, precision_score, 
                             recall_score, f1_score, roc_curve, roc_auc_score, average_precision_score, 
                             precision_recall_curve, auc)

from sklearn.utils import resample
from sklearn.cross_validation import train_test_split 
from sklearn.model_selection import KFold  
from sklearn.preprocessing import StandardScaler 

from sklearn.base import TransformerMixin

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

from sklearn.utils import shuffle

from xgboost import XGBClassifier

import datetime
import warnings
import os

warnings.filterwarnings("ignore")



In [2]:
# helper functions
def sentence_tokenizer(text):
    sentences = sent_tokenize(text)
    return sentences

def whitespace_tokenizer(sentences):
    listy = []
    tokenizer = WhitespaceTokenizer()
    for i in list(range(0,len(sentences))):
        tokenized = tokenizer.tokenize(sentences[i])
        listy.append(tokenized)
    return listy

def polarity(sentences):
    listy = []
    for i in list(range(0,len(sentences))):
        pol = TextBlob(sentences[i]).polarity
        listy.append(pol)
    return np.min(listy), np.max(listy), np.mean(listy),listy

def stemmer(text):
    stemmer = PorterStemmer()
    listy = []
    for word in TextBlob(text).words:
        listy.append(stemmer.stem(word))
    return listy

def token_clean(text):
    text = text.replace('\n',' ')
    text = re.sub('[^A-Za-z0-9 ]+', '', text)
    text = text.lower().split()
    return text

In [3]:
df = pd.read_pickle('../data/toxictrain.pkl')
print(df.shape)
# df.head()

(159571, 24)


In [4]:
df.sum(axis=0,numeric_only=True)

toxic                 1.529400e+04
severe_toxic          1.595000e+03
obscene               8.449000e+03
threat                4.780000e+02
insult                7.877000e+03
identity_hate         1.405000e+03
rating                3.509800e+04
clean                 1.433460e+05
polarity_comment      2.065111e+03
polarity_comment_s    1.902005e+03
word_count            1.055252e+07
char_count            6.288266e+07
char_count_s          5.053248e+07
polarity_min         -1.595756e+04
polarity_max          3.604874e+04
polarity_mean         7.919310e+03
dtype: float64

### Upsampling

In [5]:
df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate', 'category', 'rating', 'clean',
       'comment_text_s', 'comment_text_f', 'token_clean', 'sent_token',
       'polarity_sentence', 'polarity_comment', 'polarity_comment_s',
       'word_count', 'char_count', 'char_count_s', 'polarity_min',
       'polarity_max', 'polarity_mean'],
      dtype='object')

In [6]:
df_Cl = df[df.rating == 0]
df_To = df[df.toxic == 1]
df_ST = df[df.severe_toxic == 1]
df_Ob = df[df.obscene == 1]
df_Th = df[df.threat == 1]
df_In = df[df.insult == 1]
df_IH = df[df.identity_hate == 1]
print(df_Cl.shape, df_To.shape,df_ST.shape,df_Ob.shape,df_Th.shape,df_In.shape,df_IH.shape)

df_STu = resample(df_ST, replace=True, n_samples=20000)
df_Obu = resample(df_Ob, replace=True, n_samples=20000)
df_Thu = resample(df_Th, replace=True, n_samples=30000)
df_Inu = resample(df_In, replace=True, n_samples=20000)
df_IHu = resample(df_IH, replace=True, n_samples=30000)

df = pd.concat([df_Cl, df_STu, df_Obu, df_Thu, df_Inu, df_IHu])
print(df.shape)

(143346, 24) (15294, 24) (1595, 24) (8449, 24) (478, 24) (7877, 24) (1405, 24)
(263346, 24)


In [7]:
dfs = df.sum(axis=0,numeric_only=True)
print(dfs)

toxic                 1.132030e+05
severe_toxic          4.078700e+04
obscene               9.551200e+04
threat                3.492200e+04
insult                9.566400e+04
identity_hate         4.550400e+04
rating                4.255920e+05
clean                 1.433460e+05
polarity_comment      2.745763e+03
polarity_comment_s    2.569387e+03
word_count            1.639040e+07
char_count            9.702554e+07
char_count_s          7.235674e+07
polarity_min         -4.683376e+04
polarity_max          3.811849e+04
polarity_mean        -5.702754e+03
dtype: float64


In [8]:
# df1 = shuffle(df)[-50000:]

In [9]:
class DenseTransformer(TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

In [10]:
models = [['rfc', RandomForestClassifier(max_features='auto',n_estimators=1000)], 
                ['gbc', GradientBoostingClassifier(n_estimators=500)],
                ['xgbc', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
                       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=6,
                       min_child_weight=11, missing=-999, n_estimators=500, nthread=4,
                       objective='reg:linear', reg_alpha=0, reg_lambda=1,
                       scale_pos_weight=1, seed=0, silent=1, subsample=0.8)]] 

categories = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']

In [None]:
def all_models_results(data, models, categories):
    roc = {}
    results_dict = {}
    
    for model in models:
        model_name = model[0]
        model_model = model[1]
        
        for cat in categories:
            X = data['comment_text_s']
            y = data[cat]
            
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42, stratify=y)
    
            pipe = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),
                             ('to_dense', DenseTransformer()), ('pipe', model_model)])   
        
            accuracy = []
            precision_1 = []
            precision_0 = []
            recall_1 = []
            recall_0 = []
            f1_1 = []
            f1_0 = []
            auc = []

            # Perform K-Fold CV and calculate metrics for each fold
            kf = KFold(5, random_state=42, shuffle=True) 
            for train_idx, test_idx in kf.split(X, y=y):
                pipe.fit(X_train, y_train)
                y_pred = pipe.predict(X_test)
                accuracy.append(accuracy_score(y_test, y_pred))
                precision_1.append(precision_score(y_test, y_pred ,pos_label=1))
                precision_0.append(precision_score(y_test, y_pred ,pos_label=0))
                recall_1.append(recall_score(y_test, y_pred, pos_label=1))
                recall_0.append(recall_score(y_test, y_pred, pos_label=0))
                f1_1.append(f1_score(y_test, y_pred, pos_label=1))
                f1_0.append(f1_score(y_test, y_pred, pos_label=0))
                auc.append(roc_auc_score(y_test, y_pred, average='macro'))

            # Calculate mean metric across K-folds
            mean_accuracy = np.mean(accuracy)
            mean_precision_1 = np.mean(precision_1)
            mean_precision_0 = np.mean(precision_0)
            mean_recall_1 = np.mean(recall_1)
            mean_recall_0 = np.mean(recall_0)
            mean_f1_1 = np.mean(f1_1)
            mean_f1_0 = np.mean(f1_0)
            mean_auc = np.mean(auc)
            cm = confusion_matrix(y_test,y_pred)
            cr = classification_report(y_test,y_pred)

            # Capture TPR and FPR from last fold for plotting
            y_score = pipe.predict_proba(X_test)[:,1]
            roc[(model_name,cat)] = roc_curve(y_test, y_score), mean_auc
            results_dict[(model_name,cat)] = {"accuracy": mean_accuracy, "precision_s": mean_precision_1, 
                                                "precision_f": mean_precision_0, "recall_s": mean_recall_1, 
                                                "recall_f": mean_recall_0, "f1_s": mean_f1_1, "f1_f": mean_f1_0,
                                                "auc": mean_auc} 
    return roc, results_dict

In [None]:
df_amr = shuffle(df)[-10000:]
roc, results_dict = all_models_results(df_amr, models, categories)

In [None]:
# Plot the ROC curve from the last K-Fold split
fig, ax = plt.subplots(figsize=(10, 10))

# Plot 50-50 Line
ax.plot([0,1],[0,1], ls='--', color='k', label='50-50')

# Plot Classifier ROC Curves
for key, value in roc.items():
    label = '{}, AUC: {}%'.format(key, round(100*value[1],1))
    ax.plot(roc[key][0][0], roc[key][0][1], label=label)
    
ax.set_xlabel('FPR')
ax.set_ylabel('TPR')
ax.set_title('ROC Curve - All Models',fontweight='bold',fontsize=15)
ax.legend(loc='best')
plt.savefig('../charts/toxic_roc_1.png')

In [None]:
rd = pd.DataFrame(results_dict).T
rd = rd.apply(lambda x: round(100*x,1).astype(str) + "%")
rd = rd.sort_values(['auc'],ascending=[False])
rd

In [None]:
def test_model(dataframe, model):
    categories = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']
    cat_dict = {}
    
    for cat in categories:
        X = dataframe['comment_text_s']
        y = dataframe[cat]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42, stratify=y)
#         X_train, X_test = np.array(X_train), np.array(X_test)

        pipe = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('to_dense', DenseTransformer()),
                             ('pipe', model)])
        pipe.fit(X_train,y_train.ravel()) # added .ravel() due to feature names mismatch
        y_pred = pipe.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision_1 = precision_score(y_test, y_pred ,pos_label=1)
        precision_0 = precision_score(y_test, y_pred ,pos_label=0)
        recall_1 = recall_score(y_test, y_pred, pos_label=1)
        recall_0 = recall_score(y_test, y_pred, pos_label=0)
        f1_1 = f1_score(y_test, y_pred, pos_label=1)
        f1_0 = f1_score(y_test, y_pred, pos_label=0)
        auc = roc_auc_score(y_test, y_pred)        
        
        mn = np.mean(y_pred == y_test)
        cm = confusion_matrix(y_test,y_pred)
        cr = classification_report(y_test,y_pred)
        cat_dict[cat] = {"accuracy": accuracy, "precision_1": precision_1, 
                         "precision_0": precision_0, "recall_1": recall_1, 
                         "recall_0": recall_0, "f1_1": f1_1, "f1_0": f1_0, 
                         "mean":mn,"auc": auc,"confusion_matrix":cm,"class_report":cr} 
    
    cat_dict = pd.DataFrame(cat_dict)
    return cat_dict

In [None]:
df_tm = shuffle(df)[-20000:]
xgb = test_model(df_tm, XGBClassifier())
xgb

In [None]:
df_rf = shuffle(df)[-20000:]
rf = test_model(df_rf, RandomForestClassifier(max_features='auto',n_estimators=1000))
rf

In [None]:
df_gb = shuffle(df)[-20000:]
gb = test_model(df_gb, GradientBoostingClassifier(n_estimators=500))
gb