## Toxic: GridSearchCV (Supervised)

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.base import TransformerMixin
from sklearn.utils import resample, shuffle
from sklearn.cross_validation import train_test_split 
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics import (accuracy_score, classification_report,confusion_matrix, precision_score, 
                             recall_score, f1_score, roc_curve, roc_auc_score, average_precision_score, 
                             precision_recall_curve, auc)


In [31]:
df = pd.read_pickle('../data/toxictrain.pkl')
print(df.shape)
df.head()

(159571, 19)


Unnamed: 0_level_0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,category,rating,comment_text_s,comment_text_f,token_clean,sent_token,polarity_comment,polarity_comment_s,word_count,char_count,char_count_s
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
3a4c7758fad18de3,3a4c7758fad18de3,", I hope your retarded kids get anal raped and...",1,1,1,1,1,1,ToSTObThInIH,6,", I hope your retarded kids get anal raped and...",", hope retarded kids get anal raped murdered h...","[i, hope, your, retarded, kids, get, anal, rap...","[, I hope your retarded kids get anal raped an...",0.0,0.0,94,494,494
24d2b50726b67167,24d2b50726b67167,I am going to murder ZimZalaBim ST47 for being...,1,1,1,1,1,1,ToSTObThInIH,6,I am going to murder ZimZalaBim ST47 for being...,am going to murder ZimZalaBim ST47 being evil ...,"[i, am, going, to, murder, zimzalabim, st47, f...",[I am going to murder ZimZalaBim ST47 for bein...,0.0,0.0,12,68,68
c586b7a2fd575b13,c586b7a2fd575b13,"Shut up you asswipe, we don't care. I'll decap...",1,1,1,1,1,1,ToSTObThInIH,6,"Shut up you asswipe, we don't care. I'll decap...","Shut up asswipe , do n't care . 'll decapitate...","[shut, up, you, asswipe, we, dont, care, ill, ...","[Shut up you asswipe, we don't care., I'll dec...",-0.25,-0.25,24,3890,1000
77d84b1321c22d9a,77d84b1321c22d9a,"LGBT \n\nyou little fuck , are you a fag , tha...",1,1,1,1,1,1,ToSTObThInIH,6,"LGBT \n\nyou little fuck , are you a fag , tha...","LGBT little fuck , are fag , piece shit page i...","[lgbt, you, little, fuck, are, you, a, fag, th...","[LGBT \n\nyou little fuck , are you a fag , th...",0.0,0.0,56,280,280
1368c10281978876,1368c10281978876,"You're a stupid cunt \n\nFuck you dumb arse, y...",1,1,1,1,1,1,ToSTObThInIH,6,"You're a stupid cunt \n\nFuck you dumb arse, y...","'re stupid cunt Fuck dumb arse , mum has hairy...","[youre, a, stupid, cunt, fuck, you, dumb, arse...","[You're a stupid cunt \n\nFuck you dumb arse, ...",0.0,0.0,59,278,278


In [32]:
class DenseTransformer(TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

In [33]:
# Upsampling
df_majority = df[df.rating == 0]
df_minority = df[df.rating > 0]
print(df_majority.shape, df_minority.shape)

df_minority_upsampled = resample(df_minority, replace=True, n_samples=127000)

df = pd.concat([df_majority,df_minority_upsampled])
print(df.shape)

(143346, 19) (16225, 19)
(270346, 19)


In [34]:
df_1 = shuffle(df)[-5000:]
# X = df_1['comment_text_s']
# y = df_1['toxic']
# print(X.shape, y.shape)

In [35]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42, stratify=y)

# k_range = list(range(1, 101))
# print(X_train.shape, y_train.shape)
# print(X_test.shape,y_test.shape)

In [38]:
X = df_1['comment_text_s']
y = df_1['toxic']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42, stratify=y)

model = RandomForestClassifier()
param_grid = [{'n_estimators': [10, 100, 500, 1000],'max_features': ['auto', 'sqrt', 'log2']}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
pipe = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('to_dense', DenseTransformer()),
                     ('grid', grid)])
pipe.fit(X_train,y_train) 
grid_predictions = pipe.predict(X_test)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_features=auto, n_estimators=10 ..............................
[CV] max_features=auto, n_estimators=10 ..............................
[CV] ............... max_features=auto, n_estimators=10, total=   1.1s
[CV] max_features=auto, n_estimators=10 ..............................
[CV] ............... max_features=auto, n_estimators=10, total=   1.1s
[CV] max_features=auto, n_estimators=100 .............................
[CV] ............... max_features=auto, n_estimators=10, total=   1.1s
[CV] max_features=auto, n_estimators=100 .............................
[CV] max_features=auto, n_estimators=100 .............................
[CV] max_features=auto, n_estimators=500 .............................
[CV] max_features=auto, n_estimators=500 .............................
[CV] max_features=auto, n_estimators=500 .............................
[CV] max_features=auto, n_estimators=1000 ............................
[CV] max_feature

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  6.5min finished


{'max_features': 'auto', 'n_estimators': 1000} RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) [[755  73]
 [205 467]]              precision    recall  f1-score   support

          0       0.79      0.91      0.84       828
          1       0.86      0.69      0.77       672

avg / total       0.82      0.81      0.81      1500



In [8]:
categories = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']
gs_cat = {}

for cat in categories:
    X = df_1['comment_text_s']
    y = df_1[cat]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42, stratify=y)

    model = RandomForestClassifier()
    param_grid = [{'n_estimators': [10, 100, 500, 1000],'max_features': ['auto', 'sqrt', 'log2']}]
    grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
    pipe = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('to_dense', DenseTransformer()),
                         ('grid', grid)])
    pipe.fit(X_train,y_train) 
    grid_predictions = pipe.predict(X_test)
    best_params = grid.best_params_
    best_estimator = grid.best_estimator_
    best_cm = confusion_matrix(y_test,grid_predictions)
    best_cr = classification_report(y_test,grid_predictions)
    gs_cat[cat] = best_params, best_estimator, best_cm, best_cr
gs_cat

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_features=auto, n_estimators=10 ..............................
[CV] max_features=auto, n_estimators=10 ..............................
[CV] ............... max_features=auto, n_estimators=10, total=   1.1s
[CV] max_features=auto, n_estimators=10 ..............................
[CV] ............... max_features=auto, n_estimators=10, total=   1.3s
[CV] max_features=auto, n_estimators=100 .............................
[CV] ............... max_features=auto, n_estimators=10, total=   1.2s
[CV] max_features=auto, n_estimators=100 .............................
[CV] max_features=auto, n_estimators=100 .............................
[CV] max_features=auto, n_estimators=500 .............................
[CV] max_features=auto, n_estimators=500 .............................
[CV] max_features=auto, n_estimators=500 .............................
[CV] max_features=auto, n_estimators=1000 ............................
[CV] max_feature

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  5.6min finished


Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_features=auto, n_estimators=10 ..............................
[CV] max_features=auto, n_estimators=10 ..............................
[CV] ............... max_features=auto, n_estimators=10, total=   1.3s
[CV] max_features=auto, n_estimators=10 ..............................
[CV] ............... max_features=auto, n_estimators=10, total=   1.4s
[CV] max_features=auto, n_estimators=100 .............................
[CV] ............... max_features=auto, n_estimators=10, total=   1.3s
[CV] max_features=auto, n_estimators=100 .............................
[CV] max_features=auto, n_estimators=100 .............................
[CV] max_features=auto, n_estimators=500 .............................
[CV] max_features=auto, n_estimators=500 .............................
[CV] max_features=auto, n_estimators=500 .............................
[CV] max_features=auto, n_estimators=1000 ............................
[CV] max_feature

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  5.9min finished


Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_features=auto, n_estimators=10 ..............................
[CV] max_features=auto, n_estimators=10 ..............................
[CV] ............... max_features=auto, n_estimators=10, total=   1.4s
[CV] max_features=auto, n_estimators=10 ..............................
[CV] ............... max_features=auto, n_estimators=10, total=   1.2s
[CV] max_features=auto, n_estimators=100 .............................
[CV] ............... max_features=auto, n_estimators=10, total=   1.3s
[CV] max_features=auto, n_estimators=100 .............................
[CV] max_features=auto, n_estimators=100 .............................
[CV] max_features=auto, n_estimators=500 .............................
[CV] max_features=auto, n_estimators=500 .............................
[CV] max_features=auto, n_estimators=500 .............................
[CV] max_features=auto, n_estimators=1000 ............................
[CV] max_feature

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  6.2min finished


Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_features=auto, n_estimators=10 ..............................
[CV] max_features=auto, n_estimators=10 ..............................
[CV] ............... max_features=auto, n_estimators=10, total=   0.8s
[CV] max_features=auto, n_estimators=10 ..............................
[CV] ............... max_features=auto, n_estimators=10, total=   0.8s
[CV] max_features=auto, n_estimators=100 .............................
[CV] ............... max_features=auto, n_estimators=10, total=   0.8s
[CV] max_features=auto, n_estimators=100 .............................
[CV] max_features=auto, n_estimators=100 .............................
[CV] max_features=auto, n_estimators=500 .............................
[CV] max_features=auto, n_estimators=500 .............................
[CV] max_features=auto, n_estimators=500 .............................
[CV] max_features=auto, n_estimators=1000 ............................
[CV] max_feature

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  3.8min finished


Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_features=auto, n_estimators=10 ..............................
[CV] max_features=auto, n_estimators=10 ..............................
[CV] ............... max_features=auto, n_estimators=10, total=   1.2s
[CV] max_features=auto, n_estimators=10 ..............................
[CV] ............... max_features=auto, n_estimators=10, total=   1.3s
[CV] max_features=auto, n_estimators=100 .............................
[CV] ............... max_features=auto, n_estimators=10, total=   1.3s
[CV] max_features=auto, n_estimators=100 .............................
[CV] max_features=auto, n_estimators=100 .............................
[CV] max_features=auto, n_estimators=500 .............................
[CV] max_features=auto, n_estimators=500 .............................
[CV] max_features=auto, n_estimators=500 .............................
[CV] max_features=auto, n_estimators=1000 ............................
[CV] max_feature

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  6.3min finished


Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_features=auto, n_estimators=10 ..............................
[CV] max_features=auto, n_estimators=10 ..............................
[CV] ............... max_features=auto, n_estimators=10, total=   1.6s
[CV] max_features=auto, n_estimators=10 ..............................
[CV] ............... max_features=auto, n_estimators=10, total=   1.6s
[CV] max_features=auto, n_estimators=100 .............................
[CV] ............... max_features=auto, n_estimators=10, total=   1.7s
[CV] max_features=auto, n_estimators=100 .............................
[CV] max_features=auto, n_estimators=100 .............................
[CV] max_features=auto, n_estimators=500 .............................
[CV] max_features=auto, n_estimators=500 .............................
[CV] max_features=auto, n_estimators=500 .............................
[CV] max_features=auto, n_estimators=1000 ............................
[CV] max_feature

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  7.0min finished


{'identity_hate': ({'max_features': 'auto', 'n_estimators': 500},
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
              oob_score=False, random_state=None, verbose=0,
              warm_start=False),
  array([[1421,   12],
         [  56,   11]]),
  '             precision    recall  f1-score   support\n\n          0       0.96      0.99      0.98      1433\n          1       0.48      0.16      0.24        67\n\navg / total       0.94      0.95      0.94      1500\n'),
 'insult': ({'max_features': 'sqrt', 'n_estimators': 1000},
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            