## Toxic: GridSearchCV (Supervised)

### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.base import TransformerMixin
from sklearn.utils import resample, shuffle
from sklearn.cross_validation import train_test_split 
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics import (accuracy_score, classification_report,confusion_matrix, precision_score, 
                             recall_score, f1_score, roc_curve, roc_auc_score, average_precision_score, 
                             precision_recall_curve, auc)

from xgboost import XGBClassifier





### Assumptions

In [2]:
df = pd.read_pickle('../data/toxictrain.pkl')
print(df.shape)
# df.head()

categories = ['clean','toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']
models = [['rfc',RandomForestClassifier(),[{'n_estimators': [10, 100, 500, 1000],'max_features': ['auto', 'sqrt', 'log2']}]],
          ['xgbc',XGBClassifier(),[{'n_estimators':[50,100,150,200],'max_depth':[2,3,4,5,6,7,8,9],'min_child_weight':[2,3,4,5],
               'colsample_bytree':[0.2,0.6,0.8],'colsample_bylevel':[0.2,0.6,0.8]}]],
          ['gbc',GradientBoostingClassifier(),[{'n_estimators': [10, 100, 500, 1000]}]]]

(159571, 24)


### Helper Functions

In [3]:
class DenseTransformer(TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

### Upsampling

In [4]:
# Upsampling
df_Cl = df[df.rating == 0]
df_To = df[df.toxic == 1]
df_ST = df[df.severe_toxic == 1]
df_Ob = df[df.obscene == 1]
df_Th = df[df.threat == 1]
df_In = df[df.insult == 1]
df_IH = df[df.identity_hate == 1]
print(df_Cl.shape, df_To.shape,df_ST.shape,df_Ob.shape,df_Th.shape,df_In.shape,df_IH.shape)

df_STu = resample(df_ST, replace=True, n_samples=20000)
df_Obu = resample(df_Ob, replace=True, n_samples=20000)
df_Thu = resample(df_Th, replace=True, n_samples=30000)
df_Inu = resample(df_In, replace=True, n_samples=20000)
df_IHu = resample(df_IH, replace=True, n_samples=30000)
print(df_STu.shape,df_Obu.shape,df_Thu.shape,df_Inu.shape,df_IHu.shape)


df = pd.concat([df_Cl, df_STu, df_Obu, df_Thu, df_Inu, df_IHu])

print(df.shape)

(143346, 24) (15294, 24) (1595, 24) (8449, 24) (478, 24) (7877, 24) (1405, 24)
(20000, 24) (20000, 24) (30000, 24) (20000, 24) (30000, 24)
(263346, 24)


In [5]:
df_1 = shuffle(df)[-5000:]

In [None]:
def gridsearch_models(data, models, categories):
    model_dict = {}
    for model in models:
        print(model)
        model_name = model[0]
        model_model = model[1]
        param_grid = model[2]
        for item in categories:    
            X = data['comment_text_s']
            y = data[item]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42, stratify=y)

            grid = GridSearchCV(model_model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
            pipe = Pipeline([('vect', CountVectorizer()),
                                 ('tfidf', TfidfTransformer()),
                                 ('to_dense', DenseTransformer()),
                                 ('grid', grid)])
            pipe.fit(X_train,y_train) 
            grid_predictions = pipe.predict(X_test)
            best_params = grid.best_params_
            best_estimator = grid.best_estimator_
            best_cm = confusion_matrix(y_test,grid_predictions)
            best_cr = classification_report(y_test,grid_predictions)
            model_dict[(model_name,item)] = {"best_params":best_params, "best_estimator":best_estimator, "best_cm":best_cm, "best_cr":best_cr} 
            print(model_dict[(model_name,item)])
    return model_dict

In [None]:
gm = gridsearch_models(df_1, models, categories)

['rfc', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), [{'max_features': ['auto', 'sqrt', 'log2'], 'n_estimators': [10, 100, 500, 1000]}]]
Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_features=auto, n_estimators=10 ..............................
[CV] max_features=auto, n_estimators=10 ..............................
[CV] ............... max_features=auto, n_estimators=10, total=   1.2s
[CV] max_features=auto, n_estimators=10 ..............................
[CV] ............... max_features=auto, n_estimators=10, total=   1.2s
[CV] max_features=auto, n_estimators=100 .............................

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  6.3min finished


{'best_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'best_cm': array([[558, 110],
       [ 30, 802]]), 'best_params': {'max_features': 'auto', 'n_estimators': 500}, 'best_cr': '             precision    recall  f1-score   support\n\n          0       0.95      0.84      0.89       668\n          1       0.88      0.96      0.92       832\n\navg / total       0.91      0.91      0.91      1500\n'}
Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_features=auto, n_estimators=10 ..............................
[CV] max_features=auto, n_estimators=10 ..............................
[CV] ..

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  6.2min finished


{'best_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'best_cm': array([[846,  25],
       [ 98, 531]]), 'best_params': {'max_features': 'auto', 'n_estimators': 1000}, 'best_cr': '             precision    recall  f1-score   support\n\n          0       0.90      0.97      0.93       871\n          1       0.96      0.84      0.90       629\n\navg / total       0.92      0.92      0.92      1500\n'}
Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_features=auto, n_estimators=10 ..............................
[CV] max_features=auto, n_estimators=10 ..............................
[CV] 

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  7.3min finished


{'best_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'best_cm': array([[1261,   20],
       [ 102,  117]]), 'best_params': {'max_features': 'sqrt', 'n_estimators': 1000}, 'best_cr': '             precision    recall  f1-score   support\n\n          0       0.93      0.98      0.95      1281\n          1       0.85      0.53      0.66       219\n\navg / total       0.91      0.92      0.91      1500\n'}
Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_features=auto, n_estimators=10 ..............................
[CV] max_features=auto, n_estimators=10 ..............................
[

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  6.5min finished


{'best_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'best_cm': array([[968,   9],
       [104, 419]]), 'best_params': {'max_features': 'auto', 'n_estimators': 1000}, 'best_cr': '             precision    recall  f1-score   support\n\n          0       0.90      0.99      0.94       977\n          1       0.98      0.80      0.88       523\n\navg / total       0.93      0.92      0.92      1500\n'}
Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_features=auto, n_estimators=10 ..............................
[CV] max_features=auto, n_estimators=10 ..............................
[CV] 

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  7.9min finished


{'best_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'best_cm': array([[1309,    2],
       [  60,  129]]), 'best_params': {'max_features': 'sqrt', 'n_estimators': 500}, 'best_cr': '             precision    recall  f1-score   support\n\n          0       0.96      1.00      0.98      1311\n          1       0.98      0.68      0.81       189\n\navg / total       0.96      0.96      0.96      1500\n'}
Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_features=auto, n_estimators=10 ..............................
[CV] max_features=auto, n_estimators=10 ..............................
[CV

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  6.3min finished


{'best_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'best_cm': array([[940,  29],
       [144, 387]]), 'best_params': {'max_features': 'sqrt', 'n_estimators': 1000}, 'best_cr': '             precision    recall  f1-score   support\n\n          0       0.87      0.97      0.92       969\n          1       0.93      0.73      0.82       531\n\navg / total       0.89      0.88      0.88      1500\n'}
Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_features=auto, n_estimators=10 ..............................
[CV] max_features=auto, n_estimators=10 ..............................
[CV] 

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  8.0min finished


{'best_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'best_cm': array([[1233,    6],
       [ 108,  153]]), 'best_params': {'max_features': 'auto', 'n_estimators': 500}, 'best_cr': '             precision    recall  f1-score   support\n\n          0       0.92      1.00      0.96      1239\n          1       0.96      0.59      0.73       261\n\navg / total       0.93      0.92      0.92      1500\n'}
['xgbc', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n

[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 12.1min


[CV] min_child_weight=4, colsample_bytree=0.2, n_estimators=150, colsample_bylevel=0.2, max_depth=2 
[CV]  min_child_weight=3, colsample_bytree=0.2, n_estimators=200, colsample_bylevel=0.2, max_depth=2, total= 5.0min
[CV] min_child_weight=4, colsample_bytree=0.2, n_estimators=200, colsample_bylevel=0.2, max_depth=2 
[CV]  min_child_weight=4, colsample_bytree=0.2, n_estimators=100, colsample_bylevel=0.2, max_depth=2, total= 2.7min
[CV] min_child_weight=4, colsample_bytree=0.2, n_estimators=200, colsample_bylevel=0.2, max_depth=2 
[CV]  min_child_weight=4, colsample_bytree=0.2, n_estimators=100, colsample_bylevel=0.2, max_depth=2, total= 2.8min
[CV] min_child_weight=4, colsample_bytree=0.2, n_estimators=200, colsample_bylevel=0.2, max_depth=2 
[CV]  min_child_weight=3, colsample_bytree=0.2, n_estimators=200, colsample_bylevel=0.2, max_depth=2, total= 5.1min
[CV] min_child_weight=5, colsample_bytree=0.2, n_estimators=50, colsample_bylevel=0.2, max_depth=2 
[CV]  min_child_weight=4, colsam

[CV] min_child_weight=3, colsample_bytree=0.2, n_estimators=200, colsample_bylevel=0.2, max_depth=3 
[CV]  min_child_weight=3, colsample_bytree=0.2, n_estimators=100, colsample_bylevel=0.2, max_depth=3, total= 3.1min
[CV] min_child_weight=3, colsample_bytree=0.2, n_estimators=200, colsample_bylevel=0.2, max_depth=3 
[CV]  min_child_weight=2, colsample_bytree=0.2, n_estimators=200, colsample_bylevel=0.2, max_depth=3, total= 6.1min
[CV] min_child_weight=4, colsample_bytree=0.2, n_estimators=50, colsample_bylevel=0.2, max_depth=3 
[CV]  min_child_weight=3, colsample_bytree=0.2, n_estimators=100, colsample_bylevel=0.2, max_depth=3, total= 2.9min
[CV] min_child_weight=4, colsample_bytree=0.2, n_estimators=50, colsample_bylevel=0.2, max_depth=3 
[CV]  min_child_weight=4, colsample_bytree=0.2, n_estimators=50, colsample_bylevel=0.2, max_depth=3, total= 1.6min
[CV] min_child_weight=4, colsample_bytree=0.2, n_estimators=50, colsample_bylevel=0.2, max_depth=3 
[CV]  min_child_weight=3, colsample

In [None]:
# def gridsearch_model_classes(data, model, categories,param_grid):
#     model_dict = {}
#     for item in categories:    
#         X = data['comment_text_s']
#         y = data[item]
#         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42, stratify=y)

#         model = model
#         param_grid = param_grid
#         grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
#         pipe = Pipeline([('vect', CountVectorizer()),
#                              ('tfidf', TfidfTransformer()),
#                              ('to_dense', DenseTransformer()),
#                              ('grid', grid)])
#         pipe.fit(X_train,y_train) 
#         grid_predictions = pipe.predict(X_test)
#         best_params = grid.best_params_
#         best_estimator = grid.best_estimator_
#         best_cm = confusion_matrix(y_test,grid_predictions)
#         best_cr = classification_report(y_test,grid_predictions)
#         model_dict[item] = {"best_params":best_params, "best_estimator":best_estimator, 
#                             "best_cm":best_cm, "best_cr":best_cr}
#     return model_dict

In [None]:
# model = RandomForestClassifier()
# param_grid = [{'n_estimators': [10, 100, 500, 1000],'max_features': ['auto', 'sqrt', 'log2']}]
# gridsearch_model_classes(df_1, model,categories, param_grid)

In [None]:
# model = XGBClassifier()
# param_grid = [{'n_estimators':[50,100,150,200],'max_depth':[2,3,4,5,6,7,8,9],'min_child_weight':[2,3,4,5],
#                'colsample_bytree':[0.2,0.6,0.8],'colsample_bylevel':[0.2,0.6,0.8]}]
# gridsearch_model_classes(df_1, model,categories, param_grid)

In [None]:
# model = GradientBoostingClassifier()
# param_grid = [{'n_estimators': [10, 100, 500, 1000]}]
# gridsearch_model_classes(df_1, model,categories, param_grid)

In [None]:
X = df_1['comment_text_s']
y = df_1['toxic']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42, stratify=y)

model = RandomForestClassifier()
param_grid = [{'n_estimators': [10, 100, 500, 1000],'max_features': ['auto', 'sqrt', 'log2']}]
grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
pipe = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('to_dense', DenseTransformer()),
                     ('grid', grid)])
pipe.fit(X_train,y_train) 
grid_predictions = pipe.predict(X_test)
best_params = grid.best_params_
best_estimator = grid.best_estimator_
best_cm = confusion_matrix(y_test,grid_predictions)
best_cr = classification_report(y_test,grid_predictions)
print(best_params, best_estimator, best_cm, best_cr)

In [None]:
categories = ['clean','toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']
gs_cat = {}

for cat in categories:
    X = df_1['comment_text_s']
    y = df_1[cat]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42, stratify=y)

    model = RandomForestClassifier()
    param_grid = [{'n_estimators': [10, 100, 500, 1000],'max_features': ['auto', 'sqrt', 'log2']}]
    grid = GridSearchCV(model,param_grid,refit=True,verbose=2, scoring='roc_auc', n_jobs=-1)
    pipe = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('to_dense', DenseTransformer()),
                         ('grid', grid)])
    pipe.fit(X_train,y_train) 
    grid_predictions = pipe.predict(X_test)
    best_params = grid.best_params_
    best_estimator = grid.best_estimator_
    best_cm = confusion_matrix(y_test,grid_predictions)
    best_cr = classification_report(y_test,grid_predictions)
    gs_cat[cat] = best_params, best_estimator, best_cm, best_cr
gs_cat