In [1]:
!pip3 install imbalanced-learn



In [16]:
import pandas as pd
import numpy as np

from imblearn.over_sampling import SMOTE, RandomOverSampler

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.utils import class_weight

import re


toxic = pd.read_csv('train_cleaned.csv')

max_features = 1000

toxic.dropna(axis=0, inplace=True)
x_train, x_test, y_train, y_test = train_test_split(toxic.loc[:,'comment_text_clean'], toxic.iloc[:,2:8], test_size = .2, random_state = 43)



In [17]:
#TF-IDF Vectors as features

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=max_features)
tfidf_vect.fit(x_train)
x_train_tfidf =  tfidf_vect.transform(x_train)
x_test_tfidf =  tfidf_vect.transform(x_test)

x_train_tfidf_os_all = [] #os = oversample
y_train_tfidf_os_all = []


for i in range(6):
   sm_tfidf = RandomOverSampler(random_state=40)
   x_train_tfidf_os, y_train_tfidf_os = sm_tfidf.fit_resample(x_train_tfidf, y_train.iloc[:,i])
   x_train_tfidf_os_all.append(x_train_tfidf_os)
   y_train_tfidf_os_all.append(y_train_tfidf_os)

In [18]:
#########
#Start building model here

class toxicmodel:
   def __init__(self, x_train, y_train, x_test, y_test, x_train_tfidf, n=6):
       self.n = n
       self.x_train = x_train
       self.y_train = y_train
       self.x_test = x_test
       self.y_test = y_test
       self.x_train_tfidf = x_train_tfidf

       self.best_params = []
       self.best_estimator = []

       self.y_predict_train = []
       self.y_predict_test = []
       self.y_predict_proba_train = []
       self.y_predict_proba_test = []

       self.acc_score_train = []
       self.acc_score_test = []

       self.roc_auc_score_train = []
       self.roc_auc_score_test = []

       self.f1_score_train = []
       self.f1_score_test = []

       self.confusion_matrix_train = []
       self.confusion_matrix_test = []

       self.classification_report_train = []
       self.classification_report_test = []


   def trainmodel(self, model_name, hyper_param_grid):
       for i in range(self.n):
           grid_search_model = GridSearchCV(model_name, hyper_param_grid, scoring='roc_auc', cv=5, refit=True, n_jobs=-1,
                                            # n_iter=10,
                                            verbose=5)
           grid_search_model.fit(self.x_train[i], self.y_train[i])
           self.best_params.append(grid_search_model.best_params_)
           self.best_estimator.append(grid_search_model.best_estimator_)

   def predictmodel(self):
       for i in range(self.n):
           y_predict_train = self.best_estimator[i].predict(self.x_train_tfidf)
           y_predict_test = self.best_estimator[i].predict(self.x_test)

           y_predict_proba_train = self.best_estimator[i].predict_proba(self.x_train_tfidf)[:, 1]
           y_predict_proba_test = self.best_estimator[i].predict_proba(self.x_test)[:, 1]

           self.y_predict_train.append(y_predict_train)
           self.y_predict_test.append(y_predict_test)

           self.y_predict_proba_train.append(y_predict_proba_train)
           self.y_predict_proba_test.append(y_predict_proba_test)

           # self.roc_auc_score_train.append(roc_auc_score(self.y_train[i], y_predict_proba_train))
           self.roc_auc_score_test.append(roc_auc_score(self.y_test.iloc[:, i], y_predict_proba_test))

           # self.acc_score_train.append(accuracy_score(self.y_train[i], y_predict_train))
           # self.acc_score_test.append(accuracy_score(self.y_test.iloc[:,i], y_predict_test))

           # self.f1_score_train.append(f1_score(self.y_train[i], y_predict_train))
           # self.f1_score_test.append(f1_score(self.y_test.iloc[:,i], y_predict_test))

           # self.confusion_matrix_train.append(confusion_matrix(self.y_train[i], y_predict_train))
           # self.confusion_matrix_test.append(confusion_matrix(self.y_test.iloc[:,i], y_predict_test))

           # self.classification_report_train.append(classification_report(self.y_train[i], y_predict_train))
           # self.classification_report_test.append(classification_report(self.y_test.iloc[:,i], y_predict_test))


rf = toxicmodel(x_train_tfidf_os_all, y_train_tfidf_os_all, x_test_tfidf, y_test, x_train_tfidf, n=6)
rf.trainmodel(RandomForestClassifier(), {'random_state': [64],
                                         'n_estimators': [400,600],
                                         'max_depth': [15]
                                         })
rf.predictmodel()

train_f1 = []
cutoff = []
for i in range(6):
    best_f1 = 0
    best_cutoff = 0
    for j in np.arange(0, 1, 0.001):
        if f1_score(y_train.iloc[:, i], rf.y_predict_proba_train[i] > j) > best_cutoff:
            best_f1 = f1_score(y_train.iloc[:, i], rf.y_predict_proba_train[i] > j)
            best_cutoff = j
    train_f1.append(best_f1)
    cutoff.append(best_cutoff)

test_f1 = [f1_score(y_test.iloc[:, i], rf.y_predict_proba_test[i] > cutoff[i]) for i in range(6)]
print(train_f1)
print(test_f1)
print(rf.roc_auc_score_test)
print(rf.best_estimator)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:  5.4min remaining:  8.1min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  6.5min remaining:  2.8min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  7.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  7.6min finished


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:  4.8min remaining:  7.2min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  5.7min remaining:  2.4min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  6.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  6.6min finished


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:  5.4min remaining:  8.1min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  6.3min remaining:  2.7min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  7.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  7.3min finished


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:  3.9min remaining:  5.9min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  4.6min remaining:  2.0min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  5.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  5.4min finished


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:  5.0min remaining:  7.6min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  6.0min remaining:  2.6min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  7.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  7.0min finished


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:  4.7min remaining:  7.1min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  5.5min remaining:  2.4min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  6.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  6.5min finished
  'precision', 'predicted', average, warn_for)


[0.6370630907891844, 0.5678496868475991, 0.678496067523499, 0.4520884520884521, 0.6562291169451074, 0.32875783765463484]
[0.5783908045977012, 0.4494086727989488, 0.6071282261368292, 0.19230769230769232, 0.5842235004108463, 0.24128312412831243]
[0.9031222119395219, 0.9669662921348315, 0.953550408555818, 0.9058527534153896, 0.9421733114143158, 0.9209279282469266]
[RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=600, n_jobs=None,
            oob_score=False, random_state=64, verbose=0, warm_start=False), RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_s