In [6]:
#importing libraries
import numpy as np
from collections import Counter
import pandas as pd
from sklearn.metrics import mean_squared_error,roc_auc_score,precision_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, confusion_matrix, log_loss,roc_auc_score
from sklearn.multiclass import OneVsRestClassifier

# for lightGBM
from lightgbm import LGBMClassifier

# for naive bayes
from sklearn.naive_bayes import GaussianNB

# for evaluation metrics
%run -i helper_functions.py

# for hyperparameter tuning
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV 

# for model export
import joblib

# for multi-label classification
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
from sklearn.multiclass import OneVsRestClassifier

In [7]:
#loading the train dataset
selected_train = pd.read_csv('Data/selected_train.csv')
selected_train.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,none,min_length_scaled,num_words_vs_length,exclamation_marks_vs_length,...,105,114,132,135,139,143,156,157,170,198
0,0,0,0,0,0,0,1,0.009393,0.181132,0.0,...,-0.09132,-0.017755,0.003997,-0.199211,-0.109984,0.089084,0.012651,-0.019501,-0.091946,-0.113697
1,0,0,0,0,0,0,1,0.000723,0.160714,0.008929,...,-0.118445,0.039541,0.017522,0.001397,9.4e-05,0.021135,0.039308,-0.067295,-0.079366,-0.044959
2,0,0,0,0,0,0,1,0.007225,0.188841,0.0,...,-0.131555,-0.060714,0.026459,-0.029582,-0.146134,0.109907,0.036539,-0.053628,0.02753,-0.21123
3,0,0,0,0,0,0,1,0.007948,0.175719,0.0,...,-0.145382,-0.004285,-0.000706,-0.134984,-0.214832,0.17756,0.007675,-0.087473,0.106848,-0.015987
4,0,0,0,0,0,0,1,0.003613,0.208955,0.0,...,0.007073,-0.204627,0.162032,0.013798,-0.221076,0.163578,-0.06831,-0.031184,-0.01728,0.038566


In [8]:
#loading the test dataset
selected_test = pd.read_csv('Data/selected_test.csv')
selected_test.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,min_length_scaled,num_words_vs_length,exclamation_marks_vs_length,num_unique_words_scaled,...,114,132,135,139,143,156,157,170,198,none
0,0,0,0,0,0,0,0.00289,0.166667,0.0,0.032911,...,0.040738,-0.010083,-0.158238,-0.204194,0.116565,0.098296,-0.20616,0.046779,-0.036748,1
1,0,0,0,0,0,0,0.005058,0.1875,0.0,0.01519,...,-0.013467,0.00355,-0.117174,-0.075475,0.043945,-0.148617,-0.35074,0.113815,-0.070907,1
2,0,0,0,0,0,0,0.000723,0.149554,0.002232,0.118987,...,-0.12508,0.045087,-0.119656,-0.274707,0.102546,0.100782,-0.107205,-0.05078,-0.098343,1
3,0,0,0,0,0,0,0.007225,0.169661,0.0,0.070886,...,-0.023304,-0.005002,-0.281848,-0.157011,0.184323,0.067625,-0.066242,0.023737,-0.111269,1
4,0,0,0,0,0,0,0.003613,0.167665,0.0,0.060759,...,0.055657,0.024936,-0.181419,-0.165707,0.194511,0.143557,-0.059017,0.136676,-0.00453,1


In [9]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
features = selected_train.columns[7:].tolist()
len(features)

41

## Gradient Boosting (LGBMClassifier)

### Baseline Gradient Boosting using BinaryRelevance

In [57]:
classifier_gb = BinaryRelevance(LGBMClassifier(random_state=0))
# train
classifier_gb.fit(selected_train[features], selected_train[labels])
# predict
predictions_gb = classifier_gb.predict(selected_train[features])

In [58]:
# predict probability
predictions_proba_gb = classifier_gb.predict_proba(selected_train[features])

In [59]:
# model evaluation overall score
get_overall_evaluation_score(selected_train[labels], predictions_gb)

Accuracy score:  0.9324626655219307
Precision score:  0.8585256861860837
Recall score:  0.7201834862385321
F1 score:  0.7820014889765815


In [60]:
# evaluation on test data
predictions_gb_test = classifier_gb.predict(selected_test[features])
predictions_proba_gb_test = classifier_gb.predict_proba(selected_test[features])
get_evaluation_score(selected_test[labels], predictions_gb_test, predictions_proba_gb_test)

Accuracy score:  0.8789896526931132
Precision score:  0.5829492803206758
Recall score:  0.6643674989653745
F1 score:  0.6174134411558834
Confusion matrix for label toxic:
[[54806  3082]
 [ 1502  4588]]
Confusion matrix for label severe_toxic:
[[63254   357]
 [  220   147]]
Confusion matrix for label obscene:
[[58600  1687]
 [ 1071  2620]]
Confusion matrix for label threat:
[[63381   386]
 [  156    55]]
Confusion matrix for label insult:
[[59351  1200]
 [ 1367  2060]]
Confusion matrix for label identity_hate:
[[62992   274]
 [  550   162]]
Logarithmic Loss:  0.3109336144201333
ROC AUC score:  0.9583558250719668


### Baseline Gradient Boosting using ClassifierChain

In [61]:
classifier_chain_gb = ClassifierChain(LGBMClassifier(random_state=0))
# train
classifier_chain_gb.fit(selected_train[features], selected_train[labels])
# predict
predictions_chain_gb = classifier_chain_gb.predict(selected_train[features])

In [62]:
# predict probability
predictions_proba_chain_gb = classifier_chain_gb.predict_proba(selected_train[features])

In [63]:
# model evaluation
get_evaluation_score(selected_train[labels], predictions_chain_gb, predictions_proba_chain_gb)

Accuracy score:  0.9344868428473846
Precision score:  0.8403138050122246
Recall score:  0.7235170095162118
F1 score:  0.775799218354555
Confusion matrix for label toxic:
[[142777   1500]
 [  4464  10830]]
Confusion matrix for label severe_toxic:
[[157818    158]
 [   458   1137]]
Confusion matrix for label obscene:
[[149876   1246]
 [  1717   6732]]
Confusion matrix for label threat:
[[158951    142]
 [   198    280]]
Confusion matrix for label insult:
[[150017   1677]
 [  2165   5712]]
Confusion matrix for label identity_hate:
[[157990    176]
 [   702    703]]
Logarithmic Loss:  0.27562349981689854
ROC AUC score:  0.98159005455481


In [64]:
# evaluation on test data
predictions_chain_gb_test = classifier_chain_gb.predict(selected_test[features])
predictions_proba_chain_gb_test = classifier_chain_gb.predict_proba(selected_test[features])
get_evaluation_score(selected_test[labels], predictions_chain_gb_test, predictions_proba_chain_gb_test)

Accuracy score:  0.8797868017130889
Precision score:  0.5570072116149407
Recall score:  0.6842323079045386
F1 score:  0.612636197875275
Confusion matrix for label toxic:
[[54806  3082]
 [ 1502  4588]]
Confusion matrix for label severe_toxic:
[[63268   343]
 [  209   158]]
Confusion matrix for label obscene:
[[58263  2024]
 [  965  2726]]
Confusion matrix for label threat:
[[63426   341]
 [  150    61]]
Confusion matrix for label insult:
[[58812  1739]
 [ 1222  2205]]
Confusion matrix for label identity_hate:
[[62931   335]
 [  530   182]]
Logarithmic Loss:  0.33110831807467267
ROC AUC score:  0.9556995692510866


### Baseline Gradient Boosting using LabelPowerset

In [65]:
classifier_powerset_gb = LabelPowerset(LGBMClassifier(random_state=0))
# train
classifier_powerset_gb.fit(selected_train[features], selected_train[labels])
# predict
predictions_powerset_gb = classifier_powerset_gb.predict(selected_train[features])

In [66]:
# predict probability
predictions_proba_powerset_gb = classifier_powerset_gb.predict_proba(selected_train[features])

In [67]:
# model evaluation
get_evaluation_score(selected_train[labels], predictions_powerset_gb, predictions_proba_powerset_gb)

Accuracy score:  0.7406358298187014
Precision score:  0.09740660064706258
Recall score:  0.17445438486523449
F1 score:  0.12454536249158364
Confusion matrix for label toxic:
[[122517  21760]
 [ 12063   3231]]
Confusion matrix for label severe_toxic:
[[153105   4871]
 [  1563     32]]
Confusion matrix for label obscene:
[[136712  14410]
 [  6960   1489]]
Confusion matrix for label threat:
[[151307   7786]
 [   474      4]]
Confusion matrix for label insult:
[[136507  15187]
 [  6551   1326]]
Confusion matrix for label identity_hate:
[[149986   8180]
 [  1364     41]]
Logarithmic Loss:  0.8053150175420156
ROC AUC score:  0.5289704923713838


In [68]:
# evaluation on test data
predictions_powerset_gb_test = classifier_powerset_gb.predict(selected_test[features])
predictions_proba_powerset_gb_test = classifier_powerset_gb.predict_proba(selected_test[features])
get_evaluation_score(selected_test[labels], predictions_powerset_gb_test, predictions_proba_powerset_gb_test)

Accuracy score:  0.7441307949607677
Precision score:  0.10160540877173552
Recall score:  0.1799558559801352
F1 score:  0.12966069946135153
Confusion matrix for label toxic:
[[48915  8973]
 [ 4753  1337]]
Confusion matrix for label severe_toxic:
[[61842  1769]
 [  362     5]]
Confusion matrix for label obscene:
[[54521  5766]
 [ 3037   654]]
Confusion matrix for label threat:
[[60954  2813]
 [  210     1]]
Confusion matrix for label insult:
[[54379  6172]
 [ 2828   599]]
Confusion matrix for label identity_hate:
[[60239  3027]
 [  699    13]]
Logarithmic Loss:  0.8015398931821524
ROC AUC score:  0.5309322488614058


### Baseline Gradient Boosting using OneVsRest

In [69]:
# Using pipeline for applying lgbm and one vs rest classifier
LGBM_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LGBMClassifier(random_state=0), n_jobs=-1)),
            ])
for category in labels:
    print('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    LGBM_pipeline.fit(selected_train[features], selected_train[category])
    
    # calculating test accuracy (on test dataset)
    prediction = LGBM_pipeline.predict(selected_test[features])
    print('Test accuracy is {}'.format(accuracy_score(selected_test[category], prediction)))
    print('Test precision is {}'.format(precision_score(selected_test[category], prediction)))
    print('Test f1-score is {}'.format(f1_score(selected_test[category], prediction)))
    print('Test recall is {}'.format(recall_score(selected_test[category], prediction)))
    print(confusion_matrix(selected_test[category], prediction))
    print('Test logarithmic loss is {}'.format(log_loss(selected_test[category], prediction)))
    print('Test ROC-AUC score is {}'.format(roc_auc_score(selected_test[category], prediction)))
    print("\n")

**Processing toxic comments...**
Test accuracy is 0.9283503704398387
Test precision is 0.598174706649283
Test f1-score is 0.666860465116279
Test recall is 0.7533661740558292
[[54806  3082]
 [ 1502  4588]]
Test logarithmic loss is 2.5825144133250966
Test ROC-AUC score is 0.8500627166575443


**Processing severe_toxic comments...**
Test accuracy is 0.9909812748132171
Test precision is 0.2916666666666667
Test f1-score is 0.3375430539609644
Test recall is 0.40054495912806537
[[63254   357]
 [  220   147]]
Test logarithmic loss is 0.32506780464410595
Test ROC-AUC score is 0.6974663611253978


**Processing obscene comments...**
Test accuracy is 0.9568914314295539
Test precision is 0.6083120501509172
Test f1-score is 0.655163790947737
Test recall is 0.7098347331346518
[[58600  1687]
 [ 1071  2620]]
Test logarithmic loss is 1.5537903036541485
Test ROC-AUC score is 0.840925958801141


**Processing threat comments...**
Test accuracy is 0.9915283378661415
Test precision is 0.12471655328798185
Tes

### Baseline Gradient Boosting using OneVsRest (is_unbalance)

In [70]:
# Using pipeline for applying lgbm and one vs rest classifier
LGBM_pipeline_unbalance = Pipeline([
                ('clf', OneVsRestClassifier(LGBMClassifier(random_state=0, is_unbalance=True), n_jobs=-1)),
            ])
for category in labels:
    print('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    LGBM_pipeline_unbalance.fit(selected_train[features], selected_train[category])
    
    # calculating test accuracy (on test dataset)
    prediction = LGBM_pipeline_unbalance.predict(selected_test[features])
    print('Test accuracy is {}'.format(accuracy_score(selected_test[category], prediction)))
    print('Test precision is {}'.format(precision_score(selected_test[category], prediction)))
    print('Test f1-score is {}'.format(f1_score(selected_test[category], prediction)))
    print('Test recall is {}'.format(recall_score(selected_test[category], prediction)))
    print(confusion_matrix(selected_test[category], prediction))
    print('Test logarithmic loss is {}'.format(log_loss(selected_test[category], prediction)))
    print('Test ROC-AUC score is {}'.format(roc_auc_score(selected_test[category], prediction)))
    print("\n")

**Processing toxic comments...**
Test accuracy is 0.8549032479914971
Test precision is 0.38830196599734135
Test f1-score is 0.5445714566059952
Test recall is 0.9113300492610837
[[49145  8743]
 [  540  5550]]
Test logarithmic loss is 5.229817037281167
Test ROC-AUC score is 0.8801485099815645


**Processing severe_toxic comments...**
Test accuracy is 0.9716933946043953
Test precision is 0.14468503937007873
Test f1-score is 0.24510212588578573
Test recall is 0.8010899182561307
[[61873  1738]
 [   73   294]]
Test logarithmic loss is 1.020273473501691
Test ROC-AUC score is 0.8868837998945995


**Processing obscene comments...**
Test accuracy is 0.8974022320172559
Test precision is 0.3504424778761062
Test f1-score is 0.5063176895306859
Test recall is 0.9119479815768085
[[54048  6239]
 [  325  3366]]
Test logarithmic loss is 3.6979983876670883
Test ROC-AUC score is 0.9042298336732716


**Processing threat comments...**
Test accuracy is 0.9138141236049893
Test precision is 0.030957013974880595

Performance seems to deteriorate when using is_unbalance = True

## Naive Bayes (GaussianNB)

### Baseline Naive Bayes using BinaryRelevance

In [71]:
import warnings
warnings.filterwarnings("ignore")

classifier_nb = BinaryRelevance(GaussianNB())
# train
classifier_nb.fit(selected_train[features], selected_train[labels])
# predict
predictions_nb = classifier_nb.predict(selected_train[features])

In [72]:
# predict probability
predictions_proba_nb = classifier_nb.predict_proba(selected_train[features])

In [73]:
# model evaluation
get_evaluation_score(selected_train[labels], predictions_nb, predictions_proba_nb)

Accuracy score:  0.8792637759993984
Precision score:  0.5580383909328479
Recall score:  0.6486124565502308
F1 score:  0.577292473981047
Confusion matrix for label toxic:
[[140351   3926]
 [  5871   9423]]
Confusion matrix for label severe_toxic:
[[152083   5893]
 [   292   1303]]
Confusion matrix for label obscene:
[[146146   4976]
 [  2527   5922]]
Confusion matrix for label threat:
[[151647   7446]
 [   181    297]]
Confusion matrix for label insult:
[[146219   5475]
 [  2896   4981]]
Confusion matrix for label identity_hate:
[[150542   7624]
 [   566    839]]
Logarithmic Loss:  0.7660158339188156
ROC AUC score:  0.9312601005621751


In [74]:
# evaluation on test data
predictions_nb_test = classifier_nb.predict(selected_test[features])
predictions_proba_nb_test = classifier_nb.predict_proba(selected_test[features])
get_evaluation_score(selected_test[labels], predictions_nb_test, predictions_proba_nb_test)

Accuracy score:  0.8374284910437964
Precision score:  0.39914770987296333
Recall score:  0.7291350531107739
F1 score:  0.5031102162203966
Confusion matrix for label toxic:
[[53390  4498]
 [ 1571  4519]]
Confusion matrix for label severe_toxic:
[[59159  4452]
 [   68   299]]
Confusion matrix for label obscene:
[[55923  4364]
 [  942  2749]]
Confusion matrix for label threat:
[[59377  4390]
 [   53   158]]
Confusion matrix for label insult:
[[56254  4297]
 [ 1066  2361]]
Confusion matrix for label identity_hate:
[[58234  5032]
 [  227   485]]
Logarithmic Loss:  0.70881977326765
ROC AUC score:  0.9267265814152312


In [75]:
classifier_nb.get_params()

{'classifier': GaussianNB(),
 'classifier__priors': None,
 'classifier__var_smoothing': 1e-09,
 'require_dense': [True, True]}

### Baseline  Naive Bayes using ClassifierChain

In [76]:
import warnings
warnings.filterwarnings("ignore")

classifier_chain_nb = ClassifierChain(GaussianNB())
# train
classifier_chain_nb.fit(selected_train[features], selected_train[labels])
# predict
predictions_chain_nb = classifier_chain_nb.predict(selected_train[features])

In [77]:
# predict probability
predictions_proba_chain_nb = classifier_chain_nb.predict_proba(selected_train[features])

In [78]:
# model evaluation
get_evaluation_score(selected_train[labels], predictions_chain_nb, predictions_proba_chain_nb)

Accuracy score:  0.8788125661931052
Precision score:  0.5565655674617817
Recall score:  0.67795885805459
F1 score:  0.5823247163434749
Confusion matrix for label toxic:
[[140351   3926]
 [  5871   9423]]
Confusion matrix for label severe_toxic:
[[149947   8029]
 [   182   1413]]
Confusion matrix for label obscene:
[[145923   5199]
 [  2302   6147]]
Confusion matrix for label threat:
[[149029  10064]
 [   143    335]]
Confusion matrix for label insult:
[[145785   5909]
 [  2440   5437]]
Confusion matrix for label identity_hate:
[[148149  10017]
 [   365   1040]]
Logarithmic Loss:  0.6688595094005957
ROC AUC score:  0.931072059247107


In [79]:
# evaluation on test data
predictions_chain_nb_test = classifier_chain_nb.predict(selected_test[features])
predictions_proba_chain_nb_test = classifier_chain_nb.predict_proba(selected_test[features])
get_evaluation_score(selected_test[labels], predictions_chain_nb_test, predictions_proba_chain_nb_test)

Accuracy score:  0.8373503391790929
Precision score:  0.394723171539833
Recall score:  0.7618981928541868
F1 score:  0.5036299318614065
Confusion matrix for label toxic:
[[53390  4498]
 [ 1571  4519]]
Confusion matrix for label severe_toxic:
[[57894  5717]
 [   44   323]]
Confusion matrix for label obscene:
[[55651  4636]
 [  842  2849]]
Confusion matrix for label threat:
[[57428  6339]
 [   34   177]]
Confusion matrix for label insult:
[[55645  4906]
 [  848  2579]]
Confusion matrix for label identity_hate:
[[56704  6562]
 [  113   599]]
Logarithmic Loss:  0.6631063220237972
ROC AUC score:  0.9251122828319055


### Baseline  Naive Bayes using LabelPowerSet

In [80]:
import warnings
warnings.filterwarnings("ignore")

classifier_power_nb = LabelPowerset(GaussianNB())
# train
classifier_power_nb.fit(selected_train[features], selected_train[labels])
# predict
predictions_power_nb = classifier_power_nb.predict(selected_train[features])

In [81]:
# predict probability
predictions_proba_power_nb = classifier_power_nb.predict_proba(selected_train[features])

In [82]:
# model evaluation
get_evaluation_score(selected_train[labels], predictions_power_nb, predictions_proba_power_nb)

Accuracy score:  0.8656021457533011
Precision score:  0.5987943558266151
Recall score:  0.5372670807453416
F1 score:  0.553592035258963
Confusion matrix for label toxic:
[[140607   3670]
 [  8174   7120]]
Confusion matrix for label severe_toxic:
[[156415   1561]
 [   857    738]]
Confusion matrix for label obscene:
[[147546   3576]
 [  2595   5854]]
Confusion matrix for label threat:
[[154431   4662]
 [   189    289]]
Confusion matrix for label insult:
[[149165   2529]
 [  3482   4395]]
Confusion matrix for label identity_hate:
[[153987   4179]
 [   944    461]]
Logarithmic Loss:  0.7372777504710718
ROC AUC score:  0.9422719716163704


In [83]:
# evaluation on test data
predictions_power_nb_test = classifier_power_nb.predict(selected_test[features])
predictions_proba_power_nb_test = classifier_power_nb.predict_proba(selected_test[features])
get_evaluation_score(selected_test[labels], predictions_power_nb_test, predictions_proba_power_nb_test)

Accuracy score:  0.8111069430116603
Precision score:  0.43116617623536224
Recall score:  0.5779417850738033
F1 score:  0.4836571435450641
Confusion matrix for label toxic:
[[54235  3653]
 [ 2620  3470]]
Confusion matrix for label severe_toxic:
[[62171  1440]
 [  200   167]]
Confusion matrix for label obscene:
[[56730  3557]
 [ 1189  2502]]
Confusion matrix for label threat:
[[61769  1998]
 [   87   124]]
Confusion matrix for label insult:
[[58579  1972]
 [ 1594  1833]]
Confusion matrix for label identity_hate:
[[60119  3147]
 [  429   283]]
Logarithmic Loss:  0.6426103750949465
ROC AUC score:  0.9285302105506413


### Baseline  Naive Bayes using OneVsRest

In [84]:
# Using pipeline for applying naive bayes and one vs rest classifier
NaiveBayes_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(GaussianNB(), n_jobs=-1)),
            ])
for category in labels:
    print('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    NaiveBayes_pipeline.fit(selected_train[features], selected_train[category])
    
    # calculating test accuracy (on test dataset)
    prediction = NaiveBayes_pipeline.predict(selected_test[features])
    print('Test accuracy is {}'.format(accuracy_score(selected_test[category], prediction)))
    print('Test precision is {}'.format(precision_score(selected_test[category], prediction)))
    print('Test f1-score is {}'.format(f1_score(selected_test[category], prediction)))
    print('Test recall is {}'.format(recall_score(selected_test[category], prediction)))
    print(confusion_matrix(selected_test[category], prediction))
    print('Test logarithmic loss is {}'.format(log_loss(selected_test[category], prediction)))
    print('Test ROC-AUC score is {}'.format(roc_auc_score(selected_test[category], prediction)))
    print("\n")

**Processing toxic comments...**
Test accuracy is 0.9051392666229017
Test precision is 0.5011644671176666
Test f1-score is 0.598265704640233
Test recall is 0.7420361247947455
[[53390  4498]
 [ 1571  4519]]
Test logarithmic loss is 3.4191273940815905
Test ROC-AUC score is 0.8321671779308166


**Processing severe_toxic comments...**
Test accuracy is 0.9293507143080434
Test precision is 0.06293411913281415
Test f1-score is 0.11684251660805002
Test recall is 0.8147138964577657
[[59159  4452]
 [   68   299]]
Test logarithmic loss is 2.546458365669597
Test ROC-AUC score is 0.8723630006412014


**Processing obscene comments...**
Test accuracy is 0.9170652411766544
Test precision is 0.38647546745395756
Test f1-score is 0.5088855979266939
Test recall is 0.7447846112164725
[[55923  4364]
 [  942  2749]]
Test logarithmic loss is 2.989271700938692
Test ROC-AUC score is 0.836198764712189


**Processing threat comments...**
Test accuracy is 0.9305542530244771
Test precision is 0.034740545294635
Test

## Oversampled Data

In [10]:
oversampled_train = pd.read_csv('Data/train_oversampled.csv')

In [11]:
oversampled_train.head()

Unnamed: 0,toxic,obscene,insult,none,min_length_scaled,num_words_vs_length,exclamation_marks_vs_length,num_unique_words_scaled,verbs_vs_length,num_uppercase_scaled,...,139,143,156,157,170,198,clean_text,threat,identity_hate,severe_toxic
0,0,0,0,1,0.009393,0.181132,0.0,0.106329,0.041509,0.003426,...,-0.109984,0.089084,0.012651,-0.019501,-0.091946,-0.113697,explanation why the edits made under my userna...,0,0,0
1,0,0,0,1,0.000723,0.160714,0.008929,0.043038,0.026786,0.001612,...,9.4e-05,0.021135,0.039308,-0.067295,-0.079366,-0.044959,d'aww! he matches this background colour i am ...,0,0,0
2,0,0,0,1,0.007225,0.188841,0.0,0.091139,0.038627,0.000806,...,-0.146134,0.109907,0.036539,-0.053628,0.02753,-0.21123,"hey man, i am really not trying to edit war. i...",0,0,0
3,0,0,0,1,0.007948,0.175719,0.0,0.141772,0.036741,0.002217,...,-0.214832,0.17756,0.007675,-0.087473,0.106848,-0.015987,""" more i cannot make any real suggestions on i...",0,0,0
4,0,0,0,1,0.003613,0.208955,0.0,0.027848,0.059701,0.000403,...,-0.221076,0.163578,-0.06831,-0.031184,-0.01728,0.038566,"you, sir, are my hero. any chance you remember...",0,0,0


In [12]:
features = selected_train.columns[7:]
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

### Baseline Gradient Boosting using Oversampled Data

In [88]:
over_classifier_gb = BinaryRelevance(LGBMClassifier(random_state=0))
# train
over_classifier_gb.fit(oversampled_train[features], oversampled_train[labels])
# predict
over_predictions_gb = over_classifier_gb.predict(oversampled_train[features])

In [89]:
# predict probability
over_predictions_proba_gb = over_classifier_gb.predict_proba(oversampled_train[features])

In [90]:
# model evaluation
get_evaluation_score(oversampled_train[labels], over_predictions_gb, over_predictions_proba_gb)

Accuracy score:  0.9578733094938494
Precision score:  0.9919831149778372
Recall score:  0.9885839384132907
F1 score:  0.9902750946771441
Confusion matrix for label toxic:
[[147310   2315]
 [  4777 267587]]
Confusion matrix for label severe_toxic:
[[280313   1013]
 [  1468 139195]]
Confusion matrix for label obscene:
[[195917   1631]
 [  2507 221934]]
Confusion matrix for label threat:
[[170088   1213]
 [     0 250688]]
Confusion matrix for label insult:
[[192575   2388]
 [  2966 224060]]
Confusion matrix for label identity_hate:
[[245938   1734]
 [  3003 171314]]
Logarithmic Loss:  4.899558893409421
ROC AUC score:  0.9992298287770203


In [91]:
# evaluation on test data
over_predictions_gb_test = over_classifier_gb.predict(selected_test[features])
over_predictions_proba_gb_test = over_classifier_gb.predict_proba(selected_test[features])
get_evaluation_score(selected_test[labels], over_predictions_gb_test, over_predictions_proba_gb_test)

Accuracy score:  0.8685485635687268
Precision score:  0.5403090661382476
Recall score:  0.7023037660366948
F1 score:  0.6079441038356306
Confusion matrix for label toxic:
[[54290  3598]
 [ 1436  4654]]
Confusion matrix for label severe_toxic:
[[63007   604]
 [  161   206]]
Confusion matrix for label obscene:
[[58444  1843]
 [ 1064  2627]]
Confusion matrix for label threat:
[[63196   571]
 [   82   129]]
Confusion matrix for label insult:
[[58778  1773]
 [ 1214  2213]]
Confusion matrix for label identity_hate:
[[62328   938]
 [  359   353]]
Logarithmic Loss:  0.3095174706208548
ROC AUC score:  0.9556650844067958


Same as XGBoost: 
Oversampled Data - Improved performance on training set, but no improvement for testing set performance.

### Baseline Naive Bayes using Oversampled Data

In [92]:
import warnings
warnings.filterwarnings("ignore")

over_classifier_nb = BinaryRelevance(GaussianNB())
# train
over_classifier_nb.fit(oversampled_train[features], oversampled_train[labels])
# predict
over_predictions_nb = over_classifier_nb.predict(oversampled_train[features])

In [93]:
# predict probability
over_predictions_proba_nb = over_classifier_nb.predict_proba(oversampled_train[features])

In [94]:
# model evaluation
get_evaluation_score(oversampled_train[labels], over_predictions_nb, over_predictions_proba_nb)

Accuracy score:  0.5251582387218624
Precision score:  0.8769532316152626
Recall score:  0.7752468206644596
F1 score:  0.8222164811156505
Confusion matrix for label toxic:
[[145825   3800]
 [ 31174 241190]]
Confusion matrix for label severe_toxic:
[[243264  38062]
 [ 41921  98742]]
Confusion matrix for label obscene:
[[179992  17556]
 [ 45563 178878]]
Confusion matrix for label threat:
[[155518  15783]
 [ 66285 184403]]
Confusion matrix for label insult:
[[176488  18475]
 [ 45375 181651]]
Confusion matrix for label identity_hate:
[[195410  52262]
 [ 59501 114816]]
Logarithmic Loss:  8.847738013874869
ROC AUC score:  0.9260561377095832


In [95]:
# evaluation on test data
over_predictions_nb_test = over_classifier_nb.predict(selected_test[features])
over_predictions_proba_nb_test = over_classifier_nb.predict_proba(selected_test[features])
get_evaluation_score(selected_test[labels], over_predictions_nb_test, over_predictions_proba_nb_test)

Accuracy score:  0.8530588639844947
Precision score:  0.4968336364510059
Recall score:  0.6409159884121948
F1 score:  0.5472951002536629
Confusion matrix for label toxic:
[[54362  3526]
 [ 1736  4354]]
Confusion matrix for label severe_toxic:
[[61457  2154]
 [  115   252]]
Confusion matrix for label obscene:
[[58460  1827]
 [ 1551  2140]]
Confusion matrix for label threat:
[[59676  4091]
 [   55   156]]
Confusion matrix for label insult:
[[58530  2021]
 [ 1443  1984]]
Confusion matrix for label identity_hate:
[[60870  2396]
 [  306   406]]
Logarithmic Loss:  1.0061376129750363
ROC AUC score:  0.9288408883514321


Oversampled Data - Improved performance on test set, but worse score for training set performance.

# -- Best Models (For Ensemble Selection) --

Selection is based on prediction quality of test data.

## Light GBM (Binary Relevance)

In [96]:
classifier_gb = BinaryRelevance(LGBMClassifier(random_state=0))
# train
classifier_gb.fit(selected_train[features], selected_train[labels])
# predict
predictions_gb = classifier_gb.predict(selected_train[features])

In [97]:
# evaluation on test data
predictions_gb_test = classifier_gb.predict(selected_test[features])
predictions_proba_gb_test = classifier_gb.predict_proba(selected_test[features])
get_evaluation_score(selected_test[labels], predictions_gb_test, predictions_proba_gb_test)

Accuracy score:  0.8789896526931132
Precision score:  0.5829492803206758
Recall score:  0.6643674989653745
F1 score:  0.6174134411558834
Confusion matrix for label toxic:
[[54806  3082]
 [ 1502  4588]]
Confusion matrix for label severe_toxic:
[[63254   357]
 [  220   147]]
Confusion matrix for label obscene:
[[58600  1687]
 [ 1071  2620]]
Confusion matrix for label threat:
[[63381   386]
 [  156    55]]
Confusion matrix for label insult:
[[59351  1200]
 [ 1367  2060]]
Confusion matrix for label identity_hate:
[[62992   274]
 [  550   162]]
Logarithmic Loss:  0.3109336144201333
ROC AUC score:  0.9583558250719668


#### Export Model

In [104]:
classifier_gb

In [107]:
joblib.dump(classifier_gb, 'lgbm_binary.sav')

['lgbm_binary.sav']

## Light GBM (Classifier Chain)

In [98]:
classifier_chain_gb = ClassifierChain(LGBMClassifier(random_state=0))
# train
classifier_chain_gb.fit(selected_train[features], selected_train[labels])
# predict
predictions_chain_gb = classifier_chain_gb.predict(selected_train[features])

In [99]:
# evaluation on test data
predictions_chain_gb_test = classifier_chain_gb.predict(selected_test[features])
predictions_proba_chain_gb_test = classifier_chain_gb.predict_proba(selected_test[features])
get_evaluation_score(selected_test[labels], predictions_chain_gb_test, predictions_proba_chain_gb_test)

Accuracy score:  0.8797868017130889
Precision score:  0.5570072116149407
Recall score:  0.6842323079045386
F1 score:  0.612636197875275
Confusion matrix for label toxic:
[[54806  3082]
 [ 1502  4588]]
Confusion matrix for label severe_toxic:
[[63268   343]
 [  209   158]]
Confusion matrix for label obscene:
[[58263  2024]
 [  965  2726]]
Confusion matrix for label threat:
[[63426   341]
 [  150    61]]
Confusion matrix for label insult:
[[58812  1739]
 [ 1222  2205]]
Confusion matrix for label identity_hate:
[[62931   335]
 [  530   182]]
Logarithmic Loss:  0.33110831807467267
ROC AUC score:  0.9556995692510866


#### Export Model

In [108]:
classifier_chain_gb

In [109]:
joblib.dump(classifier_chain_gb, 'lgbm_chain.sav')

['lgbm_chain.sav']

## Naive Bayes (Binary Relevance using Oversampled Data)

In [14]:
import warnings
warnings.filterwarnings("ignore")

over_classifier_nb = BinaryRelevance(GaussianNB())
# train
over_classifier_nb.fit(oversampled_train[features], oversampled_train[labels])
# predict
over_predictions_nb = over_classifier_nb.predict(oversampled_train[features])

In [15]:
# evaluation on test data
over_predictions_nb_test = over_classifier_nb.predict(selected_test[features])
over_predictions_proba_nb_test = over_classifier_nb.predict_proba(selected_test[features])
get_evaluation_score(selected_test[labels], over_predictions_nb_test, over_predictions_proba_nb_test)

Accuracy score:  0.8530588639844947
Precision score:  0.4968336364510059
Recall score:  0.6409159884121948
F1 score:  0.5472951002536629
Confusion matrix for label toxic:
[[54362  3526]
 [ 1736  4354]]
Confusion matrix for label severe_toxic:
[[61457  2154]
 [  115   252]]
Confusion matrix for label obscene:
[[58460  1827]
 [ 1551  2140]]
Confusion matrix for label threat:
[[59676  4091]
 [   55   156]]
Confusion matrix for label insult:
[[58530  2021]
 [ 1443  1984]]
Confusion matrix for label identity_hate:
[[60870  2396]
 [  306   406]]
Logarithmic Loss:  1.0061376129750363
ROC AUC score:  0.9288408883514321


In [17]:
# model evaluation on each label
i = 0
for label in labels:
    print(f'fitting for label {label}')
    get_evaluation_score_single_class(selected_test[label], over_predictions_nb_test[:,i].toarray(), over_predictions_proba_nb_test[:,i].toarray())
    i += 1
    print("\n")

fitting for label toxic
Accuracy score:  0.9177529775860452
Precision score:  0.5525380710659898
Recall score:  0.7149425287356321
F1 score:  0.6233357193987115
Confusion matrix:
[[54362  3526]
 [ 1736  4354]]
Logarithmic Loss:  2.1116280670827967
ROC AUC score:  0.9287029250640612


fitting for label severe_toxic
Accuracy score:  0.9645346837975554
Precision score:  0.10473815461346633
Recall score:  0.6866485013623979
F1 score:  0.18175261449693472
Confusion matrix:
[[61457  2154]
 [  115   252]]
Logarithmic Loss:  0.6988973199720871
ROC AUC score:  0.948290330057476


fitting for label obscene
Accuracy score:  0.9472006002063209
Precision score:  0.5394504663473658
Recall score:  0.5797886751557844
F1 score:  0.5588926612692608
Confusion matrix:
[[58460  1827]
 [ 1551  2140]]
Logarithmic Loss:  0.8155270784029294
ROC AUC score:  0.9279784437770856


fitting for label threat
Accuracy score:  0.9351964737878645
Precision score:  0.03673181068989875
Recall score:  0.7393364928909952
F1

#### Export Model

In [111]:
over_classifier_nb

In [110]:
joblib.dump(over_classifier_nb, 'nb_binary.sav')

['nb_binary.sav']

## Naive Bayes (Classifier Chain using Oversampled Data)

In [21]:
import warnings
warnings.filterwarnings("ignore")

over_classifier_chain_nb = ClassifierChain(GaussianNB())
# train
over_classifier_chain_nb.fit(oversampled_train[features], oversampled_train[labels])
# predict
over_predictions_chain_nb = over_classifier_chain_nb.predict(oversampled_train[features])

In [22]:
# evaluation on test data
over_predictions_chain_nb_test = over_classifier_chain_nb.predict(selected_test[features])
over_predictions_proba_chain_nb_test = over_classifier_chain_nb.predict_proba(selected_test[features])
get_evaluation_score(selected_test[labels], over_predictions_chain_nb_test, over_predictions_proba_chain_nb_test)

Accuracy score:  0.8548407264997343
Precision score:  0.4925368479626326
Recall score:  0.6545040695268313
F1 score:  0.5458599646848109
Confusion matrix for label toxic:
[[54362  3526]
 [ 1736  4354]]
Confusion matrix for label severe_toxic:
[[60013  3598]
 [   65   302]]
Confusion matrix for label obscene:
[[58385  1902]
 [ 1500  2191]]
Confusion matrix for label threat:
[[59473  4294]
 [   52   159]]
Confusion matrix for label insult:
[[58459  2092]
 [ 1426  2001]]
Confusion matrix for label identity_hate:
[[59878  3388]
 [  230   482]]
Logarithmic Loss:  1.1197576556581255
ROC AUC score:  0.9362249662469333


In [23]:
# model evaluation on each label
i = 0
for label in labels:
    print(f'fitting for label {label}')
    get_evaluation_score_single_class(selected_test[label], over_predictions_chain_nb_test[:,i].toarray(), over_predictions_proba_chain_nb_test[:,i].toarray())
    i += 1
    print("\n")

fitting for label toxic
Accuracy score:  0.9177529775860452
Precision score:  0.5525380710659898
Recall score:  0.7149425287356321
F1 score:  0.6233357193987115
Confusion matrix:
[[54362  3526]
 [ 1736  4354]]
Logarithmic Loss:  2.1116280670827967
ROC AUC score:  0.9287029250640612


fitting for label severe_toxic
Accuracy score:  0.9427459439182219
Precision score:  0.07743589743589743
Recall score:  0.8228882833787466
F1 score:  0.14155144129364894
Confusion matrix:
[[60013  3598]
 [   65   302]]
Logarithmic Loss:  1.036284804393332
ROC AUC score:  0.9524692338741303


fitting for label obscene
Accuracy score:  0.9468254712557441
Precision score:  0.535304177864647
Recall score:  0.5936060688160391
F1 score:  0.56294964028777
Confusion matrix:
[[58385  1902]
 [ 1500  2191]]
Logarithmic Loss:  1.1957921970680645
ROC AUC score:  0.9441230893226226


fitting for label threat
Accuracy score:  0.9320703991997249
Precision score:  0.03570626543902987
Recall score:  0.7535545023696683
F1 sc

#### Export Model

In [112]:
over_classifier_chain_nb

In [113]:
joblib.dump(over_classifier_chain_nb, 'nb_chain.sav')

['nb_chain.sav']

Naive Bayes Classifier has almost no hyperparameters to tune, so it usually generalizes well. One thing to note is that due to the feature independence assumption, the class probabilities output by Naive Bayes can be pretty inaccurate.

# Hyperparameter Tuning (For LGBM)

* num_leaves (int, optional (default=31)) – Maximum tree leaves for base learners.

* max_depth (int, optional (default=-1)) – Maximum tree depth for base learners, <=0 means no limit.

* learning_rate (float, optional (default=0.1)) – Boosting learning rate. You can use callbacks parameter of fit method to shrink/adapt learning rate in training using reset_parameter callback. Note, that this will ignore the learning_rate argument in training.

* n_estimators (int, optional (default=100)) – Number of boosted trees to fit.

* subsample_for_bin (int, optional (default=200000)) – Number of samples for constructing bins.
scale_pos_weight [default=1] A value greater than 0 should be used in case of high-class imbalance as it helps in faster convergence.

* scale_pos_weight [default=1] A value greater than 0 should be used in case of high-class imbalance as it helps in faster convergence.

References: 
* https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/
* https://lightgbm.readthedocs.io/en/latest/Parameters.html
* https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html

### LGBM (Binary Relevance)

In [122]:
BinaryRelevance(classifier=LGBMClassifier(random_state=0)).get_params().keys()

dict_keys(['classifier', 'classifier__boosting_type', 'classifier__class_weight', 'classifier__colsample_bytree', 'classifier__importance_type', 'classifier__learning_rate', 'classifier__max_depth', 'classifier__min_child_samples', 'classifier__min_child_weight', 'classifier__min_split_gain', 'classifier__n_estimators', 'classifier__n_jobs', 'classifier__num_leaves', 'classifier__objective', 'classifier__random_state', 'classifier__reg_alpha', 'classifier__reg_lambda', 'classifier__silent', 'classifier__subsample', 'classifier__subsample_for_bin', 'classifier__subsample_freq', 'require_dense'])

### Tune scale_pos_weight

In [140]:
param_test = {
 'classifier__scale_pos_weight':range(1,18,4)
}

lgbm_tuned = BinaryRelevance(LGBMClassifier(random_state=0))
gsearch = GridSearchCV(estimator=lgbm_tuned, param_grid=param_test, scoring='f1_weighted', cv=5, verbose=3)
gsearch.fit(selected_train[features], selected_train[labels])

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END ....classifier__scale_pos_weight=1;, score=0.690 total time=   7.8s
[CV 2/5] END ....classifier__scale_pos_weight=1;, score=0.688 total time=   7.9s
[CV 3/5] END ....classifier__scale_pos_weight=1;, score=0.694 total time=   8.1s
[CV 4/5] END ....classifier__scale_pos_weight=1;, score=0.695 total time=   8.0s
[CV 5/5] END ....classifier__scale_pos_weight=1;, score=0.680 total time=   7.9s
[CV 1/5] END ....classifier__scale_pos_weight=5;, score=0.678 total time=   8.2s
[CV 2/5] END ....classifier__scale_pos_weight=5;, score=0.676 total time=   8.4s
[CV 3/5] END ....classifier__scale_pos_weight=5;, score=0.683 total time=   8.3s
[CV 4/5] END ....classifier__scale_pos_weight=5;, score=0.676 total time=   8.2s
[CV 5/5] END ....classifier__scale_pos_weight=5;, score=0.671 total time=   8.2s
[CV 1/5] END ....classifier__scale_pos_weight=9;, score=0.641 total time=   8.5s
[CV 2/5] END ....classifier__scale_pos_weight=9;,

In [130]:
gsearch.best_params_, gsearch.best_score_

({'classifier__scale_pos_weight': 1}, 0.6894672664194945)

No adjustment on default scale_pos_weight needed, does not improve model performance.

### Tune n_classifiers

In [123]:
param_test = {'classifier__n_estimators':range(20,81,10)}

lgbm_tuned = BinaryRelevance(LGBMClassifier(random_state=0))
gsearch = GridSearchCV(estimator=lgbm_tuned, param_grid=param_test, scoring='f1_weighted', cv=5, verbose=3)

gsearch.fit(selected_train[features],selected_train[labels])

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV 1/5] END .......classifier__n_estimators=20;, score=0.660 total time=   2.7s
[CV 2/5] END .......classifier__n_estimators=20;, score=0.656 total time=   2.6s
[CV 3/5] END .......classifier__n_estimators=20;, score=0.667 total time=   2.7s
[CV 4/5] END .......classifier__n_estimators=20;, score=0.670 total time=   2.6s
[CV 5/5] END .......classifier__n_estimators=20;, score=0.658 total time=   2.6s
[CV 1/5] END .......classifier__n_estimators=30;, score=0.675 total time=   3.3s
[CV 2/5] END .......classifier__n_estimators=30;, score=0.669 total time=   3.3s
[CV 3/5] END .......classifier__n_estimators=30;, score=0.678 total time=   3.4s
[CV 4/5] END .......classifier__n_estimators=30;, score=0.685 total time=   3.4s
[CV 5/5] END .......classifier__n_estimators=30;, score=0.673 total time=   3.3s
[CV 1/5] END .......classifier__n_estimators=40;, score=0.680 total time=   4.0s
[CV 2/5] END .......classifier__n_estimators=40;,

In [126]:
gsearch.best_params_, gsearch.best_score_

({'classifier__n_estimators': 80}, 0.6883177239570974)

n_estimators = 80 will be used for the subsequent tuning process.

### Tune max_depth & min_samples_split

In [137]:
param_test = {'classifier__max_depth':range(4,10,2), 'classifier__min_split_gain':np.arange(0,0.4,0.1)}
lgbm_tuned = BinaryRelevance(LGBMClassifier(random_state=0, n_estimators=80))

gsearch = GridSearchCV(estimator=lgbm_tuned, param_grid=param_test, scoring='f1_weighted', cv=5, verbose=3)

gsearch.fit(selected_train[features],selected_train[labels])

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END classifier__max_depth=4, classifier__min_split_gain=0.0;, score=0.682 total time=   4.0s
[CV 2/5] END classifier__max_depth=4, classifier__min_split_gain=0.0;, score=0.685 total time=   4.1s
[CV 3/5] END classifier__max_depth=4, classifier__min_split_gain=0.0;, score=0.689 total time=   4.0s
[CV 4/5] END classifier__max_depth=4, classifier__min_split_gain=0.0;, score=0.689 total time=   4.1s
[CV 5/5] END classifier__max_depth=4, classifier__min_split_gain=0.0;, score=0.677 total time=   4.1s
[CV 1/5] END classifier__max_depth=4, classifier__min_split_gain=0.1;, score=0.681 total time=   4.0s
[CV 2/5] END classifier__max_depth=4, classifier__min_split_gain=0.1;, score=0.683 total time=   4.0s
[CV 3/5] END classifier__max_depth=4, classifier__min_split_gain=0.1;, score=0.689 total time=   4.0s
[CV 4/5] END classifier__max_depth=4, classifier__min_split_gain=0.1;, score=0.689 total time=   4.0s
[CV 5/5] END classifi

In [138]:
gsearch.best_params_, gsearch.best_score_

({'classifier__max_depth': 6, 'classifier__min_split_gain': 0.2},
 0.6900204063607662)

max_depth = 6 & min_split_gain = 0.2 will be used for the subsequent tuning process.

In [None]:
### Tune min_child_samples

In [141]:
param_test = {'classifier__min_child_samples': range(20,71,10)}
lgbm_tuned = BinaryRelevance(LGBMClassifier(random_state=0, n_estimators=80, max_depth=6, min_split_gain=0.2))

gsearch = GridSearchCV(estimator=lgbm_tuned, param_grid=param_test, scoring='f1_weighted', cv=5, verbose=3)

gsearch.fit(selected_train[features],selected_train[labels])

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END ..classifier__min_child_samples=20;, score=0.685 total time=   6.1s
[CV 2/5] END ..classifier__min_child_samples=20;, score=0.690 total time=   6.0s
[CV 3/5] END ..classifier__min_child_samples=20;, score=0.695 total time=   6.0s
[CV 4/5] END ..classifier__min_child_samples=20;, score=0.696 total time=   5.9s
[CV 5/5] END ..classifier__min_child_samples=20;, score=0.684 total time=   6.0s
[CV 1/5] END ..classifier__min_child_samples=30;, score=0.689 total time=   6.0s
[CV 2/5] END ..classifier__min_child_samples=30;, score=0.687 total time=   5.9s
[CV 3/5] END ..classifier__min_child_samples=30;, score=0.693 total time=   5.9s
[CV 4/5] END ..classifier__min_child_samples=30;, score=0.695 total time=   6.0s
[CV 5/5] END ..classifier__min_child_samples=30;, score=0.684 total time=   6.0s
[CV 1/5] END ..classifier__min_child_samples=40;, score=0.684 total time=   5.8s
[CV 2/5] END ..classifier__min_child_samples=40;,

In [143]:
gsearch.best_params_, gsearch.best_score_

({'classifier__min_child_samples': 20}, 0.6900204063607662)

No adjustment on default min_child_samples needed, does not improve model performance.

### Tune subsample

In [147]:
param_test = {
 'classifier__subsample':[0.6,0.7,0.75,0.8,0.85,0.9]
}

lgbm_tuned = BinaryRelevance(LGBMClassifier(random_state=0, n_estimators=80, max_depth=6, min_split_gain=0.2))
gsearch = GridSearchCV(estimator=lgbm_tuned, param_grid=param_test, scoring='f1_weighted', cv=5, verbose=3)
gsearch.fit(selected_train[features], selected_train[labels])

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END .........classifier__subsample=0.6;, score=0.685 total time=   6.0s
[CV 2/5] END .........classifier__subsample=0.6;, score=0.690 total time=   6.1s
[CV 3/5] END .........classifier__subsample=0.6;, score=0.695 total time=   6.3s
[CV 4/5] END .........classifier__subsample=0.6;, score=0.696 total time=   6.2s
[CV 5/5] END .........classifier__subsample=0.6;, score=0.684 total time=   6.2s
[CV 1/5] END .........classifier__subsample=0.7;, score=0.685 total time=   5.9s
[CV 2/5] END .........classifier__subsample=0.7;, score=0.690 total time=   6.0s
[CV 3/5] END .........classifier__subsample=0.7;, score=0.695 total time=   5.8s
[CV 4/5] END .........classifier__subsample=0.7;, score=0.696 total time=   6.2s
[CV 5/5] END .........classifier__subsample=0.7;, score=0.684 total time=   6.2s
[CV 1/5] END ........classifier__subsample=0.75;, score=0.685 total time=   6.3s
[CV 2/5] END ........classifier__subsample=0.75;,

In [145]:
gsearch.best_params_, gsearch.best_score_

({'classifier__subsample': 0.6}, 0.6900204063607662)

subsample = 0.6 will be used for the subsequent tuning process.

### Tune learning_rate

In [148]:
param_test = {
 'classifier__learning_rate':[0.1, 0.2, 0.3, 0.4, 0.5]
}

lgbm_tuned = BinaryRelevance(LGBMClassifier(random_state=0, n_estimators=80, max_depth=6, min_split_gain=0.2, subsample=0.6))
gsearch = GridSearchCV(estimator=lgbm_tuned, param_grid=param_test, scoring='f1_weighted', cv=5, verbose=3)
gsearch.fit(selected_train[features], selected_train[labels])

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END .....classifier__learning_rate=0.1;, score=0.685 total time=   5.9s
[CV 2/5] END .....classifier__learning_rate=0.1;, score=0.690 total time=   5.9s
[CV 3/5] END .....classifier__learning_rate=0.1;, score=0.695 total time=   6.0s
[CV 4/5] END .....classifier__learning_rate=0.1;, score=0.696 total time=   6.0s
[CV 5/5] END .....classifier__learning_rate=0.1;, score=0.684 total time=   5.9s
[CV 1/5] END .....classifier__learning_rate=0.2;, score=0.682 total time=   5.5s
[CV 2/5] END .....classifier__learning_rate=0.2;, score=0.683 total time=   5.5s
[CV 3/5] END .....classifier__learning_rate=0.2;, score=0.687 total time=   5.6s
[CV 4/5] END .....classifier__learning_rate=0.2;, score=0.685 total time=   5.7s
[CV 5/5] END .....classifier__learning_rate=0.2;, score=0.675 total time=   5.6s
[CV 1/5] END .....classifier__learning_rate=0.3;, score=0.671 total time=   5.1s
[CV 2/5] END .....classifier__learning_rate=0.3;,

In [149]:
gsearch.best_params_, gsearch.best_score_

({'classifier__learning_rate': 0.1}, 0.6900204063607662)

No adjustment on default learning_rate needed, does not improve model performance.

## Tuned Light GBM (Binary Relevance)

In [151]:
classifier_tuned = BinaryRelevance(LGBMClassifier(random_state=0, n_estimators=80, max_depth=6, min_split_gain=0.2, subsample=0.6))
# train
classifier_tuned.fit(selected_train[features], selected_train[labels])
# predict
predictions_tuned = classifier_tuned.predict(selected_train[features])

In [152]:
# predict probability
predictions_proba_tuned = classifier_tuned.predict_proba(selected_train[features])

In [153]:
# model evaluation
get_evaluation_score(selected_train[labels], predictions_tuned, predictions_proba_tuned)

Accuracy score:  0.92831404202518
Precision score:  0.8469211827544635
Recall score:  0.6922331756795259
F1 score:  0.7593649476566278
Confusion matrix for label toxic:
[[142791   1486]
 [  4696  10598]]
Confusion matrix for label severe_toxic:
[[157707    269]
 [   691    904]]
Confusion matrix for label obscene:
[[150017   1105]
 [  1976   6473]]
Confusion matrix for label threat:
[[159069     24]
 [    84    394]]
Confusion matrix for label insult:
[[150324   1370]
 [  2468   5409]]
Confusion matrix for label identity_hate:
[[158040    126]
 [   887    518]]
Logarithmic Loss:  0.25086807424923746
ROC AUC score:  0.982274361564088


In [154]:
# evaluation on test data
predictions_tuned_test = classifier_tuned.predict(selected_test[features])
predictions_proba_tuned_test = classifier_tuned.predict_proba(selected_test[features])
get_evaluation_score(selected_test[labels], predictions_tuned_test, predictions_proba_tuned_test)

Accuracy score:  0.8824127043671262
Precision score:  0.5898061325308035
Recall score:  0.6630569733756381
F1 score:  0.6210033648505818
Confusion matrix for label toxic:
[[54900  2988]
 [ 1544  4546]]
Confusion matrix for label severe_toxic:
[[63257   354]
 [  208   159]]
Confusion matrix for label obscene:
[[58659  1628]
 [ 1061  2630]]
Confusion matrix for label threat:
[[63565   202]
 [  157    54]]
Confusion matrix for label insult:
[[59355  1196]
 [ 1359  2068]]
Confusion matrix for label identity_hate:
[[63006   260]
 [  556   156]]
Logarithmic Loss:  0.3029266280750555
ROC AUC score:  0.958930392026325


In [155]:
classifier_tuned

In [156]:
joblib.dump(classifier_tuned, 'lgbm_tuned.sav')

['lgbm_tuned.sav']

Tuned model performed better than baseline model.

## Tuned Light GBM on Each Label

In [157]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [170]:
def predictions(model):
    predictions = model.predict(selected_train[features])
    predictions_proba = model.predict_proba(selected_train[features])
    print('Model Performance on training set:')
    get_evaluation_score_single_class(selected_train[label], predictions, predictions_proba[:, 1])
    
    predictions_test = model.predict(selected_test[features])
    predictions_proba_test = model.predict_proba(selected_test[features])
    print('\nModel Performance on test set:')
    get_evaluation_score_single_class(selected_test[label], predictions_test, predictions_proba_test[:, 1])

### Toxic Label

In [171]:
param_test_each_label = {
 'max_depth':range(4,10,2),
 'min_child_samples':range(20,71,10)
}

label = labels[0]
LGBM_tuned_toxic = LGBMClassifier(random_state=0, n_estimators=80, subsample=0.6)
gsearch_toxic = GridSearchCV(estimator=LGBM_tuned_toxic, param_grid=param_test_each_label, scoring='f1', cv=5, verbose=3)
gsearch_toxic.fit(selected_train[features], selected_train[label])

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END .max_depth=4, min_child_samples=20;, score=0.737 total time=   0.7s
[CV 2/5] END .max_depth=4, min_child_samples=20;, score=0.746 total time=   0.7s
[CV 3/5] END .max_depth=4, min_child_samples=20;, score=0.741 total time=   0.7s
[CV 4/5] END .max_depth=4, min_child_samples=20;, score=0.738 total time=   0.7s
[CV 5/5] END .max_depth=4, min_child_samples=20;, score=0.736 total time=   0.7s
[CV 1/5] END .max_depth=4, min_child_samples=30;, score=0.737 total time=   0.7s
[CV 2/5] END .max_depth=4, min_child_samples=30;, score=0.747 total time=   0.7s
[CV 3/5] END .max_depth=4, min_child_samples=30;, score=0.741 total time=   0.7s
[CV 4/5] END .max_depth=4, min_child_samples=30;, score=0.739 total time=   0.7s
[CV 5/5] END .max_depth=4, min_child_samples=30;, score=0.735 total time=   0.7s
[CV 1/5] END .max_depth=4, min_child_samples=40;, score=0.740 total time=   0.7s
[CV 2/5] END .max_depth=4, min_child_samples=40;

In [172]:
gsearch_toxic.best_params_, gsearch_toxic.best_score_

({'max_depth': 8, 'min_child_samples': 40}, 0.7478995266817143)

In [173]:
predictions(gsearch_toxic)

Model Performance on training set:
Accuracy score:  0.9616785004794104
Precision score:  0.8766516208453016
Recall score:  0.6984438341833399
F1 score:  0.7774664289093489
Confusion matrix:
[[142774   1503]
 [  4612  10682]]
Logarithmic Loss:  0.10499893360215283
ROC AUC score:  0.9739857953396589

Model Performance on test set:
Accuracy score:  0.928491043796305
Precision score:  0.5996579397447704
Recall score:  0.7484400656814449
F1 score:  0.6658388722518443
Confusion matrix:
[[54845  3043]
 [ 1532  4558]]
Logarithmic Loss:  0.17421235234625868
ROC AUC score:  0.9525780361443105


In [174]:
gsearch_toxic

In [182]:
joblib.dump(gsearch_toxic, 'lgbm_toxic.sav')

['lgbm_toxic.sav']

### Severe Toxic Label

In [176]:
label = labels[1]
LGBM_tuned_severe_toxic = LGBMClassifier(random_state=0, n_estimators=80, subsample=0.6)
gsearch_severe_toxic = GridSearchCV(estimator=LGBM_tuned_severe_toxic, param_grid=param_test_each_label, scoring='f1', cv=5, verbose=3)
gsearch_severe_toxic.fit(selected_train[features], selected_train[label])

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END .max_depth=4, min_child_samples=20;, score=0.421 total time=   0.7s
[CV 2/5] END .max_depth=4, min_child_samples=20;, score=0.377 total time=   0.6s
[CV 3/5] END .max_depth=4, min_child_samples=20;, score=0.348 total time=   0.7s
[CV 4/5] END .max_depth=4, min_child_samples=20;, score=0.370 total time=   0.7s
[CV 5/5] END .max_depth=4, min_child_samples=20;, score=0.347 total time=   0.6s
[CV 1/5] END .max_depth=4, min_child_samples=30;, score=0.421 total time=   0.6s
[CV 2/5] END .max_depth=4, min_child_samples=30;, score=0.388 total time=   0.7s
[CV 3/5] END .max_depth=4, min_child_samples=30;, score=0.344 total time=   0.6s
[CV 4/5] END .max_depth=4, min_child_samples=30;, score=0.372 total time=   0.6s
[CV 5/5] END .max_depth=4, min_child_samples=30;, score=0.351 total time=   0.6s
[CV 1/5] END .max_depth=4, min_child_samples=40;, score=0.416 total time=   0.6s
[CV 2/5] END .max_depth=4, min_child_samples=40;

In [177]:
gsearch_severe_toxic.best_params_, gsearch_severe_toxic.best_score_

({'max_depth': 4, 'min_child_samples': 50}, 0.37754315383769904)

In [179]:
predictions(gsearch_severe_toxic)

Model Performance on training set:
Accuracy score:  0.9917967550494764
Precision score:  0.6388349514563106
Recall score:  0.41253918495297803
F1 score:  0.5013333333333332
Confusion matrix:
[[157604    372]
 [   937    658]]
Logarithmic Loss:  0.019306911541052105
ROC AUC score:  0.9930876111811278

Model Performance on test set:
Accuracy score:  0.9917159023414298
Precision score:  0.3373253493013972
Recall score:  0.4604904632152589
F1 score:  0.38940092165898615
Confusion matrix:
[[63279   332]
 [  198   169]]
Logarithmic Loss:  0.02075891428358873
ROC AUC score:  0.9827884591619267


In [180]:
gsearch_severe_toxic

In [183]:
joblib.dump(gsearch_severe_toxic, 'lgbm_severe_toxic.sav')

['lgbm_severe_toxic.sav']

### Obscene Label

In [184]:
label = labels[2]
LGBM_tuned_obscene = LGBMClassifier(random_state=0, n_estimators=80, subsample=0.6)
gsearch_obscene = GridSearchCV(estimator=LGBM_tuned_severe_toxic, param_grid=param_test_each_label, scoring='f1', cv=5, verbose=3)
gsearch_obscene.fit(selected_train[features], selected_train[label])

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END .max_depth=4, min_child_samples=20;, score=0.748 total time=   0.7s
[CV 2/5] END .max_depth=4, min_child_samples=20;, score=0.761 total time=   0.7s
[CV 3/5] END .max_depth=4, min_child_samples=20;, score=0.762 total time=   0.7s
[CV 4/5] END .max_depth=4, min_child_samples=20;, score=0.772 total time=   0.7s
[CV 5/5] END .max_depth=4, min_child_samples=20;, score=0.756 total time=   0.6s
[CV 1/5] END .max_depth=4, min_child_samples=30;, score=0.751 total time=   0.7s
[CV 2/5] END .max_depth=4, min_child_samples=30;, score=0.761 total time=   0.7s
[CV 3/5] END .max_depth=4, min_child_samples=30;, score=0.765 total time=   0.7s
[CV 4/5] END .max_depth=4, min_child_samples=30;, score=0.772 total time=   0.7s
[CV 5/5] END .max_depth=4, min_child_samples=30;, score=0.754 total time=   0.7s
[CV 1/5] END .max_depth=4, min_child_samples=40;, score=0.747 total time=   0.7s
[CV 2/5] END .max_depth=4, min_child_samples=40;

In [185]:
gsearch_obscene.best_params_, gsearch_obscene.best_score_

({'max_depth': 6, 'min_child_samples': 40}, 0.7636583840792126)

In [186]:
predictions(gsearch_obscene)

Model Performance on training set:
Accuracy score:  0.9807671820067556
Precision score:  0.8547876549723028
Recall score:  0.767073026393656
F1 score:  0.8085584180649991
Confusion matrix:
[[150021   1101]
 [  1968   6481]]
Logarithmic Loss:  0.05145552473958767
ROC AUC score:  0.9897485093741291

Model Performance on test set:
Accuracy score:  0.957563537466004
Precision score:  0.6139654367118169
Recall score:  0.7122730967217556
F1 score:  0.6594757305907438
Confusion matrix:
[[58634  1653]
 [ 1062  2629]]
Logarithmic Loss:  0.10232141433006615
ROC AUC score:  0.9658726145559758


In [187]:
gsearch_obscene

In [188]:
joblib.dump(gsearch_obscene, 'lgbm_obscene.sav')

['lgbm_obscene.sav']

### Threat Label

In [189]:
label = labels[3]
LGBM_tuned_threat = LGBMClassifier(random_state=0, n_estimators=80, subsample=0.6)
gsearch_threat = GridSearchCV(estimator=LGBM_tuned_threat, param_grid=param_test_each_label, scoring='f1', cv=5, verbose=3)
gsearch_threat.fit(selected_train[features], selected_train[label])

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END .max_depth=4, min_child_samples=20;, score=0.266 total time=   0.7s
[CV 2/5] END .max_depth=4, min_child_samples=20;, score=0.258 total time=   0.7s
[CV 3/5] END .max_depth=4, min_child_samples=20;, score=0.239 total time=   0.7s
[CV 4/5] END .max_depth=4, min_child_samples=20;, score=0.117 total time=   0.7s
[CV 5/5] END .max_depth=4, min_child_samples=20;, score=0.221 total time=   0.7s
[CV 1/5] END .max_depth=4, min_child_samples=30;, score=0.252 total time=   0.7s
[CV 2/5] END .max_depth=4, min_child_samples=30;, score=0.224 total time=   0.7s
[CV 3/5] END .max_depth=4, min_child_samples=30;, score=0.237 total time=   0.6s
[CV 4/5] END .max_depth=4, min_child_samples=30;, score=0.147 total time=   0.6s
[CV 5/5] END .max_depth=4, min_child_samples=30;, score=0.217 total time=   0.7s
[CV 1/5] END .max_depth=4, min_child_samples=40;, score=0.278 total time=   0.6s
[CV 2/5] END .max_depth=4, min_child_samples=40;

In [190]:
gsearch_threat.best_params_, gsearch_threat.best_score_

({'max_depth': 8, 'min_child_samples': 60}, 0.2684985491598624)

In [191]:
predictions(gsearch_threat)

Model Performance on training set:
Accuracy score:  0.9995801242080328
Precision score:  0.9812646370023419
Recall score:  0.8765690376569037
F1 score:  0.9259668508287293
Confusion matrix:
[[159085      8]
 [    59    419]]
Logarithmic Loss:  0.002081896824715256
ROC AUC score:  0.9999628582287348

Model Performance on test set:
Accuracy score:  0.9958266904248335
Precision score:  0.325
Recall score:  0.24644549763033174
F1 score:  0.28032345013477084
Confusion matrix:
[[63659   108]
 [  159    52]]
Logarithmic Loss:  0.01811735016793471
ROC AUC score:  0.9615455393476711


In [192]:
gsearch_threat

In [193]:
joblib.dump(gsearch_threat, 'lgbm_threat.sav')

['lgbm_threat.sav']

### Insult Label

In [194]:
label = labels[4]
LGBM_tuned_insult = LGBMClassifier(random_state=0, n_estimators=80, subsample=0.6)
gsearch_insult = GridSearchCV(estimator=LGBM_tuned_insult, param_grid=param_test_each_label, scoring='f1', cv=5, verbose=3)
gsearch_insult.fit(selected_train[features], selected_train[label])

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END .max_depth=4, min_child_samples=20;, score=0.678 total time=   0.7s
[CV 2/5] END .max_depth=4, min_child_samples=20;, score=0.669 total time=   0.7s
[CV 3/5] END .max_depth=4, min_child_samples=20;, score=0.684 total time=   0.7s
[CV 4/5] END .max_depth=4, min_child_samples=20;, score=0.697 total time=   0.7s
[CV 5/5] END .max_depth=4, min_child_samples=20;, score=0.673 total time=   0.7s
[CV 1/5] END .max_depth=4, min_child_samples=30;, score=0.673 total time=   0.7s
[CV 2/5] END .max_depth=4, min_child_samples=30;, score=0.667 total time=   0.7s
[CV 3/5] END .max_depth=4, min_child_samples=30;, score=0.683 total time=   0.7s
[CV 4/5] END .max_depth=4, min_child_samples=30;, score=0.696 total time=   0.7s
[CV 5/5] END .max_depth=4, min_child_samples=30;, score=0.673 total time=   0.7s
[CV 1/5] END .max_depth=4, min_child_samples=40;, score=0.678 total time=   0.7s
[CV 2/5] END .max_depth=4, min_child_samples=40;

In [195]:
gsearch_insult.best_params_, gsearch_insult.best_score_

({'max_depth': 8, 'min_child_samples': 50}, 0.6847354712161283)

In [196]:
predictions(gsearch_insult)

Model Performance on training set:
Accuracy score:  0.9762801511552851
Precision score:  0.8005287896592245
Recall score:  0.6918877745334518
F1 score:  0.742254000680967
Confusion matrix:
[[150336   1358]
 [  2427   5450]]
Logarithmic Loss:  0.06063440716111601
ROC AUC score:  0.985531557830589

Model Performance on test set:
Accuracy score:  0.960048766763575
Precision score:  0.6354587869362364
Recall score:  0.5961482346075284
F1 score:  0.6151761517615175
Confusion matrix:
[[59379  1172]
 [ 1384  2043]]
Logarithmic Loss:  0.09807094763477005
ROC AUC score:  0.9627195714222041


In [197]:
gsearch_insult

In [198]:
joblib.dump(gsearch_insult, 'lgbm_insult.sav')

['lgbm_insult.sav']

### Identity Hate Label

In [199]:
label = labels[5]
LGBM_tuned_hate = LGBMClassifier(random_state=0, n_estimators=80, subsample=0.6)
gsearch_hate = GridSearchCV(estimator=LGBM_tuned_hate, param_grid=param_test_each_label, scoring='f1', cv=5, verbose=3)
gsearch_hate.fit(selected_train[features], selected_train[label])

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END .max_depth=4, min_child_samples=20;, score=0.198 total time=   0.7s
[CV 2/5] END .max_depth=4, min_child_samples=20;, score=0.169 total time=   0.7s
[CV 3/5] END .max_depth=4, min_child_samples=20;, score=0.198 total time=   0.7s
[CV 4/5] END .max_depth=4, min_child_samples=20;, score=0.163 total time=   0.7s
[CV 5/5] END .max_depth=4, min_child_samples=20;, score=0.144 total time=   0.7s
[CV 1/5] END .max_depth=4, min_child_samples=30;, score=0.182 total time=   0.7s
[CV 2/5] END .max_depth=4, min_child_samples=30;, score=0.134 total time=   0.7s
[CV 3/5] END .max_depth=4, min_child_samples=30;, score=0.180 total time=   0.7s
[CV 4/5] END .max_depth=4, min_child_samples=30;, score=0.181 total time=   0.6s
[CV 5/5] END .max_depth=4, min_child_samples=30;, score=0.139 total time=   0.7s
[CV 1/5] END .max_depth=4, min_child_samples=40;, score=0.203 total time=   0.7s
[CV 2/5] END .max_depth=4, min_child_samples=40;

In [200]:
gsearch_hate.best_params_, gsearch_hate.best_score_

({'max_depth': 8, 'min_child_samples': 30}, 0.24064688260100464)

In [201]:
predictions(gsearch_hate)

Model Performance on training set:
Accuracy score:  0.9940653376866724
Precision score:  0.8347953216374269
Recall score:  0.40640569395017795
F1 score:  0.546673049305888
Confusion matrix:
[[158053    113]
 [   834    571]]
Logarithmic Loss:  0.014040202796486696
ROC AUC score:  0.9969117247553283

Model Performance on test set:
Accuracy score:  0.9878395698521367
Precision score:  0.4175
Recall score:  0.2345505617977528
F1 score:  0.30035971223021585
Confusion matrix:
[[63033   233]
 [  545   167]]
Logarithmic Loss:  0.03950818531908332
ROC AUC score:  0.9583303215565313


In [202]:
gsearch_hate

In [203]:
joblib.dump(gsearch_hate, 'lgbm_hate.sav')

['lgbm_hate.sav']

## -- end --