## Read Data

In [33]:
import pandas as pd
import numpy as np

# for reg model training
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

# for random forest
from sklearn.ensemble import RandomForestClassifier

# for multi label classification
from sklearn.datasets import make_multilabel_classification
from skmultilearn.problem_transform import ClassifierChain
from sklearn.multioutput import MultiOutputClassifier
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.problem_transform import BinaryRelevance

# for evaluation metrics
%run -i helper_functions.py

# for model export
import joblib

import warnings
warnings.filterwarnings("ignore")

In [2]:
selected_train = pd.read_csv('Data/selected_train.csv')
selected_test = pd.read_csv('Data/selected_test.csv')

In [3]:
selected_train.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,none,min_length_scaled,num_words_vs_length,exclamation_marks_vs_length,...,105,114,132,135,139,143,156,157,170,198
0,0,0,0,0,0,0,1,0.009393,0.181132,0.0,...,-0.09132,-0.017755,0.003997,-0.199211,-0.109984,0.089084,0.012651,-0.019501,-0.091946,-0.113697
1,0,0,0,0,0,0,1,0.000723,0.160714,0.008929,...,-0.118445,0.039541,0.017522,0.001397,9.4e-05,0.021135,0.039308,-0.067295,-0.079366,-0.044959
2,0,0,0,0,0,0,1,0.007225,0.188841,0.0,...,-0.131555,-0.060714,0.026459,-0.029582,-0.146134,0.109907,0.036539,-0.053628,0.02753,-0.21123
3,0,0,0,0,0,0,1,0.007948,0.175719,0.0,...,-0.145382,-0.004285,-0.000706,-0.134984,-0.214832,0.17756,0.007675,-0.087473,0.106848,-0.015987
4,0,0,0,0,0,0,1,0.003613,0.208955,0.0,...,0.007073,-0.204627,0.162032,0.013798,-0.221076,0.163578,-0.06831,-0.031184,-0.01728,0.038566


In [5]:
selected_train.shape

(159571, 48)

In [6]:
selected_test.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,min_length_scaled,num_words_vs_length,exclamation_marks_vs_length,num_unique_words_scaled,...,114,132,135,139,143,156,157,170,198,none
0,0,0,0,0,0,0,0.00289,0.166667,0.0,0.032911,...,0.040738,-0.010083,-0.158238,-0.204194,0.116565,0.098296,-0.20616,0.046779,-0.036748,1
1,0,0,0,0,0,0,0.005058,0.1875,0.0,0.01519,...,-0.013467,0.00355,-0.117174,-0.075475,0.043945,-0.148617,-0.35074,0.113815,-0.070907,1
2,0,0,0,0,0,0,0.000723,0.149554,0.002232,0.118987,...,-0.12508,0.045087,-0.119656,-0.274707,0.102546,0.100782,-0.107205,-0.05078,-0.098343,1
3,0,0,0,0,0,0,0.007225,0.169661,0.0,0.070886,...,-0.023304,-0.005002,-0.281848,-0.157011,0.184323,0.067625,-0.066242,0.023737,-0.111269,1
4,0,0,0,0,0,0,0.003613,0.167665,0.0,0.060759,...,0.055657,0.024936,-0.181419,-0.165707,0.194511,0.143557,-0.059017,0.136676,-0.00453,1


In [7]:
selected_test.shape

(63978, 48)

In [8]:
selected_test.columns

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate',
       'min_length_scaled', 'num_words_vs_length',
       'exclamation_marks_vs_length', 'num_unique_words_scaled',
       'verbs_vs_length', 'num_uppercase_scaled', 'uppercase_vs_length',
       'sentiment', 'bad_toxic_vs_length', 'bad_severe_toxic_vs_length',
       'bad_obscene_vs_length', 'bad_threat_vs_length', 'bad_insult_vs_length',
       'bad_identity_hate_vs_length', '29', '34', '46', '47', '53', '54', '65',
       '72', '82', '86', '87', '93', '95', '96', '98', '100', '103', '105',
       '114', '132', '135', '139', '143', '156', '157', '170', '198', 'none'],
      dtype='object')

In [5]:
features = selected_test.columns[7:-1]
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

## Model Building

### Logistic Regression Using OneVsRest

In [19]:
# with reweight
classifier_log_ovr = MultiOutputClassifier(LogisticRegression(max_iter=10000, class_weight='balanced'))
# train
classifier_log_ovr.fit(selected_train[features], selected_train[labels])
# predict
predictions_log_ovr = classifier_log_ovr.predict(selected_train[features])

In [31]:
# predict probability
predictions_proba_log_ovr = classifier_log_ovr.predict_proba(selected_train[features])

In [191]:
# model evaluation overall score
get_overall_evaluation_score(selected_train[labels], predictions_log_ovr)

Accuracy score:  0.9071573155523247
Precision score:  0.7835665408904706
Recall score:  0.4539005071514046
F1 score:  0.5705580253670303


In [201]:
# model evaluation on each label
i = 0
for label in labels:
    print(f'fitting for label {label}')
    get_evaluation_score_single_class(selected_train[label], predictions_log_ovr[:,i], np.array(predictions_proba_log_ovr[i][:,1]).T)
    i += 1
    print("\n")

fitting for label toxic
Accuracy score:  0.9446954647147665
Precision score:  0.8416605049117989
Recall score:  0.5209886229894076
F1 score:  0.643592746658051
Confusion matrix:
[[142778   1499]
 [  7326   7968]]
Logarithmic Loss:  0.15238669789000686
ROC AUC score:  0.941303466965538


fitting for label severe_toxic
Accuracy score:  0.990236321136046
Precision score:  0.5300813008130081
Recall score:  0.2043887147335423
F1 score:  0.2950226244343891
Confusion matrix:
[[157687    289]
 [  1269    326]]
Logarithmic Loss:  0.029947837509383204
ROC AUC score:  0.9689319896693167


fitting for label obscene
Accuracy score:  0.9676883644271203
Precision score:  0.8245614035087719
Recall score:  0.49508817611551664
F1 score:  0.61869545925159
Confusion matrix:
[[150232    890]
 [  4266   4183]]
Logarithmic Loss:  0.09489150128336714
ROC AUC score:  0.9569661176088267


fitting for label threat
Accuracy score:  0.996973134216117
Precision score:  0.41379310344827586
Recall score:  0.025104602

In [77]:
# on test data
predictions_log_ovr_test = classifier_log_ovr.predict(selected_test[features])
# model evaluation overall score
get_overall_evaluation_score(selected_test[labels], predictions_log_ovr_test)

Accuracy score:  0.7322360811528963
Precision score:  0.30084295436985375
Recall score:  0.8835011725755276
F1 score:  0.4407659323140065


In [230]:
# using cross validation
# with reweight
classifier_log_ovr_cv = MultiOutputClassifier(LogisticRegressionCV(max_iter=10000, class_weight='balanced',cv = 5))
# train
classifier_log_ovr_cv.fit(selected_train[features], selected_train[labels])
# predict
predictions_log_ovr_cv = classifier_log_ovr.predict(selected_train[features])

In [231]:
# predict probability
predictions_proba_log_ovr_cv = classifier_log_ovr_cv.predict_proba(selected_train[features])

In [232]:
# model evaluation overall score
get_overall_evaluation_score(selected_train[labels], predictions_log_ovr_cv)

Accuracy score:  0.9071573155523247
Precision score:  0.7835665408904706
Recall score:  0.4539005071514046
F1 score:  0.5705580253670303


The F1 score does not have obvious improve, hence wouldn't use cross validation for logistic regression for saving CPU

### Export Model

In [34]:
joblib.dump(classifier_log_ovr, 'logReg_ovr.sav')

['logReg_ovr.sav']

### Logistic Regression Using Binary Relevance

In [36]:
classifier_log_br = BinaryRelevance(LogisticRegression(max_iter=10000, class_weight='balanced'))
# train
classifier_log_br.fit(selected_train[features], selected_train[labels])
# predict
predictions_log_br = classifier_log_br.predict(selected_train[features])

In [37]:
# predict probability
predictions_proba_log_br = classifier_log_br.predict_proba(selected_train[features])

In [38]:
# model evaluation overall score
get_overall_evaluation_score(selected_train[labels], predictions_log_br)

Accuracy score:  0.7823790037036805
Precision score:  0.3823137948352328
Recall score:  0.8699071172146561
F1 score:  0.5202510172396128


In [227]:
# model evaluation on each label
i = 0
for label in labels:
    print(f'fitting for label {label}')
    get_evaluation_score_single_class(selected_train[label], predictions_log_br[:,i].toarray(), predictions_proba_log_br[:,i].toarray())
    i += 1
    print("\n")

fitting for label toxic
Accuracy score:  0.8878743631361589
Precision score:  0.4545263600084016
Recall score:  0.8489603766182817
F1 score:  0.5920656634746921
Confusion matrix:
[[128695  15582]
 [  2310  12984]]
Logarithmic Loss:  0.2976558490655549
ROC AUC score:  0.9450675196913704


fitting for label severe_toxic
Accuracy score:  0.9482738091507855
Precision score:  0.15543826968850252
Recall score:  0.941692789968652
F1 score:  0.26683247468466864
Confusion matrix:
[[149815   8161]
 [    93   1502]]
Logarithmic Loss:  0.20756119824661315
ROC AUC score:  0.9819963287943583


fitting for label obscene
Accuracy score:  0.9228180559124152
Precision score:  0.39690749133564385
Recall score:  0.8810510119540774
F1 score:  0.547272459932363
Confusion matrix:
[[139811  11311]
 [  1005   7444]]
Logarithmic Loss:  0.2478629532743535
ROC AUC score:  0.9652468745132916


fitting for label threat
Accuracy score:  0.9027517531381015
Precision score:  0.02757884156301043
Recall score:  0.918410

### Export Model

In [39]:
joblib.dump(classifier_log_br, 'logReg_br.sav')

['logReg_br.sav']

### Logistic Regression Using ClassifierChain

In [40]:
classifier_log_chain = ClassifierChain(LogisticRegression(max_iter=10000, class_weight='balanced'))
# train
classifier_log_chain.fit(selected_train[features], selected_train[labels])
# predict
predictions_log_chain = classifier_log_chain.predict(selected_train[features])

In [125]:
# predict probability
predictions_proba_log_chain = classifier_log_chain.predict_proba(selected_train[features])

In [234]:
# model evaluation overall score
get_overall_evaluation_score(selected_train[labels], predictions_log_chain)

Accuracy score:  0.7989922980992787
Precision score:  0.3265042142598098
Recall score:  0.8962903869166334
F1 score:  0.4593230547737794


In [235]:
# model evaluation on each label
i = 0
for label in labels:
    print(f'fitting for label {label}')
    get_evaluation_score_single_class(selected_train[label], predictions_log_chain[:,i].toarray(), predictions_proba_log_chain[:,i].toarray())
    i += 1
    print("\n")

fitting for label toxic
Accuracy score:  0.8878743631361589
Precision score:  0.4545263600084016
Recall score:  0.8489603766182817
F1 score:  0.5920656634746921
Confusion matrix:
[[128695  15582]
 [  2310  12984]]
Logarithmic Loss:  0.2976558490655549
ROC AUC score:  0.9450675196913704


fitting for label severe_toxic
Accuracy score:  0.8495403300098389
Precision score:  0.06133552528573665
Recall score:  0.9824451410658307
F1 score:  0.11546255019710422
Confusion matrix:
[[133995  23981]
 [    28   1567]]
Logarithmic Loss:  0.2573653047966825
ROC AUC score:  0.9804797379642446


fitting for label obscene
Accuracy score:  0.8663353616885273
Precision score:  0.27425687044307345
Recall score:  0.9260267487276601
F1 score:  0.4231819779863158
Confusion matrix:
[[130418  20704]
 [   625   7824]]
Logarithmic Loss:  0.540121157439039
ROC AUC score:  0.9608222001382551


fitting for label threat
Accuracy score:  0.8502798127479304
Precision score:  0.018151883103519242
Recall score:  0.92259

### Export Model

In [41]:
joblib.dump(classifier_log_chain, 'logReg_chain.sav')

['logReg_chain.sav']

### Logistic Regression Using LabelPowerSet

In [43]:
classifier_log_lps = LabelPowerset(LogisticRegression(max_iter=10000, class_weight='balanced'))
# train
classifier_log_lps.fit(selected_train[features], selected_train[labels])
# predict
predictions_log_lps = classifier_log_lps.predict(selected_train[features])

In [44]:
# predict probability
predictions_proba_log_lps = classifier_log_lps.predict_proba(selected_train[features])

In [45]:
# model evaluation overall score
get_overall_evaluation_score(selected_train[labels], predictions_log_lps)

Accuracy score:  0.5402924090216894
Precision score:  0.2638933873790623
Recall score:  0.6392102114080574
F1 score:  0.35461867971907374


In [46]:
# model evaluation on each label
i = 0
for label in labels:
    print(f'fitting for label {label}')
    get_evaluation_score_single_class(selected_train[label], predictions_log_lps[:,i].toarray(), predictions_proba_log_lps[:,i].toarray())
    i += 1
    print("\n")

fitting for label toxic
Accuracy score:  0.8733792481089923
Precision score:  0.39722931402502826
Recall score:  0.6205701582319864
F1 score:  0.48439533518768985
Confusion matrix:
[[129875  14402]
 [  5803   9491]]
Logarithmic Loss:  0.3694693161758654
ROC AUC score:  0.9004981145332336


fitting for label severe_toxic
Accuracy score:  0.9586641683012578
Precision score:  0.1504263945197819
Recall score:  0.6746081504702194
F1 score:  0.24599908550525834
Confusion matrix:
[[151899   6077]
 [   519   1076]]
Logarithmic Loss:  0.08433989644240165
ROC AUC score:  0.967695061175913


fitting for label obscene
Accuracy score:  0.8141015598072332
Precision score:  0.17512480475330006
Recall score:  0.6767664812403835
F1 score:  0.2782481751824818
Confusion matrix:
[[124189  26933]
 [  2731   5718]]
Logarithmic Loss:  0.39331131867579655
ROC AUC score:  0.8822640557964808


fitting for label threat
Accuracy score:  0.8439879426712874
Precision score:  0.014900464894504709
Recall score:  0.78

### Export Model

In [47]:
joblib.dump(classifier_log_lps, 'logReg_lps.sav')

['logReg_lps.sav']

### Random Forest Using Power Set Labelling

In [70]:
classifier_rf = RandomForestClassifier(random_state = 0,class_weight='balanced',n_estimators=100,max_depth = 15)
classifier_rf.fit(selected_train[features], selected_train[labels])
predictions_rf = classifier_rf.predict(selected_train[features])

In [71]:
# predict probability
predictions_proba_rf = classifier_rf.predict_proba(selected_train[features])

In [72]:
# model evaluation overall score
get_overall_evaluation_score(selected_train[labels], predictions_rf)

Accuracy score:  0.4602653364333118
Precision score:  0.19489052186078495
Recall score:  0.9955268106444811
F1 score:  0.31103799430324197


In [73]:
# model evaluation on each label
i = 0
for label in labels:
    print(f'fitting for label {label}')
    get_evaluation_score_single_class(selected_train[label], predictions_rf[:,i], np.array(predictions_proba_rf[i][:,1]).T)
    i += 1
    print("\n")

fitting for label toxic
Accuracy score:  0.5071284882591448
Precision score:  0.16245044968245173
Recall score:  0.9967961292009938
F1 score:  0.27937107148747453
Confusion matrix:
[[65678 78599]
 [   49 15245]]
Logarithmic Loss:  0.737827788443954
ROC AUC score:  0.9745928658246024


fitting for label severe_toxic
Accuracy score:  0.9936015942746489
Precision score:  0.6106399383191982
Recall score:  0.993103448275862
F1 score:  0.7562664120315111
Confusion matrix:
[[156966   1010]
 [    11   1584]]
Logarithmic Loss:  0.07374605008083888
ROC AUC score:  0.9991690734182391


fitting for label obscene
Accuracy score:  0.7191406959911262
Precision score:  0.15836245443955962
Recall score:  0.9976328559592851
F1 score:  0.2733360356708553
Confusion matrix:
[[106325  44797]
 [    20   8429]]
Logarithmic Loss:  0.5564503422124562
ROC AUC score:  0.9944977583378385


fitting for label threat
Accuracy score:  0.9975371464739834
Precision score:  0.5530586766541823
Recall score:  0.92677824267

In [74]:
# on test data
predictions_rf_test = classifier_rf.predict(selected_test[features])
predictions_proba_rf_test = classifier_rf.predict_proba(selected_test[features])

In [75]:
# model evaluation on test data
get_overall_evaluation_score(selected_test[labels], predictions_rf_test)

Accuracy score:  0.4317734221138516
Precision score:  0.15660378304421346
Recall score:  0.8870188991585046
F1 score:  0.25636103500394214


### GridSearch Cross Validation

In [21]:
# using grid search cv
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [5, 10, 15, 20],
    'max_depth': [2, 5, 7, 9]
}
rf_cv = RandomForestClassifier(random_state = 0,class_weight='balanced')
grid_clf = GridSearchCV(rf_cv, param_grid, cv=5)
grid_clf.fit(selected_train[features], selected_train[labels])

In [26]:
grid_clf.best_params_

{'max_depth': 9, 'n_estimators': 5}

In [27]:
classifier_rf_tuned = RandomForestClassifier(random_state = 0,class_weight='balanced',n_estimators=5,max_depth = 9)
classifier_rf_tuned.fit(selected_train[features], selected_train[labels])
predictions_rf_tuned = classifier_rf_tuned.predict(selected_train[features])

In [28]:
# model evaluation overall score
get_overall_evaluation_score(selected_train[labels], predictions_rf_tuned)

Accuracy score:  0.007971373244511847
Precision score:  0.06985598010561689
Recall score:  0.9923927289304234
F1 score:  0.1295739288108892


In [29]:
# on test data
predictions_rf_tuned_test = classifier_rf_tuned.predict(selected_test[features])
predictions_proba_rf_tuned_test = classifier_rf_tuned.predict_proba(selected_test[features])

In [30]:
# model evaluation on test data
get_overall_evaluation_score(selected_test[labels], predictions_rf_tuned_test)

Accuracy score:  0.005970802463346776
Precision score:  0.06926813812086417
Recall score:  0.9793764657194096
F1 score:  0.12856055597128338


As the model after tuning performed even worse, we would not apply parameter tuning on Random Forest as well.

### Export Model

In [76]:
joblib.dump(classifier_rf, 'rf_lps.sav')

['rf_lps.sav']

### Random Forest Using ClassificationChain

In [48]:
classifier_rf_chain = ClassifierChain(RandomForestClassifier(random_state = 0,max_depth = 10))
# train
classifier_rf_chain.fit(selected_train[features], selected_train[labels])
# predict
predictions_chain = classifier_rf_chain.predict(selected_train[features])

In [50]:
# predict probability
predictions_proba_chain = classifier_rf_chain.predict_proba(selected_train[features])

In [51]:
# model evaluation overall score
get_overall_evaluation_score(selected_train[labels], predictions_chain)

Accuracy score:  0.9322245270130538
Precision score:  0.9020199772874217
Recall score:  0.6421163599065474
F1 score:  0.7324311795329826


In [52]:
# model evaluation on each label
i = 0
for label in labels:
    print(f'fitting for label {label}')
    get_evaluation_score_single_class(selected_train[label], predictions_chain[:,i].toarray(), predictions_proba_chain[:,i].toarray())
    i += 1
    print("\n")

fitting for label toxic
Accuracy score:  0.9615155636049156
Precision score:  0.9554184495969749
Recall score:  0.6277625212501634
F1 score:  0.7576845677307343
Confusion matrix:
[[143829    448]
 [  5693   9601]]
Logarithmic Loss:  0.1163360526290165
ROC AUC score:  0.964860772678608


fitting for label severe_toxic
Accuracy score:  0.9941092053067286
Precision score:  0.9685264663805436
Recall score:  0.4244514106583072
F1 score:  0.5902353966870096
Confusion matrix:
[[157954     22]
 [   918    677]]
Logarithmic Loss:  0.015965643592664773
ROC AUC score:  0.9947034869627434


fitting for label obscene
Accuracy score:  0.9822085466657475
Precision score:  0.8754014989293362
Recall score:  0.7741744585158007
F1 score:  0.8216820551472896
Confusion matrix:
[[150191    931]
 [  1908   6541]]
Logarithmic Loss:  0.058234173859614345
ROC AUC score:  0.9833681232487671


fitting for label threat
Accuracy score:  0.9974494112338708
Precision score:  1.0
Recall score:  0.14853556485355648
F1 

In [55]:
# on test data
predictions_chain_test = classifier_rf_chain.predict(selected_test[features])
# predict probability
predictions_proba_chain_test = classifier_rf_chain.predict_proba(selected_test[features])

In [56]:
get_overall_evaluation_score(selected_test[labels], predictions_chain_test)

Accuracy score:  0.8951045671949732
Precision score:  0.5967302928948657
Recall score:  0.6288453579804111
F1 score:  0.5943281006205795


### Export Model

In [58]:
joblib.dump(classifier_rf_chain, 'rf_chain.sav')

['rf_chain.sav']

### Random Forest Using BinaryRelevance

In [57]:
classifier_rf_br = BinaryRelevance(RandomForestClassifier(random_state = 0,class_weight='balanced',max_depth = 10))
# train
classifier_rf_br.fit(selected_train[features], selected_train[labels])
# predict
predictions = classifier_rf_br.predict(selected_train[features])

In [59]:
# predict probability
predictions_proba = classifier_rf_br.predict_proba(selected_train[features])

In [60]:
# model evaluation
get_overall_evaluation_score(selected_train[labels], predictions)

Accuracy score:  0.886432998477167
Precision score:  0.6001627923746145
Recall score:  0.9106216878454613
F1 score:  0.7106884726700586


In [61]:
# model evaluation on each label
i = 0
for label in labels:
    print(f'fitting for label {label}')
    get_evaluation_score_single_class(selected_train[label], predictions[:,i].toarray(), predictions_proba[:,i].toarray())
    i += 1
    print("\n")

fitting for label toxic
Accuracy score:  0.94515920812679
Precision score:  0.6614200424335126
Recall score:  0.8764875114423957
F1 score:  0.7539158066421079
Confusion matrix:
[[137415   6862]
 [  1889  13405]]
Logarithmic Loss:  0.20778452308065776
ROC AUC score:  0.9750634177911363


fitting for label severe_toxic
Accuracy score:  0.9745881143816859
Precision score:  0.28043555872902537
Recall score:  0.9849529780564263
F1 score:  0.4365707933861331
Confusion matrix:
[[153945   4031]
 [    24   1571]]
Logarithmic Loss:  0.07067698904905502
ROC AUC score:  0.9927983922163963


fitting for label obscene
Accuracy score:  0.9723633993645462
Precision score:  0.6725920861464832
Recall score:  0.9314711800213042
F1 score:  0.7811414392059554
Confusion matrix:
[[147291   3831]
 [   579   7870]]
Logarithmic Loss:  0.11739173656586525
ROC AUC score:  0.9904634006742283


fitting for label threat
Accuracy score:  0.9826534896691754
Precision score:  0.1472581638940234
Recall score:  1.0
F1 sc

### Export Model

In [62]:
joblib.dump(classifier_rf_br, 'rf_br.sav')

['rf_br.sav']

### Random Forest Using OneVsRest

In [63]:
for label in labels:
    print(f'fitting for label {label}')
    classifier_rf_ovr = RandomForestClassifier(random_state = 0,class_weight='balanced',max_depth = 10)
    classifier_rf_ovr.fit(selected_train[features], selected_train[label])
    predictions_rf_ovr = classifier_rf_ovr.predict(selected_train[features])
    predictions_proba_rf_ovr = classifier_rf_ovr.predict_proba(selected_train[features])
    print('Model Performance on training set:')
    get_evaluation_score_single_class(selected_train[label], predictions_rf_ovr, predictions_proba_rf_ovr[:, 1])

#     output model
    fileName = 'rf_' + str(label) + '.sav'
    joblib.dump(classifier_rf_ovr, fileName)
    
    predictions_rf_ovr_test = classifier_rf_ovr.predict(selected_test[features])
    predictions_proba_rf_ovr_test = classifier_rf_ovr.predict_proba(selected_test[features])
    print('Model Performance on test set:')
    get_evaluation_score_single_class(selected_test[label], predictions_rf_ovr_test, predictions_proba_rf_ovr_test[:, 1])

fitting for label toxic
Model Performance on training set:
Accuracy score:  0.94515920812679
Precision score:  0.6614200424335126
Recall score:  0.8764875114423957
F1 score:  0.7539158066421079
Confusion matrix:
[[137415   6862]
 [  1889  13405]]
Logarithmic Loss:  0.20778452308065776
ROC AUC score:  0.9750634177911363
Model Performance on test set:
Accuracy score:  0.8851323892588077
Precision score:  0.44677432992305743
Recall score:  0.8676518883415435
F1 score:  0.5898308868672211
Confusion matrix:
[[51345  6543]
 [  806  5284]]
Logarithmic Loss:  0.31447545034122104
ROC AUC score:  0.9485148321065717
fitting for label severe_toxic
Model Performance on training set:
Accuracy score:  0.9745881143816859
Precision score:  0.28043555872902537
Recall score:  0.9849529780564263
F1 score:  0.4365707933861331
Confusion matrix:
[[153945   4031]
 [    24   1571]]
Logarithmic Loss:  0.07067698904905502
ROC AUC score:  0.9927983922163963
Model Performance on test set:
Accuracy score:  0.963034

## Oversampled Data

In [8]:
oversampled_train = pd.read_csv('Data/train_oversampled.csv')

In [9]:
print(oversampled_train.shape)
oversampled_train.head()

(421989, 49)


Unnamed: 0,toxic,obscene,insult,none,min_length_scaled,num_words_vs_length,exclamation_marks_vs_length,num_unique_words_scaled,verbs_vs_length,num_uppercase_scaled,...,139,143,156,157,170,198,clean_text,threat,identity_hate,severe_toxic
0,0,0,0,1,0.009393,0.181132,0.0,0.106329,0.041509,0.003426,...,-0.109984,0.089084,0.012651,-0.019501,-0.091946,-0.113697,explanation why the edits made under my userna...,0,0,0
1,0,0,0,1,0.000723,0.160714,0.008929,0.043038,0.026786,0.001612,...,9.4e-05,0.021135,0.039308,-0.067295,-0.079366,-0.044959,d'aww! he matches this background colour i am ...,0,0,0
2,0,0,0,1,0.007225,0.188841,0.0,0.091139,0.038627,0.000806,...,-0.146134,0.109907,0.036539,-0.053628,0.02753,-0.21123,"hey man, i am really not trying to edit war. i...",0,0,0
3,0,0,0,1,0.007948,0.175719,0.0,0.141772,0.036741,0.002217,...,-0.214832,0.17756,0.007675,-0.087473,0.106848,-0.015987,""" more i cannot make any real suggestions on i...",0,0,0
4,0,0,0,1,0.003613,0.208955,0.0,0.027848,0.059701,0.000403,...,-0.221076,0.163578,-0.06831,-0.031184,-0.01728,0.038566,"you, sir, are my hero. any chance you remember...",0,0,0


In [10]:
# logistic regression
classifier_over = MultiOutputClassifier(LogisticRegression(max_iter=10000))
# train
classifier_over.fit(oversampled_train[features], oversampled_train[labels])
# predict
predictions_over = classifier_over.predict(oversampled_train[features])

In [11]:
# predict probability
predictions_proba_log_ovr_over = classifier_over.predict_proba(oversampled_train[features])

In [16]:
# model evaluation overall score
get_overall_evaluation_score(oversampled_train[labels], predictions_over)

Accuracy score:  0.6146108073907139
Precision score:  0.9004191406353675
Recall score:  0.91221551936062
F1 score:  0.9061222418446149


In [18]:
# on test data
predictions_over_test = classifier_over.predict(selected_test[features])
# model evaluation overall score
get_overall_evaluation_score(selected_test[labels], predictions_over_test)

Accuracy score:  0.7821282315796055
Precision score:  0.3927793290075892
Recall score:  0.706028417712788
F1 score:  0.4965676475667567


The F1 score for test data using oversampled_train does not improve a lot comparing with using unsampled data (0.4407). In case of overfitting issue, we would not use oversampled_train in building logistic regression model.

In [78]:
# random forest
classifier_rf = RandomForestClassifier(random_state = 0,class_weight='balanced',n_estimators=100,max_depth = 15)
classifier_rf.fit(oversampled_train[features], oversampled_train[labels])
predictions_rf = classifier_rf.predict(oversampled_train[features])
# model evaluation overall score
get_overall_evaluation_score(oversampled_train[labels], predictions_rf)

Accuracy score:  0.9780563000457357
Precision score:  0.9983590890341083
Recall score:  0.9910507879416735
F1 score:  0.9946728284828051


The F1 score for test data using oversampled_train is obviously overfitted, hence we would not use oversampled_train in building random forest model.

# ---END---