## Read Data

In [131]:
import pandas as pd
import numpy as np

# for reg model training
from sklearn.linear_model import LogisticRegression

# for random forest

# for multi label classification
from sklearn.datasets import make_multilabel_classification
from skmultilearn.problem_transform import ClassifierChain
from sklearn.multioutput import MultiOutputClassifier
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.problem_transform import BinaryRelevance

# for evaluation metrics
%run -i helper_functions.py

import warnings
warnings.filterwarnings("ignore")

In [4]:
selected_train = pd.read_csv('Data/selected_train.csv')
selected_test = pd.read_csv('Data/selected_test.csv')

In [5]:
selected_train.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,none,min_length_scaled,num_words_vs_length,exclamation_marks_vs_length,...,105,114,132,135,139,143,156,157,170,198
0,0,0,0,0,0,0,1,0.009393,0.181132,0.0,...,-0.09132,-0.017755,0.003997,-0.199211,-0.109984,0.089084,0.012651,-0.019501,-0.091946,-0.113697
1,0,0,0,0,0,0,1,0.000723,0.160714,0.008929,...,-0.118445,0.039541,0.017522,0.001397,9.4e-05,0.021135,0.039308,-0.067295,-0.079366,-0.044959
2,0,0,0,0,0,0,1,0.007225,0.188841,0.0,...,-0.131555,-0.060714,0.026459,-0.029582,-0.146134,0.109907,0.036539,-0.053628,0.02753,-0.21123
3,0,0,0,0,0,0,1,0.007948,0.175719,0.0,...,-0.145382,-0.004285,-0.000706,-0.134984,-0.214832,0.17756,0.007675,-0.087473,0.106848,-0.015987
4,0,0,0,0,0,0,1,0.003613,0.208955,0.0,...,0.007073,-0.204627,0.162032,0.013798,-0.221076,0.163578,-0.06831,-0.031184,-0.01728,0.038566


In [7]:
selected_train.shape

(159571, 48)

In [8]:
selected_test.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,min_length_scaled,num_words_vs_length,exclamation_marks_vs_length,num_unique_words_scaled,...,114,132,135,139,143,156,157,170,198,none
0,0,0,0,0,0,0,0.00289,0.166667,0.0,0.032911,...,0.040738,-0.010083,-0.158238,-0.204194,0.116565,0.098296,-0.20616,0.046779,-0.036748,1
1,0,0,0,0,0,0,0.005058,0.1875,0.0,0.01519,...,-0.013467,0.00355,-0.117174,-0.075475,0.043945,-0.148617,-0.35074,0.113815,-0.070907,1
2,0,0,0,0,0,0,0.000723,0.149554,0.002232,0.118987,...,-0.12508,0.045087,-0.119656,-0.274707,0.102546,0.100782,-0.107205,-0.05078,-0.098343,1
3,0,0,0,0,0,0,0.007225,0.169661,0.0,0.070886,...,-0.023304,-0.005002,-0.281848,-0.157011,0.184323,0.067625,-0.066242,0.023737,-0.111269,1
4,0,0,0,0,0,0,0.003613,0.167665,0.0,0.060759,...,0.055657,0.024936,-0.181419,-0.165707,0.194511,0.143557,-0.059017,0.136676,-0.00453,1


In [10]:
selected_test.shape

(63978, 48)

In [11]:
selected_test.columns

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate',
       'min_length_scaled', 'num_words_vs_length',
       'exclamation_marks_vs_length', 'num_unique_words_scaled',
       'verbs_vs_length', 'num_uppercase_scaled', 'uppercase_vs_length',
       'sentiment', 'bad_toxic_vs_length', 'bad_severe_toxic_vs_length',
       'bad_obscene_vs_length', 'bad_threat_vs_length', 'bad_insult_vs_length',
       'bad_identity_hate_vs_length', '29', '34', '46', '47', '53', '54', '65',
       '72', '82', '86', '87', '93', '95', '96', '98', '100', '103', '105',
       '114', '132', '135', '139', '143', '156', '157', '170', '198', 'none'],
      dtype='object')

In [42]:
features = selected_test.columns[7:-1]
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

### Logistic Regression Using OneVsRest

In [117]:
# with reweight
classifier_log_ovr = MultiOutputClassifier(LogisticRegression(max_iter=10000, class_weight='balanced'))
# train
classifier_log_ovr.fit(selected_train[features], selected_train[labels])
# predict
predictions_log_ovr = classifier_log_ovr.predict(selected_train[features])

In [118]:
# predict probability
predictions_proba_log_ovr = classifier_log_ovr.predict_proba(selected_train[features])

In [120]:
# model evaluation
get_evaluation_score(selected_train[labels], predictions_log_ovr, predictions_proba_log_ovr)

Accuracy score:  0.7823790037036805
Precision score:  0.3823137948352328
Recall score:  0.8699071172146561
F1 score:  0.5202510172396128
Confusion matrix for label toxic:
[[128695  15582]
 [  2310  12984]]
Confusion matrix for label severe_toxic:
[[149815   8161]
 [    93   1502]]
Confusion matrix for label obscene:
[[139811  11311]
 [  1005   7444]]
Confusion matrix for label threat:
[[143614  15479]
 [    39    439]]
Confusion matrix for label insult:
[[138740  12954]
 [   945   6932]]
Confusion matrix for label identity_hate:
[[143161  15005]
 [   174   1231]]


### Logistic Regression Using Binary Relevance

In [111]:
classifier_log_br = BinaryRelevance(LogisticRegression(max_iter=10000, class_weight='balanced'))
# train
classifier_log_br.fit(selected_train[features], selected_train[labels])
# predict
predictions_log_br = classifier_log_br.predict(selected_train[features])

In [112]:
# predict probability
predictions_proba_log_br = classifier_log_br.predict_proba(selected_train[features])

In [113]:
# model evaluation
get_evaluation_score(selected_train[labels], predictions_log_br, predictions_proba_log_br)

Accuracy score:  0.7823790037036805
Precision score:  0.3823137948352328
Recall score:  0.8699071172146561
F1 score:  0.5202510172396128
Confusion matrix for label toxic:
[[128695  15582]
 [  2310  12984]]
Confusion matrix for label severe_toxic:
[[149815   8161]
 [    93   1502]]
Confusion matrix for label obscene:
[[139811  11311]
 [  1005   7444]]
Confusion matrix for label threat:
[[143614  15479]
 [    39    439]]
Confusion matrix for label insult:
[[138740  12954]
 [   945   6932]]
Confusion matrix for label identity_hate:
[[143161  15005]
 [   174   1231]]
Logarithmic Loss:  0.35616097812647285
ROC AUC score:  0.9556984676033963


### Logistic Regression Using ClassifierChain

In [124]:
classifier_log_chain = ClassifierChain(LogisticRegression(max_iter=10000, class_weight='balanced'))
# train
classifier_log_chain.fit(selected_train[features], selected_train[labels])
# predict
predictions_log_chain = classifier_log_chain.predict(selected_train[features])

In [125]:
# predict probability
predictions_proba_log_chain = classifier_log_chain.predict_proba(selected_train[features])

In [128]:
# model evaluation
get_evaluation_score(selected_train[labels], predictions_log_chain, predictions_proba_log_chain)

Accuracy score:  0.7989922980992787
Precision score:  0.3265042142598098
Recall score:  0.8962903869166334
F1 score:  0.4593230547737794
Confusion matrix for label toxic:
[[128695  15582]
 [  2310  12984]]
Confusion matrix for label severe_toxic:
[[133995  23981]
 [    28   1567]]
Confusion matrix for label obscene:
[[130418  20704]
 [   625   7824]]
Confusion matrix for label threat:
[[135239  23854]
 [    37    441]]
Confusion matrix for label insult:
[[130454  21240]
 [   553   7324]]
Confusion matrix for label identity_hate:
[[130863  27303]
 [    87   1318]]
Logarithmic Loss:  0.37333162077812154
ROC AUC score:  0.9512013648677577


### Logistic Regression Using LabelPowerSet

In [132]:
classifier_log_lps = LabelPowerset(LogisticRegression(max_iter=10000, class_weight='balanced'))
# train
classifier_log_lps.fit(selected_train[features], selected_train[labels])
# predict
predictions_log_lps = classifier_log_lps.predict(selected_train[features])

In [133]:
# predict probability
predictions_proba_log_lps = classifier_log_lps.predict_proba(selected_train[features])

In [134]:
# model evaluation
get_evaluation_score(selected_train[labels], predictions_log_lps, predictions_proba_log_lps)

Accuracy score:  0.5402924090216894
Precision score:  0.2638933873790623
Recall score:  0.6392102114080574
F1 score:  0.35461867971907374
Confusion matrix for label toxic:
[[129875  14402]
 [  5803   9491]]
Confusion matrix for label severe_toxic:
[[151899   6077]
 [   519   1076]]
Confusion matrix for label obscene:
[[124189  26933]
 [  2731   5718]]
Confusion matrix for label threat:
[[134301  24792]
 [   103    375]]
Confusion matrix for label insult:
[[129345  22349]
 [  2973   4904]]
Confusion matrix for label identity_hate:
[[130450  27716]
 [   534    871]]
Logarithmic Loss:  0.33656150777593447
ROC AUC score:  0.8944370583915846


### Random Forest Using Power Set Labelling

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 1000, random_state = 0)
rf.fit(selected_train[features], selected_train[labels])
predictions_rf = rf.predict(selected_train[features])

In [None]:
# predict probability
predictions_proba_rf = rf.predict_proba(selected_train[features])

In [None]:
# model evaluation
get_evaluation_score(selected_train[labels], predictions_rf, predictions_proba_rf)

### Random Forest Using ClassificationChain

In [None]:
classifier_chain = ClassifierChain(RandomForestRegressor(n_estimators = 1000, random_state = 0))
# train
classifier_chain.fit(selected_train[features], selected_train[labels])
# predict
predictions_chain = classifier_chain.predict(selected_train[features])

In [None]:
# predict probability
predictions_proba_chain = classifier_chain.predict_proba(selected_train[features])

In [None]:
# model evaluation
get_evaluation_score(selected_train[labels], predictions_chain, predictions_proba_chain)

### Random Forest Using BinaryRelevance

In [None]:
classifier_br = BinaryRelevance(RandomForestRegressor(n_estimators = 1000, random_state = 0))
# train
classifier_br.fit(selected_train[features], selected_train[labels])
# predict
predictions = classifier_br.predict(selected_train[features])

In [None]:
# predict probability
predictions_proba = classifier_br.predict_proba(selected_train[features])

In [None]:
# model evaluation
get_evaluation_score(selected_train[labels], predictions, predictions_proba)