In [1]:
import pandas as pd

# for evaluation metrics
%run -i helper_functions.py
from sklearn.model_selection import cross_validate

# for reg model training
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

# for random forest
from sklearn.ensemble import RandomForestClassifier

# for multi-label classification
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset

# for cross validation
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings("ignore")

## Read Data

In [2]:
selected_train = pd.read_csv('Data/selected_train.csv')

In [4]:
print(selected_train.shape)
selected_train.head()

(159571, 48)


Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,none,min_length_scaled,num_words_vs_length,exclamation_marks_vs_length,...,105,114,132,135,139,143,156,157,170,198
0,0,0,0,0,0,0,1,0.009393,0.181132,0.0,...,-0.09132,-0.017755,0.003997,-0.199211,-0.109984,0.089084,0.012651,-0.019501,-0.091946,-0.113697
1,0,0,0,0,0,0,1,0.000723,0.160714,0.008929,...,-0.118445,0.039541,0.017522,0.001397,9.4e-05,0.021135,0.039308,-0.067295,-0.079366,-0.044959
2,0,0,0,0,0,0,1,0.007225,0.188841,0.0,...,-0.131555,-0.060714,0.026459,-0.029582,-0.146134,0.109907,0.036539,-0.053628,0.02753,-0.21123
3,0,0,0,0,0,0,1,0.007948,0.175719,0.0,...,-0.145382,-0.004285,-0.000706,-0.134984,-0.214832,0.17756,0.007675,-0.087473,0.106848,-0.015987
4,0,0,0,0,0,0,1,0.003613,0.208955,0.0,...,0.007073,-0.204627,0.162032,0.013798,-0.221076,0.163578,-0.06831,-0.031184,-0.01728,0.038566


In [3]:
features = selected_train.columns[7:]
labels = selected_train.columns[:6]

## Logistic Regression

### Baseline Logistic Regression using BinaryRelevance

In [8]:
def model_evaluation(model, train):
    features = train.columns[7:]
    labels = train.columns[:6]
    kf = KFold(n_splits=5)
    validation_scores = pd.DataFrame({'accuracy':[], 'precision_weighted':[], 'recall_weighted':[], 
                                      'f1_weighted':[], 'log_loss':[], 'roc_auc_weighted':[]})
    for train_index, test_index in kf.split(train):
        train_split = train.iloc[train_index]
        test_split = train.iloc[test_index]
        print('Starting fitting...')
        model.fit(train_split[features], train_split[labels])
        predictions = model.predict(test_split[features])
        predictions_proba = model.predict_proba(test_split[features])
        scores = [accuracy_score(test_split[labels], predictions), precision_score(test_split[labels], predictions, average="weighted"),
                  recall_score(test_split[labels], predictions, average="weighted"), f1_score(test_split[labels], predictions, average="weighted"),
                  log_loss(test_split[labels], predictions_proba.toarray()), roc_auc_score(test_split[labels], predictions_proba.toarray(), average="weighted")]
        validation_scores.loc[len(validation_scores)] = scores
        print(f'Evaluation Scores:\n{scores}\n')
    return validation_scores   

In [10]:
classifier_lr = BinaryRelevance(LogisticRegression(max_iter=10000, class_weight='balanced'))
scores_lr = model_evaluation(classifier_lr, selected_train)

Starting fitting...
Evaluation Scores:
[0.78110606297979, 0.38312777734872755, 0.8747718657868876, 0.5218320762939799, 0.361826465747517, 0.9571759578912593]

Starting fitting...
Evaluation Scores:
[0.7805665225292975, 0.37672819921185674, 0.8648764462219682, 0.5136746726124278, 0.3557133101110622, 0.9534705308273822]

Starting fitting...
Evaluation Scores:
[0.7806918593720624, 0.383764096430795, 0.8643101482326112, 0.521333200334735, 0.3558406591614544, 0.9546048020672011]

Starting fitting...
Evaluation Scores:
[0.7794071567337219, 0.373570824471023, 0.8741740879057742, 0.5129656601058877, 0.35327175102635244, 0.9553114183732587]

Starting fitting...
Evaluation Scores:
[0.7829792567525224, 0.3746450991097544, 0.8624928530588908, 0.5114597782873533, 0.3557374858350244, 0.9522242006403099]



In [11]:
scores_lr

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,log_loss,roc_auc_weighted
0,0.781106,0.383128,0.874772,0.521832,0.361826,0.957176
1,0.780567,0.376728,0.864876,0.513675,0.355713,0.953471
2,0.780692,0.383764,0.86431,0.521333,0.355841,0.954605
3,0.779407,0.373571,0.874174,0.512966,0.353272,0.955311
4,0.782979,0.374645,0.862493,0.51146,0.355737,0.952224


### Baseline Logistic Regression using ClassifierChain

In [13]:
classifier_lr = ClassifierChain(LogisticRegression(max_iter=10000, class_weight='balanced'))
scores_chain_lr = model_evaluation(classifier_lr, selected_train)

Starting fitting...
Evaluation Scores:
[0.7965533448221839, 0.32913163287629466, 0.9035518742103047, 0.4624119007545911, 0.38025901482669816, 0.9529966351078759]

Starting fitting...
Evaluation Scores:
[0.7982390173591527, 0.3228231367627005, 0.8917297528924439, 0.45476589797928757, 0.37155151172656614, 0.9485669223020883]

Starting fitting...
Evaluation Scores:
[0.7985836936767563, 0.3279473500558061, 0.8955245153933865, 0.4617693961356759, 0.373577472498418, 0.9499014956228403]

Starting fitting...
Evaluation Scores:
[0.7961709594535313, 0.32096897444254835, 0.8984487216317151, 0.45410466697766716, 0.3698241329536564, 0.9509843360831298]

Starting fitting...
Evaluation Scores:
[0.7989910384157423, 0.32127690549165744, 0.8867924528301887, 0.45253792016593214, 0.3713735945140179, 0.9478930281527612]



In [14]:
scores_chain_lr

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,log_loss,roc_auc_weighted
0,0.796553,0.329132,0.903552,0.462412,0.380259,0.952997
1,0.798239,0.322823,0.89173,0.454766,0.371552,0.948567
2,0.798584,0.327947,0.895525,0.461769,0.373577,0.949901
3,0.796171,0.320969,0.898449,0.454105,0.369824,0.950984
4,0.798991,0.321277,0.886792,0.452538,0.371374,0.947893


### Baseline Logistic Regression using LabelPowerset

In [19]:
classifier_lp_lr = LabelPowerset(LogisticRegression(max_iter=10000, class_weight='balanced'))
scores_lp_lr = model_evaluation(classifier_lp_lr, selected_train)

Starting fitting...
Evaluation Scores:
[0.5382422058593138, 0.25818891399335364, 0.6330197950301839, 0.3490259105059477, 0.3408416223056499, 0.892602816433005]

Starting fitting...
Evaluation Scores:
[0.5446512502350066, 0.2589128275722432, 0.6219111555492073, 0.3459259378004993, 0.33735543145612995, 0.8907091069898391]

Starting fitting...
Evaluation Scores:
[0.541204487058971, 0.2615379281404357, 0.6309863169897377, 0.3567977382478096, 0.3378157721747543, 0.8937576895326305]

Starting fitting...
Evaluation Scores:
[0.5379143949363916, 0.26413097243016126, 0.6460787130135018, 0.35868319646429675, 0.3332087095145717, 0.9005038900695024]

Starting fitting...
Evaluation Scores:
[0.542551858118694, 0.26059661391629707, 0.6439393939393939, 0.3545889313730019, 0.3381675067914982, 0.8913922638248576]



In [20]:
scores_lp_lr

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,log_loss,roc_auc_weighted
0,0.538242,0.258189,0.63302,0.349026,0.340842,0.892603
1,0.544651,0.258913,0.621911,0.345926,0.337355,0.890709
2,0.541204,0.261538,0.630986,0.356798,0.337816,0.893758
3,0.537914,0.264131,0.646079,0.358683,0.333209,0.900504
4,0.542552,0.260597,0.643939,0.354589,0.338168,0.891392


## Store the Result

In [21]:
scores_collection = pd.DataFrame({'model name': [], 'accuracy':[], 'precision_weighted':[], 'recall_weighted':[], 
                                      'f1_weighted':[], 'log_loss':[], 'roc_auc_weighted':[]})
scores_collection.loc[len(scores_collection)] = ['LogReg with BinaryRelevance']+np.mean(scores_lr).values.tolist()
scores_collection.loc[len(scores_collection)] = ['LogReg with ClassifierChain']+np.mean(scores_chain_lr).values.tolist()
scores_collection.loc[len(scores_collection)] = ['LogReg with LabelPowerset']+np.mean(scores_lp_lr).values.tolist()

In [22]:
scores_collection

Unnamed: 0,model name,accuracy,precision_weighted,recall_weighted,f1_weighted,log_loss,roc_auc_weighted
0,LogReg with BinaryRelevance,0.78095,0.378367,0.868125,0.516253,0.356478,0.954557
1,LogReg with ClassifierChain,0.797708,0.32443,0.895209,0.457118,0.373317,0.950068
2,LogReg with LabelPowerset,0.540913,0.260673,0.635187,0.353004,0.337478,0.893793


In [23]:
scores_collection.to_csv('LogReg Evaluation.csv', index=False)

## Random Forest

### Random Forest using BinaryRelevance

In [4]:
classifier = BinaryRelevance(RandomForestClassifier(random_state = 0,class_weight='balanced',max_depth = 10))
scores = model_evaluation(classifier, selected_train)

Starting fitting...
Evaluation Scores:
[0.869277769074103, 0.5671761647253778, 0.8146848238101924, 0.659065139225299, 0.32635482618324596, 0.9677592487835692]

Starting fitting...
Evaluation Scores:
[0.8698376887886194, 0.5617088164428913, 0.8108841594057992, 0.6535801429779559, 0.3219546255629364, 0.964836280133045]

Starting fitting...
Evaluation Scores:
[0.871937080904932, 0.5735855230717503, 0.8164196123147093, 0.6640727580648728, 0.32185276179031946, 0.9669644120297468]

Starting fitting...
Evaluation Scores:
[0.8722190888011531, 0.5656196607943321, 0.8190175237000862, 0.6594682346610224, 0.3184594262156382, 0.9687733015263874]

Starting fitting...
Evaluation Scores:
[0.8708717177414301, 0.5605786248811306, 0.8073184676958262, 0.6516270735926296, 0.32023215085978685, 0.9654686767229733]



In [25]:
scores

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,log_loss,roc_auc_weighted
0,0.869278,0.567176,0.814685,0.659065,0.326355,0.967759
1,0.869838,0.561709,0.810884,0.65358,0.321955,0.964836
2,0.871937,0.573586,0.81642,0.664073,0.321853,0.966964
3,0.872219,0.56562,0.819018,0.659468,0.318459,0.968773
4,0.870872,0.560579,0.807318,0.651627,0.320232,0.965469


### Random Forest using ClassifierChain

In [5]:
classifier_chain = BinaryRelevance(RandomForestClassifier(random_state = 0,max_depth = 10))
scores_chain = model_evaluation(classifier_chain, selected_train)

Starting fitting...
Evaluation Scores:
[0.9145542848190507, 0.8420678168387787, 0.5556647479994384, 0.6502144113913056, 0.287318435757757, 0.9661389078692632]

Starting fitting...
Evaluation Scores:
[0.9167136679827035, 0.8216200622232417, 0.5560634195114984, 0.6496863347763459, 0.28304055901232483, 0.9628577839138188]

Starting fitting...
Evaluation Scores:
[0.9176850285141317, 0.8273921755210838, 0.5592930444697833, 0.6557264504309591, 0.2841520803143075, 0.9635852225014436]

Starting fitting...
Evaluation Scores:
[0.9176850285141317, 0.8151208206968353, 0.5706693478885377, 0.6597637709696572, 0.2796187291943415, 0.9664187653068163]

Starting fitting...
Evaluation Scores:
[0.9158989785047315, 0.8195934361553373, 0.5491709548313322, 0.6424273808359757, 0.2829371841524248, 0.9630414410266241]



In [27]:
scores_chain

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,log_loss,roc_auc_weighted
0,0.914554,0.842068,0.555665,0.650214,0.287318,0.966139
1,0.916714,0.82162,0.556063,0.649686,0.283041,0.962858
2,0.917685,0.827392,0.559293,0.655726,0.284152,0.963585
3,0.917685,0.815121,0.570669,0.659764,0.279619,0.966419
4,0.915899,0.819593,0.549171,0.642427,0.282937,0.963041


### Random Forest using LabelPowerset

In [6]:
classifier_psl_rf = LabelPowerset(RandomForestClassifier(random_state = 0,class_weight='balanced',n_estimators=100,max_depth = 15))
scores_psl = model_evaluation(classifier_psl_rf, selected_train)

Starting fitting...
Evaluation Scores:
[0.8813097289675701, 0.6338161010866284, 0.61490944826618, 0.6218737064792403, 0.318996754908307, 0.9505622548900228]

Starting fitting...
Evaluation Scores:
[0.8861941467694429, 0.6411535905753317, 0.605913440937009, 0.6188866643817417, 0.3160445673317832, 0.9451897881313568]

Starting fitting...
Evaluation Scores:
[0.8886382152033591, 0.6519728945274685, 0.6111744583808438, 0.6286122115892729, 0.31590203777830683, 0.9501505714950528]

Starting fitting...
Evaluation Scores:
[0.8813373441123018, 0.6313084469342092, 0.6139040505601838, 0.6183593150218584, 0.3120387394069897, 0.9508776177718575]

Starting fitting...
Evaluation Scores:
[0.8887948862568152, 0.6468655400086205, 0.6082046883933676, 0.6240304852507212, 0.31277174475132896, 0.94869648230854]



In [30]:
scores_psl

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,log_loss,roc_auc_weighted
0,0.88131,0.633816,0.614909,0.621874,0.318997,0.950562
1,0.886194,0.641154,0.605913,0.618887,0.316045,0.94519
2,0.888638,0.651973,0.611174,0.628612,0.315902,0.950151
3,0.881337,0.631308,0.613904,0.618359,0.312039,0.950878
4,0.888795,0.646866,0.608205,0.62403,0.312772,0.948696


## Store the Result

In [9]:
scores_collection = pd.DataFrame({'model name': [], 'accuracy':[], 'precision_weighted':[], 'recall_weighted':[], 
                                      'f1_weighted':[], 'log_loss':[], 'roc_auc_weighted':[]})
scores_collection.loc[len(scores_collection)] = ['Random Forest with BinaryRelevance']+np.mean(scores).values.tolist()
scores_collection.loc[len(scores_collection)] = ['Random Forest with ClassifierChain']+np.mean(scores_chain).values.tolist()
scores_collection.loc[len(scores_collection)] = ['Random Forest with LabelPowerset']+np.mean(scores_psl).values.tolist()

In [10]:
scores_collection

Unnamed: 0,model name,accuracy,precision_weighted,recall_weighted,f1_weighted,log_loss,roc_auc_weighted
0,Random Forest with BinaryRelevance,0.870829,0.565734,0.813665,0.657563,0.321771,0.96676
1,Random Forest with ClassifierChain,0.916507,0.825159,0.558172,0.651564,0.283413,0.964408
2,Random Forest with LabelPowerset,0.885255,0.641023,0.610821,0.622352,0.315151,0.949095


In [11]:
scores_collection.to_csv('RandomForest Evaluation.csv', index=False)