In [59]:
import pandas as pd

# for adaboost
from sklearn.ensemble import AdaBoostClassifier

# for xgboost
from xgboost import XGBClassifier

# for evaluation metrics
%run -i helper_functions.py
from sklearn.model_selection import cross_validate

# for multi-label classification
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset

import warnings
warnings.filterwarnings("ignore")

## Read Data

In [2]:
selected_train = pd.read_csv('Data\selected_train.csv')

In [3]:
print(selected_train.shape)
selected_train.head()

(159571, 48)


Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,none,min_length_scaled,num_words_vs_length,exclamation_marks_vs_length,...,105,114,132,135,139,143,156,157,170,198
0,0,0,0,0,0,0,1,0.009393,0.181132,0.0,...,-0.09132,-0.017755,0.003997,-0.199211,-0.109984,0.089084,0.012651,-0.019501,-0.091946,-0.113697
1,0,0,0,0,0,0,1,0.000723,0.160714,0.008929,...,-0.118445,0.039541,0.017522,0.001397,9.4e-05,0.021135,0.039308,-0.067295,-0.079366,-0.044959
2,0,0,0,0,0,0,1,0.007225,0.188841,0.0,...,-0.131555,-0.060714,0.026459,-0.029582,-0.146134,0.109907,0.036539,-0.053628,0.02753,-0.21123
3,0,0,0,0,0,0,1,0.007948,0.175719,0.0,...,-0.145382,-0.004285,-0.000706,-0.134984,-0.214832,0.17756,0.007675,-0.087473,0.106848,-0.015987
4,0,0,0,0,0,0,1,0.003613,0.208955,0.0,...,0.007073,-0.204627,0.162032,0.013798,-0.221076,0.163578,-0.06831,-0.031184,-0.01728,0.038566


In [8]:
features = selected_train.columns[7:]

In [9]:
labels = selected_train.columns[:6]

## Adaboost

### Baseline Adaboost using BinaryRelevance

In [27]:
from sklearn.model_selection import KFold

In [65]:
def model_evaluation(model, train):
    features = train.columns[7:]
    labels = train.columns[:6]
    kf = KFold(n_splits=5)
    validation_scores = pd.DataFrame({'accuracy':[], 'precision_weighted':[], 'recall_weighted':[], 
                                      'f1_weighted':[], 'log_loss':[], 'roc_auc_weighted':[]})
    for train_index, test_index in kf.split(train):
        train_split = train.iloc[train_index]
        test_split = train.iloc[test_index]
        print('Starting fitting...')
        model.fit(train_split[features], train_split[labels])
        predictions = model.predict(test_split[features])
        predictions_proba = model.predict_proba(test_split[features])
        scores = [accuracy_score(test_split[labels], predictions), precision_score(test_split[labels], predictions, average="weighted"),
                  recall_score(test_split[labels], predictions, average="weighted"), f1_score(test_split[labels], predictions, average="weighted"),
                  log_loss(test_split[labels], predictions_proba.toarray()), roc_auc_score(test_split[labels], predictions_proba.toarray(), average="weighted")]
        validation_scores.loc[len(validation_scores)] = scores
        print(f'Evaluation Scores:\n{scores}\n')
    return validation_scores   

In [66]:
classifier_ab = BinaryRelevance(AdaBoostClassifier(random_state=0))
scores_ab = model_evaluation(classifier_ab, selected_train)

Starting fitting...
Evaluation Scores:
[0.9086009713300955, 0.7662075024631361, 0.5955355889372456, 0.668614831071146, 0.39369639966892334, 0.9644349748310285]

Starting fitting...
Evaluation Scores:
[0.9115748574293413, 0.7595417714523536, 0.5947721754035138, 0.6661630929909684, 0.38696189906549155, 0.9625646922464547]

Starting fitting...
Evaluation Scores:
[0.9124208811180047, 0.7719679475126819, 0.5987742303306728, 0.6733192558041735, 0.3877950753399316, 0.9646503826264773]

Starting fitting...
Evaluation Scores:
[0.9121388732217836, 0.7637796278111846, 0.6064349324906636, 0.6747315236135966, 0.3847270115759182, 0.9653717914010013]

Starting fitting...
Evaluation Scores:
[0.9113241837438115, 0.7665321644718834, 0.5914808461978274, 0.666324875262268, 0.38644634243444553, 0.9615207163393116]



In [67]:
scores_ab

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,log_loss,roc_auc_weighted
0,0.908601,0.766208,0.595536,0.668615,0.393696,0.964435
1,0.911575,0.759542,0.594772,0.666163,0.386962,0.962565
2,0.912421,0.771968,0.598774,0.673319,0.387795,0.96465
3,0.912139,0.76378,0.606435,0.674732,0.384727,0.965372
4,0.911324,0.766532,0.591481,0.666325,0.386446,0.961521


### Baseline Adaboost using ClassifierChain

In [68]:
classifier_chain_ab = ClassifierChain(AdaBoostClassifier(random_state=0))
scores_chain_ab = model_evaluation(classifier_chain_ab, selected_train)

Starting fitting...
Evaluation Scores:
[0.9104809650634498, 0.7384635533740987, 0.6317562824652534, 0.6745848669974185, 0.38961003115342546, 0.962411850840173]

Starting fitting...
Evaluation Scores:
[0.9135802469135802, 0.7443306616884929, 0.630624196543351, 0.6777138181688405, 0.38273790835086247, 0.959051054485923]

Starting fitting...
Evaluation Scores:
[0.9140502600739487, 0.7423848777398544, 0.6328392246294184, 0.6789176970840937, 0.38458337339085846, 0.9624953997128569]

Starting fitting...
Evaluation Scores:
[0.9142695995487874, 0.7382283902076263, 0.6430623384085034, 0.6809014825232593, 0.38158460910319736, 0.9630270816800737]

Starting fitting...
Evaluation Scores:
[0.9120135363790186, 0.7332005622547857, 0.6179245283018868, 0.664510165034994, 0.3831426565262774, 0.9589690169502456]



In [69]:
scores_chain_ab

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,log_loss,roc_auc_weighted
0,0.910481,0.738464,0.631756,0.674585,0.38961,0.962412
1,0.91358,0.744331,0.630624,0.677714,0.382738,0.959051
2,0.91405,0.742385,0.632839,0.678918,0.384583,0.962495
3,0.91427,0.738228,0.643062,0.680901,0.381585,0.963027
4,0.912014,0.733201,0.617925,0.66451,0.383143,0.958969


### Baseline Adaboost using LabelPowerset

In [None]:
classifier_powerset_ab = LabelPowerset(AdaBoostClassifier(random_state=0))
scores_powerset_ab = model_evaluation(classifier_powerset_ab, selected_train)

## XGBoost

### Baseline XGboost using BinaryRelevance

In [None]:
classifier = BinaryRelevance(XGBClassifier(random_state=0))
scores = model_evaluation(classifier, selected_train)

### Baseline XGboost using ClassifierChain

In [None]:
classifier_chain = ClassifierChain(XGBClassifier(random_state=0))
scores_chain = model_evaluation(classifier_chain, selected_train)

### Baseline XGboost using LabelPowerset

In [None]:
classifier_powerset = LabelPowerset(XGBClassifier(random_state=0))
scores_powerset = model_evaluation(classifier_powerset, selected_train)

### Tuned XGboost using BinaryRelevance

In [None]:
classifier_tuned = BinaryRelevance(XGBClassifier(random_state=0, scale_pos_weight=5, max_depth=8, min_child_weight=3, gamma=0.4, colsample_bytree=1.0, subsample=1.0))
scores_tuned = model_evaluation(classifier_tuned, selected_train)