In [30]:
import pandas as pd
import numpy as np
#Import saved models
import joblib

# for KF validation
from sklearn.model_selection import KFold



# for multi-label classification
from skmultilearn.problem_transform import BinaryRelevance

#for LightGBM
from lightgbm import LGBMClassifier

# for random forest
from sklearn.ensemble import RandomForestClassifier

# for adaboost
from sklearn.ensemble import AdaBoostClassifier

# for xgboost
from xgboost import XGBClassifier

#for evaluation
# for evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score

# for evaluation metrics
%run -i helper_functions.py

import warnings
warnings.filterwarnings("ignore")

In [None]:
model name		                    f1_weighted	log_loss	roc_auc_weighted
Baseline CNN                        0.725218    0.275093    0.981135
Random Forest with BinaryRelevance	0.657563	0.321771	0.966760
Tuned LightGBM with BinaryRelevance	0.690020	0.296583	0.969507
Tuned XGBoost with BinaryRelevance	0.694385	0.307418	0.966565
Adaboost with BinaryRelevance		0.669831	0.387925	0.963709

In [26]:
#Load train data
selected_train = pd.read_csv('Data\selected_train.csv')
X_train =  selected_train[selected_train.columns[7:]]
y_train = selected_train[selected_train.columns[:6]]

#loading the test dataset
selected_test = pd.read_csv('Data/selected_test.csv')
X_test = selected_test[selected_train.columns[7:]]

In [5]:
tuned_xgb_br = BinaryRelevance(XGBClassifier(random_state=0, scale_pos_weight=5, max_depth=8, min_child_weight=3, gamma=0.4, colsample_bytree=1.0, subsample=1.0))
tuned_lgbm_br = BinaryRelevance(LGBMClassifier(random_state=0, n_estimators=80, max_depth=6, min_split_gain=0.2, subsample=0.6))
ada_br = BinaryRelevance(AdaBoostClassifier(random_state=0))
rf_br = BinaryRelevance(RandomForestClassifier(random_state = 0,class_weight='balanced',max_depth = 10))

In [6]:
def weighted_average(models, X_train, y_train, X_test, weights):
    models_proba = []
    for model in models:
        model.fit(X_train, y_train)
        predictions_proba = model.predict_proba(X_test)
        models_proba.append(predictions_proba)

    ##### Weigthed Average
    weighted_average = (weights[0]*models_proba[0]+
                        weights[1]*models_proba[1]+
                        weights[2]*models_proba[2])

    weighted_average_proba_df = pd.DataFrame.sparse.from_spmatrix(weighted_average, columns=selected_train.columns[:6])

    #### Get label
    predictions = weighted_average_proba_df
    for label in y_train.columns:
        predictions[label] = np.where(predictions[label] >= 0.5, 1, 0)
    
    return {'predictions': predictions, 'predict_proba': weighted_average_proba_df}

In [7]:
#Overwrite model_evaluation function from helper.py

def model_evaluation(models, train, weights):
    features = train.columns[7:]
    labels = train.columns[:6]
    kf = KFold(n_splits=5)
    validation_scores = pd.DataFrame({'accuracy':[], 'precision_weighted':[], 'recall_weighted':[], 
                                      'f1_weighted':[], 'log_loss':[], 'roc_auc_weighted':[]})
    for train_index, test_index in kf.split(train):
        train_split = train.iloc[train_index]
        test_split = train.iloc[test_index]
        print('Starting fitting...')
        output = weighted_average(models, train_split[features], train_split[labels], test_split[features], weights)
        predictions_proba = output['predict_proba']
        predictions = output['predictions']

        
        scores = [accuracy_score(test_split[labels], predictions), precision_score(test_split[labels], predictions, average="weighted"),
                  recall_score(test_split[labels], predictions, average="weighted"), 
                  f1_score(test_split[labels], predictions, average="weighted"),
                  log_loss(test_split[labels], predictions_proba), 
                  roc_auc_score(test_split[labels], predictions_proba, average="weighted")]
        validation_scores.loc[len(validation_scores)] = scores
        print(f'Evaluation Scores:\n{scores}\n')
    return validation_scores   

# Ensemble 1 (LGBM, XGBoost, and AdaBoost)

In [158]:
models_1 = [tuned_xgb_br, tuned_lgbm_br, ada_br]
ensemble_1 = model_evaluation(models_1, selected_train, [1/3, 1/3, 1/3])

Starting fitting...




Evaluation Scores:
[0.9119222935923547, 0.754946406353802, 0.6651691702934157, 0.7021977828639927, 1.4057302468285604, 0.8255746828331806]

Starting fitting...




Evaluation Scores:
[0.9134549100708154, 0.7502204823653467, 0.66490501356949, 0.7018691335681302, 1.3289470452669154, 0.8255971886659689]

Starting fitting...




Evaluation Scores:
[0.9148022811305383, 0.757293095682153, 0.6647662485746865, 0.7052760821303976, 1.3712369795827541, 0.8256343876074247]

Starting fitting...




Evaluation Scores:
[0.9133922416494329, 0.747938829398676, 0.6738006320022982, 0.7029314833343828, 1.3302525631726532, 0.8294960570341356]

Starting fitting...




Evaluation Scores:
[0.9134549100708154, 0.7523341611602621, 0.6568038879359634, 0.6960963316245494, 1.406448845288263, 0.8215171570131606]



In [168]:
ensemble_1

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,log_loss,roc_auc_weighted
0,0.911922,0.754946,0.665169,0.702198,1.40573,0.825575
1,0.913455,0.75022,0.664905,0.701869,1.328947,0.825597
2,0.914802,0.757293,0.664766,0.705276,1.371237,0.825634
3,0.913392,0.747939,0.673801,0.702931,1.330253,0.829496
4,0.913455,0.752334,0.656804,0.696096,1.406449,0.821517


In [169]:
#Different weight - more weight on models with higher F1 and lower log loss score
ensemble_3 = model_evaluation(models_1, selected_train, [3/8, 3/8, 1/4])

Starting fitting...
Evaluation Scores:
[0.9119222935923547, 0.7548049521047272, 0.6654499508634003, 0.7023500447420293, 1.4067145375433694, 0.8257111497132608]

Starting fitting...
Evaluation Scores:
[0.9135175784921978, 0.7502455869048043, 0.66490501356949, 0.7018781402906501, 1.3289761457636213, 0.8255971886659689]

Starting fitting...
Evaluation Scores:
[0.9146769442877734, 0.7571111342619925, 0.6653363740022805, 0.7054990718595194, 1.3690261996771658, 0.8258933690712769]

Starting fitting...
Evaluation Scores:
[0.9135175784921978, 0.7483990289562756, 0.6743751795461075, 0.7034950748416028, 1.3269237742807614, 0.8297833308060402]

Starting fitting...
Evaluation Scores:
[0.9134549100708154, 0.7514657382906181, 0.657661520869068, 0.6963729172506641, 1.404309289832919, 0.8219223825705774]



In [170]:
ensemble_3

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,log_loss,roc_auc_weighted
0,0.911922,0.754805,0.66545,0.70235,1.406715,0.825711
1,0.913518,0.750246,0.664905,0.701878,1.328976,0.825597
2,0.914677,0.757111,0.665336,0.705499,1.369026,0.825893
3,0.913518,0.748399,0.674375,0.703495,1.326924,0.829783
4,0.913455,0.751466,0.657662,0.696373,1.404309,0.821922


# Ensemble 1 (LGBM, XGBoost, and RandomForest)

In [166]:
models_2 = [tuned_xgb_br, tuned_lgbm_br, rf_br]
ensemble_2_score = model_evaluation(models_2, selected_train, [1/3, 1/3, 1/3])

Starting fitting...
Evaluation Scores:
[0.9088829703900987, 0.7171212386129368, 0.7063035237961534, 0.710014204513782, 1.19710448628984, 0.8444041618780586]

Starting fitting...
Evaluation Scores:
[0.9087234442564391, 0.7116427769060432, 0.7026139122982431, 0.7055123418600046, 1.1745058481850823, 0.8425881809276611]

Starting fitting...
Evaluation Scores:
[0.9105094942658395, 0.7239011433602807, 0.7092360319270239, 0.715950358446648, 1.1864606764488084, 0.846191897954615]

Starting fitting...
Evaluation Scores:
[0.910039481105471, 0.7133286019863341, 0.7150244182706119, 0.7120469817612871, 1.1371533756642025, 0.848465472953013]

Starting fitting...
Evaluation Scores:
[0.9099768126840885, 0.715856514948613, 0.7012578616352201, 0.7065125100938601, 1.2092403059317378, 0.8419053972410111]



In [167]:
ensemble_2_score

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,log_loss,roc_auc_weighted
0,0.908883,0.717121,0.706304,0.710014,1.197104,0.844404
1,0.908723,0.711643,0.702614,0.705512,1.174506,0.842588
2,0.910509,0.723901,0.709236,0.71595,1.186461,0.846192
3,0.910039,0.713329,0.715024,0.712047,1.137153,0.848465
4,0.909977,0.715857,0.701258,0.706513,1.20924,0.841905


In [21]:
models_2 = [tuned_xgb_br, tuned_lgbm_br, rf_br]
ensemble_4_score = model_evaluation(models_2, selected_train, [3/8, 3/8, 1/4])

Starting fitting...
Evaluation Scores:
[0.9105749647501175, 0.7314768597251409, 0.6945107398568019, 0.7102751027921048, 1.264944571182985, 0.8391750636619557]

Starting fitting...
Evaluation Scores:
[0.9104781600551483, 0.722356029386905, 0.6906156263390945, 0.7045121751153328, 1.2223511036683345, 0.8371951699265464]

Starting fitting...
Evaluation Scores:
[0.9118255311148712, 0.7355584749098699, 0.6954104903078677, 0.7140208847251024, 1.2422051805621837, 0.8398707493019653]

Starting fitting...
Evaluation Scores:
[0.9111048442689729, 0.7225871718444269, 0.7032461936225223, 0.7099529325522982, 1.2011140143614047, 0.8430543195020472]

Starting fitting...
Evaluation Scores:
[0.9109168390048255, 0.7271791476540261, 0.6878216123499142, 0.7043033239411101, 1.268764095733869, 0.8357408781202781]



In [None]:
ensemble_4_score

### Store the Result

In [23]:
scores_collection = pd.DataFrame({'model name': [], 'accuracy':[], 'precision_weighted':[], 'recall_weighted':[], 
                                      'f1_weighted':[], 'log_loss':[], 'roc_auc_weighted':[]})
scores_collection.loc[len(scores_collection)] = ['Ensemble 1 (XGBoost, LGBM, AdaBoost) with equal weight']+np.mean(ensemble_1).values.tolist()
scores_collection.loc[len(scores_collection)] = ['Ensemble 2 (XGBoost, LGBM, AdaBoost) with different weight']+np.mean(ensemble_3).values.tolist()
scores_collection.loc[len(scores_collection)] = ['Ensemble 3 (XGBoost, LGBM, RandomForest) with equal weight']+np.mean(ensemble_2_score).values.tolist()
scores_collection.loc[len(scores_collection)] = ['Ensemble 4 (XGBoost, LGBM, RandomForest) with different weight']+np.mean(ensemble_4_score).values.tolist()

In [24]:
scores_collection

Unnamed: 0,model name,accuracy,precision_weighted,recall_weighted,f1_weighted,log_loss,roc_auc_weighted
0,"Ensemble 1 (XGBoost, LGBM, AdaBoost) with equa...",0.913405,0.752546,0.665089,0.701674,1.368523,0.825564
1,"Ensemble 2 (XGBoost, LGBM, AdaBoost) with diff...",0.913418,0.752405,0.665546,0.701919,1.36719,0.82578
2,"Ensemble 3 (XGBoost, LGBM, RandomForest) with ...",0.909626,0.71637,0.706887,0.710007,1.180893,0.844709
3,"Ensemble 4 (XGBoost, LGBM, RandomForest) with ...",0.91098,0.727832,0.694321,0.708613,1.239876,0.839007


In [25]:
#Save scores dataframe
scores_collection.to_csv('Ensemble Evaluation.csv', index=False)

## Evaluate on test dataset

Evalute the best performing ensemble model, Ensemble 3, based on its validation score (F1 and log loss) on test dataset.

In [28]:
#Test the best performing ensemble model on test dataset
test_output_prediction = weighted_average(models_2, X_train, y_train, X_test, [1/3,1/3,1/3])

In [41]:
get_evaluation_score(selected_test[selected_test.columns[:6]], scipy.sparse.csr_matrix(test_output_prediction['predictions'].values), test_output_prediction['predict_proba'].to_numpy())

Accuracy score:  0.867704523429929
Precision score:  0.5284269189758418
Recall score:  0.7473444613050075
F1 score:  0.6160023199223558
Confusion matrix for label toxic:
[[53652  4236]
 [ 1151  4939]]
Confusion matrix for label severe_toxic:
[[63129   482]
 [  153   214]]
Confusion matrix for label obscene:
[[57846  2441]
 [  803  2888]]
Confusion matrix for label threat:
[[63673    94]
 [  141    70]]
Confusion matrix for label insult:
[[58423  2128]
 [  957  2470]]
Confusion matrix for label identity_hate:
[[62986   280]
 [  458   254]]
Logarithmic Loss:  1.233934206156114
ROC AUC score:  0.8487802053438396
