In [21]:
import pandas as pd
import numpy as np

#Import saved models
import joblib

# for KF validation
from sklearn.model_selection import KFold

from copy import deepcopy

# for multi-label classification
from skmultilearn.problem_transform import BinaryRelevance

#for LightGBM
from lightgbm import LGBMClassifier

# for random forest
from sklearn.ensemble import RandomForestClassifier

# for adaboost
from sklearn.ensemble import AdaBoostClassifier

# for xgboost
from xgboost import XGBClassifier

#for evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
import scipy

# for evaluation metrics
%run -i helper_functions.py

import warnings
warnings.filterwarnings("ignore")

In [2]:
#Load train data
selected_train = pd.read_csv('Data\selected_train.csv')
X_train =  selected_train[selected_train.columns[7:]]
y_train = selected_train[selected_train.columns[:6]]

#loading the test dataset
selected_test = pd.read_csv('Data/selected_test.csv')
X_test = selected_test[selected_train.columns[7:]]

In [3]:
tuned_xgb_br = BinaryRelevance(XGBClassifier(random_state=0, scale_pos_weight=5, max_depth=8, min_child_weight=3, gamma=0.4, colsample_bytree=1.0, subsample=1.0))
tuned_lgbm_br = BinaryRelevance(LGBMClassifier(random_state=0, n_estimators=80, max_depth=6, min_split_gain=0.2, subsample=0.6))
ada_br = BinaryRelevance(AdaBoostClassifier(random_state=0))
rf_br = BinaryRelevance(RandomForestClassifier(random_state = 0,class_weight='balanced',max_depth = 10))

In [4]:
def weighted_average(models, X_train, y_train, X_test, weights):
    models_proba = []
    for model in models:
        model.fit(X_train, y_train)
        predictions_proba = model.predict_proba(X_test)
        models_proba.append(predictions_proba)

    ##### Weigthed Average
    if len(models) == 3:
        weighted_average = (weights[0]*models_proba[0]+
                            weights[1]*models_proba[1]+
                            weights[2]*models_proba[2])
    else:
        weighted_average = (weights[0]*models_proba[0]+
                            weights[1]*models_proba[1]+
                            weights[2]*models_proba[2]+
                            weights[3]*models_proba[3])


    weighted_average_proba_df = pd.DataFrame.sparse.from_spmatrix(weighted_average, columns=selected_train.columns[:6])
    #### Get label
    predictions = deepcopy(weighted_average_proba_df)
    for label in y_train.columns:
        predictions[label] = np.where(predictions[label] >= 0.5, 1, 0)
    
    return {'predictions': predictions, 'predict_proba': weighted_average_proba_df}

In [5]:
#Overwrite model_evaluation function from helper.py

def model_evaluation(models, train, weights):
    features = train.columns[7:]
    labels = train.columns[:6]
    kf = KFold(n_splits=5)
    validation_scores = pd.DataFrame({'accuracy':[], 'precision_weighted':[], 'recall_weighted':[], 
                                      'f1_weighted':[], 'log_loss':[], 'roc_auc_weighted':[]})
    for train_index, test_index in kf.split(train):
        train_split = train.iloc[train_index]
        test_split = train.iloc[test_index]
        print('Starting fitting...')
        output = weighted_average(models, train_split[features], train_split[labels], test_split[features], weights)
        predictions_proba = output['predict_proba']
        predictions = output['predictions']

        
        scores = [accuracy_score(test_split[labels], predictions), precision_score(test_split[labels], predictions, average="weighted"),
                  recall_score(test_split[labels], predictions, average="weighted"), 
                  f1_score(test_split[labels], predictions, average="weighted"),
                  log_loss(test_split[labels], predictions_proba.to_numpy()), 
                  roc_auc_score(test_split[labels], predictions_proba.to_numpy(), average="weighted")]
        validation_scores.loc[len(validation_scores)] = scores
        print(f'Evaluation Scores:\n{scores}\n')
    return validation_scores   

# Ensemble 1 (LGBM, XGBoost, and AdaBoost)

In [14]:
models_1 = [tuned_xgb_br, tuned_lgbm_br, ada_br]
ensemble_1 = model_evaluation(models_1, selected_train, [1/3, 1/3, 1/3])

Starting fitting...
Evaluation Scores:
[0.9119222935923547, 0.754946406353802, 0.6651691702934157, 0.7021977828639927, 0.3265233787322905, 0.9711349505881992]

Starting fitting...
Evaluation Scores:
[0.9134549100708154, 0.7502204823653467, 0.66490501356949, 0.7018691335681302, 0.320638626313969, 0.9700226352306222]

Starting fitting...
Evaluation Scores:
[0.9148022811305383, 0.757293095682153, 0.6647662485746865, 0.7052760821303976, 0.3209236961756583, 0.9713181243284671]

Starting fitting...
Evaluation Scores:
[0.9133922416494329, 0.747938829398676, 0.6738006320022982, 0.7029314833343828, 0.317474653433656, 0.9720461880206995]

Starting fitting...
Evaluation Scores:
[0.9134549100708154, 0.7523341611602621, 0.6568038879359634, 0.6960963316245494, 0.3207363541372396, 0.9693666189226791]



In [15]:
ensemble_1

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,log_loss,roc_auc_weighted
0,0.911922,0.754946,0.665169,0.702198,0.326523,0.971135
1,0.913455,0.75022,0.664905,0.701869,0.320639,0.970023
2,0.914802,0.757293,0.664766,0.705276,0.320924,0.971318
3,0.913392,0.747939,0.673801,0.702931,0.317475,0.972046
4,0.913455,0.752334,0.656804,0.696096,0.320736,0.969367


In [30]:
#Different weight - more weight on models with higher F1 and lower log loss score
ensemble_3 = model_evaluation(models_1, selected_train, [3/8, 3/8, 1/4])

Starting fitting...
Evaluation Scores:
[0.9119222935923547, 0.7548049521047272, 0.6654499508634003, 0.7023500447420293, 0.31861028335383523, 0.9712033187989894]

Starting fitting...
Evaluation Scores:
[0.9135175784921978, 0.7502455869048043, 0.66490501356949, 0.7018781402906501, 0.3128229530501169, 0.97008897169846]

Starting fitting...
Evaluation Scores:
[0.9146769442877734, 0.7571111342619925, 0.6653363740022805, 0.7054990718595194, 0.3129854487602042, 0.971356208770678]

Starting fitting...
Evaluation Scores:
[0.9135175784921978, 0.7483990289562756, 0.6743751795461075, 0.7034950748416028, 0.30948365307716585, 0.9720408311716409]

Starting fitting...
Evaluation Scores:
[0.9134549100708154, 0.7514657382906181, 0.657661520869068, 0.6963729172506641, 0.3129896262168092, 0.9695012080551284]



In [31]:
ensemble_3

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,log_loss,roc_auc_weighted
0,0.911922,0.754805,0.66545,0.70235,0.31861,0.971203
1,0.913518,0.750246,0.664905,0.701878,0.312823,0.970089
2,0.914677,0.757111,0.665336,0.705499,0.312985,0.971356
3,0.913518,0.748399,0.674375,0.703495,0.309484,0.972041
4,0.913455,0.751466,0.657662,0.696373,0.31299,0.969501


# Ensemble 2 (LGBM, XGBoost, and RandomForest)

In [6]:
models_2 = [tuned_xgb_br, tuned_lgbm_br, rf_br]
ensemble_2_score = model_evaluation(models_2, selected_train, [1/3, 1/3, 1/3])

Starting fitting...
Evaluation Scores:
[0.9088829703900987, 0.7171212386129368, 0.7063035237961534, 0.710014204513782, 0.2968581204455867, 0.9710995001927276]

Starting fitting...
Evaluation Scores:
[0.9087234442564391, 0.7116427769060432, 0.7026139122982431, 0.7055123418600046, 0.29202617320845065, 0.9687695650423149]

Starting fitting...
Evaluation Scores:
[0.9105094942658395, 0.7239011433602807, 0.7092360319270239, 0.715950358446648, 0.292200369000546, 0.97033636633036]

Starting fitting...
Evaluation Scores:
[0.910039481105471, 0.7133286019863341, 0.7150244182706119, 0.7120469817612871, 0.28852250053663386, 0.9720840635978917]

Starting fitting...
Evaluation Scores:
[0.9099768126840885, 0.715856514948613, 0.7012578616352201, 0.7065125100938601, 0.29123433699977064, 0.9690518247520075]



In [7]:
ensemble_2_score

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,log_loss,roc_auc_weighted
0,0.908883,0.717121,0.706304,0.710014,0.296858,0.9711
1,0.908723,0.711643,0.702614,0.705512,0.292026,0.96877
2,0.910509,0.723901,0.709236,0.71595,0.2922,0.970336
3,0.910039,0.713329,0.715024,0.712047,0.288523,0.972084
4,0.909977,0.715857,0.701258,0.706513,0.291234,0.969052


In [8]:
models_2 = [tuned_xgb_br, tuned_lgbm_br, rf_br]
ensemble_4_score = model_evaluation(models_2, selected_train, [3/8, 3/8, 1/4])

Starting fitting...
Evaluation Scores:
[0.9105749647501175, 0.7314768597251409, 0.6945107398568019, 0.7102751027921048, 0.2933830875118954, 0.9713726146996211]

Starting fitting...
Evaluation Scores:
[0.9104781600551483, 0.722356029386905, 0.6906156263390945, 0.7045121751153328, 0.2886264545875376, 0.9691461791913214]

Starting fitting...
Evaluation Scores:
[0.9118255311148712, 0.7355584749098699, 0.6954104903078677, 0.7140208847251024, 0.28866930001546415, 0.9706354397158797]

Starting fitting...
Evaluation Scores:
[0.9111048442689729, 0.7225871718444269, 0.7032461936225223, 0.7099529325522982, 0.28489485187067687, 0.9723262526458304]

Starting fitting...
Evaluation Scores:
[0.9109168390048255, 0.7271791476540261, 0.6878216123499142, 0.7043033239411101, 0.2878407145999973, 0.9693852232675221]



In [9]:
ensemble_4_score

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,log_loss,roc_auc_weighted
0,0.910575,0.731477,0.694511,0.710275,0.293383,0.971373
1,0.910478,0.722356,0.690616,0.704512,0.288626,0.969146
2,0.911826,0.735558,0.69541,0.714021,0.288669,0.970635
3,0.911105,0.722587,0.703246,0.709953,0.284895,0.972326
4,0.910917,0.727179,0.687822,0.704303,0.287841,0.969385


# Ensemble 3 (All 4 Models)

In [35]:
models_3 = [tuned_xgb_br, tuned_lgbm_br, ada_br, rf_br]
ensemble_5_score = model_evaluation(models_3, selected_train, [3/8, 3/8, 1/8, 1/8])

Starting fitting...
Evaluation Scores:
[0.9112642957856807, 0.741714598446878, 0.6782254667976976, 0.7052987780064434, 0.3083293513627697, 0.9717370435230349]

Starting fitting...
Evaluation Scores:
[0.9122955442752397, 0.7370709461469593, 0.6786173403799457, 0.7043676635127404, 0.302830951328866, 0.9698231220121846]

Starting fitting...
Evaluation Scores:
[0.9131729021745942, 0.7459379078525786, 0.682012542759407, 0.7110272924723191, 0.3029387674050004, 0.9713199369600063]

Starting fitting...
Evaluation Scores:
[0.9119822021683274, 0.7334937863021092, 0.6890261419132433, 0.7064202387101678, 0.29941894851677797, 0.9727592563568129]

Starting fitting...
Evaluation Scores:
[0.9124208811180047, 0.7394652169295365, 0.6730989136649514, 0.7010650079569917, 0.302880899132374, 0.969898641659977]



# Store the Result

In [16]:
scores_collection = pd.DataFrame({'model name': [], 'accuracy':[], 'precision_weighted':[], 'recall_weighted':[], 
                                      'f1_weighted':[], 'log_loss':[], 'roc_auc_weighted':[]})
scores_collection.loc[len(scores_collection)] = ['Ensemble 1 (XGBoost, LGBM, AdaBoost) with equal weight']+np.mean(ensemble_1).values.tolist()
scores_collection.loc[len(scores_collection)] = ['Ensemble 2 (XGBoost, LGBM, AdaBoost) with different weight']+np.mean(ensemble_3).values.tolist()
scores_collection.loc[len(scores_collection)] = ['Ensemble 3 (XGBoost, LGBM, RandomForest) with equal weight']+np.mean(ensemble_2_score).values.tolist()
scores_collection.loc[len(scores_collection)] = ['Ensemble 4 (XGBoost, LGBM, RandomForest) with different weight']+np.mean(ensemble_4_score).values.tolist()
scores_collection.loc[len(scores_collection)] = ['Ensemble 5 (XGBoost, LGBM, RandomForest, AdaBoost) with different weight']+np.mean(ensemble_5_score).values.tolist()

In [17]:
scores_collection

Unnamed: 0,model name,accuracy,precision_weighted,recall_weighted,f1_weighted,log_loss,roc_auc_weighted
0,"Ensemble 1 (XGBoost, LGBM, AdaBoost) with equa...",0.91365,0.754148,0.663738,0.70232,0.314079,0.970701
1,"Ensemble 2 (XGBoost, LGBM, AdaBoost) with diff...",0.91365,0.754148,0.663738,0.70232,0.314079,0.970701
2,"Ensemble 3 (XGBoost, LGBM, RandomForest) with ...",0.909626,0.71637,0.706887,0.710007,0.292168,0.970268
3,"Ensemble 4 (XGBoost, LGBM, RandomForest) with ...",0.91098,0.727832,0.694321,0.708613,0.288683,0.970573
4,"Ensemble 5 (XGBoost, LGBM, RandomForest, AdaBo...",0.912139,0.738342,0.683382,0.706707,0.302587,0.97168


In [18]:
#Save scores dataframe
scores_collection.to_csv('Ensemble Evaluation.csv', index=False)

## Evaluate on test dataset

Evalute the best performing ensemble model, Ensemble 3, based on its validation score (F1 and log loss) on test dataset.

In [19]:
#Test the best performing ensemble model on test dataset
test_output_prediction = weighted_average(models_2, X_train, y_train, X_test, [1/3,1/3,1/3])

In [22]:
get_evaluation_score(selected_test[selected_test.columns[:6]], scipy.sparse.csr_matrix(test_output_prediction['predictions'].values), test_output_prediction['predict_proba'].to_numpy())

Accuracy score:  0.867704523429929
Precision score:  0.5284269189758418
Recall score:  0.7473444613050075
F1 score:  0.6160023199223558
Confusion matrix for label toxic:
[[53652  4236]
 [ 1151  4939]]
Confusion matrix for label severe_toxic:
[[63129   482]
 [  153   214]]
Confusion matrix for label obscene:
[[57846  2441]
 [  803  2888]]
Confusion matrix for label threat:
[[63673    94]
 [  141    70]]
Confusion matrix for label insult:
[[58423  2128]
 [  957  2470]]
Confusion matrix for label identity_hate:
[[62986   280]
 [  458   254]]
Logarithmic Loss:  0.30335997676023324
ROC AUC score:  0.9602262715453892
