# Importing Packages

In [2]:
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
plt.style.use("fivethirtyeight")
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from collections import Counter
import pickle

%reload_ext autoreload
%autoreload 2
from utils import *

import smote_variants as sv
import imbalanced_databases as imbd
import xgboost as xgb

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict, GridSearchCV,RepeatedStratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, roc_auc_score, classification_report, plot_confusion_matrix, auc, mean_squared_error, confusion_matrix, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils import resample
from sklearn.dummy import DummyClassifier
from imblearn.metrics import geometric_mean_score

from imblearn.under_sampling import CondensedNearestNeighbour, NearMiss, OneSidedSelection, NeighbourhoodCleaningRule, RandomUnderSampler, TomekLinks, EditedNearestNeighbours
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, SVMSMOTE
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier, RUSBoostClassifier

from xgboost.sklearn import XGBClassifier

# Importing Training and Validation Datasets

In [3]:
pickle_in = open("../data/pickles/training_model.pickle","rb")
train = pickle.load(pickle_in)
pickle_in = open("../data/pickles/validate_model.pickle","rb")
validate = pickle.load(pickle_in)

In [4]:
X_train = train.drop(["default"], axis=1)
y_tr = train["default"]
X_validate = validate.drop(["default"], axis=1)
y_val = validate["default"]

In [5]:
X_train.head()

Unnamed: 0,limit,behind1,paid2,delayed,latemths,age,behind2,billed1,avg_av,avail1
0,1790.26,0,179.13,0,0,44,0,1631.93,0.344578,0.08844
1,5728.83,-1,173.87,0,0,46,-1,891.69,0.957227,0.84435
2,3580.52,-1,0.0,0,0,47,-1,238.68,0.96865,0.933339
3,6086.88,0,89.26,0,0,29,0,2831.87,0.650602,0.534758
4,5370.78,-2,1171.37,0,0,33,-2,873.4,0.836153,0.837379


In [6]:
X_validate.head()

Unnamed: 0,limit,behind1,paid2,delayed,latemths,age,behind2,billed1,avg_av,avail1
0,1074.16,0,71.61,0,0,25,0,317.38,0.602052,0.704532
1,5370.78,0,151.64,0,0,26,0,4895.86,0.293715,0.088427
2,2506.36,0,111.43,0,0,32,0,2510.73,0.005217,-0.001744
3,4654.68,0,64.74,0,0,49,0,740.38,0.883482,0.840939
4,1790.26,0,53.71,1,1,36,0,3373.85,0.188227,-0.884559


# Standardize Datasets

In [7]:
scaler = StandardScaler()
scaler.fit(X_train)
X_tr_ss = scaler.transform(X_train)
X_val_ss = scaler.transform(X_validate)

In [8]:
scaled = MinMaxScaler()
scaled.fit(X_train)
X_tr = scaled.transform(X_train)
X_val = scaled.transform(X_validate)

# Importing Model

In [9]:
pickle_in = open("../data/pickles/best_model.pickle","rb")
rfcb = pickle.load(pickle_in)

In [10]:
rfcb

RandomForestClassifier(max_depth=8, n_estimators=400)

# Dummy Classifier

In [11]:
dc = DummyClassifier(strategy='most_frequent').fit(X_tr, y_tr)
y_pred_dc_tr = dc.predict(X_tr)
y_pred_dc_val = dc.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_dc_tr, y_pred_dc_val, dc)

Training Accuracy:  0.7782857142857142
Validation Accuracy:  0.7811666666666667
Training F1 Score:  0.0
Validation F1 Score:  0.0
Training AUC Score:  0.5
Validation AUC Score:  0.5
Training Recall Score:  0.0
Validation Recall Score:  0.0
Training Precision Score:  0.0
Validation Precision Score:  0.0
Training Average Precision Score:  0.22171428571428572
Validation Average Precision Score:  0.21883333333333332


# Ensemble Methods

## Bagging Classifier

Instead of using a single tree, we will check if an ensemble of decsion tree can actually alleviate the issue induced by the class imbalancing. First, we will use a bagging classifier and its counter part which internally uses a random under-sampling to balanced each boostrap sample.

Balancing each bootstrap sample allows to increase significantly the balanced accuracy and the geometric mean.

In [12]:
bc = BaggingClassifier(n_estimators=50, random_state=42).fit(X_tr_ss, y_tr)
y_pred_bc_tr = bc.predict(X_tr_ss)
y_pred_bc_val = bc.predict(X_val_ss)
get_metric(X_tr_ss, y_tr, X_val_ss, y_val, y_pred_bc_tr, y_pred_bc_val, bc)
print("")
print('Training Balanced Accuracy: ', balanced_accuracy_score(y_tr, y_pred_bc_tr))
print('Training Geometric Mean: ', geometric_mean_score(y_tr, y_pred_bc_tr))
print('Validation Balanced Accuracy: ', balanced_accuracy_score(y_val, y_pred_bc_val))
print('Validation Geometric Mean: ', geometric_mean_score(y_val, y_pred_bc_val))

Training Accuracy:  0.996904761904762
Validation Accuracy:  0.807
Training F1 Score:  0.9929964443486693
Validation F1 Score:  0.44326923076923075
Training AUC Score:  0.9998954304300327
Validation AUC Score:  0.740359286457933
Training Recall Score:  0.9896907216494846
Validation Recall Score:  0.3511043412033511
Training Precision Score:  0.9963243243243243
Validation Precision Score:  0.6010430247718384
Training Average Precision Score:  0.9996190235446103
Validation Average Precision Score:  0.48114820511254575

Training Balanced Accuracy:  0.9943252922980659
Training Geometric Mean:  0.9943144913248209
Validation Balanced Accuracy:  0.6429086886302653
Validation Geometric Mean:  0.5728715429649479


## Balanced Bagging Classifier

A Bagging classifier with additional balancing.

This implementation of Bagging is similar to the scikit-learn implementation. It includes an additional step to balance the training set at fit time using a given sampler.

This classifier can serves as a basis to implement various methods such as Exactly Balanced Bagging [6], Roughly Balanced Bagging [7], Over-Bagging [6], or SMOTE-Bagging [8].

In [13]:
bbc = BalancedBaggingClassifier(n_estimators=50, random_state=42).fit(X_tr_ss, y_tr)
y_pred_bbc_tr = bbc.predict(X_tr_ss)
y_pred_bbc_val = bbc.predict(X_val_ss)
get_metric(X_tr_ss, y_tr, X_val_ss, y_val, y_pred_bbc_tr, y_pred_bbc_val, bbc)
print("")
print('Training Balanced Accuracy: ', balanced_accuracy_score(y_tr, y_pred_bbc_tr))
print('Training Geometric Mean: ', geometric_mean_score(y_tr, y_pred_bbc_tr))
print('Validation Balanced Accuracy: ', balanced_accuracy_score(y_val, y_pred_bbc_val))
print('Validation Geometric Mean: ', geometric_mean_score(y_val, y_pred_bbc_val))

Training Accuracy:  0.9383333333333334
Validation Accuracy:  0.7631666666666667
Training F1 Score:  0.8778186621379375
Validation F1 Score:  0.5111799105607154
Training AUC Score:  0.9955504481714446
Validation AUC Score:  0.7528188434539899
Training Recall Score:  0.9991408934707904
Validation Recall Score:  0.5658796648895659
Training Precision Score:  0.7827696449604576
Validation Precision Score:  0.46612296110414053
Training Average Precision Score:  0.9819759873453777
Validation Average Precision Score:  0.499848233823849

Training Balanced Accuracy:  0.9600758309742596
Training Geometric Mean:  0.9592807316490086
Validation Balanced Accuracy:  0.6921568155896517
Validation Geometric Mean:  0.680540328399629


## Balanced Bagging Classifier with Gradient Boosting Classifier

In [14]:
bbc3 = BalancedBaggingClassifier(
        base_estimator=HistGradientBoostingClassifier(random_state=42),
        n_estimators=10,
        random_state=42,
        n_jobs=2).fit(X_tr_ss, y_tr)
y_pred_bbc3_tr = bbc3.predict(X_tr_ss)
y_pred_bbc3_val = bbc3.predict(X_val_ss)
get_metric(X_tr_ss, y_tr, X_val_ss, y_val, y_pred_bbc3_tr, y_pred_bbc3_val, bbc3)

Training Accuracy:  0.793
Validation Accuracy:  0.7533333333333333
Training F1 Score:  0.6030499497762761
Validation F1 Score:  0.5271565495207667
Training AUC Score:  0.8560039766792316
Validation AUC Score:  0.7749874838134551
Training Recall Score:  0.709192439862543
Validation Recall Score:  0.6283320639756284
Training Precision Score:  0.5245432883240667
Validation Precision Score:  0.45404512933406715
Training Average Precision Score:  0.680188981227117
Validation Average Precision Score:  0.5372388529575939


## Balanced Random Forest Classifier

Random forest is another popular ensemble method and it is usually outperforming bagging. Here, we used a vanilla random forest and its balanced counterpart in which each bootstrap sample is balanced.

Similarly to the previous experiment, the balanced classifier outperform the classifier which learn from imbalanced bootstrap samples. In addition, random forest outsperforms the bagging classifier.

In [15]:
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=42).fit(X_tr, y_tr)
y_pred_brf_tr = brf.predict(X_tr)
y_pred_brf_val = brf.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_brf_tr, y_pred_brf_val, brf)

Training Accuracy:  0.888
Validation Accuracy:  0.724
Training F1 Score:  0.7983539094650206
Validation F1 Score:  0.5083135391923991
Training AUC Score:  0.9928255616361626
Validation AUC Score:  0.7643157793647772
Training Recall Score:  1.0
Validation Recall Score:  0.6519421172886519
Training Precision Score:  0.6643835616438356
Validation Precision Score:  0.41654501216545015
Training Average Precision Score:  0.973183893912578
Validation Average Precision Score:  0.5111466083509113


## RUSBoostClassifier

Random under-sampling integrated in the learning of AdaBoost.

During learning, the problem of class balancing is alleviated by random under-sampling the sample at each iteration of the boosting algorithm

In [16]:
rbc = RUSBoostClassifier(n_estimators=200, algorithm='SAMME.R', random_state=42).fit(X_tr, y_tr)
y_pred_rbc_tr = rbc.predict(X_tr)
y_pred_rbc_val = rbc.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_rbc_tr, y_pred_rbc_val, rbc)

Training Accuracy:  0.7674761904761904
Validation Accuracy:  0.7566666666666667
Training F1 Score:  0.5456406439006235
Validation F1 Score:  0.5197368421052632
Training AUC Score:  0.7936512282426961
Validation AUC Score:  0.7667346817069983
Training Recall Score:  0.6297250859106529
Validation Recall Score:  0.6016755521706016
Training Precision Score:  0.48136594976194386
Validation Precision Score:  0.45744064852345107
Training Average Precision Score:  0.5609306330020786
Validation Average Precision Score:  0.5244608728332589


## Easy Ensemble Classifier

Bag of balanced boosted learners also known as EasyEnsemble.

This algorithm is known as EasyEnsemble [1]. The classifier is an ensemble of AdaBoost learners trained on different balanced boostrap samples. The balancing is achieved by random under-sampling.

In [17]:
eec = EasyEnsembleClassifier(random_state=42).fit(X_tr, y_tr) 
y_pred_eec_tr = eec.predict(X_tr)
y_pred_eec_val = eec.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_eec_tr, y_pred_eec_val, eec)

Training Accuracy:  0.7592380952380953
Validation Accuracy:  0.7516666666666667
Training F1 Score:  0.5427744619280159
Validation F1 Score:  0.5278833967046895
Training AUC Score:  0.790370227396205
Validation AUC Score:  0.7749849651391096
Training Recall Score:  0.6445446735395189
Validation Recall Score:  0.6344249809596344
Training Precision Score:  0.46875976257419555
Validation Precision Score:  0.45198046663049374
Training Average Precision Score:  0.5504213877594832
Validation Average Precision Score:  0.5262955379856071


# Undersampling Methods

TomekLinks

In [18]:
tl = TomekLinks()
sampling(X_tr, y_tr, X_val, y_val, tl, rfcb)

Training Count:  Counter({0: 14764, 1: 4656})
Validation Count:  Counter({0: 4264, 1: 1313})
Training Accuracy:  0.8414521112255406
Validation Accuracy:  0.8221265913573605
Training F1 Score:  0.5748999033549634
Validation F1 Score:  0.512291052114061
Training AUC Score:  0.8410875276748216
Validation AUC Score:  0.7956881252420235
Training Recall Score:  0.44716494845360827
Validation Recall Score:  0.3968012185833968
Training Precision Score:  0.8047931967529958
Validation Precision Score:  0.7226074895977809
Training Average Precision Score:  0.7079040437507254
Validation Average Precision Score:  0.6005496054547611


In [19]:
X_tr_tl, y_tr_tl = tl.fit_resample(X_tr, y_tr)
X_val_tl, y_val_tl = tl.fit_resample(X_val, y_val)
tlm = rfcb.fit(X_tr_tl, y_tr_tl)
y_pred_tl_tr = tlm.predict(X_tr_tl)
y_pred_tl_val = tlm.predict(X_val_tl)

In [20]:
enn = EditedNearestNeighbours()
sampling(X_tr, y_tr, X_val, y_val, enn, rfcb)

Training Count:  Counter({0: 9976, 1: 4656})
Validation Count:  Counter({0: 2831, 1: 1313})
Training Accuracy:  0.8467741935483871
Validation Accuracy:  0.8221525096525096
Training F1 Score:  0.7060828526481384
Validation F1 Score:  0.6592695330559408
Training AUC Score:  0.8899022279760085
Validation AUC Score:  0.8499417422654147
Training Recall Score:  0.5783934707903781
Validation Recall Score:  0.543031226199543
Training Precision Score:  0.9061238223418573
Validation Precision Score:  0.8388235294117647
Training Average Precision Score:  0.8493028729475788
Validation Average Precision Score:  0.7953422050446671


In [21]:
X_tr_enn, y_tr_enn = enn.fit_resample(X_tr, y_tr)
X_val_enn, y_val_enn = enn.fit_resample(X_val, y_val)
ennm = rfcb.fit(X_tr_enn, y_tr_enn)
y_pred_enn_tr = ennm.predict(X_tr_enn)
y_pred_enn_val = ennm.predict(X_val_enn)

# Oversampling Methods

In [22]:
sm = SMOTE(sampling_strategy='minority', random_state=42)
sampling(X_tr, y_tr, X_val, y_val, sm, rfcb)

Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.7600954478707783
Validation Accuracy:  0.7209302325581395
Training F1 Score:  0.7422260206429557
Validation F1 Score:  0.6938202247191011
Training AUC Score:  0.8424750938821697
Validation AUC Score:  0.8001881284519292
Training Recall Score:  0.6907733724914341
Validation Recall Score:  0.6323874546618306
Training Precision Score:  0.8019605057536582
Validation Precision Score:  0.7684729064039408
Training Average Precision Score:  0.8496452993954169
Validation Average Precision Score:  0.808761409308747


In [23]:
X_tr_sm, y_tr_sm = sm.fit_resample(X_tr, y_tr)
X_val_sm, y_val_sm = sm.fit_resample(X_val, y_val)
smm = rfcb.fit(X_tr_sm, y_tr_sm)
y_pred_sm_tr = smm.predict(X_tr_sm)
y_pred_sm_val = smm.predict(X_val_sm)

In [24]:
svmsm = SVMSMOTE()
sampling(X_tr, y_tr, X_val, y_val, svmsm, rfcb)

Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.8128059226627509
Validation Accuracy:  0.7801365478984424
Training F1 Score:  0.799777494191944
Validation F1 Score:  0.7600977767431033
Training AUC Score:  0.8958704487302966
Validation AUC Score:  0.8581061134964274
Training Recall Score:  0.7477361722956437
Validation Recall Score:  0.6966076381480691
Training Precision Score:  0.8596046986002673
Validation Precision Score:  0.8363217213114754
Training Average Precision Score:  0.9060622836348282
Validation Average Precision Score:  0.8705240938339492


In [25]:
X_tr_svm, y_tr_svm = svmsm.fit_resample(X_tr, y_tr)
X_val_svm, y_val_svm = svmsm.fit_resample(X_val, y_val)
svmsmote = rfcb.fit(X_tr_svm, y_tr_svm)
y_pred_svmsm_tr = svmsmote.predict(X_tr_svm)
y_pred_svmsm_val = svmsmote.predict(X_val_svm)

In [26]:
adsn = ADASYN()
sampling(X_tr, y_tr, X_val, y_val, adsn, rfcb)

Training Count:  Counter({0: 16344, 1: 15798})
Validation Count:  Counter({0: 4687, 1: 4539})
Training Accuracy:  0.733308443780723
Validation Accuracy:  0.6947756340776068
Training F1 Score:  0.7128692972466003
Validation F1 Score:  0.6658756525866161
Training AUC Score:  0.8099216594763416
Validation AUC Score:  0.7643251881507884
Training Recall Score:  0.6735662742119256
Validation Recall Score:  0.6181978409341264
Training Precision Score:  0.7570432555492317
Validation Precision Score:  0.7215222422216508
Training Average Precision Score:  0.8049168447325794
Validation Average Precision Score:  0.7627422987392249


In [27]:
X_tr_adsn, y_tr_adsn = adsn.fit_resample(X_tr, y_tr)
X_val_adsn, y_val_adsn = adsn.fit_resample(X_val, y_val)
adsnm = rfcb.fit(X_tr_adsn, y_tr_adsn)
y_pred_adsnm_tr = adsnm.predict(X_tr_adsn)
y_pred_adsnm_val = adsnm.predict(X_val_adsn)

# Combined Methods

In [28]:
smtk = SMOTETomek()
sampling(X_tr, y_tr, X_val, y_val, smtk, rfcb)

Training Count:  Counter({0: 15451, 1: 15451})
Validation Count:  Counter({0: 4406, 1: 4406})
Training Accuracy:  0.7729273186201541
Validation Accuracy:  0.7347934634589196
Training F1 Score:  0.7573567550745184
Validation F1 Score:  0.7120147874306838
Training AUC Score:  0.8564985131678327
Validation AUC Score:  0.8156725529438357
Training Recall Score:  0.7087567147757426
Validation Recall Score:  0.6556967771221062
Training Precision Score:  0.8131125631125631
Validation Precision Score:  0.778916149905635
Training Average Precision Score:  0.862713572916824
Validation Average Precision Score:  0.8241649851590853


In [29]:
X_tr_smt, y_tr_smt = smtk.fit_resample(X_tr, y_tr)
X_val_smt, y_val_smt = smtk.fit_resample(X_val, y_val)
smtkm = rfcb.fit(X_tr_smt, y_tr_smt)
y_pred_smtk_tr = smtkm.predict(X_tr_smt)
y_pred_smtk_val = smtkm.predict(X_val_smt)

In [30]:
smenn = SMOTEENN(sampling_strategy="minority", n_jobs= -1)
sampling(X_tr, y_tr, X_val, y_val, smenn, rfcb)

Training Count:  Counter({1: 10908, 0: 8566})
Validation Count:  Counter({1: 3104, 0: 2390})
Training Accuracy:  0.8652048885693745
Validation Accuracy:  0.823443756825628
Training F1 Score:  0.8735853599807368
Validation F1 Score:  0.8328162702516374
Training AUC Score:  0.9471770499876667
Validation AUC Score:  0.9139590432644609
Training Recall Score:  0.8314998166483315
Validation Recall Score:  0.7783505154639175
Training Precision Score:  0.9201582631632342
Validation Precision Score:  0.8954781319495922
Training Average Precision Score:  0.9634523631084815
Validation Average Precision Score:  0.9411334223073468


In [31]:
X_tr_sme, y_tr_sme = smenn.fit_resample(X_tr, y_tr)
X_val_sme, y_val_sme = smenn.fit_resample(X_val, y_val)
smennm = rfcb.fit(X_tr_sme, y_tr_sme)
y_pred_smenn_tr = smennm.predict(X_tr_sme)
y_pred_smenn_val = smennm.predict(X_val_sme)

# SMOTE Variants

In [32]:
pfsm = sv.polynom_fit_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, pfsm, rfcb)

2021-04-15 08:54:08,385:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star'}")
2021-04-15 08:54:08,442:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star'}")
Training Count:  Counter({1: 18624, 0: 16344})
Validation Count:  Counter({1: 5252, 0: 4687})
Training Accuracy:  0.898535804163807
Validation Accuracy:  0.8921420666062985
Training F1 Score:  0.8992159981820247
Validation F1 Score:  0.891739042617653
Training AUC Score:  0.9569829305141351
Validation AUC Score:  0.9448800306660788
Training Recall Score:  0.8498711340206185
Validation Recall Score:  0.8406321401370906
Training Precision Score:  0.9546441495778046
Validation Precision Score:  0.9494623655913978
Training Average Precision Score:  0.9705040305862509
Validation Average Precision Score:  0.9629846738573772


In [33]:
X_tr_pfs, y_tr_pfs = pfsm.sample(X_tr, y_tr)
X_val_pfs, y_val_pfs = pfsm.sample(X_val, y_val)
pfsmm = rfcb.fit(X_tr_pfs, y_tr_pfs)
y_pred_pfsm_tr = pfsmm.predict(X_tr_pfs)
y_pred_pfsm_val = pfsmm.predict(X_val_pfs)

2021-04-15 08:54:20,196:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star'}")
2021-04-15 08:54:20,245:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star'}")


In [34]:
pws = sv.ProWSyn()
sampling2(X_tr, y_tr, X_val, y_val, pws, rfcb)

2021-04-15 08:54:30,337:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1.0, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1}")
2021-04-15 08:54:31,685:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1.0, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.8672295643661282
Validation Accuracy:  0.8580115212289311
Training F1 Score:  0.8582533150434385
Validation F1 Score:  0.8477290927811464
Training AUC Score:  0.9343193083739972
Validation AUC Score:  0.9170845516032912
Training Recall Score:  0.8039035731767009
Validation Recall Score:  0.7904843183272883
Training Precision Score:  0.9204847975339778
Validation Precision Score:  0.9139121854958067
Training Average Precision Score:  0.9477157980445392
Validation Average Precision Score:  0.9366948692015955


In [35]:
X_tr_pws, y_tr_pws = pws.sample(X_tr, y_tr)
X_val_pws, y_val_pws = pws.sample(X_val, y_val)
pwsm = rfcb.fit(X_tr_pws, y_tr_pws)
y_pred_pws_tr = pwsm.predict(X_tr_pws)
y_pred_pws_val = pwsm.predict(X_val_pws)

2021-04-15 08:54:46,125:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1.0, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1}")
2021-04-15 08:54:47,466:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1.0, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1}")


In [36]:
smipf = sv.SMOTE_IPF()
sampling2(X_tr, y_tr, X_val, y_val, smipf, rfcb)

2021-04-15 08:54:59,803:INFO:SMOTE_IPF: Running sampling via ('SMOTE_IPF', "{'proportion': 1.0, 'n_neighbors': 5, 'n_folds': 9, 'k': 3, 'p': 0.01, 'voting': 'majority', 'n_jobs': 1, 'classifier': DecisionTreeClassifier()}")
2021-04-15 08:54:59,804:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2021-04-15 08:55:01,961:INFO:SMOTE_IPF: Removing 44 elements
2021-04-15 08:55:03,824:INFO:SMOTE_IPF: Removing 0 elements
2021-04-15 08:55:05,653:INFO:SMOTE_IPF: Removing 0 elements
2021-04-15 08:55:05,660:INFO:SMOTE_IPF: Running sampling via ('SMOTE_IPF', "{'proportion': 1.0, 'n_neighbors': 5, 'n_folds': 9, 'k': 3, 'p': 0.01, 'voting': 'majority', 'n_jobs': 1, 'classifier': DecisionTreeClassifier()}")
2021-04-15 08:55:05,660:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
Training Count:  Counter({0: 16337, 1: 16307})
2021-04-15 08:55:06,167:INFO:SMOTE_IPF: Removing 1 elements
2021-04-15 08:55:06,57

In [37]:
X_tr_smi, y_tr_smi = smipf.sample(X_tr, y_tr)
X_val_smi, y_val_smi = smipf.sample(X_val, y_val)
smipfm = rfcb.fit(X_tr_smi, y_tr_smi)
y_pred_smipf_tr = smipfm.predict(X_tr_smi)
y_pred_smipf_val = smipfm.predict(X_val_smi)

2021-04-15 08:55:20,587:INFO:SMOTE_IPF: Running sampling via ('SMOTE_IPF', "{'proportion': 1.0, 'n_neighbors': 5, 'n_folds': 9, 'k': 3, 'p': 0.01, 'voting': 'majority', 'n_jobs': 1, 'classifier': DecisionTreeClassifier()}")
2021-04-15 08:55:20,588:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2021-04-15 08:55:22,745:INFO:SMOTE_IPF: Removing 44 elements
2021-04-15 08:55:24,571:INFO:SMOTE_IPF: Removing 0 elements
2021-04-15 08:55:26,407:INFO:SMOTE_IPF: Removing 0 elements
2021-04-15 08:55:26,409:INFO:SMOTE_IPF: Running sampling via ('SMOTE_IPF', "{'proportion': 1.0, 'n_neighbors': 5, 'n_folds': 9, 'k': 3, 'p': 0.01, 'voting': 'majority', 'n_jobs': 1, 'classifier': DecisionTreeClassifier()}")
2021-04-15 08:55:26,410:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2021-04-15 08:55:26,920:INFO:SMOTE_IPF: Removing 1 elements
2021-04-15 08:55:27,335:INFO:SMOTE_IPF: Removing 0 elements
2021-04-1

In [38]:
smobd = sv.SMOBD()
sampling2(X_tr, y_tr, X_val, y_val, smobd, rfcb)

2021-04-15 08:55:39,167:INFO:SMOBD: Running sampling via ('SMOBD', "{'proportion': 1.0, 'eta1': 0.5, 't': 1.8, 'min_samples': 5, 'max_eps': 1.0, 'n_jobs': 1}")
2021-04-15 08:55:51,270:INFO:SMOBD: Running sampling via ('SMOBD', "{'proportion': 1.0, 'eta1': 0.5, 't': 1.8, 'min_samples': 5, 'max_eps': 1.0, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.7809287812041116
Validation Accuracy:  0.7561339876253467
Training F1 Score:  0.7720515677224257
Validation F1 Score:  0.7453775896636221
Training AUC Score:  0.8661376877493652
Validation AUC Score:  0.8348050063253458
Training Recall Score:  0.7419848262359275
Validation Recall Score:  0.7138894815446981
Training Precision Score:  0.8046579523588349
Validation Precision Score:  0.7797716150081566
Training Average Precision Score:  0.8726066884380497
Validation Average Precision Score:  0.8436539856446121


In [39]:
X_tr_smo, y_tr_smo = smobd.sample(X_tr, y_tr)
X_val_smo, y_val_smo = smobd.sample(X_val, y_val)
smobdm = rfcb.fit(X_tr_smo, y_tr_smo)
y_pred_smobd_tr = smobdm.predict(X_tr_smo)
y_pred_smobd_val = smobdm.predict(X_val_smo)

2021-04-15 08:56:06,572:INFO:SMOBD: Running sampling via ('SMOBD', "{'proportion': 1.0, 'eta1': 0.5, 't': 1.8, 'min_samples': 5, 'max_eps': 1.0, 'n_jobs': 1}")
2021-04-15 08:56:19,185:INFO:SMOBD: Running sampling via ('SMOBD', "{'proportion': 1.0, 'eta1': 0.5, 't': 1.8, 'min_samples': 5, 'max_eps': 1.0, 'n_jobs': 1}")


In [40]:
gsm = sv.G_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, gsm, rfcb)

2021-04-15 08:56:31,848:INFO:G_SMOTE: Running sampling via ('G_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'method': 'linear', 'n_jobs': 1}")
2021-04-15 08:56:32,912:INFO:G_SMOTE: Running sampling via ('G_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'method': 'linear', 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.7668257464512971
Validation Accuracy:  0.7330915297631747
Training F1 Score:  0.749375246613179
Validation F1 Score:  0.7091374099046733
Training AUC Score:  0.8486447495015992
Validation AUC Score:  0.8062866439769649
Training Recall Score:  0.6971977484092021
Validation Recall Score:  0.6507360785150416
Training Precision Score:  0.8099943133352289
Validation Precision Score:  0.7790549169859514
Training Average Precision Score:  0.8564941379794629
Validation Average Precision Score:  0.816062280058713


In [41]:
X_tr_gsm, y_tr_gsm = gsm.sample(X_tr, y_tr)
X_val_gsm, y_val_gsm = gsm.sample(X_val, y_val)
gsmm = rfcb.fit(X_tr_gsm, y_tr_gsm)
y_pred_gsm_tr = gsmm.predict(X_tr_gsm)
y_pred_gsm_val = gsmm.predict(X_val_gsm)

2021-04-15 08:56:46,781:INFO:G_SMOTE: Running sampling via ('G_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'method': 'linear', 'n_jobs': 1}")
2021-04-15 08:56:47,821:INFO:G_SMOTE: Running sampling via ('G_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'method': 'linear', 'n_jobs': 1}")


In [42]:
ccr = sv.CCR()
sampling2(X_tr, y_tr, X_val, y_val, ccr, rfcb)

2021-04-15 08:56:59,864:INFO:CCR: Running sampling via ('CCR', "{'proportion': 1.0, 'energy': 1.0, 'scaling': 0.0, 'n_jobs': 1}")
2021-04-15 08:57:07,547:INFO:CCR: Running sampling via ('CCR', "{'proportion': 1.0, 'energy': 1.0, 'scaling': 0.0, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16326})
Validation Count:  Counter({0: 4687, 1: 4677})
Training Accuracy:  0.8862871135598408
Validation Accuracy:  0.8851986330627937
Training F1 Score:  0.8786304681629586
Validation F1 Score:  0.8773251169690746
Training AUC Score:  0.9377693940801974
Validation AUC Score:  0.9349816585381966
Training Recall Score:  0.8236555188043612
Validation Recall Score:  0.8218943767372248
Training Precision Score:  0.9414688790870266
Validation Precision Score:  0.9407733724914341
Training Average Precision Score:  0.9545534935727262
Validation Average Precision Score:  0.9526667419617793


In [43]:
X_tr_ccr, y_tr_ccr = ccr.sample(X_tr, y_tr)
X_val_ccr, y_val_ccr = ccr.sample(X_val, y_val)
ccrm = rfcb.fit(X_tr_ccr, y_tr_ccr)
y_pred_ccr_tr = ccrm.predict(X_tr_ccr)
y_pred_ccr_val = ccrm.predict(X_val_ccr)

2021-04-15 08:57:20,971:INFO:CCR: Running sampling via ('CCR', "{'proportion': 1.0, 'energy': 1.0, 'scaling': 0.0, 'n_jobs': 1}")
2021-04-15 08:57:28,582:INFO:CCR: Running sampling via ('CCR', "{'proportion': 1.0, 'energy': 1.0, 'scaling': 0.0, 'n_jobs': 1}")


In [44]:
lvq = sv.LVQ_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, lvq, rfcb)

2021-04-15 08:57:40,422:INFO:LVQ_SMOTE: Running sampling via ('LVQ_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_clusters': 10, 'n_jobs': 1}")
2021-04-15 08:57:46,118:INFO:LVQ_SMOTE: Running sampling via ('LVQ_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_clusters': 10, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.8902655408712677
Validation Accuracy:  0.8746532963516108
Training F1 Score:  0.8837766905355927
Validation F1 Score:  0.8656375071469411
Training AUC Score:  0.9436660299941373
Validation AUC Score:  0.9286163641254229
Training Recall Score:  0.8344346549192364
Validation Recall Score:  0.8075528056326008
Training Precision Score:  0.9393208898684482
Validation Precision Score:  0.9327254805322819
Training Average Precision Score:  0.9583840936452743
Validation Average Precision Score:  0.9442636962460135


In [45]:
X_tr_lvq, y_tr_lvq = lvq.sample(X_tr, y_tr)
X_val_lvq, y_val_lvq = lvq.sample(X_val, y_val)
lvqm = rfcb.fit(X_tr_lvq, y_tr_lvq)
y_pred_lvq_tr = lvqm.predict(X_tr_lvq)
y_pred_lvq_val = lvqm.predict(X_val_lvq)

2021-04-15 08:57:58,818:INFO:LVQ_SMOTE: Running sampling via ('LVQ_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_clusters': 10, 'n_jobs': 1}")
2021-04-15 08:58:05,339:INFO:LVQ_SMOTE: Running sampling via ('LVQ_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_clusters': 10, 'n_jobs': 1}")


In [46]:
ass = sv.Assembled_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, ass, rfcb)

2021-04-15 08:58:17,348:INFO:Assembled_SMOTE: Running sampling via ('Assembled_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'pop': 2, 'thres': 0.3, 'n_jobs': 1}")
2021-04-15 09:02:49,081:INFO:Assembled_SMOTE: Running sampling via ('Assembled_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'pop': 2, 'thres': 0.3, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.7585046500244738
Validation Accuracy:  0.7246639641561766
Training F1 Score:  0.7410104986876641
Validation F1 Score:  0.7005453068801486
Training AUC Score:  0.8444174051786493
Validation AUC Score:  0.8026541279259817
Training Recall Score:  0.6909569260890847
Validation Recall Score:  0.644122039684233
Training Precision Score:  0.798882286361064
Validation Precision Score:  0.7678026449643948
Training Average Precision Score:  0.8506318145162908
Validation Average Precision Score:  0.8120547726343764


In [47]:
X_tr_ass, y_tr_ass = ass.sample(X_tr, y_tr)
X_val_ass, y_val_ass = ass.sample(X_val, y_val)
assm = rfcb.fit(X_tr_ass, y_tr_ass)
y_pred_ass_tr = assm.predict(X_tr_ass)
y_pred_ass_val = assm.predict(X_val_ass)

2021-04-15 09:03:08,637:INFO:Assembled_SMOTE: Running sampling via ('Assembled_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'pop': 2, 'thres': 0.3, 'n_jobs': 1}")
2021-04-15 09:07:32,687:INFO:Assembled_SMOTE: Running sampling via ('Assembled_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'pop': 2, 'thres': 0.3, 'n_jobs': 1}")


In [48]:
data = {'Accuracy': ["",
                     accuracy(y_val, y_pred_bc_val),
                     accuracy(y_val, y_pred_bbc_val),
                     accuracy(y_val, y_pred_bbc3_val),
                     accuracy(y_val, y_pred_brf_val),
                     accuracy(y_val, y_pred_rbc_val),
                     accuracy(y_val, y_pred_eec_val),
                     "",
                     accuracy(y_val_tl, y_pred_tl_val),
                     accuracy(y_val_enn, y_pred_enn_val),
                     "",
                     accuracy(y_val_sm, y_pred_sm_val),
                     accuracy(y_val_svm, y_pred_svmsm_val),
                     accuracy(y_val_adsn, y_pred_adsnm_val),
                     "",
                     accuracy(y_val_smt, y_pred_smtk_val),
                     accuracy(y_val_sme, y_pred_smenn_val),
                     "",
                     accuracy(y_val_pfs, y_pred_pfsm_val),
                     accuracy(y_val_pws, y_pred_pws_val),
                     accuracy(y_val_smi, y_pred_smipf_val),
                     accuracy(y_val_smo, y_pred_smobd_val),
                     accuracy(y_val_gsm, y_pred_gsm_val),
                     accuracy(y_val_ccr, y_pred_ccr_val),
                     accuracy(y_val_lvq, y_pred_lvq_val),
                     accuracy(y_val_ass, y_pred_ass_val)],
    'F1 Score': ["",
                f1_score(y_val, y_pred_bc_val),
                f1_score(y_val, y_pred_bbc_val),
                f1_score(y_val, y_pred_bbc3_val),
                f1_score(y_val, y_pred_brf_val),
                f1_score(y_val, y_pred_rbc_val),
                f1_score(y_val, y_pred_eec_val),
                "",
                f1_score(y_val_tl, y_pred_tl_val),
                f1_score(y_val_enn, y_pred_enn_val),
                "",
                f1_score(y_val_sm, y_pred_sm_val),
                f1_score(y_val_svm, y_pred_svmsm_val),
                f1_score(y_val_adsn, y_pred_adsnm_val),
                "",
                f1_score(y_val_smt, y_pred_smtk_val),
                f1_score(y_val_sme, y_pred_smenn_val),
                "",
                f1_score(y_val_pfs, y_pred_pfsm_val),
                f1_score(y_val_pws, y_pred_pws_val),
                f1_score(y_val_smi, y_pred_smipf_val),
                f1_score(y_val_smo, y_pred_smobd_val),
                f1_score(y_val_gsm, y_pred_gsm_val),
                f1_score(y_val_ccr, y_pred_ccr_val),
                f1_score(y_val_lvq, y_pred_lvq_val),
                f1_score(y_val_ass, y_pred_ass_val)],
    'PR AUC Score': ["",
              aps(X_val, y_val, bc),
              aps(X_val, y_val, bbc),
              aps(X_val, y_val, bbc3),
              aps(X_val, y_val, brf),
              aps(X_val, y_val, rbc),
              aps(X_val, y_val, eec),
              "",
              aps(X_val_tl, y_val_tl, tlm),
              aps(X_val_enn, y_val_enn, ennm),
              "",
              aps(X_val_sm, y_val_sm, smm),
              aps(X_val_svm, y_val_svm, svmsmote),
              aps(X_val_adsn, y_val_adsn, adsnm),
              "",
              aps(X_val_smt, y_val_smt, smtkm),
              aps(X_val_sme, y_val_sme, smennm),
              "",
              aps(X_val_pfs, y_val_pfs, pfsmm),
              aps(X_val_pws, y_val_pws, pwsm),
              aps(X_val_smi, y_val_smi, smipfm),
              aps(X_val_smo, y_val_smo, smobdm),
              aps(X_val_gsm, y_val_gsm, gsmm),
              aps(X_val_ccr, y_val_ccr, ccrm),
              aps(X_val_lvq, y_val_lvq, lvqm),
              aps(X_val, y_val_ass, assm)]}
scores = pd.DataFrame(data=data, index = ['ENSEMBLE METHODS',
                                          'BaggingClassifier',
                                          'BalancedBaggingClassifier',
                                          'BBC with GradientBoostingClassifier',
                                          'BalancedRandomForestClassifier',
                                          'RUBoostClassifier',
                                          'EasyEnsembleClassifier',
                                          'UNDERSAMPLING METHODS',
                                          'TomekLinks',
                                          'EditedNearestNeighbours',
                                          'OVERSAMPLING METHODS',
                                          'SMOTE',
                                          'SVMSMOTE',
                                          'ADASYN',
                                          'COMBINED METHODS',
                                          'SMOTETomek',
                                          'SMOTEENN',
                                          'SMOTE-VARIANTS',
                                          'sv.polynom_fit_SMOTE',
                                          'sv.ProWSyn',
                                          'sv.SMOTE_IPF',
                                          'sv.SMOBD',
                                          'sv.G_SMOTE',
                                          'sv.CCR',
                                          'sv.LQV_SMOTE',
                                          'sv.Assembled_SMOTE'])

ValueError: Found input variables with inconsistent numbers of samples: [9374, 6000]

In [None]:
scores