# Importing Packages

In [58]:
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
plt.style.use("fivethirtyeight")
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from collections import Counter
import pickle

%reload_ext autoreload
%autoreload 2
from utils import *

import smote_variants as sv
import imbalanced_databases as imbd
import xgboost as xgb

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict, GridSearchCV,RepeatedStratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, roc_auc_score, classification_report, plot_confusion_matrix, auc, mean_squared_error, confusion_matrix, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils import resample
from sklearn.dummy import DummyClassifier

from imblearn.under_sampling import CondensedNearestNeighbour, NearMiss, OneSidedSelection, NeighbourhoodCleaningRule, RandomUnderSampler, TomekLinks, EditedNearestNeighbours
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, SVMSMOTE
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier, RUSBoostClassifier

from xgboost.sklearn import XGBClassifier

2021-02-22 17:22:27,517:DEBUG:Loaded backend module://ipykernel.pylab.backend_inline version unknown.


# Importing Training and Validation Datasets

In [3]:
pickle_in = open("data/training_model.pickle","rb")
train = pickle.load(pickle_in)
pickle_in = open("data/validate_model.pickle","rb")
validate = pickle.load(pickle_in)

In [4]:
X_train = train.drop(["default"], axis=1)
y_tr = train["default"]
X_validate = validate.drop(["default"], axis=1)
y_val = validate["default"]

In [5]:
X_train.head()

Unnamed: 0,limit,behind1,paid2,delayed,latemths,age,behind2,billed1,avg_av,avail1
0,1790.26,0,179.13,0,0,44,0,1631.93,0.344578,0.08844
1,5728.83,-1,173.87,0,0,46,-1,891.69,0.957227,0.84435
2,3580.52,-1,0.0,0,0,47,-1,238.68,0.96865,0.933339
3,6086.88,0,89.26,0,0,29,0,2831.87,0.650602,0.534758
4,5370.78,-2,1171.37,0,0,33,-2,873.4,0.836153,0.837379


In [6]:
X_validate.head()

Unnamed: 0,limit,behind1,paid2,delayed,latemths,age,behind2,billed1,avg_av,avail1
0,1074.16,0,71.61,0,0,25,0,317.38,0.602052,0.704532
1,5370.78,0,151.64,0,0,26,0,4895.86,0.293715,0.088427
2,2506.36,0,111.43,0,0,32,0,2510.73,0.005217,-0.001744
3,4654.68,0,64.74,0,0,49,0,740.38,0.883482,0.840939
4,1790.26,0,53.71,1,1,36,0,3373.85,0.188227,-0.884559


# Standardize Datasets

In [7]:
scaler = StandardScaler()
scaler.fit(X_train)
X_tr_ss = scaler.transform(X_train)
X_val_ss = scaler.transform(X_validate)

In [8]:
scaled = MinMaxScaler()
scaled.fit(X_train)
X_tr = scaled.transform(X_train)
X_val = scaled.transform(X_validate)

# Importing Model

In [9]:
pickle_in = open("data/best_model.pickle","rb")
rfcb = pickle.load(pickle_in)

In [10]:
rfcb

RandomForestClassifier(max_depth=8, n_estimators=400)

# Dummy Classifier

In [11]:
dc = DummyClassifier(strategy='most_frequent').fit(X_tr, y_tr)
y_pred_dc_tr = dc.predict(X_tr)
y_pred_dc_val = dc.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_dc_tr, y_pred_dc_val, dc)

Training Accuracy:  0.7782857142857142
Validation Accuracy:  0.7811666666666667
Training F1 Score:  0.0
Validation F1 Score:  0.0
Training AUC Score:  0.5
Validation AUC Score:  0.5
Training Recall Score:  0.0
Validation Recall Score:  0.0
Training Precision Score:  0.0
Validation Precision Score:  0.0
Training Average Precision Score:  0.22171428571428572
Validation Average Precision Score:  0.21883333333333332


# Ensemble Methods

## Bagging Classifier

Instead of using a single tree, we will check if an ensemble of decsion tree can actually alleviate the issue induced by the class imbalancing. First, we will use a bagging classifier and its counter part which internally uses a random under-sampling to balanced each boostrap sample.

Balancing each bootstrap sample allows to increase significantly the balanced accuracy and the geometric mean.

In [61]:
bc = BaggingClassifier(n_estimators=50, random_state=42).fit(X_tr_ss, y_tr)
y_pred_bc_tr = bc.predict(X_tr_ss)
y_pred_bc_val = bc.predict(X_val_ss)
get_metric(X_tr_ss, y_tr, X_val_ss, y_val, y_pred_bc_tr, y_pred_bc_val, bc)
print("")
print('Training Balanced Accuracy: ', balanced_accuracy_score(y_tr, y_pred_bc_tr))
print('Training Geometric Mean: ', geometric_mean_score(y_tr, y_pred_bc_tr))
print('Validation Balanced Accuracy: ', balanced_accuracy_score(y_val, y_pred_bc_val))
print('Validation Geometric Mean: ', geometric_mean_score(y_val, y_pred_bc_val))

Training Accuracy:  0.996904761904762
Validation Accuracy:  0.807
Training F1 Score:  0.9929964443486693
Validation F1 Score:  0.44326923076923075
Training AUC Score:  0.9998954304300327
Validation AUC Score:  0.740359286457933
Training Recall Score:  0.9896907216494846
Validation Recall Score:  0.3511043412033511
Training Precision Score:  0.9963243243243243
Validation Precision Score:  0.6010430247718384
Training Average Precision Score:  0.9996190235446103
Validation Average Precision Score:  0.48114820511254575

Training Balanced Accuracy:  0.9943252922980659


NameError: name 'geometric_mean_score' is not defined

## Balanced Bagging Classifier

A Bagging classifier with additional balancing.

This implementation of Bagging is similar to the scikit-learn implementation. It includes an additional step to balance the training set at fit time using a given sampler.

This classifier can serves as a basis to implement various methods such as Exactly Balanced Bagging [6], Roughly Balanced Bagging [7], Over-Bagging [6], or SMOTE-Bagging [8].

In [63]:
bbc = BalancedBaggingClassifier(n_estimators=50, random_state=42).fit(X_tr_ss, y_tr)
y_pred_bbc_tr = bbc.predict(X_tr_ss)
y_pred_bbc_val = bbc.predict(X_val_ss)
get_metric(X_tr_ss, y_tr, X_val_ss, y_val, y_pred_bbc_tr, y_pred_bbc_val, bbc)
print("")
print('Training Balanced Accuracy: ', balanced_accuracy_score(y_tr, y_pred_bbc_tr))
print('Training Geometric Mean: ', geometric_mean_score(y_tr, y_pred_bbc_tr))
print('Validation Balanced Accuracy: ', balanced_accuracy_score(y_val, y_pred_bbc_val))
print('Validation Geometric Mean: ', geometric_mean_score(y_val, y_pred_bbc_val))

Training Accuracy:  0.9383333333333334
Validation Accuracy:  0.7631666666666667
Training F1 Score:  0.8778186621379375
Validation F1 Score:  0.5111799105607154
Training AUC Score:  0.9955504481714446
Validation AUC Score:  0.7528188434539899
Training Recall Score:  0.9991408934707904
Validation Recall Score:  0.5658796648895659
Training Precision Score:  0.7827696449604576
Validation Precision Score:  0.46612296110414053
Training Average Precision Score:  0.9819759873453777
Validation Average Precision Score:  0.499848233823849

Training Balanced Accuracy:  0.9600758309742596


NameError: name 'geometric_mean_score' is not defined

## Balanced Bagging Classifier with Gradient Boosting Classifier

In [70]:
bbc3 = BalancedBaggingClassifier(
        base_estimator=HistGradientBoostingClassifier(random_state=42),
        n_estimators=10,
        random_state=42,
        n_jobs=2).fit(X_tr_ss, y_tr)
y_pred_bbc3_tr = bbc3.predict(X_tr_ss)
y_pred_bbc3_val = bbc3.predict(X_val_ss)
get_metric(X_tr_ss, y_tr, X_val_ss, y_val, y_pred_bbc3_tr, y_pred_bbc3_val, bbc3)

Training Accuracy:  0.793
Validation Accuracy:  0.7533333333333333
Training F1 Score:  0.6030499497762761
Validation F1 Score:  0.5271565495207667
Training AUC Score:  0.8560039766792316
Validation AUC Score:  0.7749874838134551
Training Recall Score:  0.709192439862543
Validation Recall Score:  0.6283320639756284
Training Precision Score:  0.5245432883240667
Validation Precision Score:  0.45404512933406715
Training Average Precision Score:  0.680188981227117
Validation Average Precision Score:  0.5372388529575939


## Balanced Random Forest Classifier

Random forest is another popular ensemble method and it is usually outperforming bagging. Here, we used a vanilla random forest and its balanced counterpart in which each bootstrap sample is balanced.

Similarly to the previous experiment, the balanced classifier outperform the classifier which learn from imbalanced bootstrap samples. In addition, random forest outsperforms the bagging classifier.

In [66]:
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=42).fit(X_tr, y_tr)
y_pred_brf_tr = brf.predict(X_tr)
y_pred_brf_val = brf.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_brf_tr, y_pred_brf_val, brf)

Training Accuracy:  0.888
Validation Accuracy:  0.724
Training F1 Score:  0.7983539094650206
Validation F1 Score:  0.5083135391923991
Training AUC Score:  0.9928255616361626
Validation AUC Score:  0.7643157793647772
Training Recall Score:  1.0
Validation Recall Score:  0.6519421172886519
Training Precision Score:  0.6643835616438356
Validation Precision Score:  0.41654501216545015
Training Average Precision Score:  0.973183893912578
Validation Average Precision Score:  0.5111466083509113


## RUSBoostClassifier

Random under-sampling integrated in the learning of AdaBoost.

During learning, the problem of class balancing is alleviated by random under-sampling the sample at each iteration of the boosting algorithm

In [67]:
rbc = RUSBoostClassifier(n_estimators=200, algorithm='SAMME.R', random_state=42).fit(X_tr, y_tr)
y_pred_rbc_tr = rbc.predict(X_tr)
y_pred_rbc_val = rbc.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_rbc_tr, y_pred_rbc_val, rbc)

Training Accuracy:  0.7674761904761904
Validation Accuracy:  0.7566666666666667
Training F1 Score:  0.5456406439006235
Validation F1 Score:  0.5197368421052632
Training AUC Score:  0.7936512282426961
Validation AUC Score:  0.7667346817069983
Training Recall Score:  0.6297250859106529
Validation Recall Score:  0.6016755521706016
Training Precision Score:  0.48136594976194386
Validation Precision Score:  0.45744064852345107
Training Average Precision Score:  0.5609306330020786
Validation Average Precision Score:  0.5244608728332589


## Easy Ensemble Classifier

Bag of balanced boosted learners also known as EasyEnsemble.

This algorithm is known as EasyEnsemble [1]. The classifier is an ensemble of AdaBoost learners trained on different balanced boostrap samples. The balancing is achieved by random under-sampling.

In [68]:
eec = EasyEnsembleClassifier(random_state=42).fit(X_tr, y_tr) 
y_pred_eec_tr = eec.predict(X_tr)
y_pred_eec_val = eec.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_eec_tr, y_pred_eec_val, eec)

Training Accuracy:  0.7592380952380953
Validation Accuracy:  0.7516666666666667
Training F1 Score:  0.5427744619280159
Validation F1 Score:  0.5278833967046895
Training AUC Score:  0.790370227396205
Validation AUC Score:  0.7749849651391096
Training Recall Score:  0.6445446735395189
Validation Recall Score:  0.6344249809596344
Training Precision Score:  0.46875976257419555
Validation Precision Score:  0.45198046663049374
Training Average Precision Score:  0.5504213877594832
Validation Average Precision Score:  0.5262955379856071


# Undersampling Methods

In [13]:
tl = TomekLinks()
sampling(X_tr, y_tr, X_val, y_val, tl, rfcb)

Training Count:  Counter({0: 14764, 1: 4656})
Validation Count:  Counter({0: 4264, 1: 1313})
Training Accuracy:  0.8411431513903193
Validation Accuracy:  0.8215886677425139
Training F1 Score:  0.5731285457312855
Validation F1 Score:  0.508641975308642
Training AUC Score:  0.8414825383281149
Validation AUC Score:  0.7956925906185653
Training Recall Score:  0.44480240549828176
Validation Recall Score:  0.39223153084539225
Training Precision Score:  0.8055231427460132
Validation Precision Score:  0.723314606741573
Training Average Precision Score:  0.7080315436385418
Validation Average Precision Score:  0.5998872931666258


In [90]:
X_tr_tl, y_tr_tl = tl.fit_resample(X_tr, y_tr)
X_val_tl, y_val_tl = tl.fit_resample(X_val, y_val)
tlm = rfcb.fit(X_tr_tl, y_tr_tl)
y_pred_tl_tr = tlm.predict(X_tr_tl)
y_pred_tl_val = tlm.predict(X_val_tl)

In [85]:
enn = EditedNearestNeighbours()
sampling(X_tr, y_tr, X_val, y_val, enn, rfcb)

Training Count:  Counter({0: 9976, 1: 4656})
Validation Count:  Counter({0: 2831, 1: 1313})
Training Accuracy:  0.8465008201202843
Validation Accuracy:  0.8231177606177607
Training F1 Score:  0.7056356487549148
Validation F1 Score:  0.6614318706697461
Training AUC Score:  0.8905078804250476
Validation AUC Score:  0.8499670307763868
Training Recall Score:  0.5781786941580757
Validation Recall Score:  0.5453160700685453
Training Precision Score:  0.9051782111634162
Validation Precision Score:  0.8403755868544601
Training Average Precision Score:  0.8497072990808405
Validation Average Precision Score:  0.7954775226193064


In [91]:
X_tr_enn, y_tr_enn = enn.fit_resample(X_tr, y_tr)
X_val_enn, y_val_enn = enn.fit_resample(X_val, y_val)
ennm = rfcb.fit(X_tr_enn, y_tr_enn)
y_pred_enn_tr = ennm.predict(X_tr_enn)
y_pred_enn_val = ennm.predict(X_val_enn)

# Oversampling Methods

In [17]:
sm = SMOTE(sampling_strategy='minority', random_state=42)
sampling(X_tr, y_tr, X_val, y_val, sm, rfcb)

Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.7590553108174254
Validation Accuracy:  0.7178365692340516
Training F1 Score:  0.7409380961778831
Validation F1 Score:  0.6902447593395011
Training AUC Score:  0.8421473257507639
Validation AUC Score:  0.7994559715556773
Training Recall Score:  0.6891213901125796
Validation Recall Score:  0.6287604011094516
Training Precision Score:  0.8011808223075829
Validation Precision Score:  0.7650571131879543
Training Average Precision Score:  0.8490450052592535
Validation Average Precision Score:  0.8074301196639206


In [92]:
X_tr_sm, y_tr_sm = sm.fit_resample(X_tr, y_tr)
X_val_sm, y_val_sm = sm.fit_resample(X_val, y_val)
smm = rfcb.fit(X_tr_sm, y_tr_sm)
y_pred_sm_tr = smm.predict(X_tr_sm)
y_pred_sm_val = smm.predict(X_val_sm)

In [21]:
svmsm = SVMSMOTE()
sampling(X_tr, y_tr, X_val, y_val, svmsm, rfcb)

Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.7755445423396965
Validation Accuracy:  0.741625773415831
Training F1 Score:  0.7640911867785601
Validation F1 Score:  0.7242714025500911
Training AUC Score:  0.8600198428207393
Validation AUC Score:  0.8220743119220535
Training Recall Score:  0.7269946157611356
Validation Recall Score:  0.6786857264774909
Training Precision Score:  0.8051772040387613
Validation Precision Score:  0.7764217720283134
Training Average Precision Score:  0.8658254608027284
Validation Average Precision Score:  0.8272026665023634


In [93]:
X_tr_svm, y_tr_svm = svmsm.fit_resample(X_tr, y_tr)
X_val_svm, y_val_svm = svmsm.fit_resample(X_val, y_val)
svmsmote = rfcb.fit(X_tr_svm, y_tr_svm)
y_pred_svmsm_tr = svmsmote.predict(X_tr_svm)
y_pred_svmsm_val = svmsmote.predict(X_val_svm)

In [23]:
adsn = ADASYN()
sampling(X_tr, y_tr, X_val, y_val, adsn, rfcb)

Training Count:  Counter({0: 16344, 1: 15798})
Validation Count:  Counter({0: 4687, 1: 4539})
Training Accuracy:  0.7345218094704747
Validation Accuracy:  0.6905484500325167
Training F1 Score:  0.7159170356560243
Validation F1 Score:  0.6624896559877055
Training AUC Score:  0.8095536808720126
Validation AUC Score:  0.7637675903025309
Training Recall Score:  0.6805924800607672
Validation Recall Score:  0.6173165895571712
Training Precision Score:  0.7551092071072407
Validation Precision Score:  0.7147959183673469
Training Average Precision Score:  0.8052866455446316
Validation Average Precision Score:  0.7622146521595576


In [96]:
X_tr_adsn, y_tr_adsn = adsn.fit_resample(X_tr, y_tr)
X_val_adsn, y_val_adsn = adsn.fit_resample(X_val, y_val)
adsnm = rfcb.fit(X_tr_adsn, y_tr_adsn)
y_pred_adsnm_tr = adsnm.predict(X_tr_adsn)
y_pred_adsnm_val = adsnm.predict(X_val_adsn)

# Combined Methods

In [25]:
smtk = SMOTETomek()
sampling(X_tr, y_tr, X_val, y_val, smtk, rfcb)

Training Count:  Counter({0: 15443, 1: 15443})
Validation Count:  Counter({0: 4401, 1: 4401})
Training Accuracy:  0.7691510716829631
Validation Accuracy:  0.7392638036809815
Training F1 Score:  0.7515852553829001
Validation F1 Score:  0.7169113112125324
Training AUC Score:  0.8541369821284749
Validation AUC Score:  0.8198021911629946
Training Recall Score:  0.6984394223920223
Validation Recall Score:  0.6603044762553965
Training Precision Score:  0.8134851798778189
Validation Precision Score:  0.784133837021047
Training Average Precision Score:  0.8612806550341908
Validation Average Precision Score:  0.8263032373452869


In [97]:
X_tr_smt, y_tr_smt = smtk.fit_resample(X_tr, y_tr)
X_val_smt, y_val_smt = smtk.fit_resample(X_val, y_val)
smtkm = rfcb.fit(X_tr_smt, y_tr_smt)
y_pred_smtk_tr = smtkm.predict(X_tr_smt)
y_pred_smtk_val = smtkm.predict(X_val_smt)

In [27]:
smenn = SMOTEENN(sampling_strategy="minority", n_jobs= -1)
sampling(X_tr, y_tr, X_val, y_val, smenn, rfcb)

Training Count:  Counter({1: 10847, 0: 8527})
Validation Count:  Counter({1: 3103, 0: 2383})
Training Accuracy:  0.8661608341075668
Validation Accuracy:  0.8288370397375137
Training F1 Score:  0.8734072157398818
Validation F1 Score:  0.8379078197824961
Training AUC Score:  0.9464115791001093
Validation AUC Score:  0.9229596417528879
Training Recall Score:  0.8246519775053011
Validation Recall Score:  0.7821463100225589
Training Precision Score:  0.9282897467828974
Validation Precision Score:  0.9022304832713754
Training Average Precision Score:  0.963094372266268
Validation Average Precision Score:  0.9459880860416121


In [98]:
X_tr_sme, y_tr_sme = smenn.fit_resample(X_tr, y_tr)
X_val_sme, y_val_sme = smenn.fit_resample(X_val, y_val)
smennm = rfcb.fit(X_tr_sme, y_tr_sme)
y_pred_smenn_tr = smennm.predict(X_tr_sme)
y_pred_smenn_val = smennm.predict(X_val_sme)

# SMOTE Variants

In [29]:
pfsm = sv.polynom_fit_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, pfsm, rfcb)

2021-02-22 16:48:05,509:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star'}")
2021-02-22 16:48:05,575:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star'}")
Training Count:  Counter({1: 18624, 0: 16344})
Validation Count:  Counter({1: 5252, 0: 4687})
Training Accuracy:  0.898535804163807
Validation Accuracy:  0.8916389978871114
Training F1 Score:  0.899221723569846
Validation F1 Score:  0.8913109294580686
Training AUC Score:  0.9571116893943026
Validation AUC Score:  0.9449073704698595
Training Recall Score:  0.8499248281786942
Validation Recall Score:  0.8408225437928408
Training Precision Score:  0.9545893137136654
Validation Precision Score:  0.9482499463173717
Training Average Precision Score:  0.9705770157564235
Validation Average Precision Score:  0.962974770258373


In [99]:
X_tr_pfs, y_tr_pfs = pfsm.sample(X_tr, y_tr)
X_val_pfs, y_val_pfs = pfsm.sample(X_val, y_val)
pfsmm = rfcb.fit(X_tr_pfs, y_tr_pfs)
y_pred_pfsm_tr = pfsmm.predict(X_tr_pfs)
y_pred_pfsm_val = pfsmm.predict(X_val_pfs)

2021-02-22 17:52:01,561:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star'}")
2021-02-22 17:52:01,614:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star'}")


In [32]:
pws = sv.ProWSyn()
sampling2(X_tr, y_tr, X_val, y_val, pws, rfcb)

2021-02-22 16:49:24,004:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1.0, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1}")
2021-02-22 16:49:25,550:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1.0, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.870013460597161
Validation Accuracy:  0.8603584382334115
Training F1 Score:  0.8612933764241179
Validation F1 Score:  0.850074447371435
Training AUC Score:  0.934292377296711
Validation AUC Score:  0.9168457493726434
Training Recall Score:  0.80714635340186
Validation Recall Score:  0.7917644548751867
Training Precision Score:  0.923227657638743
Validation Precision Score:  0.9176557863501483
Training Average Precision Score:  0.9477262027185454
Validation Average Precision Score:  0.936260964903121


In [100]:
X_tr_pws, y_tr_pws = pws.sample(X_tr, y_tr)
X_val_pws, y_val_pws = pws.sample(X_val, y_val)
pwsm = rfcb.fit(X_tr_pws, y_tr_pws)
y_pred_pws_tr = pwsm.predict(X_tr_pws)
y_pred_pws_val = pwsm.predict(X_val_pws)

2021-02-22 17:52:31,910:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1.0, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1}")
2021-02-22 17:52:33,568:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1.0, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1}")


In [36]:
smipf = sv.SMOTE_IPF()
sampling2(X_tr, y_tr, X_val, y_val, smipf, rfcb)

2021-02-22 17:01:12,214:INFO:SMOTE_IPF: Running sampling via ('SMOTE_IPF', "{'proportion': 1.0, 'n_neighbors': 5, 'n_folds': 9, 'k': 3, 'p': 0.01, 'voting': 'majority', 'n_jobs': 1, 'classifier': DecisionTreeClassifier()}")
2021-02-22 17:01:12,215:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2021-02-22 17:01:14,398:INFO:SMOTE_IPF: Removing 44 elements
2021-02-22 17:01:16,418:INFO:SMOTE_IPF: Removing 0 elements
2021-02-22 17:01:18,642:INFO:SMOTE_IPF: Removing 0 elements
2021-02-22 17:01:18,654:INFO:SMOTE_IPF: Running sampling via ('SMOTE_IPF', "{'proportion': 1.0, 'n_neighbors': 5, 'n_folds': 9, 'k': 3, 'p': 0.01, 'voting': 'majority', 'n_jobs': 1, 'classifier': DecisionTreeClassifier()}")
2021-02-22 17:01:18,655:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
Training Count:  Counter({0: 16335, 1: 16309})
2021-02-22 17:01:19,355:INFO:SMOTE_IPF: Removing 1 elements
2021-02-22 17:01:19,93

In [101]:
X_tr_smi, y_tr_smi = smipf.sample(X_tr, y_tr)
X_val_smi, y_val_smi = smipf.sample(X_val, y_val)
smipfm = rfcb.fit(X_tr_smi, y_tr_smi)
y_pred_smipf_tr = smipfm.predict(X_tr_smi)
y_pred_smipf_val = smipfm.predict(X_val_smi)

2021-02-22 17:53:06,949:INFO:SMOTE_IPF: Running sampling via ('SMOTE_IPF', "{'proportion': 1.0, 'n_neighbors': 5, 'n_folds': 9, 'k': 3, 'p': 0.01, 'voting': 'majority', 'n_jobs': 1, 'classifier': DecisionTreeClassifier()}")
2021-02-22 17:53:06,950:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2021-02-22 17:53:09,441:INFO:SMOTE_IPF: Removing 44 elements
2021-02-22 17:53:11,606:INFO:SMOTE_IPF: Removing 0 elements
2021-02-22 17:53:13,666:INFO:SMOTE_IPF: Removing 0 elements
2021-02-22 17:53:13,669:INFO:SMOTE_IPF: Running sampling via ('SMOTE_IPF', "{'proportion': 1.0, 'n_neighbors': 5, 'n_folds': 9, 'k': 3, 'p': 0.01, 'voting': 'majority', 'n_jobs': 1, 'classifier': DecisionTreeClassifier()}")
2021-02-22 17:53:13,671:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2021-02-22 17:53:14,249:INFO:SMOTE_IPF: Removing 1 elements
2021-02-22 17:53:14,732:INFO:SMOTE_IPF: Removing 0 elements
2021-02-2

In [39]:
smobd = sv.SMOBD()
sampling2(X_tr, y_tr, X_val, y_val, smobd, rfcb)

2021-02-22 17:02:26,559:INFO:SMOBD: Running sampling via ('SMOBD', "{'proportion': 1.0, 'eta1': 0.5, 't': 1.8, 'min_samples': 5, 'max_eps': 1.0, 'n_jobs': 1}")
2021-02-22 17:02:43,453:INFO:SMOBD: Running sampling via ('SMOBD', "{'proportion': 1.0, 'eta1': 0.5, 't': 1.8, 'min_samples': 5, 'max_eps': 1.0, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.7822136563876652
Validation Accuracy:  0.7500533390228291
Training F1 Score:  0.7729693529355488
Validation F1 Score:  0.7365935919055648
Training AUC Score:  0.8662056780504038
Validation AUC Score:  0.8305436428829629
Training Recall Score:  0.7414953499755262
Validation Recall Score:  0.6989545551525496
Training Precision Score:  0.8072337307666689
Validation Precision Score:  0.7785171102661597
Training Average Precision Score:  0.8731892060278357
Validation Average Precision Score:  0.8404365719961142


In [102]:
X_tr_smo, y_tr_smo = smobd.sample(X_tr, y_tr)
X_val_smo, y_val_smo = smobd.sample(X_val, y_val)
smobdm = rfcb.fit(X_tr_smo, y_tr_smo)
y_pred_smobd_tr = smobdm.predict(X_tr_smo)
y_pred_smobd_val = smobdm.predict(X_val_smo)

2021-02-22 17:53:37,721:INFO:SMOBD: Running sampling via ('SMOBD', "{'proportion': 1.0, 'eta1': 0.5, 't': 1.8, 'min_samples': 5, 'max_eps': 1.0, 'n_jobs': 1}")
2021-02-22 17:53:54,284:INFO:SMOBD: Running sampling via ('SMOBD', "{'proportion': 1.0, 'eta1': 0.5, 't': 1.8, 'min_samples': 5, 'max_eps': 1.0, 'n_jobs': 1}")


In [41]:
gsm = sv.G_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, gsm, rfcb)

2021-02-22 17:03:33,931:INFO:G_SMOTE: Running sampling via ('G_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'method': 'linear', 'n_jobs': 1}")
2021-02-22 17:03:35,182:INFO:G_SMOTE: Running sampling via ('G_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'method': 'linear', 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.7654185022026432
Validation Accuracy:  0.735331768721997
Training F1 Score:  0.74726433750824
Validation F1 Score:  0.7121475809258614
Training AUC Score:  0.8487669351328954
Validation AUC Score:  0.8103146449268933
Training Recall Score:  0.693587860988742
Validation Recall Score:  0.6547898442500534
Training Precision Score:  0.8099456987710775
Validation Precision Score:  0.7805188199389623
Training Average Precision Score:  0.8571132784187702
Validation Average Precision Score:  0.8196666342277832


In [103]:
X_tr_gsm, y_tr_gsm = gsm.sample(X_tr, y_tr)
X_val_gsm, y_val_gsm = gsm.sample(X_val, y_val)
gsmm = rfcb.fit(X_tr_gsm, y_tr_gsm)
y_pred_gsm_tr = gsmm.predict(X_tr_gsm)
y_pred_gsm_val = gsmm.predict(X_val_gsm)

2021-02-22 17:54:10,086:INFO:G_SMOTE: Running sampling via ('G_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'method': 'linear', 'n_jobs': 1}")
2021-02-22 17:54:11,493:INFO:G_SMOTE: Running sampling via ('G_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'method': 'linear', 'n_jobs': 1}")


In [43]:
ccr = sv.CCR()
sampling2(X_tr, y_tr, X_val, y_val, ccr, rfcb)

2021-02-22 17:04:07,490:INFO:CCR: Running sampling via ('CCR', "{'proportion': 1.0, 'energy': 1.0, 'scaling': 0.0, 'n_jobs': 1}")
2021-02-22 17:04:17,401:INFO:CCR: Running sampling via ('CCR', "{'proportion': 1.0, 'energy': 1.0, 'scaling': 0.0, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16326})
Validation Count:  Counter({0: 4687, 1: 4677})
Training Accuracy:  0.8868380777471686
Validation Accuracy:  0.8828492097394276
Training F1 Score:  0.8788623480454799
Validation F1 Score:  0.874269340974212
Training AUC Score:  0.9377075049848568
Validation AUC Score:  0.9342001739967507
Training Recall Score:  0.8214504471395321
Validation Recall Score:  0.8154800085524909
Training Precision Score:  0.944902416684281
Validation Precision Score:  0.942193675889328
Training Average Precision Score:  0.9545213893134362
Validation Average Precision Score:  0.9520791123396067


In [None]:
X_tr_ccr, y_tr_ccr = ccr.sample(X_tr, y_tr)
X_val_ccr, y_val_ccr = ccr.sample(X_val, y_val)
ccrm = rfcb.fit(X_tr_ccr, y_tr_ccr)
y_pred_ccr_tr = crrm.predict(X_tr_ccr)
y_pred_ccr_val = crrm.predict(X_val_ccr)

In [47]:
lvq = sv.LVQ_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, lvq, rfcb)

2021-02-22 17:05:42,380:INFO:LVQ_SMOTE: Running sampling via ('LVQ_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_clusters': 10, 'n_jobs': 1}")
2021-02-22 17:05:49,152:INFO:LVQ_SMOTE: Running sampling via ('LVQ_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_clusters': 10, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.8906938325991189
Validation Accuracy:  0.867932579475144
Training F1 Score:  0.8842002916869226
Validation F1 Score:  0.8573732718894008
Training AUC Score:  0.9424935454510932
Validation AUC Score:  0.9271898326149313
Training Recall Score:  0.834618208516887
Validation Recall Score:  0.7938980157883507
Training Precision Score:  0.9400454827372339
Validation Precision Score:  0.9318807913849236
Training Average Precision Score:  0.9577370656480544
Validation Average Precision Score:  0.9435478421124348


In [106]:
X_tr_lvq, y_tr_lvq = lvq.sample(X_tr, y_tr)
X_val_lvq, y_val_lvq = lvq.sample(X_val, y_val)
lvqm = rfcb.fit(X_tr_lvq, y_tr_lvq)
y_pred_lvq_tr = lvqm.predict(X_tr_lvq)
y_pred_lvq_val = lvqm.predict(X_val_lvq)

2021-02-22 17:55:19,053:INFO:LVQ_SMOTE: Running sampling via ('LVQ_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_clusters': 10, 'n_jobs': 1}")
2021-02-22 17:55:25,933:INFO:LVQ_SMOTE: Running sampling via ('LVQ_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_clusters': 10, 'n_jobs': 1}")


In [49]:
ass = sv.Assembled_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, ass, rfcb)

2021-02-22 17:06:24,301:INFO:Assembled_SMOTE: Running sampling via ('Assembled_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'pop': 2, 'thres': 0.3, 'n_jobs': 1}")
2021-02-22 17:11:18,054:INFO:Assembled_SMOTE: Running sampling via ('Assembled_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'pop': 2, 'thres': 0.3, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.7615026921194322
Validation Accuracy:  0.7249839982931513
Training F1 Score:  0.7439737274220033
Validation F1 Score:  0.6995337995337996
Training AUC Score:  0.8441466756014652
Validation AUC Score:  0.8001493447118393
Training Recall Score:  0.6930372001957905
Validation Recall Score:  0.6402816300405376
Training Precision Score:  0.8029916347653481
Validation Precision Score:  0.7708707937323401
Training Average Precision Score:  0.8515566336565037
Validation Average Precision Score:  0.8102468817277988


In [107]:
X_tr_ass, y_tr_ass = ass.sample(X_tr, y_tr)
X_val_ass, y_val_ass = ass.sample(X_val, y_val)
assm = rfcb.fit(X_tr_ass, y_tr_ass)
y_pred_ass_tr = assm.predict(X_tr_ass)
y_pred_ass_val = assm.predict(X_val_ass)

2021-02-22 17:55:45,946:INFO:Assembled_SMOTE: Running sampling via ('Assembled_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'pop': 2, 'thres': 0.3, 'n_jobs': 1}")


KeyboardInterrupt: 

In [120]:
data = {'Accuracy': ["",
                     accuracy(y_val, y_pred_bc_val),
                     accuracy(y_val, y_pred_bbc_val),
                     accuracy(y_val, y_pred_bbc3_val),
                     accuracy(y_val, y_pred_brf_val),
                     accuracy(y_val, y_pred_rbc_val),
                     accuracy(y_val, y_pred_eec_val),
                     "",
                     accuracy(y_val_tl, y_pred_tl_val),
                     accuracy(y_val_enn, y_pred_enn_val),
                     "",
                     accuracy(y_val_sm, y_pred_sm_val),
                     accuracy(y_val_svm, y_pred_svmsm_val),
                     accuracy(y_val_adsn, y_pred_adsnm_val),
                     "",
                     accuracy(y_val_smt, y_pred_smtk_val),
                     accuracy(y_val_sme, y_pred_smenn_val),
                     "",
                     accuracy(y_val_pfs, y_pred_pfsm_val),
                     accuracy(y_val_pws, y_pred_pws_val),
                     accuracy(y_val_smi, y_pred_smipf_val),
                     accuracy(y_val_smo, y_pred_smobd_val),
                     accuracy(y_val_gsm, y_pred_gsm_val),
                     accuracy(y_val_ccr, y_pred_ccr_val),
                     accuracy(y_val_lvq, y_pred_lvq_val)],
                    #  accuracy(y_val_ass, y_pred_ass_val)],
    'F1 Score': ["",
                f1_score(y_val, y_pred_bc_val),
                f1_score(y_val, y_pred_bbc_val),
                f1_score(y_val, y_pred_bbc3_val),
                f1_score(y_val, y_pred_brf_val),
                f1_score(y_val, y_pred_rbc_val),
                f1_score(y_val, y_pred_eec_val),
                "",
                f1_score(y_val_tl, y_pred_tl_val),
                f1_score(y_val_enn, y_pred_enn_val),
                "",
                f1_score(y_val_sm, y_pred_sm_val),
                f1_score(y_val_svm, y_pred_svmsm_val),
                f1_score(y_val_adsn, y_pred_adsnm_val),
                "",
                f1_score(y_val_smt, y_pred_smtk_val),
                f1_score(y_val_sme, y_pred_smenn_val),
                "",
                f1_score(y_val_pfs, y_pred_pfsm_val),
                f1_score(y_val_pws, y_pred_pws_val),
                f1_score(y_val_smi, y_pred_smipf_val),
                f1_score(y_val_smo, y_pred_smobd_val),
                f1_score(y_val_gsm, y_pred_gsm_val),
                f1_score(y_val_ccr, y_pred_ccr_val),
                f1_score(y_val_lvq, y_pred_lvq_val)],
                # f1_score(y_val_ass, y_pred_ass_val)],
    'PR AUC Score': ["",
              aps(X_val, y_val, bc),
              aps(X_val, y_val, bbc),
              aps(X_val, y_val, bbc3),
              aps(X_val, y_val, brf),
              aps(X_val, y_val, rbc),
              aps(X_val, y_val, eec),
              "",
              aps(X_val_tl, y_val_tl, tlm),
              aps(X_val_enn, y_val_enn, ennm),
              "",
              aps(X_val_sm, y_val_sm, smm),
              aps(X_val_svm, y_val_svm, svmsmote),
              aps(X_val_adsn, y_val_adsn, adsnm),
              "",
              aps(X_val_smt, y_val_smt, smtkm),
              aps(X_val_sme, y_val_sme, smennm),
              "",
              aps(X_val_pfs, y_val_pfs, pfsmm),
              aps(X_val_pws, y_val_pws, pwsm),
              aps(X_val_smi, y_val_smi, smipfm),
              aps(X_val_smo, y_val_smo, smobdm),
              aps(X_val_gsm, y_val_gsm, gsmm),
              aps(X_val_ccr, y_val_ccr, ccrm),
              aps(X_val_lvq, y_val_lvq, lvqm)]}
            #   aps(X_val, y_val_ass, assm)]}
scores = pd.DataFrame(data=data, index = [{'ENSEMBLE METHODS':
                                          'BaggingClassifier',
                                          'BalancedBaggingClassifier',
                                          'BBC with GradientBoostingClassifier',
                                          'BalancedRandomForestClassifier',
                                          'RUBoostClassifier',
                                          'EasyEnsembleClassifier',
                                          'UNDERSAMPLING METHODS',
                                          'TomekLinks',
                                          'EditedNearestNeighbours',
                                          'OVERSAMPLING METHODS',
                                          'SMOTE',
                                          'SVMSMOTE',
                                          'ADASYN',
                                          'COMBINED METHODS',
                                          'SMOTETomek',
                                          'SMOTEENN',
                                          'SMOTE-VARIANTS',
                                          'sv.polynom_fit_SMOTE',
                                          'sv.ProWSyn',
                                          'sv.SMOTE_IPF',
                                          'sv.SMOBD'
                                          'sv.G_SMOTE',
                                          'sv.CCR',
                                          'sv.LQV_SMOTE',
                                          'sv.Assembled_SMOTE'])

In [121]:
scores

Unnamed: 0,Accuracy,F1 Score,PR AUC Score
ENSEMBLE METHODS,,,
BaggingClassifier,0.807,0.443269,0.204693
BalancedBaggingClassifier,0.763167,0.51118,0.307574
BBC with GradientBoostingClassifier,0.753333,0.527157,0.43927
BalancedRandomForestClassifier,0.724,0.508314,0.511147
RUBoostClassifier,0.756667,0.519737,0.524461
EasyEnsembleClassifier,0.751667,0.527883,0.526296
UNDERSAMPLING METHODS,,,
TomekLinks,0.82123,0.508625,0.591199
EditedNearestNeighbours,0.821429,0.658041,0.785686
