# Importing Packages

In [1]:
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
plt.style.use("fivethirtyeight")
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from collections import Counter
import pickle

%reload_ext autoreload
%autoreload 2
from utils import *

import smote_variants as sv
import imbalanced_databases as imbd
import xgboost as xgb

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict, GridSearchCV,RepeatedStratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, roc_auc_score, classification_report, plot_confusion_matrix, auc, mean_squared_error, confusion_matrix, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils import resample
from sklearn.dummy import DummyClassifier
from imblearn.metrics import geometric_mean_score

from imblearn.under_sampling import CondensedNearestNeighbour, NearMiss, OneSidedSelection, NeighbourhoodCleaningRule, RandomUnderSampler, TomekLinks, EditedNearestNeighbours
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, SVMSMOTE
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier, RUSBoostClassifier

from xgboost.sklearn import XGBClassifier

# Importing Training and Validation Datasets

In [2]:
pickle_in = open("../data/pickles/training_model.pickle","rb")
train = pickle.load(pickle_in)
pickle_in = open("../data/pickles/validate_model.pickle","rb")
validate = pickle.load(pickle_in)

In [3]:
X_train = train.drop(["default"], axis=1)
y_tr = train["default"]
X_validate = validate.drop(["default"], axis=1)
y_val = validate["default"]

In [4]:
X_train.head()

Unnamed: 0,limit,behind1,paid2,delayed,latemths,age,behind2,billed1,avg_av,avail1
0,1790.26,0,179.13,0,0,44,0,1631.93,0.344578,0.08844
1,5728.83,-1,173.87,0,0,46,-1,891.69,0.957227,0.84435
2,3580.52,-1,0.0,0,0,47,-1,238.68,0.96865,0.933339
3,6086.88,0,89.26,0,0,29,0,2831.87,0.650602,0.534758
4,5370.78,-2,1171.37,0,0,33,-2,873.4,0.836153,0.837379


In [5]:
X_validate.head()

Unnamed: 0,limit,behind1,paid2,delayed,latemths,age,behind2,billed1,avg_av,avail1
0,1074.16,0,71.61,0,0,25,0,317.38,0.602052,0.704532
1,5370.78,0,151.64,0,0,26,0,4895.86,0.293715,0.088427
2,2506.36,0,111.43,0,0,32,0,2510.73,0.005217,-0.001744
3,4654.68,0,64.74,0,0,49,0,740.38,0.883482,0.840939
4,1790.26,0,53.71,1,1,36,0,3373.85,0.188227,-0.884559


# Standardize Datasets

In [6]:
scaler = StandardScaler()
scaler.fit(X_train)
X_tr_ss = scaler.transform(X_train)
X_val_ss = scaler.transform(X_validate)

In [7]:
scaled = MinMaxScaler()
scaled.fit(X_train)
X_tr = scaled.transform(X_train)
X_val = scaled.transform(X_validate)

# Importing Model

In [8]:
pickle_in = open("../data/pickles/best_model.pickle","rb")
rfcb = pickle.load(pickle_in)

In [9]:
rfcb

RandomForestClassifier(max_depth=8, n_estimators=400)

# Dummy Classifier

In [10]:
#baseline counts
counter = Counter(y_tr)
print("Baseline: ", counter)
counter2 = Counter(y_val)
print("Validation: ", counter2)

Baseline:  Counter({0: 16344, 1: 4656})
Validation:  Counter({0: 4687, 1: 1313})


In [11]:
dc = DummyClassifier(strategy='most_frequent').fit(X_tr, y_tr)
y_pred_dc_tr = dc.predict(X_tr)
y_pred_dc_val = dc.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_dc_tr, y_pred_dc_val, dc)

Training Accuracy:  0.7782857142857142
Validation Accuracy:  0.7811666666666667
Training F1 Score:  0.0
Validation F1 Score:  0.0
Training AUC Score:  0.5
Validation AUC Score:  0.5
Training Recall Score:  0.0
Validation Recall Score:  0.0
Training Precision Score:  0.0
Validation Precision Score:  0.0
Training Average Precision Score:  0.22171428571428572
Validation Average Precision Score:  0.21883333333333332


# Ensemble Methods

## Bagging Classifier

Instead of using a single tree, we will check if an ensemble of decsion tree can actually alleviate the issue induced by the class imbalancing. First, we will use a bagging classifier and its counter part which internally uses a random under-sampling to balanced each boostrap sample.

Balancing each bootstrap sample allows to increase significantly the balanced accuracy and the geometric mean.

In [47]:
bc = BaggingClassifier(n_estimators=50, random_state=42).fit(X_tr_ss, y_tr)
y_pred_bc_tr = bc.predict(X_tr_ss)
y_pred_bc_val = bc.predict(X_val_ss)
get_metric(X_tr_ss, y_tr, X_val_ss, y_val, y_pred_bc_tr, y_pred_bc_val, bc)
print("")
print('Training Balanced Accuracy: ', balanced_accuracy_score(y_tr, y_pred_bc_tr))
print('Training Geometric Mean: ', geometric_mean_score(y_tr, y_pred_bc_tr))
print('Validation Balanced Accuracy: ', balanced_accuracy_score(y_val, y_pred_bc_val))
print('Validation Geometric Mean: ', geometric_mean_score(y_val, y_pred_bc_val))

Training Accuracy:  0.996904761904762
Validation Accuracy:  0.807
Training F1 Score:  0.9929964443486693
Validation F1 Score:  0.44326923076923075
Training AUC Score:  0.9998954304300327
Validation AUC Score:  0.740359286457933
Training Recall Score:  0.9896907216494846
Validation Recall Score:  0.3511043412033511
Training Precision Score:  0.9963243243243243
Validation Precision Score:  0.6010430247718384
Training Average Precision Score:  0.9996190235446103
Validation Average Precision Score:  0.48114820511254575

Training Balanced Accuracy:  0.9943252922980659
Training Geometric Mean:  0.9943144913248209
Validation Balanced Accuracy:  0.6429086886302653
Validation Geometric Mean:  0.5728715429649479


## Balanced Bagging Classifier

A Bagging classifier with additional balancing.

This implementation of Bagging is similar to the scikit-learn implementation. It includes an additional step to balance the training set at fit time using a given sampler.

This classifier can serves as a basis to implement various methods such as Exactly Balanced Bagging [6], Roughly Balanced Bagging [7], Over-Bagging [6], or SMOTE-Bagging [8].

In [48]:
bbc = BalancedBaggingClassifier(n_estimators=50, random_state=42).fit(X_tr_ss, y_tr)
y_pred_bbc_tr = bbc.predict(X_tr_ss)
y_pred_bbc_val = bbc.predict(X_val_ss)
get_metric(X_tr_ss, y_tr, X_val_ss, y_val, y_pred_bbc_tr, y_pred_bbc_val, bbc)
print("")
print('Training Balanced Accuracy: ', balanced_accuracy_score(y_tr, y_pred_bbc_tr))
print('Training Geometric Mean: ', geometric_mean_score(y_tr, y_pred_bbc_tr))
print('Validation Balanced Accuracy: ', balanced_accuracy_score(y_val, y_pred_bbc_val))
print('Validation Geometric Mean: ', geometric_mean_score(y_val, y_pred_bbc_val))

Training Accuracy:  0.9383333333333334
Validation Accuracy:  0.7631666666666667
Training F1 Score:  0.8778186621379375
Validation F1 Score:  0.5111799105607154
Training AUC Score:  0.9955504481714446
Validation AUC Score:  0.7528188434539899
Training Recall Score:  0.9991408934707904
Validation Recall Score:  0.5658796648895659
Training Precision Score:  0.7827696449604576
Validation Precision Score:  0.46612296110414053
Training Average Precision Score:  0.9819759873453777
Validation Average Precision Score:  0.499848233823849

Training Balanced Accuracy:  0.9600758309742596
Training Geometric Mean:  0.9592807316490086
Validation Balanced Accuracy:  0.6921568155896517
Validation Geometric Mean:  0.680540328399629


## Balanced Bagging Classifier with Gradient Boosting Classifier

In [49]:
bbc3 = BalancedBaggingClassifier(
        base_estimator=HistGradientBoostingClassifier(random_state=42),
        n_estimators=10,
        random_state=42,
        n_jobs=2).fit(X_tr_ss, y_tr)
y_pred_bbc3_tr = bbc3.predict(X_tr_ss)
y_pred_bbc3_val = bbc3.predict(X_val_ss)
get_metric(X_tr_ss, y_tr, X_val_ss, y_val, y_pred_bbc3_tr, y_pred_bbc3_val, bbc3)

Training Accuracy:  0.793
Validation Accuracy:  0.7533333333333333
Training F1 Score:  0.6030499497762761
Validation F1 Score:  0.5271565495207667
Training AUC Score:  0.8560039766792316
Validation AUC Score:  0.7749874838134551
Training Recall Score:  0.709192439862543
Validation Recall Score:  0.6283320639756284
Training Precision Score:  0.5245432883240667
Validation Precision Score:  0.45404512933406715
Training Average Precision Score:  0.680188981227117
Validation Average Precision Score:  0.5372388529575939


## Balanced Random Forest Classifier

Random forest is another popular ensemble method and it is usually outperforming bagging. Here, we used a vanilla random forest and its balanced counterpart in which each bootstrap sample is balanced.

Similarly to the previous experiment, the balanced classifier outperform the classifier which learn from imbalanced bootstrap samples. In addition, random forest outsperforms the bagging classifier.

In [50]:
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=42).fit(X_tr, y_tr)
y_pred_brf_tr = brf.predict(X_tr)
y_pred_brf_val = brf.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_brf_tr, y_pred_brf_val, brf)

Training Accuracy:  0.888
Validation Accuracy:  0.724
Training F1 Score:  0.7983539094650206
Validation F1 Score:  0.5083135391923991
Training AUC Score:  0.9928255616361626
Validation AUC Score:  0.7643157793647772
Training Recall Score:  1.0
Validation Recall Score:  0.6519421172886519
Training Precision Score:  0.6643835616438356
Validation Precision Score:  0.41654501216545015
Training Average Precision Score:  0.973183893912578
Validation Average Precision Score:  0.5111466083509113


## RUSBoostClassifier

Random under-sampling integrated in the learning of AdaBoost.

During learning, the problem of class balancing is alleviated by random under-sampling the sample at each iteration of the boosting algorithm

In [51]:
rbc = RUSBoostClassifier(n_estimators=200, algorithm='SAMME.R', random_state=42).fit(X_tr, y_tr)
y_pred_rbc_tr = rbc.predict(X_tr)
y_pred_rbc_val = rbc.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_rbc_tr, y_pred_rbc_val, rbc)

Training Accuracy:  0.7674761904761904
Validation Accuracy:  0.7566666666666667
Training F1 Score:  0.5456406439006235
Validation F1 Score:  0.5197368421052632
Training AUC Score:  0.7936512282426961
Validation AUC Score:  0.7667346817069983
Training Recall Score:  0.6297250859106529
Validation Recall Score:  0.6016755521706016
Training Precision Score:  0.48136594976194386
Validation Precision Score:  0.45744064852345107
Training Average Precision Score:  0.5609306330020786
Validation Average Precision Score:  0.5244608728332589


## Easy Ensemble Classifier

Bag of balanced boosted learners also known as EasyEnsemble.

This algorithm is known as EasyEnsemble [1]. The classifier is an ensemble of AdaBoost learners trained on different balanced boostrap samples. The balancing is achieved by random under-sampling.

In [52]:
eec = EasyEnsembleClassifier(random_state=42).fit(X_tr, y_tr) 
y_pred_eec_tr = eec.predict(X_tr)
y_pred_eec_val = eec.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_eec_tr, y_pred_eec_val, eec)

Training Accuracy:  0.7592380952380953
Validation Accuracy:  0.7516666666666667
Training F1 Score:  0.5427744619280159
Validation F1 Score:  0.5278833967046895
Training AUC Score:  0.790370227396205
Validation AUC Score:  0.7749849651391096
Training Recall Score:  0.6445446735395189
Validation Recall Score:  0.6344249809596344
Training Precision Score:  0.46875976257419555
Validation Precision Score:  0.45198046663049374
Training Average Precision Score:  0.5504213877594832
Validation Average Precision Score:  0.5262955379856071


# Undersampling Methods

## Tomek Links

Tomek links are instances of a minority-majority class pair where each is its own nearest neighbor aka instances of opposing class very close together.  These pairs ultimately fall into one of two categories:  boundary or noise instances.  Only these two cases will have nearest neighbors from opposite classes.  By identify such pairs, Tomek links removes the majority instance with the goal of clarifying the boundary between the two classes by making the minority class region more distinct.

In [12]:
tl = TomekLinks()
sampling(X_tr, y_tr, X_val, y_val, tl, rfcb)

Training Count:  Counter({0: 14764, 1: 4656})
Training Accuracy:  0.84129763130793
Validation Accuracy:  0.8203333333333334
Training F1 Score:  0.5739563173901023
Validation F1 Score:  0.4919886899151743
Training AUC Score:  0.8417570098879879
Validation AUC Score:  0.7796665307665821
Training Recall Score:  0.44587628865979384
Validation Recall Score:  0.39756283320639757
Training Precision Score:  0.8052754072924748
Validation Precision Score:  0.6452410383189122
Training Average Precision Score:  0.70876857570943
Validation Average Precision Score:  0.5418005782123946


In [13]:
X_tr_tl, y_tr_tl = tl.fit_resample(X_tr, y_tr)
tlm = rfcb.fit(X_tr_tl, y_tr_tl)
y_pred_tl_tr = tlm.predict(X_tr_tl)
y_pred_tl_val = tlm.predict(X_val)

## Edited Nearest Neighbors Rule

This is a rule for finding ambiguous and noisy examples in the dataset by using k = 3 nearest neighbors to edit the preclassified samples and then k = 1 rule to make decisions.



In [14]:
enn = EditedNearestNeighbours()
sampling(X_tr, y_tr, X_val, y_val, enn, rfcb)

Training Count:  Counter({0: 9976, 1: 4656})
Training Accuracy:  0.847457627118644
Validation Accuracy:  0.7861666666666667
Training F1 Score:  0.7074705111402361
Validation F1 Score:  0.5263935031376892
Training AUC Score:  0.890159751100235
Validation AUC Score:  0.7783997188184459
Training Recall Score:  0.5796821305841925
Validation Recall Score:  0.543031226199543
Training Precision Score:  0.9075319435104237
Validation Precision Score:  0.5107449856733525
Training Average Precision Score:  0.8494880697247924
Validation Average Precision Score:  0.5324626580629498


In [15]:
X_tr_enn, y_tr_enn = enn.fit_resample(X_tr, y_tr)
ennm = rfcb.fit(X_tr_enn, y_tr_enn)
y_pred_enn_tr = ennm.predict(X_tr_enn)
y_pred_enn_val = ennm.predict(X_val)

In [16]:
oss = OneSidedSelection(n_neighbors=1, n_seeds_S=200)
sampling(X_tr, y_tr, X_val, y_val, oss, rfcb)

Training Count:  Counter({0: 13439, 1: 4656})
Training Accuracy:  0.8308925117435756
Validation Accuracy:  0.8185
Training F1 Score:  0.579092159559835
Validation F1 Score:  0.48992974238875875
Training AUC Score:  0.8358804636912264
Validation AUC Score:  0.7793365031797858
Training Recall Score:  0.4521048109965636
Validation Recall Score:  0.3983244478293983
Training Precision Score:  0.8052792654934966
Validation Precision Score:  0.6362530413625304
Training Average Precision Score:  0.7167220651629658
Validation Average Precision Score:  0.5429037752368918


In [17]:
X_tr_oss, y_tr_oss = oss.fit_resample(X_tr, y_tr)
ossm = rfcb.fit(X_tr_oss, y_tr_oss)
y_pred_oss_tr = ossm.predict(X_tr_oss)
y_pred_oss_val = ossm.predict(X_val)

In [18]:
ncr = NeighbourhoodCleaningRule(n_neighbors=3, threshold_cleaning=0.5)
sampling(X_tr, y_tr, X_val, y_val, ncr, rfcb)

Training Count:  Counter({0: 10206, 1: 4656})
Training Accuracy:  0.846117615394967
Validation Accuracy:  0.7911666666666667
Training F1 Score:  0.7005368600235694
Validation F1 Score:  0.5326370757180157
Training AUC Score:  0.8880858755512726
Validation AUC Score:  0.7786070625903574
Training Recall Score:  0.5745274914089347
Validation Recall Score:  0.5437928408225438
Training Precision Score:  0.897349882589735
Validation Precision Score:  0.5219298245614035
Training Average Precision Score:  0.8419235879154527
Validation Average Precision Score:  0.5281597053047045


In [19]:
X_tr_ncr, y_tr_ncr = ncr.fit_resample(X_tr, y_tr)
ncrm = rfcb.fit(X_tr_ncr, y_tr_ncr)
y_pred_ncr_tr = ncrm.predict(X_tr_ncr)
y_pred_ncr_val = ncrm.predict(X_val)

# Oversampling Methods

## SMOTE

In [20]:
sm = SMOTE(sampling_strategy='minority', random_state=42)
sampling(X_tr, y_tr, X_val, y_val, sm, rfcb)

Training Count:  Counter({0: 16344, 1: 16344})
Training Accuracy:  0.7598813020068527
Validation Accuracy:  0.7636666666666667
Training F1 Score:  0.7416987527561129
Validation F1 Score:  0.524798927613941
Training AUC Score:  0.8428007150893575
Validation AUC Score:  0.7736144000574583
Training Recall Score:  0.6894884973078805
Validation Recall Score:  0.5963442498095963
Training Precision Score:  0.8024638609983622
Validation Precision Score:  0.46858168761220825
Training Average Precision Score:  0.849865392961803
Validation Average Precision Score:  0.5357606777880041


In [21]:
X_tr_sm, y_tr_sm = sm.fit_resample(X_tr, y_tr)
smm = rfcb.fit(X_tr_sm, y_tr_sm)
y_pred_sm_tr = smm.predict(X_tr_sm)
y_pred_sm_val = smm.predict(X_val)

## SVMSMOTE

In [22]:
svmsm = SVMSMOTE()
sampling(X_tr, y_tr, X_val, y_val, svmsm, rfcb)

Training Count:  Counter({0: 16344, 1: 16344})
Training Accuracy:  0.7865883504650024
Validation Accuracy:  0.777
Training F1 Score:  0.7722494286647079
Validation F1 Score:  0.5324947589098532
Training AUC Score:  0.877603191472667
Validation AUC Score:  0.7752159519508433
Training Recall Score:  0.7236294664708761
Validation Recall Score:  0.5803503427265804
Training Precision Score:  0.8278734425311494
Validation Precision Score:  0.4919302775984506
Training Average Precision Score:  0.8864177983868675
Validation Average Precision Score:  0.5382684997594274


In [23]:
X_tr_svm, y_tr_svm = svmsm.fit_resample(X_tr, y_tr)
svmsmote = rfcb.fit(X_tr_svm, y_tr_svm)
y_pred_svmsm_tr = svmsmote.predict(X_tr_svm)
y_pred_svmsm_val = svmsmote.predict(X_val)

## ADASYN

In [24]:
adsn = ADASYN()
sampling(X_tr, y_tr, X_val, y_val, adsn, rfcb)

Training Count:  Counter({0: 16344, 1: 15798})
Training Accuracy:  0.7357662871009893
Validation Accuracy:  0.7346666666666667
Training F1 Score:  0.7162473689485817
Validation F1 Score:  0.5086419753086419
Training AUC Score:  0.8099940329008108
Validation AUC Score:  0.7700356888029976
Training Recall Score:  0.6785036080516521
Validation Recall Score:  0.6275704493526276
Training Precision Score:  0.7584376990023349
Validation Precision Score:  0.4276076803321225
Training Average Precision Score:  0.8050639820385264
Validation Average Precision Score:  0.5344189933451883


In [25]:
X_tr_adsn, y_tr_adsn = adsn.fit_resample(X_tr, y_tr)
adsnm = rfcb.fit(X_tr_adsn, y_tr_adsn)
y_pred_adsnm_tr = adsnm.predict(X_tr_adsn)
y_pred_adsnm_val = adsnm.predict(X_val)

# Combined Methods

## SMOTE Tomek

In [26]:
smtk = SMOTETomek()
sampling(X_tr, y_tr, X_val, y_val, smtk, rfcb)

Training Count:  Counter({0: 15434, 1: 15434})
Training Accuracy:  0.7724180380977064
Validation Accuracy:  0.763
Training F1 Score:  0.7557456277598136
Validation F1 Score:  0.5237776289350301
Training AUC Score:  0.8551836359594372
Validation AUC Score:  0.7718141978810312
Training Recall Score:  0.7041596475314241
Validation Recall Score:  0.5955826351865956
Training Precision Score:  0.8154873564943348
Validation Precision Score:  0.4674237895995218
Training Average Precision Score:  0.861548443411833
Validation Average Precision Score:  0.5344147248352176


In [27]:
X_tr_smt, y_tr_smt = smtk.fit_resample(X_tr, y_tr)
smtkm = rfcb.fit(X_tr_smt, y_tr_smt)
y_pred_smtk_tr = smtkm.predict(X_tr_smt)
y_pred_smtk_val = smtkm.predict(X_val)

## SMOTE ENN

In [28]:
smenn = SMOTEENN(sampling_strategy="minority", n_jobs= -1)
sampling(X_tr, y_tr, X_val, y_val, smenn, rfcb)

Training Count:  Counter({1: 10841, 0: 8601})
Training Accuracy:  0.8672976031272502
Validation Accuracy:  0.7315
Training F1 Score:  0.8749272833042466
Validation F1 Score:  0.5175202156334232
Training AUC Score:  0.9476800840071958
Validation AUC Score:  0.7734414240032266
Training Recall Score:  0.8323955354672078
Validation Recall Score:  0.6580350342726581
Training Precision Score:  0.922039440073567
Validation Precision Score:  0.42645607107601186
Training Average Precision Score:  0.9636258846533216
Validation Average Precision Score:  0.5238793379222708


In [29]:
X_tr_sme, y_tr_sme = smenn.fit_resample(X_tr, y_tr)
smennm = rfcb.fit(X_tr_sme, y_tr_sme)
y_pred_smenn_tr = smennm.predict(X_tr_sme)
y_pred_smenn_val = smennm.predict(X_val)

# SMOTE Variants

In [30]:
pfsm = sv.polynom_fit_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, pfsm, rfcb)

2021-04-16 17:19:22,537:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star'}")
Training Count:  Counter({1: 18624, 0: 16344})
Training Accuracy:  0.8984500114390299
Validation Accuracy:  0.8211666666666667
Training F1 Score:  0.8991680154470851
Validation F1 Score:  0.47168882323978334
Training AUC Score:  0.9572546356350702
Validation AUC Score:  0.7792807673539506
Training Recall Score:  0.8501396048109966
Validation Recall Score:  0.3648134044173648
Training Precision Score:  0.954197553185078
Validation Precision Score:  0.6671309192200557
Training Average Precision Score:  0.970652150241486
Validation Average Precision Score:  0.5459309732931904


In [31]:
X_tr_pfs, y_tr_pfs = pfsm.sample(X_tr, y_tr)
pfsmm = rfcb.fit(X_tr_pfs, y_tr_pfs)
y_pred_pfsm_tr = pfsmm.predict(X_tr_pfs)
y_pred_pfsm_val = pfsmm.predict(X_val)

2021-04-16 17:19:35,712:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star'}")


In [32]:
pws = sv.ProWSyn()
sampling2(X_tr, y_tr, X_val, y_val, pws, rfcb)

2021-04-16 17:19:46,913:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1.0, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Training Accuracy:  0.8688509544787077
Validation Accuracy:  0.8128333333333333
Training F1 Score:  0.8602353861702473
Validation F1 Score:  0.48884842967683206
Training AUC Score:  0.9341265231893869
Validation AUC Score:  0.7709209622115974
Training Recall Score:  0.8072075379344101
Validation Recall Score:  0.408987052551409
Training Precision Score:  0.920720217740247
Validation Precision Score:  0.6074660633484162
Training Average Precision Score:  0.9476394043011918
Validation Average Precision Score:  0.5385448336584753


In [33]:
X_tr_pws, y_tr_pws = pws.sample(X_tr, y_tr)
pwsm = rfcb.fit(X_tr_pws, y_tr_pws)
y_pred_pws_tr = pwsm.predict(X_tr_pws)
y_pred_pws_val = pwsm.predict(X_val)

2021-04-16 17:20:03,190:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1.0, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1}")


In [34]:
smipf = sv.SMOTE_IPF()
sampling2(X_tr, y_tr, X_val, y_val, smipf, rfcb)

2021-04-16 17:20:17,227:INFO:SMOTE_IPF: Running sampling via ('SMOTE_IPF', "{'proportion': 1.0, 'n_neighbors': 5, 'n_folds': 9, 'k': 3, 'p': 0.01, 'voting': 'majority', 'n_jobs': 1, 'classifier': DecisionTreeClassifier()}")
2021-04-16 17:20:17,228:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2021-04-16 17:20:19,687:INFO:SMOTE_IPF: Removing 44 elements
2021-04-16 17:20:21,682:INFO:SMOTE_IPF: Removing 0 elements
2021-04-16 17:20:23,670:INFO:SMOTE_IPF: Removing 0 elements
Training Count:  Counter({0: 16335, 1: 16309})
Training Accuracy:  0.7594657517461095
Validation Accuracy:  0.761
Training F1 Score:  0.7412509062149871
Validation F1 Score:  0.5207219251336899
Training AUC Score:  0.8441428857590598
Validation AUC Score:  0.7727691004481452
Training Recall Score:  0.6896192286467595
Validation Recall Score:  0.5932977913175933
Training Precision Score:  0.8012395811070742
Validation Precision Score:  0.4639666468135795
Training Average

In [35]:
X_tr_smi, y_tr_smi = smipf.sample(X_tr, y_tr)
smipfm = rfcb.fit(X_tr_smi, y_tr_smi)
y_pred_smipf_tr = smipfm.predict(X_tr_smi)
y_pred_smipf_val = smipfm.predict(X_val)

2021-04-16 17:20:38,414:INFO:SMOTE_IPF: Running sampling via ('SMOTE_IPF', "{'proportion': 1.0, 'n_neighbors': 5, 'n_folds': 9, 'k': 3, 'p': 0.01, 'voting': 'majority', 'n_jobs': 1, 'classifier': DecisionTreeClassifier()}")
2021-04-16 17:20:38,416:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2021-04-16 17:20:41,233:INFO:SMOTE_IPF: Removing 44 elements
2021-04-16 17:20:43,485:INFO:SMOTE_IPF: Removing 0 elements
2021-04-16 17:20:45,568:INFO:SMOTE_IPF: Removing 0 elements


In [36]:
smobd = sv.SMOBD()
sampling2(X_tr, y_tr, X_val, y_val, smobd, rfcb)

2021-04-16 17:20:57,661:INFO:SMOBD: Running sampling via ('SMOBD', "{'proportion': 1.0, 'eta1': 0.5, 't': 1.8, 'min_samples': 5, 'max_eps': 1.0, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Training Accuracy:  0.7812347038668624
Validation Accuracy:  0.756
Training F1 Score:  0.771963391689786
Validation F1 Score:  0.5187376725838264
Training AUC Score:  0.8652088931433553
Validation AUC Score:  0.7746990549771361
Training Recall Score:  0.7405775819872736
Validation Recall Score:  0.6009139375476009
Training Precision Score:  0.8061272061272061
Validation Precision Score:  0.45633314054366686
Training Average Precision Score:  0.8722611390691086
Validation Average Precision Score:  0.5386363108200776


In [37]:
X_tr_smo, y_tr_smo = smobd.sample(X_tr, y_tr)
smobdm = rfcb.fit(X_tr_smo, y_tr_smo)
y_pred_smobd_tr = smobdm.predict(X_tr_smo)
y_pred_smobd_val = smobdm.predict(X_val)

2021-04-16 17:21:24,981:INFO:SMOBD: Running sampling via ('SMOBD', "{'proportion': 1.0, 'eta1': 0.5, 't': 1.8, 'min_samples': 5, 'max_eps': 1.0, 'n_jobs': 1}")


In [38]:
gsm = sv.G_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, gsm, rfcb)

2021-04-16 17:21:50,903:INFO:G_SMOTE: Running sampling via ('G_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'method': 'linear', 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Training Accuracy:  0.768569505628977
Validation Accuracy:  0.7713333333333333
Training F1 Score:  0.7497270652066034
Validation F1 Score:  0.5265700483091788
Training AUC Score:  0.8492055141279669
Validation AUC Score:  0.7721363444545535
Training Recall Score:  0.6932819383259912
Validation Recall Score:  0.5811119573495811
Training Precision Score:  0.8161780594972268
Validation Precision Score:  0.48138801261829656
Training Average Precision Score:  0.8588620049562558
Validation Average Precision Score:  0.5377825125993618


In [39]:
X_tr_gsm, y_tr_gsm = gsm.sample(X_tr, y_tr)
gsmm = rfcb.fit(X_tr_gsm, y_tr_gsm)
y_pred_gsm_tr = gsmm.predict(X_tr_gsm)
y_pred_gsm_val = gsmm.predict(X_val)

2021-04-16 17:22:06,493:INFO:G_SMOTE: Running sampling via ('G_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'method': 'linear', 'n_jobs': 1}")


In [40]:
ccr = sv.CCR()
sampling2(X_tr, y_tr, X_val, y_val, ccr, rfcb)

2021-04-16 17:22:20,632:INFO:CCR: Running sampling via ('CCR', "{'proportion': 1.0, 'energy': 1.0, 'scaling': 0.0, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16326})
Training Accuracy:  0.8866850321395776
Validation Accuracy:  0.8205
Training F1 Score:  0.878900883218842
Validation F1 Score:  0.4691966485953672
Training AUC Score:  0.9379536391237782
Validation AUC Score:  0.771606447871322
Training Recall Score:  0.822859242925395
Validation Recall Score:  0.3625285605483625
Training Precision Score:  0.943133951137321
Validation Precision Score:  0.664804469273743
Training Average Precision Score:  0.9545171097518231
Validation Average Precision Score:  0.5377121590344558


In [41]:
X_tr_ccr, y_tr_ccr = ccr.sample(X_tr, y_tr)
ccrm = rfcb.fit(X_tr_ccr, y_tr_ccr)
y_pred_ccr_tr = ccrm.predict(X_tr_ccr)
y_pred_ccr_val = ccrm.predict(X_val)

2021-04-16 17:22:43,778:INFO:CCR: Running sampling via ('CCR', "{'proportion': 1.0, 'energy': 1.0, 'scaling': 0.0, 'n_jobs': 1}")


In [42]:
lvq = sv.LVQ_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, lvq, rfcb)

2021-04-16 17:23:04,766:INFO:LVQ_SMOTE: Running sampling via ('LVQ_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_clusters': 10, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Training Accuracy:  0.8913362701908958
Validation Accuracy:  0.8178333333333333
Training F1 Score:  0.8847875445994162
Validation F1 Score:  0.47325301204819276
Training AUC Score:  0.943388539571029
Validation AUC Score:  0.7656511642531537
Training Recall Score:  0.8344958394517866
Validation Recall Score:  0.37395277989337394
Training Precision Score:  0.941529752864835
Validation Precision Score:  0.6443569553805775
Training Average Precision Score:  0.9583065728006597
Validation Average Precision Score:  0.5241524796023469


In [43]:
X_tr_lvq, y_tr_lvq = lvq.sample(X_tr, y_tr)
lvqm = rfcb.fit(X_tr_lvq, y_tr_lvq)
y_pred_lvq_tr = lvqm.predict(X_tr_lvq)
y_pred_lvq_val = lvqm.predict(X_val)

2021-04-16 17:23:22,041:INFO:LVQ_SMOTE: Running sampling via ('LVQ_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_clusters': 10, 'n_jobs': 1}")


In [44]:
ass = sv.Assembled_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, ass, rfcb)

2021-04-16 17:30:20,062:INFO:Assembled_SMOTE: Running sampling via ('Assembled_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'pop': 2, 'thres': 0.3, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Training Accuracy:  0.7599730788056779
Validation Accuracy:  0.7666666666666667
Training F1 Score:  0.7401642601669095
Validation F1 Score:  0.525101763907734
Training AUC Score:  0.8438034990305112
Validation AUC Score:  0.7722402600831878
Training Recall Score:  0.6837371512481645
Validation Recall Score:  0.5894897182025894
Training Precision Score:  0.8067427086341322
Validation Precision Score:  0.47339449541284406
Training Average Precision Score:  0.8509803965962606
Validation Average Precision Score:  0.5359082571907731


In [45]:
X_tr_ass, y_tr_ass = ass.sample(X_tr, y_tr)
assm = rfcb.fit(X_tr_ass, y_tr_ass)
y_pred_ass_tr = assm.predict(X_tr_ass)
y_pred_ass_val = assm.predict(X_val)

2021-04-16 17:35:17,729:INFO:Assembled_SMOTE: Running sampling via ('Assembled_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'pop': 2, 'thres': 0.3, 'n_jobs': 1}")


In [56]:
data = {'Accuracy': ["",
                     accuracy(y_val, y_pred_bc_val),
                     accuracy(y_val, y_pred_bbc_val),
                     accuracy(y_val, y_pred_bbc3_val),
                     accuracy(y_val, y_pred_brf_val),
                     accuracy(y_val, y_pred_rbc_val),
                     accuracy(y_val, y_pred_eec_val),
                     "",
                     accuracy(y_val, y_pred_tl_val),
                     accuracy(y_val, y_pred_enn_val),
                     accuracy(y_val, y_pred_oss_val),
                     accuracy(y_val, y_pred_ncr_val),
                     "",
                     accuracy(y_val, y_pred_sm_val),
                     accuracy(y_val, y_pred_svmsm_val),
                     accuracy(y_val, y_pred_adsnm_val),
                     "",
                     accuracy(y_val, y_pred_smtk_val),
                     accuracy(y_val, y_pred_smenn_val),
                     "",
                     accuracy(y_val, y_pred_pfsm_val),
                     accuracy(y_val, y_pred_pws_val),
                     accuracy(y_val, y_pred_smipf_val),
                     accuracy(y_val, y_pred_smobd_val),
                     accuracy(y_val, y_pred_gsm_val),
                     accuracy(y_val, y_pred_ccr_val),
                     accuracy(y_val, y_pred_lvq_val),
                     accuracy(y_val, y_pred_ass_val)],
    'F1 Score': ["",
                f1_score(y_val, y_pred_bc_val),
                f1_score(y_val, y_pred_bbc_val),
                f1_score(y_val, y_pred_bbc3_val),
                f1_score(y_val, y_pred_brf_val),
                f1_score(y_val, y_pred_rbc_val),
                f1_score(y_val, y_pred_eec_val),
                "",
                f1_score(y_val, y_pred_tl_val),
                f1_score(y_val, y_pred_enn_val),
                f1_score(y_val, y_pred_oss_val),
                f1_score(y_val, y_pred_ncr_val),
                "",
                f1_score(y_val, y_pred_sm_val),
                f1_score(y_val, y_pred_svmsm_val),
                f1_score(y_val, y_pred_adsnm_val),
                "",
                f1_score(y_val, y_pred_smtk_val),
                f1_score(y_val, y_pred_smenn_val),
                "",
                f1_score(y_val, y_pred_pfsm_val),
                f1_score(y_val, y_pred_pws_val),
                f1_score(y_val, y_pred_smipf_val),
                f1_score(y_val, y_pred_smobd_val),
                f1_score(y_val, y_pred_gsm_val),
                f1_score(y_val, y_pred_ccr_val),
                f1_score(y_val, y_pred_lvq_val),
                f1_score(y_val, y_pred_ass_val)],
    'PR AUC Score': ["",
              aps(X_val, y_val, bc),
              aps(X_val, y_val, bbc),
              aps(X_val, y_val, bbc3),
              aps(X_val, y_val, brf),
              aps(X_val, y_val, rbc),
              aps(X_val, y_val, eec),
              "",
              aps(X_val, y_val, tlm),
              aps(X_val, y_val, ennm),
              "",
              "",
              "",
              aps(X_val, y_val, smm),
              aps(X_val, y_val, svmsmote),
              aps(X_val, y_val, adsnm),
              "",
              aps(X_val, y_val, smtkm),
              aps(X_val, y_val, smennm),
              "",
              aps(X_val, y_val, pfsmm),
              aps(X_val, y_val, pwsm),
              aps(X_val, y_val, smipfm),
              aps(X_val, y_val, smobdm),
              aps(X_val, y_val, gsmm),
              aps(X_val, y_val, ccrm),
              aps(X_val, y_val, lvqm),
              aps(X_val, y_val, assm)]}
scores = pd.DataFrame(data=data, index = ['ENSEMBLE METHODS',
                                          'BaggingClassifier',
                                          'BalancedBaggingClassifier',
                                          'BBC with GradientBoostingClassifier',
                                          'BalancedRandomForestClassifier',
                                          'RUBoostClassifier',
                                          'EasyEnsembleClassifier',
                                          'UNDERSAMPLING METHODS',
                                          'TomekLinks',
                                          'EditedNearestNeighbours',
                                          'OneSidedSelection',
                                          'NeighbourhoodCleaningRule',
                                          'OVERSAMPLING METHODS',
                                          'SMOTE',
                                          'SVMSMOTE',
                                          'ADASYN',
                                          'COMBINED METHODS',
                                          'SMOTETomek',
                                          'SMOTEENN',
                                          'SMOTE-VARIANTS',
                                          'sv.polynom_fit_SMOTE',
                                          'sv.ProWSyn',
                                          'sv.SMOTE_IPF',
                                          'sv.SMOBD',
                                          'sv.G_SMOTE',
                                          'sv.CCR',
                                          'sv.LQV_SMOTE',
                                          'sv.Assembled_SMOTE'])

In [57]:
scores

Unnamed: 0,Accuracy,F1 Score,PR AUC Score
ENSEMBLE METHODS,,,
BaggingClassifier,0.807,0.443269,0.204693
BalancedBaggingClassifier,0.763167,0.51118,0.307574
BBC with GradientBoostingClassifier,0.753333,0.527157,0.43927
BalancedRandomForestClassifier,0.724,0.508314,0.511147
RUBoostClassifier,0.756667,0.519737,0.524461
EasyEnsembleClassifier,0.751667,0.527883,0.526296
UNDERSAMPLING METHODS,,,
TomekLinks,0.820667,0.491012,0.533732
EditedNearestNeighbours,0.787333,0.528804,0.533732
