# Importing Packages

In [35]:
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
plt.style.use("fivethirtyeight")
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from collections import Counter
import pickle

%reload_ext autoreload
%autoreload 2
from utils import *

import smote_variants as sv
import imbalanced_databases as imbd
import xgboost as xgb

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict, GridSearchCV,RepeatedStratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, roc_auc_score, classification_report, plot_confusion_matrix, auc, mean_squared_error, confusion_matrix, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils import resample
from sklearn.dummy import DummyClassifier
from imblearn.metrics import geometric_mean_score

from imblearn.under_sampling import CondensedNearestNeighbour, NearMiss, OneSidedSelection, NeighbourhoodCleaningRule, RandomUnderSampler, TomekLinks, EditedNearestNeighbours
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, SVMSMOTE
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier, RUSBoostClassifier

from xgboost.sklearn import XGBClassifier

2021-04-15 04:50:11,536:DEBUG:Loaded backend module://ipykernel.pylab.backend_inline version unknown.


# Importing Training and Validation Datasets

In [3]:
pickle_in = open("../data/pickles/training_model.pickle","rb")
train = pickle.load(pickle_in)
pickle_in = open("../data/pickles/validate_model.pickle","rb")
validate = pickle.load(pickle_in)

In [4]:
X_train = train.drop(["default"], axis=1)
y_tr = train["default"]
X_validate = validate.drop(["default"], axis=1)
y_val = validate["default"]

In [5]:
X_train.head()

Unnamed: 0,limit,behind1,paid2,delayed,latemths,age,behind2,billed1,avg_av,avail1
0,1790.26,0,179.13,0,0,44,0,1631.93,0.344578,0.08844
1,5728.83,-1,173.87,0,0,46,-1,891.69,0.957227,0.84435
2,3580.52,-1,0.0,0,0,47,-1,238.68,0.96865,0.933339
3,6086.88,0,89.26,0,0,29,0,2831.87,0.650602,0.534758
4,5370.78,-2,1171.37,0,0,33,-2,873.4,0.836153,0.837379


In [6]:
X_validate.head()

Unnamed: 0,limit,behind1,paid2,delayed,latemths,age,behind2,billed1,avg_av,avail1
0,1074.16,0,71.61,0,0,25,0,317.38,0.602052,0.704532
1,5370.78,0,151.64,0,0,26,0,4895.86,0.293715,0.088427
2,2506.36,0,111.43,0,0,32,0,2510.73,0.005217,-0.001744
3,4654.68,0,64.74,0,0,49,0,740.38,0.883482,0.840939
4,1790.26,0,53.71,1,1,36,0,3373.85,0.188227,-0.884559


# Standardize Datasets

In [7]:
scaler = StandardScaler()
scaler.fit(X_train)
X_tr_ss = scaler.transform(X_train)
X_val_ss = scaler.transform(X_validate)

In [8]:
scaled = MinMaxScaler()
scaled.fit(X_train)
X_tr = scaled.transform(X_train)
X_val = scaled.transform(X_validate)

# Importing Model

In [10]:
pickle_in = open("../data/pickles/best_model.pickle","rb")
rfcb = pickle.load(pickle_in)

In [11]:
rfcb

RandomForestClassifier(max_depth=8, n_estimators=400)

# Dummy Classifier

In [12]:
dc = DummyClassifier(strategy='most_frequent').fit(X_tr, y_tr)
y_pred_dc_tr = dc.predict(X_tr)
y_pred_dc_val = dc.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_dc_tr, y_pred_dc_val, dc)

Training Accuracy:  0.7782857142857142
Validation Accuracy:  0.7811666666666667
Training F1 Score:  0.0
Validation F1 Score:  0.0
Training AUC Score:  0.5
Validation AUC Score:  0.5
Training Recall Score:  0.0
Validation Recall Score:  0.0
Training Precision Score:  0.0
Validation Precision Score:  0.0
Training Average Precision Score:  0.22171428571428572
Validation Average Precision Score:  0.21883333333333332


# Ensemble Methods

## Bagging Classifier

Instead of using a single tree, we will check if an ensemble of decsion tree can actually alleviate the issue induced by the class imbalancing. First, we will use a bagging classifier and its counter part which internally uses a random under-sampling to balanced each boostrap sample.

Balancing each bootstrap sample allows to increase significantly the balanced accuracy and the geometric mean.

In [14]:
bc = BaggingClassifier(n_estimators=50, random_state=42).fit(X_tr_ss, y_tr)
y_pred_bc_tr = bc.predict(X_tr_ss)
y_pred_bc_val = bc.predict(X_val_ss)
get_metric(X_tr_ss, y_tr, X_val_ss, y_val, y_pred_bc_tr, y_pred_bc_val, bc)
print("")
print('Training Balanced Accuracy: ', balanced_accuracy_score(y_tr, y_pred_bc_tr))
print('Training Geometric Mean: ', geometric_mean_score(y_tr, y_pred_bc_tr))
print('Validation Balanced Accuracy: ', balanced_accuracy_score(y_val, y_pred_bc_val))
print('Validation Geometric Mean: ', geometric_mean_score(y_val, y_pred_bc_val))

Training Accuracy:  0.996904761904762
Validation Accuracy:  0.807
Training F1 Score:  0.9929964443486693
Validation F1 Score:  0.44326923076923075
Training AUC Score:  0.9998954304300327
Validation AUC Score:  0.740359286457933
Training Recall Score:  0.9896907216494846
Validation Recall Score:  0.3511043412033511
Training Precision Score:  0.9963243243243243
Validation Precision Score:  0.6010430247718384
Training Average Precision Score:  0.9996190235446103
Validation Average Precision Score:  0.48114820511254575

Training Balanced Accuracy:  0.9943252922980659
Training Geometric Mean:  0.9943144913248209
Validation Balanced Accuracy:  0.6429086886302653
Validation Geometric Mean:  0.5728715429649479


## Balanced Bagging Classifier

A Bagging classifier with additional balancing.

This implementation of Bagging is similar to the scikit-learn implementation. It includes an additional step to balance the training set at fit time using a given sampler.

This classifier can serves as a basis to implement various methods such as Exactly Balanced Bagging [6], Roughly Balanced Bagging [7], Over-Bagging [6], or SMOTE-Bagging [8].

In [15]:
bbc = BalancedBaggingClassifier(n_estimators=50, random_state=42).fit(X_tr_ss, y_tr)
y_pred_bbc_tr = bbc.predict(X_tr_ss)
y_pred_bbc_val = bbc.predict(X_val_ss)
get_metric(X_tr_ss, y_tr, X_val_ss, y_val, y_pred_bbc_tr, y_pred_bbc_val, bbc)
print("")
print('Training Balanced Accuracy: ', balanced_accuracy_score(y_tr, y_pred_bbc_tr))
print('Training Geometric Mean: ', geometric_mean_score(y_tr, y_pred_bbc_tr))
print('Validation Balanced Accuracy: ', balanced_accuracy_score(y_val, y_pred_bbc_val))
print('Validation Geometric Mean: ', geometric_mean_score(y_val, y_pred_bbc_val))

Training Accuracy:  0.9383333333333334
Validation Accuracy:  0.7631666666666667
Training F1 Score:  0.8778186621379375
Validation F1 Score:  0.5111799105607154
Training AUC Score:  0.9955504481714446
Validation AUC Score:  0.7528188434539899
Training Recall Score:  0.9991408934707904
Validation Recall Score:  0.5658796648895659
Training Precision Score:  0.7827696449604576
Validation Precision Score:  0.46612296110414053
Training Average Precision Score:  0.9819759873453777
Validation Average Precision Score:  0.499848233823849

Training Balanced Accuracy:  0.9600758309742596
Training Geometric Mean:  0.9592807316490086
Validation Balanced Accuracy:  0.6921568155896517
Validation Geometric Mean:  0.680540328399629


## Balanced Bagging Classifier with Gradient Boosting Classifier

In [16]:
bbc3 = BalancedBaggingClassifier(
        base_estimator=HistGradientBoostingClassifier(random_state=42),
        n_estimators=10,
        random_state=42,
        n_jobs=2).fit(X_tr_ss, y_tr)
y_pred_bbc3_tr = bbc3.predict(X_tr_ss)
y_pred_bbc3_val = bbc3.predict(X_val_ss)
get_metric(X_tr_ss, y_tr, X_val_ss, y_val, y_pred_bbc3_tr, y_pred_bbc3_val, bbc3)

Training Accuracy:  0.793
Validation Accuracy:  0.7533333333333333
Training F1 Score:  0.6030499497762761
Validation F1 Score:  0.5271565495207667
Training AUC Score:  0.8560039766792316
Validation AUC Score:  0.7749874838134551
Training Recall Score:  0.709192439862543
Validation Recall Score:  0.6283320639756284
Training Precision Score:  0.5245432883240667
Validation Precision Score:  0.45404512933406715
Training Average Precision Score:  0.680188981227117
Validation Average Precision Score:  0.5372388529575939


## Balanced Random Forest Classifier

Random forest is another popular ensemble method and it is usually outperforming bagging. Here, we used a vanilla random forest and its balanced counterpart in which each bootstrap sample is balanced.

Similarly to the previous experiment, the balanced classifier outperform the classifier which learn from imbalanced bootstrap samples. In addition, random forest outsperforms the bagging classifier.

In [17]:
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=42).fit(X_tr, y_tr)
y_pred_brf_tr = brf.predict(X_tr)
y_pred_brf_val = brf.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_brf_tr, y_pred_brf_val, brf)

Training Accuracy:  0.888
Validation Accuracy:  0.724
Training F1 Score:  0.7983539094650206
Validation F1 Score:  0.5083135391923991
Training AUC Score:  0.9928255616361626
Validation AUC Score:  0.7643157793647772
Training Recall Score:  1.0
Validation Recall Score:  0.6519421172886519
Training Precision Score:  0.6643835616438356
Validation Precision Score:  0.41654501216545015
Training Average Precision Score:  0.973183893912578
Validation Average Precision Score:  0.5111466083509113


## RUSBoostClassifier

Random under-sampling integrated in the learning of AdaBoost.

During learning, the problem of class balancing is alleviated by random under-sampling the sample at each iteration of the boosting algorithm

In [18]:
rbc = RUSBoostClassifier(n_estimators=200, algorithm='SAMME.R', random_state=42).fit(X_tr, y_tr)
y_pred_rbc_tr = rbc.predict(X_tr)
y_pred_rbc_val = rbc.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_rbc_tr, y_pred_rbc_val, rbc)

Training Accuracy:  0.7674761904761904
Validation Accuracy:  0.7566666666666667
Training F1 Score:  0.5456406439006235
Validation F1 Score:  0.5197368421052632
Training AUC Score:  0.7936512282426961
Validation AUC Score:  0.7667346817069983
Training Recall Score:  0.6297250859106529
Validation Recall Score:  0.6016755521706016
Training Precision Score:  0.48136594976194386
Validation Precision Score:  0.45744064852345107
Training Average Precision Score:  0.5609306330020786
Validation Average Precision Score:  0.5244608728332589


## Easy Ensemble Classifier

Bag of balanced boosted learners also known as EasyEnsemble.

This algorithm is known as EasyEnsemble [1]. The classifier is an ensemble of AdaBoost learners trained on different balanced boostrap samples. The balancing is achieved by random under-sampling.

In [19]:
eec = EasyEnsembleClassifier(random_state=42).fit(X_tr, y_tr) 
y_pred_eec_tr = eec.predict(X_tr)
y_pred_eec_val = eec.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_eec_tr, y_pred_eec_val, eec)

Training Accuracy:  0.7592380952380953
Validation Accuracy:  0.7516666666666667
Training F1 Score:  0.5427744619280159
Validation F1 Score:  0.5278833967046895
Training AUC Score:  0.790370227396205
Validation AUC Score:  0.7749849651391096
Training Recall Score:  0.6445446735395189
Validation Recall Score:  0.6344249809596344
Training Precision Score:  0.46875976257419555
Validation Precision Score:  0.45198046663049374
Training Average Precision Score:  0.5504213877594832
Validation Average Precision Score:  0.5262955379856071


# Undersampling Methods

In [20]:
tl = TomekLinks()
sampling(X_tr, y_tr, X_val, y_val, tl, rfcb)

Training Count:  Counter({0: 14764, 1: 4656})
Validation Count:  Counter({0: 4264, 1: 1313})
Training Accuracy:  0.8411946446961895
Validation Accuracy:  0.8215886677425139
Training F1 Score:  0.5737976782752903
Validation F1 Score:  0.5105755041810133
Training AUC Score:  0.8411808705535243
Validation AUC Score:  0.7958815653538222
Training Recall Score:  0.44587628865979384
Validation Recall Score:  0.3952779893373953
Training Precision Score:  0.8046511627906977
Validation Precision Score:  0.7208333333333333
Training Average Precision Score:  0.7081118927611287
Validation Average Precision Score:  0.6009639712486988


In [21]:
X_tr_tl, y_tr_tl = tl.fit_resample(X_tr, y_tr)
X_val_tl, y_val_tl = tl.fit_resample(X_val, y_val)
tlm = rfcb.fit(X_tr_tl, y_tr_tl)
y_pred_tl_tr = tlm.predict(X_tr_tl)
y_pred_tl_val = tlm.predict(X_val_tl)

In [22]:
enn = EditedNearestNeighbours()
sampling(X_tr, y_tr, X_val, y_val, enn, rfcb)

Training Count:  Counter({0: 9976, 1: 4656})
Validation Count:  Counter({0: 2831, 1: 1313})
Training Accuracy:  0.8468425369054128
Validation Accuracy:  0.8221525096525096
Training F1 Score:  0.7059441018239077
Validation F1 Score:  0.6595842956120092
Training AUC Score:  0.8901606015089135
Validation AUC Score:  0.8502209919929581
Training Recall Score:  0.5777491408934707
Validation Recall Score:  0.5437928408225438
Training Precision Score:  0.9072512647554806
Validation Precision Score:  0.8380281690140845
Training Average Precision Score:  0.8494178656765456
Validation Average Precision Score:  0.7955570343428421


In [23]:
X_tr_enn, y_tr_enn = enn.fit_resample(X_tr, y_tr)
X_val_enn, y_val_enn = enn.fit_resample(X_val, y_val)
ennm = rfcb.fit(X_tr_enn, y_tr_enn)
y_pred_enn_tr = ennm.predict(X_tr_enn)
y_pred_enn_val = ennm.predict(X_val_enn)

# Oversampling Methods

In [24]:
sm = SMOTE(sampling_strategy='minority', random_state=42)
sampling(X_tr, y_tr, X_val, y_val, sm, rfcb)

Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.7616556534508077
Validation Accuracy:  0.7211435886494559
Training F1 Score:  0.744515494343335
Validation F1 Score:  0.6946975005839756
Training AUC Score:  0.8427581584467958
Validation AUC Score:  0.8002354701064991
Training Recall Score:  0.6945668135095447
Validation Recall Score:  0.6345210155749946
Training Precision Score:  0.8022047911808353
Validation Precision Score:  0.767483870967742
Training Average Precision Score:  0.8500294403513298
Validation Average Precision Score:  0.8094971426948072


In [25]:
X_tr_sm, y_tr_sm = sm.fit_resample(X_tr, y_tr)
X_val_sm, y_val_sm = sm.fit_resample(X_val, y_val)
smm = rfcb.fit(X_tr_sm, y_tr_sm)
y_pred_sm_tr = smm.predict(X_tr_sm)
y_pred_sm_val = smm.predict(X_val_sm)

In [26]:
svmsm = SVMSMOTE()
sampling(X_tr, y_tr, X_val, y_val, svmsm, rfcb)

Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.8132648066568772
Validation Accuracy:  0.759974397269042
Training F1 Score:  0.800222556784709
Validation F1 Score:  0.7318875119161107
Training AUC Score:  0.8962785365348626
Validation AUC Score:  0.841742129188183
Training Recall Score:  0.7479809104258444
Validation Recall Score:  0.6552165564326862
Training Precision Score:  0.8603096410978185
Validation Precision Score:  0.8288798920377868
Training Average Precision Score:  0.906123108211528
Validation Average Precision Score:  0.8520197556517825


In [27]:
X_tr_svm, y_tr_svm = svmsm.fit_resample(X_tr, y_tr)
X_val_svm, y_val_svm = svmsm.fit_resample(X_val, y_val)
svmsmote = rfcb.fit(X_tr_svm, y_tr_svm)
y_pred_svmsm_tr = svmsmote.predict(X_tr_svm)
y_pred_svmsm_val = svmsmote.predict(X_val_svm)

In [28]:
adsn = ADASYN()
sampling(X_tr, y_tr, X_val, y_val, adsn, rfcb)

Training Count:  Counter({0: 16344, 1: 15798})
Validation Count:  Counter({0: 4687, 1: 4539})
Training Accuracy:  0.7390019289403273
Validation Accuracy:  0.6877303273357902
Training F1 Score:  0.7195158647898625
Validation F1 Score:  0.6564102564102564
Training AUC Score:  0.8111968349091816
Validation AUC Score:  0.7627437254906662
Training Recall Score:  0.6810988732750981
Validation Recall Score:  0.6063009473452302
Training Precision Score:  0.7625256891786549
Validation Precision Score:  0.7155486219448778
Training Average Precision Score:  0.8070264802094758
Validation Average Precision Score:  0.76201424416308


In [29]:
X_tr_adsn, y_tr_adsn = adsn.fit_resample(X_tr, y_tr)
X_val_adsn, y_val_adsn = adsn.fit_resample(X_val, y_val)
adsnm = rfcb.fit(X_tr_adsn, y_tr_adsn)
y_pred_adsnm_tr = adsnm.predict(X_tr_adsn)
y_pred_adsnm_val = adsnm.predict(X_val_adsn)

# Combined Methods

In [30]:
smtk = SMOTETomek()
sampling(X_tr, y_tr, X_val, y_val, smtk, rfcb)

Training Count:  Counter({0: 15416, 1: 15416})
Validation Count:  Counter({0: 4425, 1: 4425})
Training Accuracy:  0.769330565646082
Validation Accuracy:  0.7282485875706215
Training F1 Score:  0.7497008516928274
Validation F1 Score:  0.6991117227574126
Training AUC Score:  0.8556610692184828
Validation AUC Score:  0.8094956366305979
Training Recall Score:  0.690905552672548
Validation Recall Score:  0.631412429378531
Training Precision Score:  0.8194337590398523
Validation Precision Score:  0.7830717488789237
Training Average Precision Score:  0.8622962975466505
Validation Average Precision Score:  0.8193511817589939


In [31]:
X_tr_smt, y_tr_smt = smtk.fit_resample(X_tr, y_tr)
X_val_smt, y_val_smt = smtk.fit_resample(X_val, y_val)
smtkm = rfcb.fit(X_tr_smt, y_tr_smt)
y_pred_smtk_tr = smtkm.predict(X_tr_smt)
y_pred_smtk_val = smtkm.predict(X_val_smt)

In [32]:
smenn = SMOTEENN(sampling_strategy="minority", n_jobs= -1)
sampling(X_tr, y_tr, X_val, y_val, smenn, rfcb)

Training Count:  Counter({1: 10884, 0: 8603})
Validation Count:  Counter({1: 3053, 0: 2384})
Training Accuracy:  0.865038230615282
Validation Accuracy:  0.8212249402243884
Training F1 Score:  0.8722929008449064
Validation F1 Score:  0.8294138294138294
Training AUC Score:  0.9491856051940892
Validation AUC Score:  0.911278267388002
Training Recall Score:  0.8252480705622933
Validation Recall Score:  0.7739927939731411
Training Precision Score:  0.9250257466529351
Validation Precision Score:  0.8933837429111531
Training Average Precision Score:  0.9644389147693944
Validation Average Precision Score:  0.9393352909334618


In [33]:
X_tr_sme, y_tr_sme = smenn.fit_resample(X_tr, y_tr)
X_val_sme, y_val_sme = smenn.fit_resample(X_val, y_val)
smennm = rfcb.fit(X_tr_sme, y_tr_sme)
y_pred_smenn_tr = smennm.predict(X_tr_sme)
y_pred_smenn_val = smennm.predict(X_val_sme)

# SMOTE Variants

In [36]:
pfsm = sv.polynom_fit_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, pfsm, rfcb)

2021-04-15 04:50:37,634:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star'}")
2021-04-15 04:50:37,691:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star'}")
Training Count:  Counter({1: 18624, 0: 16344})
Validation Count:  Counter({1: 5252, 0: 4687})
Training Accuracy:  0.8982212308396248
Validation Accuracy:  0.8924439078378107
Training F1 Score:  0.8988719347597535
Validation F1 Score:  0.8920964974260625
Training AUC Score:  0.9571087885168197
Validation AUC Score:  0.9446958830724123
Training Recall Score:  0.849280498281787
Validation Recall Score:  0.8413937547600914
Training Precision Score:  0.9546140382642284
Validation Precision Score:  0.9493018259935553
Training Average Precision Score:  0.9705543176761762
Validation Average Precision Score:  0.9628815414588587


In [37]:
X_tr_pfs, y_tr_pfs = pfsm.sample(X_tr, y_tr)
X_val_pfs, y_val_pfs = pfsm.sample(X_val, y_val)
pfsmm = rfcb.fit(X_tr_pfs, y_tr_pfs)
y_pred_pfsm_tr = pfsmm.predict(X_tr_pfs)
y_pred_pfsm_val = pfsmm.predict(X_val_pfs)

2021-04-15 04:51:00,025:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star'}")
2021-04-15 04:51:00,074:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star'}")


In [38]:
pws = sv.ProWSyn()
sampling2(X_tr, y_tr, X_val, y_val, pws, rfcb)

2021-04-15 04:51:10,367:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1.0, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1}")
2021-04-15 04:51:11,706:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1.0, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.8693404307391092
Validation Accuracy:  0.8566247066353745
Training F1 Score:  0.8608114714029657
Validation F1 Score:  0.846012832263978
Training AUC Score:  0.9352446851215748
Validation AUC Score:  0.914909475700735
Training Recall Score:  0.8080641213901126
Validation Recall Score:  0.787710689140175
Training Precision Score:  0.9209260163168538
Validation Precision Score:  0.9136352388022767
Training Average Precision Score:  0.9483072953418669
Validation Average Precision Score:  0.934783825939638


In [39]:
X_tr_pws, y_tr_pws = pws.sample(X_tr, y_tr)
X_val_pws, y_val_pws = pws.sample(X_val, y_val)
pwsm = rfcb.fit(X_tr_pws, y_tr_pws)
y_pred_pws_tr = pwsm.predict(X_tr_pws)
y_pred_pws_val = pwsm.predict(X_val_pws)

2021-04-15 04:51:26,828:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1.0, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1}")
2021-04-15 04:51:28,202:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1.0, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1}")


In [40]:
smipf = sv.SMOTE_IPF()
sampling2(X_tr, y_tr, X_val, y_val, smipf, rfcb)

2021-04-15 04:51:40,675:INFO:SMOTE_IPF: Running sampling via ('SMOTE_IPF', "{'proportion': 1.0, 'n_neighbors': 5, 'n_folds': 9, 'k': 3, 'p': 0.01, 'voting': 'majority', 'n_jobs': 1, 'classifier': DecisionTreeClassifier()}")
2021-04-15 04:51:40,676:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2021-04-15 04:51:42,968:INFO:SMOTE_IPF: Removing 44 elements
2021-04-15 04:51:44,773:INFO:SMOTE_IPF: Removing 0 elements
2021-04-15 04:51:46,626:INFO:SMOTE_IPF: Removing 0 elements
2021-04-15 04:51:46,633:INFO:SMOTE_IPF: Running sampling via ('SMOTE_IPF', "{'proportion': 1.0, 'n_neighbors': 5, 'n_folds': 9, 'k': 3, 'p': 0.01, 'voting': 'majority', 'n_jobs': 1, 'classifier': DecisionTreeClassifier()}")
2021-04-15 04:51:46,634:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
Training Count:  Counter({0: 16335, 1: 16309})
2021-04-15 04:51:47,122:INFO:SMOTE_IPF: Removing 1 elements
2021-04-15 04:51:47,55

In [41]:
X_tr_smi, y_tr_smi = smipf.sample(X_tr, y_tr)
X_val_smi, y_val_smi = smipf.sample(X_val, y_val)
smipfm = rfcb.fit(X_tr_smi, y_tr_smi)
y_pred_smipf_tr = smipfm.predict(X_tr_smi)
y_pred_smipf_val = smipfm.predict(X_val_smi)

2021-04-15 04:52:02,070:INFO:SMOTE_IPF: Running sampling via ('SMOTE_IPF', "{'proportion': 1.0, 'n_neighbors': 5, 'n_folds': 9, 'k': 3, 'p': 0.01, 'voting': 'majority', 'n_jobs': 1, 'classifier': DecisionTreeClassifier()}")
2021-04-15 04:52:02,072:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2021-04-15 04:52:04,225:INFO:SMOTE_IPF: Removing 44 elements
2021-04-15 04:52:06,023:INFO:SMOTE_IPF: Removing 0 elements
2021-04-15 04:52:07,905:INFO:SMOTE_IPF: Removing 0 elements
2021-04-15 04:52:07,908:INFO:SMOTE_IPF: Running sampling via ('SMOTE_IPF', "{'proportion': 1.0, 'n_neighbors': 5, 'n_folds': 9, 'k': 3, 'p': 0.01, 'voting': 'majority', 'n_jobs': 1, 'classifier': DecisionTreeClassifier()}")
2021-04-15 04:52:07,909:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2021-04-15 04:52:08,405:INFO:SMOTE_IPF: Removing 1 elements
2021-04-15 04:52:08,804:INFO:SMOTE_IPF: Removing 0 elements
2021-04-1

In [42]:
smobd = sv.SMOBD()
sampling2(X_tr, y_tr, X_val, y_val, smobd, rfcb)

2021-04-15 04:52:21,104:INFO:SMOBD: Running sampling via ('SMOBD', "{'proportion': 1.0, 'eta1': 0.5, 't': 1.8, 'min_samples': 5, 'max_eps': 1.0, 'n_jobs': 1}")
2021-04-15 04:52:34,833:INFO:SMOBD: Running sampling via ('SMOBD', "{'proportion': 1.0, 'eta1': 0.5, 't': 1.8, 'min_samples': 5, 'max_eps': 1.0, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.78392682329907
Validation Accuracy:  0.7496266268401963
Training F1 Score:  0.7750135380498837
Validation F1 Score:  0.7360845608905882
Training AUC Score:  0.8657833104857171
Validation AUC Score:  0.8265956447771753
Training Recall Score:  0.744309838472834
Validation Recall Score:  0.6983144868786004
Training Precision Score:  0.8083593594258754
Validation Precision Score:  0.7781740370898717
Training Average Precision Score:  0.8729223145739723
Validation Average Precision Score:  0.8340950315703439


In [43]:
X_tr_smo, y_tr_smo = smobd.sample(X_tr, y_tr)
X_val_smo, y_val_smo = smobd.sample(X_val, y_val)
smobdm = rfcb.fit(X_tr_smo, y_tr_smo)
y_pred_smobd_tr = smobdm.predict(X_tr_smo)
y_pred_smobd_val = smobdm.predict(X_val_smo)

2021-04-15 04:52:51,395:INFO:SMOBD: Running sampling via ('SMOBD', "{'proportion': 1.0, 'eta1': 0.5, 't': 1.8, 'min_samples': 5, 'max_eps': 1.0, 'n_jobs': 1}")
2021-04-15 04:53:05,569:INFO:SMOBD: Running sampling via ('SMOBD', "{'proportion': 1.0, 'eta1': 0.5, 't': 1.8, 'min_samples': 5, 'max_eps': 1.0, 'n_jobs': 1}")


In [44]:
gsm = sv.G_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, gsm, rfcb)

2021-04-15 04:53:19,715:INFO:G_SMOTE: Running sampling via ('G_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'method': 'linear', 'n_jobs': 1}")
2021-04-15 04:53:20,874:INFO:G_SMOTE: Running sampling via ('G_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'method': 'linear', 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.7640724424865394
Validation Accuracy:  0.7311713249413271
Training F1 Score:  0.7460651959170234
Validation F1 Score:  0.7064989517819706
Training AUC Score:  0.8488149685847524
Validation AUC Score:  0.8096987026884461
Training Recall Score:  0.6931595692608908
Validation Recall Score:  0.6471090249626626
Training Precision Score:  0.8077142449736204
Validation Precision Score:  0.7778917671197743
Training Average Precision Score:  0.8565923433612492
Validation Average Precision Score:  0.8203206768560537


In [45]:
X_tr_gsm, y_tr_gsm = gsm.sample(X_tr, y_tr)
X_val_gsm, y_val_gsm = gsm.sample(X_val, y_val)
gsmm = rfcb.fit(X_tr_gsm, y_tr_gsm)
y_pred_gsm_tr = gsmm.predict(X_tr_gsm)
y_pred_gsm_val = gsmm.predict(X_val_gsm)

2021-04-15 04:53:36,243:INFO:G_SMOTE: Running sampling via ('G_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'method': 'linear', 'n_jobs': 1}")
2021-04-15 04:53:37,422:INFO:G_SMOTE: Running sampling via ('G_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'method': 'linear', 'n_jobs': 1}")


In [46]:
ccr = sv.CCR()
sampling2(X_tr, y_tr, X_val, y_val, ccr, rfcb)

2021-04-15 04:53:50,007:INFO:CCR: Running sampling via ('CCR', "{'proportion': 1.0, 'energy': 1.0, 'scaling': 0.0, 'n_jobs': 1}")
2021-04-15 04:53:58,502:INFO:CCR: Running sampling via ('CCR', "{'proportion': 1.0, 'energy': 1.0, 'scaling': 0.0, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16326})
Validation Count:  Counter({0: 4687, 1: 4677})
Training Accuracy:  0.8865319865319865
Validation Accuracy:  0.8831695856471593
Training F1 Score:  0.8785028350430993
Validation F1 Score:  0.8746849942726231
Training AUC Score:  0.9372585448325896
Validation AUC Score:  0.9345089860686273
Training Recall Score:  0.8208991792233248
Validation Recall Score:  0.8163352576437888
Training Precision Score:  0.9448008459640466
Validation Precision Score:  0.9420182580804343
Training Average Precision Score:  0.9542408861222424
Validation Average Precision Score:  0.952332835924645


In [47]:
X_tr_ccr, y_tr_ccr = ccr.sample(X_tr, y_tr)
X_val_ccr, y_val_ccr = ccr.sample(X_val, y_val)
ccrm = rfcb.fit(X_tr_ccr, y_tr_ccr)
y_pred_ccr_tr = crrm.predict(X_tr_ccr)
y_pred_ccr_val = crrm.predict(X_val_ccr)

2021-04-15 04:54:12,754:INFO:CCR: Running sampling via ('CCR', "{'proportion': 1.0, 'energy': 1.0, 'scaling': 0.0, 'n_jobs': 1}")
2021-04-15 04:54:20,496:INFO:CCR: Running sampling via ('CCR', "{'proportion': 1.0, 'energy': 1.0, 'scaling': 0.0, 'n_jobs': 1}")


NameError: name 'crrm' is not defined

In [None]:
lvq = sv.LVQ_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, lvq, rfcb)

In [None]:
X_tr_lvq, y_tr_lvq = lvq.sample(X_tr, y_tr)
X_val_lvq, y_val_lvq = lvq.sample(X_val, y_val)
lvqm = rfcb.fit(X_tr_lvq, y_tr_lvq)
y_pred_lvq_tr = lvqm.predict(X_tr_lvq)
y_pred_lvq_val = lvqm.predict(X_val_lvq)

In [None]:
ass = sv.Assembled_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, ass, rfcb)

In [None]:
X_tr_ass, y_tr_ass = ass.sample(X_tr, y_tr)
X_val_ass, y_val_ass = ass.sample(X_val, y_val)
assm = rfcb.fit(X_tr_ass, y_tr_ass)
y_pred_ass_tr = assm.predict(X_tr_ass)
y_pred_ass_val = assm.predict(X_val_ass)

In [120]:
data = {'Accuracy': ["",
                     accuracy(y_val, y_pred_bc_val),
                     accuracy(y_val, y_pred_bbc_val),
                     accuracy(y_val, y_pred_bbc3_val),
                     accuracy(y_val, y_pred_brf_val),
                     accuracy(y_val, y_pred_rbc_val),
                     accuracy(y_val, y_pred_eec_val),
                     "",
                     accuracy(y_val_tl, y_pred_tl_val),
                     accuracy(y_val_enn, y_pred_enn_val),
                     "",
                     accuracy(y_val_sm, y_pred_sm_val),
                     accuracy(y_val_svm, y_pred_svmsm_val),
                     accuracy(y_val_adsn, y_pred_adsnm_val),
                     "",
                     accuracy(y_val_smt, y_pred_smtk_val),
                     accuracy(y_val_sme, y_pred_smenn_val),
                     "",
                     accuracy(y_val_pfs, y_pred_pfsm_val),
                     accuracy(y_val_pws, y_pred_pws_val),
                     accuracy(y_val_smi, y_pred_smipf_val),
                     accuracy(y_val_smo, y_pred_smobd_val),
                     accuracy(y_val_gsm, y_pred_gsm_val),
                     accuracy(y_val_ccr, y_pred_ccr_val),
                     accuracy(y_val_lvq, y_pred_lvq_val),
                     accuracy(y_val_ass, y_pred_ass_val)],
    'F1 Score': ["",
                f1_score(y_val, y_pred_bc_val),
                f1_score(y_val, y_pred_bbc_val),
                f1_score(y_val, y_pred_bbc3_val),
                f1_score(y_val, y_pred_brf_val),
                f1_score(y_val, y_pred_rbc_val),
                f1_score(y_val, y_pred_eec_val),
                "",
                f1_score(y_val_tl, y_pred_tl_val),
                f1_score(y_val_enn, y_pred_enn_val),
                "",
                f1_score(y_val_sm, y_pred_sm_val),
                f1_score(y_val_svm, y_pred_svmsm_val),
                f1_score(y_val_adsn, y_pred_adsnm_val),
                "",
                f1_score(y_val_smt, y_pred_smtk_val),
                f1_score(y_val_sme, y_pred_smenn_val),
                "",
                f1_score(y_val_pfs, y_pred_pfsm_val),
                f1_score(y_val_pws, y_pred_pws_val),
                f1_score(y_val_smi, y_pred_smipf_val),
                f1_score(y_val_smo, y_pred_smobd_val),
                f1_score(y_val_gsm, y_pred_gsm_val),
                f1_score(y_val_ccr, y_pred_ccr_val),
                f1_score(y_val_lvq, y_pred_lvq_val),
                f1_score(y_val_ass, y_pred_ass_val)],
    'PR AUC Score': ["",
              aps(X_val, y_val, bc),
              aps(X_val, y_val, bbc),
              aps(X_val, y_val, bbc3),
              aps(X_val, y_val, brf),
              aps(X_val, y_val, rbc),
              aps(X_val, y_val, eec),
              "",
              aps(X_val_tl, y_val_tl, tlm),
              aps(X_val_enn, y_val_enn, ennm),
              "",
              aps(X_val_sm, y_val_sm, smm),
              aps(X_val_svm, y_val_svm, svmsmote),
              aps(X_val_adsn, y_val_adsn, adsnm),
              "",
              aps(X_val_smt, y_val_smt, smtkm),
              aps(X_val_sme, y_val_sme, smennm),
              "",
              aps(X_val_pfs, y_val_pfs, pfsmm),
              aps(X_val_pws, y_val_pws, pwsm),
              aps(X_val_smi, y_val_smi, smipfm),
              aps(X_val_smo, y_val_smo, smobdm),
              aps(X_val_gsm, y_val_gsm, gsmm),
              aps(X_val_ccr, y_val_ccr, ccrm),
              aps(X_val_lvq, y_val_lvq, lvqm),
              aps(X_val, y_val_ass, assm)]}
scores = pd.DataFrame(data=data, index = [{'ENSEMBLE METHODS':
                                          'BaggingClassifier',
                                          'BalancedBaggingClassifier',
                                          'BBC with GradientBoostingClassifier',
                                          'BalancedRandomForestClassifier',
                                          'RUBoostClassifier',
                                          'EasyEnsembleClassifier',
                                          'UNDERSAMPLING METHODS',
                                          'TomekLinks',
                                          'EditedNearestNeighbours',
                                          'OVERSAMPLING METHODS',
                                          'SMOTE',
                                          'SVMSMOTE',
                                          'ADASYN',
                                          'COMBINED METHODS',
                                          'SMOTETomek',
                                          'SMOTEENN',
                                          'SMOTE-VARIANTS',
                                          'sv.polynom_fit_SMOTE',
                                          'sv.ProWSyn',
                                          'sv.SMOTE_IPF',
                                          'sv.SMOBD',
                                          'sv.G_SMOTE',
                                          'sv.CCR',
                                          'sv.LQV_SMOTE',
                                          'sv.Assembled_SMOTE'])

In [121]:
scores

Unnamed: 0,Accuracy,F1 Score,PR AUC Score
ENSEMBLE METHODS,,,
BaggingClassifier,0.807,0.443269,0.204693
BalancedBaggingClassifier,0.763167,0.51118,0.307574
BBC with GradientBoostingClassifier,0.753333,0.527157,0.43927
BalancedRandomForestClassifier,0.724,0.508314,0.511147
RUBoostClassifier,0.756667,0.519737,0.524461
EasyEnsembleClassifier,0.751667,0.527883,0.526296
UNDERSAMPLING METHODS,,,
TomekLinks,0.82123,0.508625,0.591199
EditedNearestNeighbours,0.821429,0.658041,0.785686
