# Importing Packages

In [22]:
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
plt.style.use("fivethirtyeight")
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from collections import Counter
import pickle

%reload_ext autoreload
%autoreload 2
from utils import *

import smote_variants as sv
import imbalanced_databases as imbd
import xgboost as xgb

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict, GridSearchCV,RepeatedStratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, roc_auc_score, classification_report, plot_confusion_matrix, auc, mean_squared_error, confusion_matrix, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils import resample
from sklearn.dummy import DummyClassifier

from imblearn.under_sampling import CondensedNearestNeighbour, NearMiss, OneSidedSelection, NeighbourhoodCleaningRule, RandomUnderSampler, TomekLinks, EditedNearestNeighbours
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier, RUSBoostClassifier

from xgboost.sklearn import XGBClassifier

2021-02-22 04:17:43,557:DEBUG:Loaded backend module://ipykernel.pylab.backend_inline version unknown.


# Importing Training and Validation Datasets

In [34]:
pickle_in = open("data/training_model.pickle","rb")
train = pickle.load(pickle_in)
pickle_in = open("data/validate_model.pickle","rb")
validate = pickle.load(pickle_in)

In [4]:
X_train = train.drop(["default"], axis=1)
y_tr = train["default"]
X_validate = validate.drop(["default"], axis=1)
y_val = validate["default"]

In [35]:
X_train.head()

Unnamed: 0,limit,behind1,paid2,delayed,latemths,age,behind2,billed1,avg_av,avail1
0,1790.26,0,179.13,0,0,44,0,1631.93,0.344578,0.08844
1,5728.83,-1,173.87,0,0,46,-1,891.69,0.957227,0.84435
2,3580.52,-1,0.0,0,0,47,-1,238.68,0.96865,0.933339
3,6086.88,0,89.26,0,0,29,0,2831.87,0.650602,0.534758
4,5370.78,-2,1171.37,0,0,33,-2,873.4,0.836153,0.837379


In [36]:
X_validate.head()

Unnamed: 0,limit,behind1,paid2,delayed,latemths,age,behind2,billed1,avg_av,avail1
0,1074.16,0,71.61,0,0,25,0,317.38,0.602052,0.704532
1,5370.78,0,151.64,0,0,26,0,4895.86,0.293715,0.088427
2,2506.36,0,111.43,0,0,32,0,2510.73,0.005217,-0.001744
3,4654.68,0,64.74,0,0,49,0,740.38,0.883482,0.840939
4,1790.26,0,53.71,1,1,36,0,3373.85,0.188227,-0.884559


# Standardize Datasets

In [37]:
scaler = StandardScaler()
scaler.fit(X_train)
X_tr_ss = scaler.transform(X_train)
X_val_ss = scaler.transform(X_validate)

In [58]:
scaled = MinMaxScaler()
scaled.fit(X_train)
X_tr = scaled.transform(X_train)
X_val = scaled.transform(X_validate)

# Importing Model

In [52]:
pickle_in = open("data/best_model.pickle","rb")
rfcb = pickle.load(pickle_in)

In [51]:
rfcb

RandomForestClassifier(max_depth=8, n_estimators=400)

# Dummy Classifier

In [6]:
dc = DummyClassifier(strategy='most_frequent').fit(X_tr, y_tr)
y_pred_dc_tr = dc.predict(X_tr)
y_pred_dc_val = dc.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_dc_tr, y_pred_dc_val, dc)

Training Accuracy:  0.7782857142857142
Validation Accuracy:  0.7811666666666667
Training F1 Score:  0.0
Validation F1 Score:  0.0
Training AUC Score:  0.5
Validation AUC Score:  0.5
Training Recall Score:  0.0
Validation Recall Score:  0.0
Training Precision Score:  0.0
Validation Precision Score:  0.0
Training Average Precision Score:  0.22171428571428572
Validation Average Precision Score:  0.21883333333333332


In [39]:
dc2 = DummyClassifier(strategy='most_frequent').fit(X_tr_mm, y_tr)
y_pred_dc_tr2 = dc2.predict(X_tr_mm)
y_pred_dc_val2 = dc2.predict(X_val_mm)
get_metric(X_tr_mm, y_tr, X_val_mm, y_val, y_pred_dc_tr2, y_pred_dc_val2, dc2)

Training Accuracy:  0.7782857142857142
Validation Accuracy:  0.7811666666666667
Training F1 Score:  0.0
Validation F1 Score:  0.0
Training AUC Score:  0.5
Validation AUC Score:  0.5
Training Recall Score:  0.0
Validation Recall Score:  0.0
Training Precision Score:  0.0
Validation Precision Score:  0.0
Training Average Precision Score:  0.22171428571428572
Validation Average Precision Score:  0.21883333333333332


# Ensemble Methods

## Bagging Classifier

Instead of using a single tree, we will check if an ensemble of decsion tree can actually alleviate the issue induced by the class imbalancing. First, we will use a bagging classifier and its counter part which internally uses a random under-sampling to balanced each boostrap sample.

Balancing each bootstrap sample allows to increase significantly the balanced accuracy and the geometric mean.

In [41]:
bc = BaggingClassifier(n_estimators=50, random_state=42).fit(X_tr_ss, y_tr)
y_pred_bc_tr = bc.predict(X_tr_ss)
y_pred_bc_val = bc.predict(X_val_ss)
get_metric(X_tr_ss, y_tr, X_val_ss, y_val, y_pred_bc_tr, y_pred_bc_val, bc)
print("")
print('Training Balanced Accuracy: ', balanced_accuracy_score(y_tr, y_pred_bc_tr))
print('Training Geometric Mean: ', geometric_mean_score(y_tr, y_pred_bc_tr))
print('Validation Balanced Accuracy: ', balanced_accuracy_score(y_val, y_pred_bc_val))
print('Validation Geometric Mean: ', geometric_mean_score(y_val, y_pred_bc_val))

Training Accuracy:  0.996904761904762
Validation Accuracy:  0.807
Training F1 Score:  0.9929964443486693
Validation F1 Score:  0.44326923076923075
Training AUC Score:  0.9998954304300327
Validation AUC Score:  0.740359286457933
Training Recall Score:  0.9896907216494846
Validation Recall Score:  0.3511043412033511
Training Precision Score:  0.9963243243243243
Validation Precision Score:  0.6010430247718384
Training Average Precision Score:  0.9996190235446103
Validation Average Precision Score:  0.48114820511254575

Training Balanced Accuracy:  0.9943252922980659
Training Geometric Mean:  0.9943144913248209
Validation Balanced Accuracy:  0.6429086886302653
Validation Geometric Mean:  0.5728715429649479


In [42]:
bc2 = BaggingClassifier(n_estimators=50, random_state=42).fit(X_tr_mm, y_tr)
y_pred_bc_tr2 = bc2.predict(X_tr_mm)
y_pred_bc_val2 = bc2.predict(X_val_ss)
get_metric(X_tr_mm, y_tr, X_val_mm, y_val, y_pred_bc_tr2, y_pred_bc_val2, bc2)
print("")
print('Training Balanced Accuracy: ', balanced_accuracy_score(y_tr, y_pred_bc_tr2))
print('Training Geometric Mean: ', geometric_mean_score(y_tr, y_pred_bc_tr2))
print('Validation Balanced Accuracy: ', balanced_accuracy_score(y_val, y_pred_bc_val2))
print('Validation Geometric Mean: ', geometric_mean_score(y_val, y_pred_bc_val2))

Training Accuracy:  0.9968571428571429
Validation Accuracy:  0.5756666666666667
Training F1 Score:  0.9928879310344828
Validation F1 Score:  0.24135876042908222
Training AUC Score:  0.9998956603976701
Validation AUC Score:  0.7392445699412303
Training Recall Score:  0.9894759450171822
Validation Recall Score:  0.30845392231530844
Training Precision Score:  0.9963235294117647
Validation Precision Score:  0.19823788546255505
Training Average Precision Score:  0.9996187404466341
Validation Average Precision Score:  0.48062889134622905

Training Balanced Accuracy:  0.9942179039819146
Training Geometric Mean:  0.9942065954435066
Validation Balanced Accuracy:  0.47948832236951683
Validation Geometric Mean:  0.44794674380648275


## Balanced Bagging Classifier

A Bagging classifier with additional balancing.

This implementation of Bagging is similar to the scikit-learn implementation. It includes an additional step to balance the training set at fit time using a given sampler.

This classifier can serves as a basis to implement various methods such as Exactly Balanced Bagging [6], Roughly Balanced Bagging [7], Over-Bagging [6], or SMOTE-Bagging [8].

In [44]:
bbc = BalancedBaggingClassifier(n_estimators=50, random_state=42).fit(X_tr_ss, y_tr)
y_pred_bbc_tr = bbc.predict(X_tr_ss)
y_pred_bbc_val = bbc.predict(X_val_ss)
get_metric(X_tr_ss, y_tr, X_val_ss, y_val, y_pred_bbc_tr, y_pred_bbc_val, bbc)
print("")
print('Training Balanced Accuracy: ', balanced_accuracy_score(y_tr, y_pred_bbc_tr))
print('Training Geometric Mean: ', geometric_mean_score(y_tr, y_pred_bbc_tr))
print('Validation Balanced Accuracy: ', balanced_accuracy_score(y_val, y_pred_bbc_val))
print('Validation Geometric Mean: ', geometric_mean_score(y_val, y_pred_bbc_val))

Training Accuracy:  0.9383333333333334
Validation Accuracy:  0.7631666666666667
Training F1 Score:  0.8778186621379375
Validation F1 Score:  0.5111799105607154
Training AUC Score:  0.9955504481714446
Validation AUC Score:  0.7528188434539899
Training Recall Score:  0.9991408934707904
Validation Recall Score:  0.5658796648895659
Training Precision Score:  0.7827696449604576
Validation Precision Score:  0.46612296110414053
Training Average Precision Score:  0.9819759873453777
Validation Average Precision Score:  0.499848233823849

Training Balanced Accuracy:  0.9600758309742596
Training Geometric Mean:  0.9592807316490086
Validation Balanced Accuracy:  0.6921568155896517
Validation Geometric Mean:  0.680540328399629


In [46]:
bbc2 = BalancedBaggingClassifier(n_estimators=50, random_state=42).fit(X_tr_mm, y_tr)
y_pred_bbc_tr2 = bbc2.predict(X_tr_mm)
y_pred_bbc_val2 = bbc2.predict(X_val_mm)
get_metric(X_tr_mm, y_tr, X_val_mm, y_val, y_pred_bbc_tr2, y_pred_bbc_val2, bbc2)
print("")
print('Training Balanced Accuracy: ', balanced_accuracy_score(y_tr, y_pred_bbc_tr2))
print('Training Geometric Mean: ', geometric_mean_score(y_tr, y_pred_bbc_tr2))
print('Validation Balanced Accuracy: ', balanced_accuracy_score(y_val, y_pred_bbc_val2))
print('Validation Geometric Mean: ', geometric_mean_score(y_val, y_pred_bbc_val2))

Training Accuracy:  0.9377619047619048
Validation Accuracy:  0.7631666666666667
Training F1 Score:  0.8768027146762183
Validation F1 Score:  0.5098309761986892
Training AUC Score:  0.9954806562787525
Validation AUC Score:  0.7528568673118481
Training Recall Score:  0.998926116838488
Validation Recall Score:  0.5628332063975628
Training Precision Score:  0.7812867461783974
Validation Precision Score:  0.46595208070617905
Training Average Precision Score:  0.9817802664689138
Validation Average Precision Score:  0.49903403558134285

Training Balanced Accuracy:  0.9596319277290825
Training Geometric Mean:  0.9588270977706405
Validation Balanced Accuracy:  0.691060298526283
Validation Geometric Mean:  0.6790597536619647


## Balanced Bagging Classifier with Gradient Boosting Classifier

In [48]:
bbc3 = BalancedBaggingClassifier(
        base_estimator=HistGradientBoostingClassifier(random_state=42),
        n_estimators=10,
        random_state=42,
        n_jobs=2).fit(X_tr_ss, y_tr)
y_pred_bbc_tr3 = bbc3.predict(X_tr_ss)
y_pred_bbc_val3 = bbc3.predict(X_val_ss)
get_metric(X_tr_ss, y_tr, X_val_ss, y_val, y_pred_bbc_tr3, y_pred_bbc_val3, bbc3)

Training Accuracy:  0.793
Validation Accuracy:  0.7533333333333333
Training F1 Score:  0.6030499497762761
Validation F1 Score:  0.5271565495207667
Training AUC Score:  0.8560039766792316
Validation AUC Score:  0.7749874838134551
Training Recall Score:  0.709192439862543
Validation Recall Score:  0.6283320639756284
Training Precision Score:  0.5245432883240667
Validation Precision Score:  0.45404512933406715
Training Average Precision Score:  0.680188981227117
Validation Average Precision Score:  0.5372388529575939


In [49]:
bbc4 = BalancedBaggingClassifier(
        base_estimator=HistGradientBoostingClassifier(random_state=42),
        n_estimators=10,
        random_state=42,
        n_jobs=2).fit(X_tr_mm, y_tr)
y_pred_bbc_tr4 = bbc4.predict(X_tr_mm)
y_pred_bbc_val4 = bbc4.predict(X_val_mm)
get_metric(X_tr_mm, y_tr, X_val_mm, y_val, y_pred_bbc_tr4, y_pred_bbc_val4, bbc4)

Training Accuracy:  0.793
Validation Accuracy:  0.7533333333333333
Training F1 Score:  0.6030499497762761
Validation F1 Score:  0.5271565495207667
Training AUC Score:  0.8560039766792316
Validation AUC Score:  0.7749874838134551
Training Recall Score:  0.709192439862543
Validation Recall Score:  0.6283320639756284
Training Precision Score:  0.5245432883240667
Validation Precision Score:  0.45404512933406715
Training Average Precision Score:  0.680188981227117
Validation Average Precision Score:  0.5372388529575939


## Balanced Random Forest Classifier

Random forest is another popular ensemble method and it is usually outperforming bagging. Here, we used a vanilla random forest and its balanced counterpart in which each bootstrap sample is balanced.

Similarly to the previous experiment, the balanced classifier outperform the classifier which learn from imbalanced bootstrap samples. In addition, random forest outsperforms the bagging classifier.

In [54]:
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=42).fit(X_tr, y_tr)
y_pred_brf_tr = brf.predict(X_tr)
y_pred_brf_val = brf.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_brf_tr, y_pred_brf_val, brf)

Training Accuracy:  0.8858095238095238
Validation Accuracy:  0.7216666666666667
Training F1 Score:  0.7952177625960717
Validation F1 Score:  0.5056246299585553
Training AUC Score:  0.9927690618729111
Validation AUC Score:  0.7649476416352144
Training Recall Score:  1.0
Validation Recall Score:  0.6504188880426505
Training Precision Score:  0.6600510348738304
Validation Precision Score:  0.4135593220338983
Training Average Precision Score:  0.9731060253750823
Validation Average Precision Score:  0.5095399440624323


## RUSBoostClassifier

Random under-sampling integrated in the learning of AdaBoost.

During learning, the problem of class balancing is alleviated by random under-sampling the sample at each iteration of the boosting algorithm

In [55]:
rbc = RUSBoostClassifier(n_estimators=200, algorithm='SAMME.R', random_state=42).fit(X_tr, y_tr)
y_pred_rbc_tr = rbc.predict(X_tr)
y_pred_rbc_val = rbc.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_rbc_tr, y_pred_rbc_val, rbc)

Training Accuracy:  0.7674761904761904
Validation Accuracy:  0.7566666666666667
Training F1 Score:  0.5456406439006235
Validation F1 Score:  0.5197368421052632
Training AUC Score:  0.7936512282426961
Validation AUC Score:  0.7667346817069983
Training Recall Score:  0.6297250859106529
Validation Recall Score:  0.6016755521706016
Training Precision Score:  0.48136594976194386
Validation Precision Score:  0.45744064852345107
Training Average Precision Score:  0.5609306330020786
Validation Average Precision Score:  0.5244608728332589


## Easy Ensemble Classifier

Bag of balanced boosted learners also known as EasyEnsemble.

This algorithm is known as EasyEnsemble [1]. The classifier is an ensemble of AdaBoost learners trained on different balanced boostrap samples. The balancing is achieved by random under-sampling.

In [56]:
eec = EasyEnsembleClassifier(random_state=42).fit(X_tr, y_tr) 
y_pred_eec_tr = eec.predict(X_tr)
y_pred_eec_val = eec.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_eec_tr, y_pred_eec_val, eec)

Training Accuracy:  0.7591428571428571
Validation Accuracy:  0.7513333333333333
Training F1 Score:  0.5426763110307414
Validation F1 Score:  0.5272496831432193
Training AUC Score:  0.7903551875127205
Validation AUC Score:  0.7749366228411914
Training Recall Score:  0.6445446735395189
Validation Recall Score:  0.6336633663366337
Training Precision Score:  0.46861336664584635
Validation Precision Score:  0.45143787303309824
Training Average Precision Score:  0.5503924427172178
Validation Average Precision Score:  0.5262746690329609


# Undersampling Methods

In [12]:
tl = TomekLinks()
sampling(X_tr, y_tr, X_val, y_val, tl, rfcb)

Training Count:  Counter({0: 14844, 1: 4656})
Validation Count:  Counter({0: 4271, 1: 1313})
Training Accuracy:  0.839948717948718
Validation Accuracy:  0.8216332378223495
Training F1 Score:  0.5707605556319626
Validation F1 Score:  0.508390918065153
Training AUC Score:  0.841091632763096
Validation AUC Score:  0.7968306061015121
Training Recall Score:  0.44566151202749144
Validation Recall Score:  0.39223153084539225
Training Precision Score:  0.7934990439770554
Validation Precision Score:  0.7223001402524544
Training Average Precision Score:  0.7049875136912391
Validation Average Precision Score:  0.6050182959215592


In [13]:
enn = EditedNearestNeighbours()
sampling(X_tr, y_tr, X_val, y_val, enn, rfcb)

Training Count:  Counter({0: 9921, 1: 4656})
Validation Count:  Counter({0: 2811, 1: 1313})
Training Accuracy:  0.8493517184605885
Validation Accuracy:  0.8208050436469447
Training F1 Score:  0.7141369435042958
Validation F1 Score:  0.6630186958504332
Training AUC Score:  0.893195115986742
Validation AUC Score:  0.848395881374526
Training Recall Score:  0.5891323024054983
Validation Recall Score:  0.5536938309215537
Training Precision Score:  0.9064771976206213
Validation Precision Score:  0.8261363636363637
Training Average Precision Score:  0.8544751985887381
Validation Average Precision Score:  0.7911518345139913


# Oversampling Methods

In [14]:
sm = SMOTE(sampling_strategy='minority', random_state=42)
sampling(X_tr, y_tr, X_val, y_val, sm, rfcb)

Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.7620533529123837
Validation Accuracy:  0.7224237251973544
Training F1 Score:  0.7464302014735605
Validation F1 Score:  0.6975122064636132
Training AUC Score:  0.8463959109595244
Validation AUC Score:  0.8030891021377533
Training Recall Score:  0.7004405286343612
Validation Recall Score:  0.6400682739492213
Training Precision Score:  0.7988834612700628
Validation Precision Score:  0.7662835249042146
Training Average Precision Score:  0.8526563021371741
Validation Average Precision Score:  0.810826989470193


In [18]:

svmsm = SVMSMOTE()
sampling(X_tr, y_tr, X_val, y_val, svmsm, rfcb)

Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.781754772393539
Validation Accuracy:  0.7906976744186046
Training F1 Score:  0.7695736434108527
Validation F1 Score:  0.7846322722283205
Training AUC Score:  0.8678097486426797
Validation AUC Score:  0.8629364644496721
Training Recall Score:  0.7288913362701909
Validation Recall Score:  0.7625346703648389
Training Precision Score:  0.8150656814449918
Validation Precision Score:  0.808048835631924
Training Average Precision Score:  0.8754784149639834
Validation Average Precision Score:  0.8728034854708415


In [15]:
adsn = ADASYN()
sampling(X_tr, y_tr, X_val, y_val, adsn, rfcb)

Training Count:  Counter({1: 16573, 0: 16344})
Validation Count:  Counter({0: 4687, 1: 4536})
Training Accuracy:  0.7403469331956132
Validation Accuracy:  0.6970616935921067
Training F1 Score:  0.736374572036643
Validation F1 Score:  0.6800274851122309
Training AUC Score:  0.8150627820568924
Validation AUC Score:  0.7682545985387178
Training Recall Score:  0.7202679056296386
Validation Recall Score:  0.6545414462081128
Training Precision Score:  0.7532180716809692
Validation Precision Score:  0.7075786463298379
Training Average Precision Score:  0.8167019672323599
Validation Average Precision Score:  0.7678434827387284


# Combined Methods

In [16]:
smtk = SMOTETomek()
sampling(X_tr, y_tr, X_val, y_val, smtk, rfcb)

Training Count:  Counter({0: 15617, 1: 15617})
Validation Count:  Counter({0: 4448, 1: 4448})
Training Accuracy:  0.7696740731254402
Validation Accuracy:  0.7419064748201439
Training F1 Score:  0.7534950657894737
Validation F1 Score:  0.7223028543783261
Training AUC Score:  0.8562297616043882
Validation AUC Score:  0.8201241474221702
Training Recall Score:  0.7040404687199846
Validation Recall Score:  0.6713129496402878
Training Precision Score:  0.8104223483452495
Validation Precision Score:  0.7816753926701571
Training Average Precision Score:  0.8615989775670404
Validation Average Precision Score:  0.8256320993928399


In [17]:
smenn = SMOTEENN(sampling_strategy="minority", n_jobs= -1)
sampling(X_tr, y_tr, X_val, y_val, smenn, rfcb)

Training Count:  Counter({1: 11465, 0: 8660})
Validation Count:  Counter({1: 3249, 0: 2360})
Training Accuracy:  0.8723478260869565
Validation Accuracy:  0.8111962916740952
Training F1 Score:  0.8839918717543463
Validation F1 Score:  0.8285575522098108
Training AUC Score:  0.9484440797325729
Validation AUC Score:  0.905861255875341
Training Recall Score:  0.8537287396423899
Validation Recall Score:  0.7876269621421976
Training Precision Score:  0.9164794007490636
Validation Precision Score:  0.8739754098360656
Training Average Precision Score:  0.9653497494543584
Validation Average Precision Score:  0.9379111317969824


# SMOTE Variants

In [23]:
pfsm = sv.polynom_fit_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, pfsm, rfcb)

2021-02-22 04:17:59,401:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star'}")
2021-02-22 04:17:59,468:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star'}")
Training Count:  Counter({1: 18624, 0: 16344})
Validation Count:  Counter({1: 5252, 0: 4687})
Training Accuracy:  0.8983356211393274
Validation Accuracy:  0.8917396116309488
Training F1 Score:  0.8990372326829684
Validation F1 Score:  0.8914228052472251
Training AUC Score:  0.9570493303841757
Validation AUC Score:  0.9449129156157973
Training Recall Score:  0.8498711340206185
Validation Recall Score:  0.841012947448591
Training Precision Score:  0.9542412732863086
Validation Precision Score:  0.9482610562473165
Training Average Precision Score:  0.9705263925143566
Validation Average Precision Score:  0.9629791009285424


In [24]:
pws = sv.ProWSyn()
sampling2(X_tr, y_tr, X_val, y_val, pws, rfcb)

2021-02-22 04:19:27,890:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1.0, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1}")
2021-02-22 04:19:29,799:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1.0, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.8706253059226627
Validation Accuracy:  0.8590783016855131
Training F1 Score:  0.862045343337139
Validation F1 Score:  0.848561274790783
Training AUC Score:  0.9344885971857152
Validation AUC Score:  0.9165062095635695
Training Recall Score:  0.8084312285854136
Validation Recall Score:  0.7896308939620226
Training Precision Score:  0.9232758018307595
Validation Precision Score:  0.9169970267591675
Training Average Precision Score:  0.9480470972523901
Validation Average Precision Score:  0.9361608164415575


In [25]:
smipf = sv.SMOTE_IPF()
sampling2(X_tr, y_tr, X_val, y_val, smipf, rfcb)

2021-02-22 04:20:17,554:INFO:SMOTE_IPF: Running sampling via ('SMOTE_IPF', "{'proportion': 1.0, 'n_neighbors': 5, 'n_folds': 9, 'k': 3, 'p': 0.01, 'voting': 'majority', 'n_jobs': 1, 'classifier': DecisionTreeClassifier()}")
2021-02-22 04:20:17,555:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2021-02-22 04:20:19,963:INFO:SMOTE_IPF: Removing 44 elements
2021-02-22 04:20:22,277:INFO:SMOTE_IPF: Removing 0 elements
2021-02-22 04:20:24,347:INFO:SMOTE_IPF: Removing 0 elements
2021-02-22 04:20:24,410:INFO:SMOTE_IPF: Running sampling via ('SMOTE_IPF', "{'proportion': 1.0, 'n_neighbors': 5, 'n_folds': 9, 'k': 3, 'p': 0.01, 'voting': 'majority', 'n_jobs': 1, 'classifier': DecisionTreeClassifier()}")
2021-02-22 04:20:24,418:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
Training Count:  Counter({0: 16335, 1: 16309})
2021-02-22 04:20:25,242:INFO:SMOTE_IPF: Removing 1 elements
2021-02-22 04:20:25,66

In [26]:
smobd = sv.SMOBD()
sampling2(X_tr, y_tr, X_val, y_val, smobd, rfcb)

2021-02-22 04:21:06,588:INFO:SMOBD: Running sampling via ('SMOBD', "{'proportion': 1.0, 'eta1': 0.5, 't': 1.8, 'min_samples': 5, 'max_eps': 1.0, 'n_jobs': 1}")
2021-02-22 04:21:10,961:INFO:SMOBD: Running sampling via ('SMOBD', "{'proportion': 1.0, 'eta1': 0.5, 't': 1.8, 'min_samples': 5, 'max_eps': 1.0, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.7566997063142438
Validation Accuracy:  0.7087689353531044
Training F1 Score:  0.7546657617916526
Validation F1 Score:  0.6989413321570357
Training AUC Score:  0.8398761288740919
Validation AUC Score:  0.7856511223226872
Training Recall Score:  0.7484092021536956
Validation Recall Score:  0.676125453381694
Training Precision Score:  0.7610278106140733
Validation Precision Score:  0.723350833143118
Training Average Precision Score:  0.8389495634338795
Validation Average Precision Score:  0.7805779833045183


In [27]:
gsm = sv.G_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, gsm, rfcb)

2021-02-22 04:21:47,527:INFO:G_SMOTE: Running sampling via ('G_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'method': 'linear', 'n_jobs': 1}")
2021-02-22 04:21:48,826:INFO:G_SMOTE: Running sampling via ('G_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'method': 'linear', 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.7734030837004405
Validation Accuracy:  0.7331982078088329
Training F1 Score:  0.75986383530556
Validation F1 Score:  0.7113675706866706
Training AUC Score:  0.8523388667300853
Validation AUC Score:  0.803912300677409
Training Recall Score:  0.7170215369554577
Validation Recall Score:  0.6575634734371666
Training Precision Score:  0.8081511619888283
Validation Precision Score:  0.7747611865258924
Training Average Precision Score:  0.8579253816727548
Validation Average Precision Score:  0.8151392116700581


In [28]:
ccr = sv.CCR()
sampling2(X_tr, y_tr, X_val, y_val, ccr, rfcb)

2021-02-22 04:22:15,763:INFO:CCR: Running sampling via ('CCR', "{'proportion': 1.0, 'energy': 1.0, 'scaling': 0.0, 'n_jobs': 1}")
2021-02-22 04:22:26,310:INFO:CCR: Running sampling via ('CCR', "{'proportion': 1.0, 'energy': 1.0, 'scaling': 0.0, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 15925})
Validation Count:  Counter({1: 4772, 0: 4687})
Training Accuracy:  0.8860206390033779
Validation Accuracy:  0.8859287451104768
Training F1 Score:  0.8764611043933896
Validation F1 Score:  0.8793469752879347
Training AUC Score:  0.9385565079979807
Validation AUC Score:  0.9380864274586608
Training Recall Score:  0.8192778649921507
Validation Recall Score:  0.8239731768650461
Training Precision Score:  0.9422257528706579
Validation Precision Score:  0.9426995924238791
Training Average Precision Score:  0.9538726864421355
Validation Average Precision Score:  0.9552071688600438


In [29]:
lvq = sv.LVQ_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, lvq, rfcb)

2021-02-22 04:22:43,598:INFO:LVQ_SMOTE: Running sampling via ('LVQ_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_clusters': 10, 'n_jobs': 1}")
2021-02-22 04:22:50,640:INFO:LVQ_SMOTE: Running sampling via ('LVQ_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_clusters': 10, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.889286588350465
Validation Accuracy:  0.8793471303605718
Training F1 Score:  0.8832467658160468
Validation F1 Score:  0.8721022277507634
Training AUC Score:  0.945907340263148
Validation AUC Score:  0.9351727508355461
Training Recall Score:  0.8375550660792952
Validation Recall Score:  0.8227010881160657
Training Precision Score:  0.9342114242817171
Validation Precision Score:  0.9278152069297402
Training Average Precision Score:  0.9595741010147972
Validation Average Precision Score:  0.9511208131357277


In [30]:
ass = sv.Assembled_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, ass, rfcb)

2021-02-22 04:23:07,498:INFO:Assembled_SMOTE: Running sampling via ('Assembled_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'pop': 2, 'thres': 0.3, 'n_jobs': 1}")
2021-02-22 04:28:09,641:INFO:Assembled_SMOTE: Running sampling via ('Assembled_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'pop': 2, 'thres': 0.3, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Validation Count:  Counter({0: 4687, 1: 4687})
Training Accuracy:  0.7632464512971121
Validation Accuracy:  0.728504373799872
Training F1 Score:  0.7461707501065958
Validation F1 Score:  0.7037597485740891
Training AUC Score:  0.8479934415751504
Validation AUC Score:  0.8084987510679753
Training Recall Score:  0.6959740577581988
Validation Recall Score:  0.6449754640494986
Training Precision Score:  0.8041710851891127
Validation Precision Score:  0.7743340163934426
Training Average Precision Score:  0.8541575444103704
Validation Average Precision Score:  0.8151928258905241


In [31]:
tomek = sv.SMOTE_TomekLinks()
sampling2(X_tr, y_tr, X_val, y_val, tomek, rfcb)

2021-02-22 04:28:31,111:INFO:SMOTE_TomekLinks: Running sampling via ('SMOTE_TomekLinks', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2021-02-22 04:28:31,113:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2021-02-22 04:28:31,413:INFO:TomekLinkRemoval: Running noise removal via TomekLinkRemoval
2021-02-22 04:28:32,157:INFO:SMOTE_TomekLinks: Running sampling via ('SMOTE_TomekLinks', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2021-02-22 04:28:32,158:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2021-02-22 04:28:32,240:INFO:TomekLinkRemoval: Running noise removal via TomekLinkRemoval
Training Count:  Counter({0: 15594, 1: 15594})
Validation Count:  Counter({0: 4470, 1: 4470})
Training Accuracy:  0.7735346928305759
Validation Accuracy:  0.7425055928411634
Training F1 Score:  0.7578427675112284
Validation F1 Score:  0.721172480620155
Training AUC Score:  0.8582767176346949

In [32]:
smenn = sv.SMOTE_ENN()
sampling2(X_tr, y_tr, X_val, y_val, smenn, rfcb)

2021-02-22 04:28:46,497:INFO:SMOTE_ENN: Running sampling via ('SMOTE_ENN', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2021-02-22 04:28:46,498:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2021-02-22 04:28:46,797:INFO:EditedNearestNeighbors: Running noise removal via EditedNearestNeighbors
2021-02-22 04:28:48,613:INFO:SMOTE_ENN: Running sampling via ('SMOTE_ENN', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2021-02-22 04:28:48,613:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2021-02-22 04:28:48,698:INFO:EditedNearestNeighbors: Running noise removal via EditedNearestNeighbors
Training Count:  Counter({1: 14953, 0: 11604})
Validation Count:  Counter({1: 4257, 0: 3301})
Training Accuracy:  0.8214406747750123
Validation Accuracy:  0.774675840169357
Training F1 Score:  0.8348655801643684
Validation F1 Score:  0.788578522656735
Training AUC Score:  0.9079434820163733
Vali