# Importing Packages

In [1]:
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
plt.style.use("fivethirtyeight")
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from collections import Counter
import pickle

%reload_ext autoreload
%autoreload 2
from utils import *

import smote_variants as sv
import imbalanced_databases as imbd
import xgboost as xgb

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict, GridSearchCV,RepeatedStratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, roc_auc_score, classification_report, plot_confusion_matrix, auc, mean_squared_error, confusion_matrix, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils import resample
from sklearn.dummy import DummyClassifier
from imblearn.metrics import geometric_mean_score

from imblearn.under_sampling import CondensedNearestNeighbour, NearMiss, OneSidedSelection, NeighbourhoodCleaningRule, RandomUnderSampler, TomekLinks, EditedNearestNeighbours
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, SVMSMOTE
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier, RUSBoostClassifier

from xgboost.sklearn import XGBClassifier

# Importing Training and Validation Datasets

In [2]:
pickle_in = open("../data/pickles/training_model.pickle","rb")
train = pickle.load(pickle_in)
pickle_in = open("../data/pickles/validate_model.pickle","rb")
validate = pickle.load(pickle_in)

In [3]:
X_train = train.drop(["default"], axis=1)
y_tr = train["default"]
X_validate = validate.drop(["default"], axis=1)
y_val = validate["default"]

In [4]:
X_train.head()

Unnamed: 0,limit,behind1,paid2,delayed,latemths,age,behind2,billed1,avg_av,avail1
0,1790.26,0,179.13,0,0,44,0,1631.93,0.344578,0.08844
1,5728.83,-1,173.87,0,0,46,-1,891.69,0.957227,0.84435
2,3580.52,-1,0.0,0,0,47,-1,238.68,0.96865,0.933339
3,6086.88,0,89.26,0,0,29,0,2831.87,0.650602,0.534758
4,5370.78,-2,1171.37,0,0,33,-2,873.4,0.836153,0.837379


In [5]:
X_validate.head()

Unnamed: 0,limit,behind1,paid2,delayed,latemths,age,behind2,billed1,avg_av,avail1
0,1074.16,0,71.61,0,0,25,0,317.38,0.602052,0.704532
1,5370.78,0,151.64,0,0,26,0,4895.86,0.293715,0.088427
2,2506.36,0,111.43,0,0,32,0,2510.73,0.005217,-0.001744
3,4654.68,0,64.74,0,0,49,0,740.38,0.883482,0.840939
4,1790.26,0,53.71,1,1,36,0,3373.85,0.188227,-0.884559


# Standardize Datasets

In [6]:
scaler = StandardScaler()
scaler.fit(X_train)
X_tr_ss = scaler.transform(X_train)
X_val_ss = scaler.transform(X_validate)

In [7]:
scaled = MinMaxScaler()
scaled.fit(X_train)
X_tr = scaled.transform(X_train)
X_val = scaled.transform(X_validate)

# Importing Model

In [8]:
pickle_in = open("../data/pickles/best_model.pickle","rb")
rfcb = pickle.load(pickle_in)

In [9]:
rfcb

RandomForestClassifier(max_depth=8, n_estimators=400)

# Dummy Classifier

In [10]:
#baseline counts
counter = Counter(y_tr)
print("Baseline: ", counter)
counter2 = Counter(y_val)
print("Validation: ", counter2)

Baseline:  Counter({0: 16344, 1: 4656})
Validation:  Counter({0: 4687, 1: 1313})


In [11]:
dc = DummyClassifier(strategy='most_frequent').fit(X_tr, y_tr)
y_pred_dc_tr = dc.predict(X_tr)
y_pred_dc_val = dc.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_dc_tr, y_pred_dc_val, dc)

Training Accuracy:  0.7782857142857142
Validation Accuracy:  0.7811666666666667
Training F1 Score:  0.0
Validation F1 Score:  0.0
Training AUC Score:  0.5
Validation AUC Score:  0.5
Training Recall Score:  0.0
Validation Recall Score:  0.0
Training Precision Score:  0.0
Validation Precision Score:  0.0
Training Average Precision Score:  0.22171428571428572
Validation Average Precision Score:  0.21883333333333332


# Ensemble Methods

## Bagging Classifier

Instead of using a single tree, we will check if an ensemble of decsion tree can actually alleviate the issue induced by the class imbalancing. First, we will use a bagging classifier and its counter part which internally uses a random under-sampling to balanced each boostrap sample.

Balancing each bootstrap sample allows to increase significantly the balanced accuracy and the geometric mean.

In [63]:
bc = BaggingClassifier(n_estimators=50, random_state=42).fit(X_tr_ss, y_tr)
y_pred_bc_tr = bc.predict(X_tr_ss)
y_pred_bc_val = bc.predict(X_val_ss)
get_metric(X_tr_ss, y_tr, X_val_ss, y_val, y_pred_bc_tr, y_pred_bc_val, bc)
print("")
print('Training Balanced Accuracy: ', balanced_accuracy_score(y_tr, y_pred_bc_tr))
print('Training Geometric Mean: ', geometric_mean_score(y_tr, y_pred_bc_tr))
print('Validation Balanced Accuracy: ', balanced_accuracy_score(y_val, y_pred_bc_val))
print('Validation Geometric Mean: ', geometric_mean_score(y_val, y_pred_bc_val))

Training Accuracy:  0.996904761904762
Validation Accuracy:  0.807
Training F1 Score:  0.9929964443486693
Validation F1 Score:  0.44326923076923075
Training AUC Score:  0.9998954304300327
Validation AUC Score:  0.740359286457933
Training Recall Score:  0.9896907216494846
Validation Recall Score:  0.3511043412033511
Training Precision Score:  0.9963243243243243
Validation Precision Score:  0.6010430247718384
Training Average Precision Score:  0.9996190235446103
Validation Average Precision Score:  0.48114820511254575

Training Balanced Accuracy:  0.9943252922980659
Training Geometric Mean:  0.9943144913248209
Validation Balanced Accuracy:  0.6429086886302653
Validation Geometric Mean:  0.5728715429649479


## Balanced Bagging Classifier

A Bagging classifier with additional balancing.

This implementation of Bagging is similar to the scikit-learn implementation. It includes an additional step to balance the training set at fit time using a given sampler.

This classifier can serves as a basis to implement various methods such as Exactly Balanced Bagging [6], Roughly Balanced Bagging [7], Over-Bagging [6], or SMOTE-Bagging [8].

In [14]:
bbc = BalancedBaggingClassifier(n_estimators=50, random_state=42).fit(X_tr_ss, y_tr)
y_pred_bbc_tr = bbc.predict(X_tr_ss)
y_pred_bbc_val = bbc.predict(X_val_ss)
get_metric(X_tr_ss, y_tr, X_val_ss, y_val, y_pred_bbc_tr, y_pred_bbc_val, bbc)
print("")
print('Training Balanced Accuracy: ', balanced_accuracy_score(y_tr, y_pred_bbc_tr))
print('Training Geometric Mean: ', geometric_mean_score(y_tr, y_pred_bbc_tr))
print('Validation Balanced Accuracy: ', balanced_accuracy_score(y_val, y_pred_bbc_val))
print('Validation Geometric Mean: ', geometric_mean_score(y_val, y_pred_bbc_val))

Training Accuracy:  0.9383333333333334
Validation Accuracy:  0.7631666666666667
Training F1 Score:  0.8778186621379375
Validation F1 Score:  0.5111799105607154
Training AUC Score:  0.9955504481714446
Validation AUC Score:  0.7528188434539899
Training Recall Score:  0.9991408934707904
Validation Recall Score:  0.5658796648895659
Training Precision Score:  0.7827696449604576
Validation Precision Score:  0.46612296110414053
Training Average Precision Score:  0.9819759873453777
Validation Average Precision Score:  0.499848233823849

Training Balanced Accuracy:  0.9600758309742596
Training Geometric Mean:  0.9592807316490086
Validation Balanced Accuracy:  0.6921568155896517
Validation Geometric Mean:  0.680540328399629


## Balanced Bagging Classifier with Gradient Boosting Classifier

In [15]:
bbc3 = BalancedBaggingClassifier(
        base_estimator=HistGradientBoostingClassifier(random_state=42),
        n_estimators=10,
        random_state=42,
        n_jobs=2).fit(X_tr_ss, y_tr)
y_pred_bbc3_tr = bbc3.predict(X_tr_ss)
y_pred_bbc3_val = bbc3.predict(X_val_ss)
get_metric(X_tr_ss, y_tr, X_val_ss, y_val, y_pred_bbc3_tr, y_pred_bbc3_val, bbc3)

Training Accuracy:  0.793
Validation Accuracy:  0.7533333333333333
Training F1 Score:  0.6030499497762761
Validation F1 Score:  0.5271565495207667
Training AUC Score:  0.8560039766792316
Validation AUC Score:  0.7749874838134551
Training Recall Score:  0.709192439862543
Validation Recall Score:  0.6283320639756284
Training Precision Score:  0.5245432883240667
Validation Precision Score:  0.45404512933406715
Training Average Precision Score:  0.680188981227117
Validation Average Precision Score:  0.5372388529575939


## Balanced Random Forest Classifier

Random forest is another popular ensemble method and it is usually outperforming bagging. Here, we used a vanilla random forest and its balanced counterpart in which each bootstrap sample is balanced.

Similarly to the previous experiment, the balanced classifier outperform the classifier which learn from imbalanced bootstrap samples. In addition, random forest outsperforms the bagging classifier.

In [16]:
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=42).fit(X_tr, y_tr)
y_pred_brf_tr = brf.predict(X_tr)
y_pred_brf_val = brf.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_brf_tr, y_pred_brf_val, brf)

Training Accuracy:  0.888
Validation Accuracy:  0.724
Training F1 Score:  0.7983539094650206
Validation F1 Score:  0.5083135391923991
Training AUC Score:  0.9928255616361626
Validation AUC Score:  0.7643157793647772
Training Recall Score:  1.0
Validation Recall Score:  0.6519421172886519
Training Precision Score:  0.6643835616438356
Validation Precision Score:  0.41654501216545015
Training Average Precision Score:  0.973183893912578
Validation Average Precision Score:  0.5111466083509113


## RUSBoostClassifier

Random under-sampling integrated in the learning of AdaBoost.

During learning, the problem of class balancing is alleviated by random under-sampling the sample at each iteration of the boosting algorithm

In [19]:
rbc = RUSBoostClassifier(n_estimators=200, algorithm='SAMME.R', random_state=42).fit(X_tr, y_tr)
y_pred_rbc_tr = rbc.predict(X_tr)
y_pred_rbc_val = rbc.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_rbc_tr, y_pred_rbc_val, rbc)

Training Accuracy:  0.7674761904761904
Validation Accuracy:  0.7566666666666667
Training F1 Score:  0.5456406439006235
Validation F1 Score:  0.5197368421052632
Training AUC Score:  0.7936512282426961
Validation AUC Score:  0.7667346817069983
Training Recall Score:  0.6297250859106529
Validation Recall Score:  0.6016755521706016
Training Precision Score:  0.48136594976194386
Validation Precision Score:  0.45744064852345107
Training Average Precision Score:  0.5609306330020786
Validation Average Precision Score:  0.5244608728332589


## Easy Ensemble Classifier

Bag of balanced boosted learners also known as EasyEnsemble.

This algorithm is known as EasyEnsemble [1]. The classifier is an ensemble of AdaBoost learners trained on different balanced boostrap samples. The balancing is achieved by random under-sampling.

In [20]:
eec = EasyEnsembleClassifier(random_state=42).fit(X_tr, y_tr) 
y_pred_eec_tr = eec.predict(X_tr)
y_pred_eec_val = eec.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_eec_tr, y_pred_eec_val, eec)

Training Accuracy:  0.7592380952380953
Validation Accuracy:  0.7516666666666667
Training F1 Score:  0.5427744619280159
Validation F1 Score:  0.5278833967046895
Training AUC Score:  0.790370227396205
Validation AUC Score:  0.7749849651391096
Training Recall Score:  0.6445446735395189
Validation Recall Score:  0.6344249809596344
Training Precision Score:  0.46875976257419555
Validation Precision Score:  0.45198046663049374
Training Average Precision Score:  0.5504213877594832
Validation Average Precision Score:  0.5262955379856071


# Undersampling Methods

## Tomek Links

Tomek links are instances of a minority-majority class pair where each is its own nearest neighbor aka instances of opposing class very close together.  These pairs ultimately fall into one of two categories:  boundary or noise instances.  Only these two cases will have nearest neighbors from opposite classes.  By identify such pairs, Tomek links removes the majority instance with the goal of clarifying the boundary between the two classes by making the minority class region more distinct.

In [21]:
tl = TomekLinks()
sampling(X_tr, y_tr, X_val, y_val, tl, rfcb)

Training Count:  Counter({0: 14764, 1: 4656})
Training Accuracy:  0.841091658084449
Validation Accuracy:  0.8198333333333333
Training F1 Score:  0.5739922694643844
Validation F1 Score:  0.4893717524799244
Training AUC Score:  0.8414495304590621
Validation AUC Score:  0.7800662687594521
Training Recall Score:  0.44652061855670105
Validation Recall Score:  0.3945163747143945
Training Precision Score:  0.8033230293663061
Validation Precision Score:  0.6442786069651741
Training Average Precision Score:  0.7083469944852872
Validation Average Precision Score:  0.5425217465417795


In [22]:
X_tr_tl, y_tr_tl = tl.fit_resample(X_tr, y_tr)
tlm = rfcb.fit(X_tr_tl, y_tr_tl)
y_pred_tl_tr = tlm.predict(X_tr_tl)
y_pred_tl_val = tlm.predict(X_val)

## Edited Nearest Neighbors Rule

This is a rule for finding ambiguous and noisy examples in the dataset by using k = 3 nearest neighbors to edit the preclassified samples and then k = 1 rule to make decisions.



In [23]:
enn = EditedNearestNeighbours()
sampling(X_tr, y_tr, X_val, y_val, enn, rfcb)

Training Count:  Counter({0: 9976, 1: 4656})
Training Accuracy:  0.84656916347731
Validation Accuracy:  0.7856666666666666
Training F1 Score:  0.7058052679858472
Validation F1 Score:  0.5261606484893145
Training AUC Score:  0.8902533606428624
Validation AUC Score:  0.7780456419540298
Training Recall Score:  0.5783934707903781
Validation Recall Score:  0.5437928408225438
Training Precision Score:  0.9052100840336135
Validation Precision Score:  0.5096359743040685
Training Average Precision Score:  0.8496757378433736
Validation Average Precision Score:  0.5333312955110157


In [24]:
X_tr_enn, y_tr_enn = enn.fit_resample(X_tr, y_tr)
ennm = rfcb.fit(X_tr_enn, y_tr_enn)
y_pred_enn_tr = ennm.predict(X_tr_enn)
y_pred_enn_val = ennm.predict(X_val)

In [25]:
oss = OneSidedSelection(n_neighbors=1, n_seeds_S=200)
sampling(X_tr, y_tr, X_val, y_val, oss, rfcb)

Training Count:  Counter({0: 13383, 1: 4656})
Training Accuracy:  0.8297023116580742
Validation Accuracy:  0.8195
Training F1 Score:  0.5766262403528114
Validation F1 Score:  0.4893917963224894
Training AUC Score:  0.8343641504339634
Validation AUC Score:  0.7800234512955817
Training Recall Score:  0.4493127147766323
Validation Recall Score:  0.3952779893373953
Training Precision Score:  0.8046153846153846
Validation Precision Score:  0.6423267326732673
Training Average Precision Score:  0.7146237317981492
Validation Average Precision Score:  0.5427343210219899


In [26]:
X_tr_oss, y_tr_oss = oss.fit_resample(X_tr, y_tr)
ossm = rfcb.fit(X_tr_oss, y_tr_oss)
y_pred_oss_tr = ossm.predict(X_tr_oss)
y_pred_oss_val = ossm.predict(X_val)

In [27]:
ncr = NeighbourhoodCleaningRule(n_neighbors=3, threshold_cleaning=0.5)
sampling(X_tr, y_tr, X_val, y_val, ncr, rfcb)

Training Count:  Counter({0: 10206, 1: 4656})
Training Accuracy:  0.8460503296999058
Validation Accuracy:  0.7906666666666666
Training F1 Score:  0.700836820083682
Validation F1 Score:  0.5313432835820896
Training AUC Score:  0.8873870833846811
Validation AUC Score:  0.7787279589589328
Training Recall Score:  0.5756013745704467
Validation Recall Score:  0.5422696115765423
Training Precision Score:  0.8957219251336899
Validation Precision Score:  0.520848573518654
Training Average Precision Score:  0.8414602542438069
Validation Average Precision Score:  0.5288568446973456


In [28]:
X_tr_ncr, y_tr_ncr = ncr.fit_resample(X_tr, y_tr)
ncrm = rfcb.fit(X_tr_ncr, y_tr_ncr)
y_pred_ncr_tr = ncrm.predict(X_tr_ncr)
y_pred_ncr_val = ncrm.predict(X_val)

# Oversampling Methods

## SMOTE

In [29]:
sm = SMOTE(sampling_strategy='minority', random_state=42)
sampling(X_tr, y_tr, X_val, y_val, sm, rfcb)

Training Count:  Counter({0: 16344, 1: 16344})
Training Accuracy:  0.7596977484092021
Validation Accuracy:  0.7608333333333334
Training F1 Score:  0.7418750616148005
Validation F1 Score:  0.5221445221445221
Training AUC Score:  0.8424826914108536
Validation AUC Score:  0.7734004752332252
Training Recall Score:  0.6906510034263338
Validation Recall Score:  0.597105864432597
Training Precision Score:  0.801306168808121
Validation Precision Score:  0.463905325443787
Training Average Precision Score:  0.8493504655835558
Validation Average Precision Score:  0.5367279167232706


In [30]:
X_tr_sm, y_tr_sm = sm.fit_resample(X_tr, y_tr)
smm = rfcb.fit(X_tr_sm, y_tr_sm)
y_pred_sm_tr = smm.predict(X_tr_sm)
y_pred_sm_val = smm.predict(X_val)

## SVMSMOTE

In [31]:
svmsm = SVMSMOTE()
sampling(X_tr, y_tr, X_val, y_val, svmsm, rfcb)

Training Count:  Counter({0: 16344, 1: 16344})
Training Accuracy:  0.8231461086637298
Validation Accuracy:  0.797
Training F1 Score:  0.8103906326871987
Validation F1 Score:  0.5245901639344263
Training AUC Score:  0.9040624882452624
Validation AUC Score:  0.7771822241389424
Training Recall Score:  0.7558737151248165
Validation Recall Score:  0.5118050266565118
Training Precision Score:  0.8733828207847296
Validation Precision Score:  0.5380304243394716
Training Average Precision Score:  0.9141460399862354
Validation Average Precision Score:  0.5340016714128782


In [32]:
X_tr_svm, y_tr_svm = svmsm.fit_resample(X_tr, y_tr)
svmsmote = rfcb.fit(X_tr_svm, y_tr_svm)
y_pred_svmsm_tr = svmsmote.predict(X_tr_svm)
y_pred_svmsm_val = svmsmote.predict(X_val)

## ADASYN

In [33]:
adsn = ADASYN()
sampling(X_tr, y_tr, X_val, y_val, adsn, rfcb)

Training Count:  Counter({0: 16344, 1: 15798})
Training Accuracy:  0.7357040632194636
Validation Accuracy:  0.7298333333333333
Training F1 Score:  0.7182701555400789
Validation F1 Score:  0.5068451475509583
Training AUC Score:  0.809981021795791
Validation AUC Score:  0.7700347950798427
Training Recall Score:  0.6854665147487023
Validation Recall Score:  0.6344249809596344
Training Precision Score:  0.7543712991988855
Validation Precision Score:  0.4219858156028369
Training Average Precision Score:  0.8054226409857588
Validation Average Precision Score:  0.5344529268723647


In [34]:
X_tr_adsn, y_tr_adsn = adsn.fit_resample(X_tr, y_tr)
adsnm = rfcb.fit(X_tr_adsn, y_tr_adsn)
y_pred_adsnm_tr = adsnm.predict(X_tr_adsn)
y_pred_adsnm_val = adsnm.predict(X_val)

# Combined Methods

## SMOTE Tomek

In [35]:
smtk = SMOTETomek()
sampling(X_tr, y_tr, X_val, y_val, smtk, rfcb)

Training Count:  Counter({0: 15455, 1: 15455})
Training Accuracy:  0.7701067615658364
Validation Accuracy:  0.7606666666666667
Training F1 Score:  0.7542706964520368
Validation F1 Score:  0.5251322751322751
Training AUC Score:  0.8555206822993797
Validation AUC Score:  0.7739561272928264
Training Recall Score:  0.7056615981882886
Validation Recall Score:  0.6047220106626047
Training Precision Score:  0.8100720493203595
Validation Precision Score:  0.4640561075394506
Training Average Precision Score:  0.8611250344543606
Validation Average Precision Score:  0.53542346377142


In [36]:
X_tr_smt, y_tr_smt = smtk.fit_resample(X_tr, y_tr)
smtkm = rfcb.fit(X_tr_smt, y_tr_smt)
y_pred_smtk_tr = smtkm.predict(X_tr_smt)
y_pred_smtk_val = smtkm.predict(X_val)

## SMOTE ENN

In [37]:
smenn = SMOTEENN(sampling_strategy="minority", n_jobs= -1)
sampling(X_tr, y_tr, X_val, y_val, smenn, rfcb)

Training Count:  Counter({1: 10912, 0: 8584})
Training Accuracy:  0.87443578169881
Validation Accuracy:  0.7351666666666666
Training F1 Score:  0.8819899730042422
Validation F1 Score:  0.5148091603053434
Training AUC Score:  0.9497287821337111
Validation AUC Score:  0.7721236698352673
Training Recall Score:  0.8383431085043989
Validation Recall Score:  0.642041127189642
Training Precision Score:  0.9304312449145646
Validation Precision Score:  0.42966360856269115
Training Average Precision Score:  0.9654422190097608
Validation Average Precision Score:  0.525527894928956


In [38]:
X_tr_sme, y_tr_sme = smenn.fit_resample(X_tr, y_tr)
smennm = rfcb.fit(X_tr_sme, y_tr_sme)
y_pred_smenn_tr = smennm.predict(X_tr_sme)
y_pred_smenn_val = smennm.predict(X_val)

# SMOTE Variants

In [39]:
pfsm = sv.polynom_fit_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, pfsm, rfcb)

2021-06-09 01:45:42,973:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star'}")
Training Count:  Counter({1: 18624, 0: 16344})
Training Accuracy:  0.8984786090139556
Validation Accuracy:  0.822
Training F1 Score:  0.8991992730989834
Validation F1 Score:  0.4738916256157635
Training AUC Score:  0.9570314093347203
Validation AUC Score:  0.7789750327874526
Training Recall Score:  0.8501932989690721
Validation Recall Score:  0.36633663366336633
Training Precision Score:  0.954200313366277
Validation Precision Score:  0.6708507670850767
Training Average Precision Score:  0.9705378081648768
Validation Average Precision Score:  0.5446084552864543


In [40]:
X_tr_pfs, y_tr_pfs = pfsm.sample(X_tr, y_tr)
pfsmm = rfcb.fit(X_tr_pfs, y_tr_pfs)
y_pred_pfsm_tr = pfsmm.predict(X_tr_pfs)
y_pred_pfsm_val = pfsmm.predict(X_val)

2021-06-09 01:45:54,935:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star'}")


In [41]:
pws = sv.ProWSyn()
sampling2(X_tr, y_tr, X_val, y_val, pws, rfcb)

2021-06-09 01:46:05,190:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1.0, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Training Accuracy:  0.8669542339696524
Validation Accuracy:  0.8145
Training F1 Score:  0.8576851336758402
Validation F1 Score:  0.4910836762688614
Training AUC Score:  0.9339168265310986
Validation AUC Score:  0.7720182917505616
Training Recall Score:  0.801823299069995
Validation Recall Score:  0.408987052551409
Training Precision Score:  0.92191347168484
Validation Precision Score:  0.61441647597254
Training Average Precision Score:  0.9472562938129672
Validation Average Precision Score:  0.5371462864688781


In [42]:
X_tr_pws, y_tr_pws = pws.sample(X_tr, y_tr)
pwsm = rfcb.fit(X_tr_pws, y_tr_pws)
y_pred_pws_tr = pwsm.predict(X_tr_pws)
y_pred_pws_val = pwsm.predict(X_val)

2021-06-09 01:46:20,962:INFO:ProWSyn: Running sampling via ('ProWSyn', "{'proportion': 1.0, 'n_neighbors': 5, 'L': 5, 'theta': 1.0, 'n_jobs': 1}")


In [43]:
smipf = sv.SMOTE_IPF()
sampling2(X_tr, y_tr, X_val, y_val, smipf, rfcb)

2021-06-09 01:46:34,599:INFO:SMOTE_IPF: Running sampling via ('SMOTE_IPF', "{'proportion': 1.0, 'n_neighbors': 5, 'n_folds': 9, 'k': 3, 'p': 0.01, 'voting': 'majority', 'n_jobs': 1, 'classifier': DecisionTreeClassifier()}")
2021-06-09 01:46:34,600:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2021-06-09 01:46:36,704:INFO:SMOTE_IPF: Removing 44 elements
2021-06-09 01:46:38,480:INFO:SMOTE_IPF: Removing 0 elements
2021-06-09 01:46:40,255:INFO:SMOTE_IPF: Removing 0 elements
Training Count:  Counter({0: 16335, 1: 16309})
Training Accuracy:  0.7629579708369072
Validation Accuracy:  0.7588333333333334
Training F1 Score:  0.7465277777777778
Validation F1 Score:  0.5207022192779066
Training AUC Score:  0.8440929828875134
Validation AUC Score:  0.7725767874747462
Training Recall Score:  0.6986939726531363
Validation Recall Score:  0.5986290936785986
Training Precision Score:  0.8013925029889585
Validation Precision Score:  0.46072684642438455
Tr

In [44]:
X_tr_smi, y_tr_smi = smipf.sample(X_tr, y_tr)
smipfm = rfcb.fit(X_tr_smi, y_tr_smi)
y_pred_smipf_tr = smipfm.predict(X_tr_smi)
y_pred_smipf_val = smipfm.predict(X_val)

2021-06-09 01:46:53,762:INFO:SMOTE_IPF: Running sampling via ('SMOTE_IPF', "{'proportion': 1.0, 'n_neighbors': 5, 'n_folds': 9, 'k': 3, 'p': 0.01, 'voting': 'majority', 'n_jobs': 1, 'classifier': DecisionTreeClassifier()}")
2021-06-09 01:46:53,764:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1}")
2021-06-09 01:46:55,903:INFO:SMOTE_IPF: Removing 44 elements
2021-06-09 01:46:57,703:INFO:SMOTE_IPF: Removing 0 elements
2021-06-09 01:46:59,506:INFO:SMOTE_IPF: Removing 0 elements


In [45]:
smobd = sv.SMOBD()
sampling2(X_tr, y_tr, X_val, y_val, smobd, rfcb)

2021-06-09 01:47:11,106:INFO:SMOBD: Running sampling via ('SMOBD', "{'proportion': 1.0, 'eta1': 0.5, 't': 1.8, 'min_samples': 5, 'max_eps': 1.0, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Training Accuracy:  0.7779613313754283
Validation Accuracy:  0.7586666666666667
Training F1 Score:  0.7678480040941659
Validation F1 Score:  0.5217965653896961
Training AUC Score:  0.8644331029195115
Validation AUC Score:  0.7736481990422213
Training Recall Score:  0.7343979441997063
Validation Recall Score:  0.6016755521706016
Training Precision Score:  0.804490616621984
Validation Precision Score:  0.4606413994169096
Training Average Precision Score:  0.871535709534764
Validation Average Precision Score:  0.5362933780787963


In [46]:
X_tr_smo, y_tr_smo = smobd.sample(X_tr, y_tr)
smobdm = rfcb.fit(X_tr_smo, y_tr_smo)
y_pred_smobd_tr = smobdm.predict(X_tr_smo)
y_pred_smobd_val = smobdm.predict(X_val)

2021-06-09 01:47:38,051:INFO:SMOBD: Running sampling via ('SMOBD', "{'proportion': 1.0, 'eta1': 0.5, 't': 1.8, 'min_samples': 5, 'max_eps': 1.0, 'n_jobs': 1}")


In [47]:
gsm = sv.G_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, gsm, rfcb)

2021-06-09 01:48:04,158:INFO:G_SMOTE: Running sampling via ('G_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'method': 'linear', 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Training Accuracy:  0.7701908957415565
Validation Accuracy:  0.7648333333333334
Training F1 Score:  0.753672612801679
Validation F1 Score:  0.5185943364039577
Training AUC Score:  0.84795639169026
Validation AUC Score:  0.7712737391150614
Training Recall Score:  0.7031326480665687
Validation Recall Score:  0.5788271134805788
Training Precision Score:  0.8120407009609949
Validation Precision Score:  0.46971569839307786
Training Average Precision Score:  0.8559895630644604
Validation Average Precision Score:  0.5348586690613729


In [48]:
X_tr_gsm, y_tr_gsm = gsm.sample(X_tr, y_tr)
gsmm = rfcb.fit(X_tr_gsm, y_tr_gsm)
y_pred_gsm_tr = gsmm.predict(X_tr_gsm)
y_pred_gsm_val = gsmm.predict(X_val)

2021-06-09 01:48:20,724:INFO:G_SMOTE: Running sampling via ('G_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'method': 'linear', 'n_jobs': 1}")


In [49]:
ccr = sv.CCR()
sampling2(X_tr, y_tr, X_val, y_val, ccr, rfcb)

2021-06-09 01:48:33,812:INFO:CCR: Running sampling via ('CCR', "{'proportion': 1.0, 'energy': 1.0, 'scaling': 0.0, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16326})
Training Accuracy:  0.8862565044383226
Validation Accuracy:  0.821
Training F1 Score:  0.8784190550975004
Validation F1 Score:  0.4704142011834319
Training AUC Score:  0.9376062690558001
Validation AUC Score:  0.7686860855917039
Training Recall Score:  0.8222467230184981
Validation Recall Score:  0.3632901751713633
Training Precision Score:  0.9428290490237393
Validation Precision Score:  0.6671328671328671
Training Average Precision Score:  0.9544477652437313
Validation Average Precision Score:  0.5355516594316295


In [50]:
X_tr_ccr, y_tr_ccr = ccr.sample(X_tr, y_tr)
ccrm = rfcb.fit(X_tr_ccr, y_tr_ccr)
y_pred_ccr_tr = ccrm.predict(X_tr_ccr)
y_pred_ccr_val = ccrm.predict(X_val)

2021-06-09 01:48:54,686:INFO:CCR: Running sampling via ('CCR', "{'proportion': 1.0, 'energy': 1.0, 'scaling': 0.0, 'n_jobs': 1}")


In [51]:
lvq = sv.LVQ_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, lvq, rfcb)

2021-06-09 01:49:13,327:INFO:LVQ_SMOTE: Running sampling via ('LVQ_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_clusters': 10, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Training Accuracy:  0.8911221243269701
Validation Accuracy:  0.8165
Training F1 Score:  0.8846615030625141
Validation F1 Score:  0.4724484906564447
Training AUC Score:  0.9429207309607991
Validation AUC Score:  0.7649660035836673
Training Recall Score:  0.8351076847772883
Validation Recall Score:  0.37547600913937546
Training Precision Score:  0.940467167367188
Validation Precision Score:  0.6369509043927648
Training Average Precision Score:  0.9580422276154233
Validation Average Precision Score:  0.5262108731024022


In [52]:
X_tr_lvq, y_tr_lvq = lvq.sample(X_tr, y_tr)
lvqm = rfcb.fit(X_tr_lvq, y_tr_lvq)
y_pred_lvq_tr = lvqm.predict(X_tr_lvq)
y_pred_lvq_val = lvqm.predict(X_val)

2021-06-09 01:49:29,963:INFO:LVQ_SMOTE: Running sampling via ('LVQ_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_clusters': 10, 'n_jobs': 1}")


In [53]:
ass = sv.Assembled_SMOTE()
sampling2(X_tr, y_tr, X_val, y_val, ass, rfcb)

2021-06-09 01:49:45,035:INFO:Assembled_SMOTE: Running sampling via ('Assembled_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'pop': 2, 'thres': 0.3, 'n_jobs': 1}")
Training Count:  Counter({0: 16344, 1: 16344})
Training Accuracy:  0.7604013705335291
Validation Accuracy:  0.76
Training F1 Score:  0.7436669503174707
Validation F1 Score:  0.521594684385382
Training AUC Score:  0.8425735248358289
Validation AUC Score:  0.7723621314224776
Training Recall Score:  0.6951174743024964
Validation Recall Score:  0.5978674790555979
Training Precision Score:  0.7995073891625616
Validation Precision Score:  0.4625810253388332
Training Average Precision Score:  0.8498748197266996
Validation Average Precision Score:  0.5349654339307137


In [58]:
X_tr_ass, y_tr_ass = ass.sample(X_tr, y_tr)
assm = rfcb.fit(X_tr_ass, y_tr_ass)
y_pred_ass_tr = assm.predict(X_tr_ass)
y_pred_ass_val = assm.predict(X_val)

2021-06-09 01:59:44,405:INFO:Assembled_SMOTE: Running sampling via ('Assembled_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'pop': 2, 'thres': 0.3, 'n_jobs': 1}")


In [62]:
data = {'Accuracy': ["",
                     accuracy(y_val, y_pred_bc_val),
                     accuracy(y_val, y_pred_bbc_val),
                     accuracy(y_val, y_pred_bbc3_val),
                     accuracy(y_val, y_pred_brf_val),
                     accuracy(y_val, y_pred_rbc_val),
                     accuracy(y_val, y_pred_eec_val),
                     "",
                     accuracy(y_val, y_pred_tl_val),
                     accuracy(y_val, y_pred_enn_val),
                     accuracy(y_val, y_pred_oss_val),
                     accuracy(y_val, y_pred_ncr_val),
                     "",
                     accuracy(y_val, y_pred_sm_val),
                     accuracy(y_val, y_pred_svmsm_val),
                     accuracy(y_val, y_pred_adsnm_val),
                     "",
                     accuracy(y_val, y_pred_smtk_val),
                     accuracy(y_val, y_pred_smenn_val),
                     "",
                     accuracy(y_val, y_pred_pfsm_val),
                     accuracy(y_val, y_pred_pws_val),
                     accuracy(y_val, y_pred_smipf_val),
                     accuracy(y_val, y_pred_smobd_val),
                     accuracy(y_val, y_pred_gsm_val),
                     accuracy(y_val, y_pred_ccr_val),
                     accuracy(y_val, y_pred_lvq_val),
                     accuracy(y_val, y_pred_ass_val)],
    'F1 Score': ["",
                f1_score(y_val, y_pred_bc_val),
                f1_score(y_val, y_pred_bbc_val),
                f1_score(y_val, y_pred_bbc3_val),
                f1_score(y_val, y_pred_brf_val),
                f1_score(y_val, y_pred_rbc_val),
                f1_score(y_val, y_pred_eec_val),
                "",
                f1_score(y_val, y_pred_tl_val),
                f1_score(y_val, y_pred_enn_val),
                f1_score(y_val, y_pred_oss_val),
                f1_score(y_val, y_pred_ncr_val),
                "",
                f1_score(y_val, y_pred_sm_val),
                f1_score(y_val, y_pred_svmsm_val),
                f1_score(y_val, y_pred_adsnm_val),
                "",
                f1_score(y_val, y_pred_smtk_val),
                f1_score(y_val, y_pred_smenn_val),
                "",
                f1_score(y_val, y_pred_pfsm_val),
                f1_score(y_val, y_pred_pws_val),
                f1_score(y_val, y_pred_smipf_val),
                f1_score(y_val, y_pred_smobd_val),
                f1_score(y_val, y_pred_gsm_val),
                f1_score(y_val, y_pred_ccr_val),
                f1_score(y_val, y_pred_lvq_val),
                f1_score(y_val, y_pred_ass_val)],
    'PR AUC Score': ["",
              aps(X_val, y_val, bc),
              aps(X_val, y_val, bbc),
              aps(X_val, y_val, bbc3),
              aps(X_val, y_val, brf),
              aps(X_val, y_val, rbc),
              aps(X_val, y_val, eec),
              "",
              aps(X_val, y_val, tlm),
              aps(X_val, y_val, ennm),
              "",
              "",
              "",
              aps(X_val, y_val, smm),
              aps(X_val, y_val, svmsmote),
              aps(X_val, y_val, adsnm),
              "",
              aps(X_val, y_val, smtkm),
              aps(X_val, y_val, smennm),
              "",
              aps(X_val, y_val, pfsmm),
              aps(X_val, y_val, pwsm),
              aps(X_val, y_val, smipfm),
              aps(X_val, y_val, smobdm),
              aps(X_val, y_val, gsmm),
              aps(X_val, y_val, ccrm),
              aps(X_val, y_val, lvqm),
              aps(X_val, y_val, assm)],
            'ROC AUC Score': ["",
              auc(y_val, y_bc),
              auc(X_val, y_val, bbc),
              auc(X_val, y_val, bbc3),
              auc(X_val, y_val, brf),
              auc(X_val, y_val, rbc),
              auc(X_val, y_val, eec),
              "",
              auc(X_val, y_val, tlm),
              auc(X_val, y_val, ennm),
              "",
              "",
              "",
              auc(X_val, y_val, smm),
              auc(X_val, y_val, svmsmote),
              auc(X_val, y_val, adsnm),
              "",
              auc(X_val, y_val, smtkm),
              auc(X_val, y_val, smennm),
              "",
              auc(X_val, y_val, pfsmm),
              auc(X_val, y_val, pwsm),
              auc(X_val, y_val, smipfm),
              auc(X_val, y_val, smobdm),
              auc(X_val, y_val, gsmm),
              auc(X_val, y_val, ccrm),
              auc(X_val, y_val, lvqm),
              auc(X_val, y_val, assm)]}
scores = pd.DataFrame(data=data, index = ['ENSEMBLE METHODS',
                                          'BaggingClassifier',
                                          'BalancedBaggingClassifier',
                                          'BBC with GradientBoostingClassifier',
                                          'BalancedRandomForestClassifier',
                                          'RUBoostClassifier',
                                          'EasyEnsembleClassifier',
                                          'UNDERSAMPLING METHODS',
                                          'TomekLinks',
                                          'EditedNearestNeighbours',
                                          'OneSidedSelection',
                                          'NeighbourhoodCleaningRule',
                                          'OVERSAMPLING METHODS',
                                          'SMOTE',
                                          'SVMSMOTE',
                                          'ADASYN',
                                          'COMBINED METHODS',
                                          'SMOTETomek',
                                          'SMOTEENN',
                                          'SMOTE-VARIANTS',
                                          'sv.polynom_fit_SMOTE',
                                          'sv.ProWSyn',
                                          'sv.SMOTE_IPF',
                                          'sv.SMOBD',
                                          'sv.G_SMOTE',
                                          'sv.CCR',
                                          'sv.LQV_SMOTE',
                                          'sv.Assembled_SMOTE'])

TypeError: auc() takes 2 positional arguments but 3 were given

In [60]:
scores

Unnamed: 0,Accuracy,F1 Score,PR AUC Score
ENSEMBLE METHODS,,,
BaggingClassifier,0.807,0.443269,0.204693
BalancedBaggingClassifier,0.763167,0.51118,0.307574
BBC with GradientBoostingClassifier,0.753333,0.527157,0.43927
BalancedRandomForestClassifier,0.724,0.508314,0.511147
RUBoostClassifier,0.756667,0.519737,0.524461
EasyEnsembleClassifier,0.751667,0.527883,0.526296
UNDERSAMPLING METHODS,,,
TomekLinks,0.819667,0.489623,0.535317
EditedNearestNeighbours,0.786667,0.527326,0.535317
