<h1><center><font size="6">Hyperparameter Tuning and Class Imbalance Notebook</font></center></h1>

# Contents

- <a href='#1'>Importing Packages</a>
- <a href='#2'>Preparing Data for Modeling</a>  
- <a href='#3'>Hyperparameter Tuning</a>
    - <a href='#31'>Logistic Regression with GridSearchCV</a>
    - <a href='#32'>Random Forest Classifier with GridSearchCV</a>
    - <a href='#33'>AdaBoost Classifier with GridSearchCV</a>
    - <a href='#34'>Gradient Boosting Classifier with GridSearchCV</a>
    - <a href='#35'>XGBoost Classifier with GridSearchCV</a>
    - <a href='#36'>Evaluation Metrics</a>
- <a href='#4'>Class Imbalance</a>
    - <a href='#41'>Ensemble Methods</a>
    - <a href='#42'>Undersampling/Downsampling Methods for Majority Class</a>
    - <a href='#43'>Oversampling/Upsampling Methods for Minority Class</a>


# <a id='1'>Importing Packages</a>

In [56]:
import numpy as np 
import pandas as pd
import re
from matplotlib import pyplot as plt
from matplotlib import style
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import itertools
from collections import Counter


from sklearn import linear_model
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, roc_auc_score, classification_report, plot_confusion_matrix, auc, mean_squared_error, confusion_matrix, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier, RUSBoostClassifier
from imblearn.metrics import geometric_mean_score
from imblearn.under_sampling import TomekLinks

import xgboost as xgb
from xgboost.sklearn import XGBClassifier


%reload_ext autoreload
%autoreload 2

from utils import *

plt.style.use("fivethirtyeight")
sns.set_theme(style="darkgrid", font='serif', context='poster')

import pickle

from imblearn.under_sampling import NeighbourhoodCleaningRule
from matplotlib import pyplot
from numpy import where
from imblearn.under_sampling import NearMiss 
from imblearn.under_sampling import OneSidedSelection
from numpy import mean
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.dummy import DummyClassifier
from sklearn.utils import resample
from sklearn.model_selection import cross_validate

# <a id='2'>Preparing Data for Modeling</a>

In [59]:
pickle_in = open("../data/pickles/training_model.pickle","rb")
train = pickle.load(pickle_in)
pickle_in = open("../data/pickles/validate_model.pickle","rb")
validate = pickle.load(pickle_in)

In [60]:
train.head()

Unnamed: 0,limit,behind1,paid2,delayed,latemths,age,behind2,billed1,avg_av,avail1,default
0,1790.26,0,179.13,0,0,44,0,1631.93,0.344578,0.08844,0
1,5728.83,-1,173.87,0,0,46,-1,891.69,0.957227,0.84435,0
2,3580.52,-1,0.0,0,0,47,-1,238.68,0.96865,0.933339,1
3,6086.88,0,89.26,0,0,29,0,2831.87,0.650602,0.534758,0
4,5370.78,-2,1171.37,0,0,33,-2,873.4,0.836153,0.837379,0


In [61]:
X_train = train.drop(["default"], axis=1)
y_tr = train["default"]
X_validate = validate.drop(["default"], axis=1)
y_val = validate["default"]

In [62]:
scaler = StandardScaler()
scaler.fit(X_train)
X_tr = scaler.transform(X_train)
X_val = scaler.transform(X_validate)

# <a id='3'>Hyperparameter Tuning</a>

## <a id='31'>Logistic Regression with GridSearchCV</a>

In [36]:
# logreg = LogisticRegression()
# params = {'C': [0.001, 0.01, 0.1, 1, 10], 
#           'penalty': ['none', 'l1', 'l2', 'elasticnet'],
#           'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']}
# gslog = GridSearchCV(estimator = logreg,
#                      param_grid = params,
#                      scoring = 'average_precision',
#                      cv = 10,
#                      n_jobs = -1).fit(X_tr, y_tr)
# y_pred_gslog_tr = gslog.predict(X_tr)
# y_pred_gslog_val = gslog.predict(X_val)
# print("Best: %f using %s" % (gslog.best_score_, gslog.best_params_))
# print("")
# get_metrics(X_tr, y_tr, X_val, y_val, y_pred_gslog_tr, y_pred_gslog_val, gslog)

# Best: 0.522622 using {'C': 1, 'penalty': 'l2', 'solver': 'newton-cg'}

In [37]:
logb = LogisticRegression(C=1, penalty='l2', solver='newton-cg').fit(X_tr, y_tr)
y_pred_logb_tr = logb.predict(X_tr)
y_pred_logb_val = logb.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_logb_tr, y_pred_logb_val, logb)

Training Accuracy:  0.8087142857142857
Validation Accuracy:  0.8071666666666667
Training F1 Score:  0.40320903283316006
Validation F1 Score:  0.380952380952381
Training AUC Score:  0.7476971040793051
Validation AUC Score:  0.7467126831177808
Training Recall Score:  0.2914518900343643
Validation Recall Score:  0.27113480578827115
Training Precision Score:  0.6539759036144578
Validation Precision Score:  0.6402877697841727
Training Average Precision Score:  0.5205202430336466
Validation Average Precision Score:  0.4986582661881126


## <a id='33'>Random Forest Classifier with GridSearchCV</a>

In [27]:
# rfc = RandomForestClassifier()
# params = {'n_estimators': [100, 200, 400, 600, 1000],
#           'criterion': ['entropy', 'gini'],
#           'max_depth': [5, 8, 15, 25, 30],
#           'min_samples_split': [2, 5, 10, 15, 100],
#           'min_samples_leaf': [1, 2, 5, 10]}
# gsrfc = GridSearchCV(estimator = rfc,
#                      param_grid = params,
#                      scoring = 'average_precision',
#                      cv = 5,
#                      n_jobs = -1).fit(X_tr, y_tr)
# y_pred_gsrfc_tr = gsrfc.predict(X_tr)
# y_pred_gsrfc_val = gsrfc.predict(X_val)
# print("Best: %f using %s" % (gsrfc.best_score_, gsrfc.best_params_))
# print("")
# get_metrics(X_tr, y_tr, X_val, y_val, y_pred_gsrfc_tr, y_pred_gsrfc_val, gsrfc)

# Best: 0.565196 using {'criterion': 'gini', 'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}

In [53]:
rfcb = RandomForestClassifier(criterion='gini', max_depth=8, min_samples_leaf=1, min_samples_split=2, n_estimators=400).fit(X_tr, y_tr)
y_pred_rfcb_tr = rfcb.predict(X_tr)
y_pred_rfcb_val = rfcb.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_rfcb_tr, y_pred_rfcb_val, rfcb)

Training Accuracy:  0.8333333333333334
Validation Accuracy:  0.82
Training F1 Score:  0.5149667405764967
Validation F1 Score:  0.46375372393247266
Training AUC Score:  0.8263813590913907
Validation AUC Score:  0.7805258862036932
Training Recall Score:  0.39905498281786944
Validation Recall Score:  0.3556740289413557
Training Precision Score:  0.72578125
Validation Precision Score:  0.666191155492154
Training Average Precision Score:  0.6523890600337748
Validation Average Precision Score:  0.5445212676503224


## <a id='34'>AdaBoost Classifier with GridSearchCV</a>

In [28]:
# abc = AdaBoostClassifier()
# params = {'n_estimators': [10, 50, 100, 200],
#           'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.5]}
# gsabc = GridSearchCV(estimator = abc,
#                      param_grid = params,
#                      n_jobs = -1,
#                      cv = 5,
#                      scoring = 'average_precision').fit(X_tr, y_tr)
# y_pred_gsabc_tr = gsabc.predict(X_tr)
# y_pred_gsabc_val = gsabc.predict(X_val)
# print("Best: %f using %s" % (gsabc.best_score_, gsabc.best_params_))
# print("")
# get_metrics(X_tr, y_tr, X_val, y_val, y_pred_gsabc_tr, y_pred_gsabc_val, gsabc)

# Best: 0.545818 using {'learning_rate': 0.1, 'n_estimators': 200}

In [43]:
abcb = AdaBoostClassifier(learning_rate=0.1, n_estimators=200).fit(X_tr, y_tr)
y_pred_abcb_tr = abcb.predict(X_tr)
y_pred_abcb_val = abcb.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_abcb_tr, y_pred_abcb_val, abcb)

Training Accuracy:  0.8192380952380952
Validation Accuracy:  0.8208333333333333
Training F1 Score:  0.44969556393157434
Validation F1 Score:  0.4438696326952923
Training AUC Score:  0.7866775976198166
Validation AUC Score:  0.7772694027703144
Training Recall Score:  0.3331185567010309
Validation Recall Score:  0.32673267326732675
Training Precision Score:  0.691793041926851
Validation Precision Score:  0.6919354838709677
Training Average Precision Score:  0.5528304845005094
Validation Average Precision Score:  0.5244329096074963


## <a id='35'>Gradient Boosting with GridSearchCV</a>

In [29]:
# gbc = GradientBoostingClassifier()
# params = {'n_estimators': [10, 100, 1000],
#           'learning_rate': [0.001, 0.01, 0.1],
#           'max_depth': [3, 7, 9]}
# gsgbc = GridSearchCV(estimator = gbc,
#                      param_grid = params, 
#                      n_jobs = -1, 
#                      cv = 5, 
#                      scoring = 'average_precision').fit(X_tr, y_tr)
# y_pred_gsgbc_tr = gsgbc.predict(X_tr)
# y_pred_gsgbc_val = gsgbc.predict(X_val)
# print("Best: %f using %s" % (gsgbc.best_score_, gsgbc.best_params_))
# print("")
# get_metric(X_tr, y_tr, X_val, y_val, y_pred_gsgbc_tr, y_pred_gsgbc_tr, gsgbc)

# Best: 0.558390 using {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 1000}

In [44]:
gbcb = GradientBoostingClassifier(learning_rate=0.01, max_depth=3, n_estimators=1000).fit(X_tr, y_tr)
y_pred_gbcb_tr = gbcb.predict(X_tr)
y_pred_gbcb_val = gbcb.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_gbcb_tr, y_pred_gbcb_val, gbcb)

Training Accuracy:  0.8276190476190476
Validation Accuracy:  0.8196666666666667
Training F1 Score:  0.49694274596998333
Validation F1 Score:  0.4622266401590458
Training AUC Score:  0.805111244939135
Validation AUC Score:  0.7812120218438938
Training Recall Score:  0.38402061855670105
Validation Recall Score:  0.3541507996953541
Training Precision Score:  0.7039370078740157
Validation Precision Score:  0.6652360515021459
Training Average Precision Score:  0.6003473510491001
Validation Average Precision Score:  0.5434988921201059


## <a id='36'>XGBoost Classifier with GridSearchCV</a>

In [32]:
# xgb = XGBClassifier()
# params = {'n_estimators': [50, 100, 150, 200], 
#           'max_depth': [3, 5, 7, 10], 
#           'min_child_weight': [2, 3, 4, 5]}
# gsxgb = GridSearchCV(estimator = xgb,
#                      param_grid = params,
#                      scoring = 'average_precision',
#                      cv = 5,
#                      n_jobs = -1).fit(X_tr, y_tr)
# y_pred_gsxgb_tr = gsxgb.predict(X_tr)
# y_pred_gsxgb_val = gsxgb.predict(X_val)
# print("Best: %f using %s" % (gsxgb.best_score_, gsxgb.best_params_))
# print("")
# get_metrics(X_tr, y_tr, X_val, y_val, y_pred_gsxgb_tr, y_pred_gsxgb_val, gsxgb)

# Best: 0.555500 using {'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 50}


In [46]:
xgbb = XGBClassifier(max_depth=3, min_child_weight=1, n_estimators=50).fit(X_tr, y_tr)
y_pred_xgbb_tr = xgbb.predict(X_tr)
y_pred_xgbb_val = xgbb.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_xgbb_tr, y_pred_xgbb_val, xgbb)

Training Accuracy:  0.8276190476190476
Validation Accuracy:  0.8185
Training F1 Score:  0.49833702882483377
Validation F1 Score:  0.4579392732702837
Training AUC Score:  0.810540741434586
Validation AUC Score:  0.7767761488364293
Training Recall Score:  0.3861683848797251
Validation Recall Score:  0.3503427265803503
Training Precision Score:  0.70234375
Validation Precision Score:  0.6609195402298851
Training Average Precision Score:  0.6011613543281619
Validation Average Precision Score:  0.5393954705676095


**Best Hyperparameters for each Model:**

- 

## <a id='37'>Evaluation Metrics</a>

In [47]:
data = {'Accuracy': [accuracy(y_val, y_pred_logb_val), 
                     accuracy(y_val, y_pred_rfcb_val), 
                     accuracy(y_val, y_pred_abcb_val), 
                     accuracy(y_val, y_pred_gbcb_val),
                     accuracy(y_val, y_pred_xgbb_val)],
        'F1 Score': [f1(y_val, y_pred_logb_val), 
                     f1(y_val, y_pred_rfcb_val), 
                     f1(y_val, y_pred_abcb_val), 
                     f1(y_val, y_pred_gbcb_val),
                     f1(y_val, y_pred_xgbb_val)],
        'Recall': [recall(y_val, y_pred_logb_val), 
                   recall(y_val, y_pred_rfcb_val), 
                   recall(y_val, y_pred_abcb_val),
                   recall(y_val, y_pred_gbcb_val),
                   recall(y_val, y_pred_xgbb_val)],
        'Precision': [precision(y_val, y_pred_logb_val), 
                      precision(y_val, y_pred_rfcb_val), 
                      precision(y_val, y_pred_abcb_val),
                      precision(y_val, y_pred_gbcb_val),
                      precision(y_val, y_pred_xgbb_val)],
        'PR AUC': [aps(X_val, y_val, logb),
                   aps(X_val, y_val, rfcb),
                   aps(X_val, y_val, abcb),
                   aps(X_val, y_val, gbcb),
                   aps(X_val, y_val, xgbb)]}
scores3 = pd.DataFrame(data=data, index = ['Logistic Regression with GridSearchCV', 
                                          'Random Forest with GridSearchCV', 
                                          'AdaBoost with GridSearchCV', 
                                          'Gradient Boosting with GridSearchCV',
                                          'XGBoost with GridSearchCV'])

In [51]:
scores3

Unnamed: 0,Accuracy,F1 Score,Recall,Precision,PR AUC
Logistic Regression with GridSearchCV,0.807167,0.380952,0.271135,0.640288,0.498658
Random Forest with GridSearchCV,0.820667,0.464143,0.354912,0.670504,0.545164
AdaBoost with GridSearchCV,0.820833,0.44387,0.326733,0.691935,0.524433
Gradient Boosting with GridSearchCV,0.819667,0.462227,0.354151,0.665236,0.543499
XGBoost with GridSearchCV,0.8185,0.457939,0.350343,0.66092,0.539395


In [49]:
scores3.to_csv("../data/charts/scores3.csv")

## Pickle out best model

In [54]:
rfcb

RandomForestClassifier(max_depth=8, n_estimators=400)

In [55]:
pickle_out = open("../data/best_model.pickle","wb")
pickle.dump(rfcb, pickle_out)
pickle_out.close()

# <a id='4'>Class Imbalance</a>

In [64]:
X_train.head()

Unnamed: 0,limit,behind1,paid2,delayed,latemths,age,behind2,billed1,avg_av,avail1
0,1790.26,0,179.13,0,0,44,0,1631.93,0.344578,0.08844
1,5728.83,-1,173.87,0,0,46,-1,891.69,0.957227,0.84435
2,3580.52,-1,0.0,0,0,47,-1,238.68,0.96865,0.933339
3,6086.88,0,89.26,0,0,29,0,2831.87,0.650602,0.534758
4,5370.78,-2,1171.37,0,0,33,-2,873.4,0.836153,0.837379


In [65]:
X_validate.head()

Unnamed: 0,limit,behind1,paid2,delayed,latemths,age,behind2,billed1,avg_av,avail1
0,1074.16,0,71.61,0,0,25,0,317.38,0.602052,0.704532
1,5370.78,0,151.64,0,0,26,0,4895.86,0.293715,0.088427
2,2506.36,0,111.43,0,0,32,0,2510.73,0.005217,-0.001744
3,4654.68,0,64.74,0,0,49,0,740.38,0.883482,0.840939
4,1790.26,0,53.71,1,1,36,0,3373.85,0.188227,-0.884559


## Dummy Classifier as Baseline

In [87]:
dc = DummyClassifier(strategy='most_frequent').fit(X_tr, y_tr)
y_pred_dc_tr = dc.predict(X_tr)
y_pred_dc_val = dc.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_dc_tr, y_pred_dc_val, dc)

Training Accuracy:  0.7782857142857142
Validation Accuracy:  0.7811666666666667
Training F1 Score:  0.0
Validation F1 Score:  0.0
Training AUC Score:  0.5
Validation AUC Score:  0.5
Training Recall Score:  0.0
Validation Recall Score:  0.0
Training Precision Score:  0.0
Validation Precision Score:  0.0
Training Average Precision Score:  0.22171428571428572
Validation Average Precision Score:  0.21883333333333332


## <a id='41'>Ensemble Methods</a>

In [66]:
bc = BaggingClassifier(n_estimators=50, random_state=42).fit(X_tr, y_tr)
y_pred_bc_tr = bc.predict(X_tr)
y_pred_bc_val = bc.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_bc_tr, y_pred_bc_val, bc)
print("")
print('Bagging Classifier Performance:')
print('Balanced training accuracy: {:.2f} - Geometric mean {:.2f}'.format(balanced_accuracy_score(y_tr, y_pred_bc_tr), geometric_mean_score(y_tr, y_pred_bc_tr)))
print('Balanced validation accuracy: {:.2f} - Geometric mean {:.2f}'.format(balanced_accuracy_score(y_val, y_pred_bc_val), geometric_mean_score(y_val, y_pred_bc_val)))

Training Accuracy:  0.996904761904762
Validation Accuracy:  0.807
Training F1 Score:  0.9929964443486693
Validation F1 Score:  0.44326923076923075
Training AUC Score:  0.9998954304300327
Validation AUC Score:  0.740359286457933
Training Recall Score:  0.9896907216494846
Validation Recall Score:  0.3511043412033511
Training Precision Score:  0.9963243243243243
Validation Precision Score:  0.6010430247718384
Training Average Precision Score:  0.9996190235446103
Validation Average Precision Score:  0.48114820511254575

Bagging Classifier Performance:
Balanced training accuracy: 0.99 - Geometric mean 0.99
Balanced validation accuracy: 0.64 - Geometric mean 0.57


In [68]:
bbc = BalancedBaggingClassifier(n_estimators=50, random_state=42).fit(X_tr, y_tr)
y_pred_bbc_tr = bbc.predict(X_tr)
y_pred_bbc_val = bbc.predict(X_val)
get_metric(X_tr, y_tr, X_val, y_val, y_pred_bbc_tr, y_pred_bbc_val, bbc)
print("")
print('Balanced Bagging Classifier Performance:')
print('Balanced training accuracy: {:.2f} - Geometric mean {:.2f}'.format(balanced_accuracy_score(y_tr, y_pred_bbc_tr), geometric_mean_score(y_val, y_pred_bbc_tr)))
print('Balanced validation accuracy: {:.2f} - Geometric mean {:.2f}'.format(balanced_accuracy_score(y_val, y_pred_bbc_val), geometric_mean_score(y_val, y_pred_bbc_val)))

Training Accuracy:  0.9383333333333334
Validation Accuracy:  0.7631666666666667
Training F1 Score:  0.8778186621379375
Validation F1 Score:  0.5111799105607154
Training AUC Score:  0.9955504481714446
Validation AUC Score:  0.7528188434539899
Training Recall Score:  0.9991408934707904
Validation Recall Score:  0.5658796648895659
Training Precision Score:  0.7827696449604576
Validation Precision Score:  0.46612296110414053
Training Average Precision Score:  0.9819759873453777
Validation Average Precision Score:  0.499848233823849

Balanced Bagging Classifier Performance:
Balanced training accuracy: 0.96 - Geometric mean 0.00
Balanced validation accuracy: 0.69 - Geometric mean 0.68


## <a id='42'>Undersampling/Downsampling Methods for Majority Class</a>

In [69]:
# separate minority and majority classes
majority = train[train.default==0]
minority = train[train.default==1]

#baseline counts
counter = Counter(y_tr)
print("Baseline: ", counter)

Baseline:  Counter({0: 16344, 1: 4656})


In [70]:
downsampled = resample(majority, replace = False, n_samples = len(minority), random_state=42)
dns = pd.concat([downsampled, minority])
print(dns.default.value_counts())

1    4656
0    4656
Name: default, dtype: int64


In [71]:
ns = NearMiss(version=1, n_neighbors=3)
X_tr_nm, y_tr_nm = ns.fit_resample(X_tr, y_tr)
counter_nm = Counter(y_tr_nm)
print("Near Miss: ", counter_nm)

Near Miss:  Counter({0: 4656, 1: 4656})


In [72]:
ncr = NeighbourhoodCleaningRule(n_neighbors=3, threshold_cleaning=0.5)
X_tr_ncr, y_tr_ncr = ncr.fit_resample(X_tr, y_tr)
counter_ncr = Counter(y_tr_ncr)
print("Neighborhood Cleaning Rule: ", counter_ncr)


Neighborhood Cleaning Rule:  Counter({0: 10215, 1: 4656})


In [73]:
oss = OneSidedSelection(n_neighbors=1, n_seeds_S=200)
X_tr_oss, y_tr_oss = oss.fit_resample(X_tr, y_tr)
counter_oss = Counter(y_tr_oss)
print("One Sided Selection: ", counter_oss)

One Sided Selection:  Counter({0: 13578, 1: 4656})


### TomekLinks

In [77]:
tl = TomekLinks()
X_tr_tk, y_tr_tk = tl.fit_resample(X_tr, y_tr)
counter_tk_tr = Counter(y_tr_tk)
print("Tomek Links: ", counter_tk_tr)

Tomek Links:  Counter({0: 14844, 1: 4656})


In [80]:
X_val_tk, y_val_tk = tl.fit_resample(X_val, y_val)
counter_tk_val = Counter(y_val_tk)
print("Tomek Links: ", counter_tk_val)

Tomek Links:  Counter({0: 4271, 1: 1313})


In [83]:
rfctomek = rfcb.fit(X_tr_tk, y_tr_tk)
y_pred_rfctk_tr = rfctomek.predict(X_tr_tk)
y_pred_rfctk_val = rfctomek.predict(X_val_tk)

In [84]:
get_metric(X_tr_tk, y_tr_tk, X_val_tk, y_val_tk, y_pred_rfctk_tr, y_pred_rfctk_val, rfctomek)

Training Accuracy:  0.8400512820512821
Validation Accuracy:  0.8234240687679083
Training F1 Score:  0.5716247768163715
Validation F1 Score:  0.5152409046214357
Training AUC Score:  0.840326646551397
Validation AUC Score:  0.7966817961265896
Training Recall Score:  0.44695017182130586
Validation Recall Score:  0.3990860624523991
Training Precision Score:  0.7927619047619048
Validation Precision Score:  0.7267683772538142
Training Average Precision Score:  0.7041020024896908
Validation Average Precision Score:  0.6044979671881536


### Edited Nearest Neighbor

In [95]:
from imblearn.under_sampling import EditedNearestNeighbours

enn = EditedNearestNeighbours()
X_tr_enn, y_tr_enn = enn.fit_resample(X_tr, y_tr)
counter_enn_tr = Counter(y_tr_enn)
print("ENN: ", counter_enn_tr)

ENN:  Counter({0: 9921, 1: 4656})


In [97]:
X_val_enn, y_val_enn = enn.fit_resample(X_val, y_val)
counter_enn_val = Counter(y_val_enn)
print("ENN: ", counter_enn_val)

ENN:  Counter({0: 2811, 1: 1313})


In [98]:
rfcenn = rfcb.fit(X_tr_enn, y_tr_enn)
y_pred_rfcenn_tr = rfcenn.predict(X_tr_enn)
y_pred_rfcenn_val = rfcenn.predict(X_val_enn)
get_metric(X_tr_enn, y_tr_enn, X_val_enn, y_val_enn, y_pred_rfcenn_tr, y_pred_rfcenn_val, rfcenn)

Training Accuracy:  0.8494203196816903
Validation Accuracy:  0.8217749757516973
Training F1 Score:  0.713857384956329
Validation F1 Score:  0.6651480637813211
Training AUC Score:  0.8926924442788753
Validation AUC Score:  0.8490285281709354
Training Recall Score:  0.5880584192439863
Validation Recall Score:  0.555978674790556
Training Precision Score:  0.9081260364842454
Validation Precision Score:  0.8276643990929705
Training Average Precision Score:  0.8542295194453019
Validation Average Precision Score:  0.7911057997458948


## Upsampling/Oversampling Methods for Minority Class

In [86]:
# Random Upsampling
upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=42) 
ups = pd.concat([majority, upsampled])
y = ups['default']
counter_upsample = Counter(y)
print(counter_upsample)

Counter({0: 16344, 1: 16344})


### SMOTE

In [100]:
sm = SMOTE(sampling_strategy='minority', random_state=42)
X_tr_sm, y_tr_sm = sm.fit_resample(X_tr, y_tr)
counter_sm_tr = Counter(y_tr_sm)
print("SMOTE: ", counter_sm_tr)

SMOTE:  Counter({0: 16344, 1: 16344})


In [102]:
X_val_sm, y_val_sm = sm.fit_resample(X_val, y_val)
counter_sm_val = Counter(y_val_sm)
print("SMOTE: ", counter_sm_val)

SMOTE:  Counter({0: 4687, 1: 4687})


In [107]:
rfcsm = rfcb.fit(X_tr_sm, y_tr_sm)
y_pred_rfcsm_tr = rfcsm.predict(X_tr_sm)
y_pred_rfcsm_val = rfcsm.predict(X_val_sm)
get_metric(X_tr_sm, y_tr_sm, X_val_sm, y_val_sm, y_pred_rfcsm_tr, y_pred_rfcsm_val, rfcsm)

Training Accuracy:  0.7607684777288302
Validation Accuracy:  0.722210369106038
Training F1 Score:  0.7450775850828009
Validation F1 Score:  0.6985413290113452
Training AUC Score:  0.8460107149450065
Validation AUC Score:  0.8034223373130215
Training Recall Score:  0.6992168379833578
Validation Recall Score:  0.6436953275016002
Training Precision Score:  0.7973765001395479
Validation Precision Score:  0.7636041508478866
Training Average Precision Score:  0.8516893822745881
Validation Average Precision Score:  0.8110946639988013


### ADASYN

In [105]:
from imblearn.over_sampling import ADASYN

adsn = ADASYN()
X_tr_ad, y_tr_ad = adsn.fit_resample(X_tr, y_tr)
counter_ad_tr = Counter(y_tr_ad)
print("SMOTE: ", counter_ad_tr)

SMOTE:  Counter({1: 16573, 0: 16344})


In [106]:
X_val_ad, y_val_ad = adsn.fit_resample(X_val, y_val)
counter_ad_val = Counter(y_val_ad)
print("SMOTE: ", counter_ad_val)

SMOTE:  Counter({0: 4687, 1: 4536})


In [108]:
rfcad = rfcb.fit(X_tr_ad, y_tr_ad)
y_pred_rfcad_tr = rfcad.predict(X_tr_ad)
y_pred_rfcad_val = rfcad.predict(X_val_ad)
get_metric(X_tr_ad, y_tr_ad, X_val_ad, y_val_ad, y_pred_rfcad_tr, y_pred_rfcad_val, rfcad)

Training Accuracy:  0.7387368229182489
Validation Accuracy:  0.692833134554917
Training F1 Score:  0.7327864777529207
Validation F1 Score:  0.672296124927704
Training AUC Score:  0.8175359821019387
Validation AUC Score:  0.7672107011814359
Training Recall Score:  0.711518735292343
Validation Recall Score:  0.640652557319224
Training Precision Score:  0.7553648068669528
Validation Precision Score:  0.7072280360184959
Training Average Precision Score:  0.8187784066680115
Validation Average Precision Score:  0.7655912049884797


## Hybridized Methods

### SMOTETomek

In [110]:
from imblearn.combine import SMOTETomek

smtk = SMOTETomek()
X_tr_smtk, y_tr_smtk = smtk.fit_resample(X_tr, y_tr)
counter_smtk_tr = Counter(y_tr_smtk)
print("SMOTETomek: ", counter_smtk_tr)

SMOTETomek:  Counter({0: 15643, 1: 15643})


In [111]:
X_val_smtk, y_val_smtk = smtk.fit_resample(X_val, y_val)
counter_smtk_val = Counter(y_val_smtk)
print("SMOTETomek: ", counter_smtk_val)

SMOTETomek:  Counter({0: 4471, 1: 4471})


In [113]:
rfcsmtk = rfcb.fit(X_tr_smtk, y_tr_smtk)
y_pred_rfcsmtk_tr = rfcsmtk.predict(X_tr_smtk)
y_pred_rfcsmtk_val = rfcsmtk.predict(X_val_smtk)
get_metric(X_tr_smtk, y_tr_smtk, X_val_smtk, y_val_smtk, y_pred_rfcsmtk_tr, y_pred_rfcsmtk_val, rfcsmtk)

Training Accuracy:  0.7692897781755418
Validation Accuracy:  0.7345112950123015
Training F1 Score:  0.7526218383713756
Validation F1 Score:  0.7106994881793808
Training AUC Score:  0.8555163090488357
Validation AUC Score:  0.8132931372490657
Training Recall Score:  0.701911398069424
Validation Recall Score:  0.652203086557817
Training Precision Score:  0.8112301440709272
Validation Precision Score:  0.7807228915662651
Training Average Precision Score:  0.8613191296119896
Validation Average Precision Score:  0.8208080715161102


### SMOTEENN

from imblearn.combine import SMOTEENN

smenn = SMOTEENN(sampling_strategy="minority", n_jobs= -1)
X_tr_smenn, y_tr_smenn = smenn.fit_resample(X_tr, y_tr)
counter_smenn_tr = Counter(y_tr_smenn)
print("SMOTEENN: ", counter_smenn_tr)

In [115]:
X_val_smenn, y_val_smenn = smenn.fit_resample(X_val, y_val)
counter_smenn_val = Counter(y_val_smenn)
print("SMOTEENN: ", counter_smenn_val)

SMOTEENN:  Counter({1: 3289, 0: 2380})


In [116]:
rfcsmenn = rfcb.fit(X_tr_smenn, y_tr_smenn)
y_pred_rfcsmenn_tr = rfcsmenn.predict(X_tr_smenn)
y_pred_rfcsmenn_val = rfcsmenn.predict(X_val_smenn)
get_metric(X_tr_smenn, y_tr_smenn, X_val_smenn, y_val_smenn, y_pred_rfcsmenn_tr, y_pred_rfcsmenn_val, rfcsmenn)

Training Accuracy:  0.8727996021879662
Validation Accuracy:  0.8110777914976186
Training F1 Score:  0.8854763610315187
Validation F1 Score:  0.8291048348492103
Training AUC Score:  0.9481934939411721
Validation AUC Score:  0.9080211348753549
Training Recall Score:  0.8590913039701156
Validation Recall Score:  0.7899057464274856
Training Precision Score:  0.9135334872979215
Validation Precision Score:  0.8723975822699799
Training Average Precision Score:  0.9647268753117685
Validation Average Precision Score:  0.9393252106055436
