In [36]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [37]:
import warnings
warnings.filterwarnings('ignore')

In [38]:
df = pd.read_csv('data/preprocessed.csv')
df.head()

Unnamed: 0,SK_ID_CURR,Tenure_Months,Avg_Monthly_Balance,Avg_Monthly_Utilization,Max_Monthly_Utilization,Pct_Txns_Fully_Paid,Months_Fully_Paid,Pct_Txns_Minimum_Paid,Months_Minimum_Paid,Payment_MinPayment_Ratio_mean,...,AMT_LESS_MAX,DAYS_LATE_AVG,DAYS_LATE_MAX,SK_DPD_DEF,LESS_GRANTED,REJECTED_APPLICATIONS,AMT_CREDIT_SUM_TOTAL,AMT_CREDIT_SUM_DEBT_TOTAL,AMT_CREDIT_SUM_LIMIT_TOTAL,AMT_CREDIT_SUM_OVERDUE_TOTAL
0,100011,74,54482.111149,0.302678,1.05,0.554054,41,0.418919,31,0.50223,...,30833.685,4.807692,32.0,26.0,1,1,435228.3,0.0,0.0,0.0
1,100013,96,18159.919219,0.115301,1.02489,0.770833,74,0.0,0,0.865673,...,23147.82,5.722581,38.0,0.0,1,0,2072280.06,0.0,0.0,0.0
2,100023,8,0.0,0.0,0.0,1.0,8,0.0,0,0.0,...,0.0,15.653846,43.0,0.0,1,0,1645692.345,137038.5,0.0,0.0
3,100028,49,8085.058163,0.035934,0.165937,0.244898,12,0.142857,7,0.0,...,8505.0,3.265487,19.0,0.0,1,0,1520875.08,186304.5,101390.76,0.0
4,100036,12,0.0,0.0,0.0,1.0,12,0.0,0,0.0,...,0.0,18.5,67.0,0.0,0,3,94959.0,8339.355,0.0,0.0


In [39]:
len(df.columns)

48

In [40]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline


categoricals = ['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']
cols_to_scale = [
    'Avg_Monthly_Balance','Months_Fully_Paid', 
    'Months_Minimum_Paid', 'Months_Minimum_Paid', 
    'AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'OWN_CAR_AGE',
    'Age_Years','Employed_Months', 'AMT_LESS_TOTAL',
    'AMT_LESS_MAX', 'DAYS_LATE_MAX', 'SK_DPD_DEF',
    'AMT_CREDIT_SUM_TOTAL', 'AMT_CREDIT_SUM_DEBT_TOTAL',
    'AMT_CREDIT_SUM_LIMIT_TOTAL', 'AMT_CREDIT_SUM_OVERDUE_TOTAL',
    'Tenure_Months'
]
others = df.columns.difference(categoricals + cols_to_scale)

preproc = ColumnTransformer([
    ('cat', OneHotEncoder(), categoricals),
    ('scaled', MinMaxScaler(), cols_to_scale),
], remainder='passthrough')

In [41]:
X = df.drop(columns='W_DEFAULT')
y = df['W_DEFAULT']

X = preproc.fit_transform(X)

In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

knn = KNeighborsClassifier()
lr = LogisticRegression(max_iter=100_000)
svc = SVC(probability=True,)
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
gbm = GradientBoostingClassifier()

models = [
    knn, lr, svc, dt, rf, gbm
]
model_names = [
    'KNN',
    'Logistic Regression',
    'SVC', 
    'Decision Tree',
    'Random Forest', 
    'Gradient Boosting'
]

In [49]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score, matthews_corrcoef
from sklearn.metrics import precision_recall_curve, auc, make_scorer

def auc_prc(y_true, y_pred):
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
    return auc(recall, precision)

auc_prc_score = make_scorer(auc_prc, needs_proba=True)

In [44]:
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, matthews_corrcoef

for model_name, model in zip(model_names, models):
    model.fit(X_train, y_train)
    print(f'{model_name} results:')

    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)

    print(f'F1 Score: {f1_score(y_test, y_pred)}')
    print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
    print(f'ROC AUC: {roc_auc_score(y_test, y_proba[:,1])}')
    # print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}')


KNN results:
F1 Score: 0.0012300123001230013
MCC: -0.01369554739582187
ROC AUC: 0.5127985695596732
Logistic Regression results:
F1 Score: 0.0
MCC: 0.0
ROC AUC: 0.5258266298914015
SVC results:
F1 Score: 0.0
MCC: 0.0
ROC AUC: 0.4757607177885576
Decision Tree results:
F1 Score: 0.15171192443919718
MCC: 0.08275175946719276
ROC AUC: 0.5448883512061188
Random Forest results:
F1 Score: 0.002582311168495804
MCC: 0.019503466914362035
ROC AUC: 0.7318203818846606
Gradient Boosting results:
F1 Score: 0.043451272501551834
MCC: 0.09796272843201512
ROC AUC: 0.7616822193028087


In [50]:
results = {}

for model_name, model in zip(model_names, models):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)

    results[model_name] = {
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_proba[:,1]),
        'PRC AUC': auc_prc(y_test, y_pred),
        'MCC': matthews_corrcoef(y_test, y_pred)
    }

    print(f'Confusion Matrix for {model_name}')
    cm = confusion_matrix(y_test, y_pred)
    display(pd.DataFrame(cm, index=['Actual Negative', 'Actual Positive'], columns=['Predicted Negative', 'Predicted Positive']))

Confusion Matrix for KNN


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,20617,81
Actual Positive,1543,1


Confusion Matrix for Logistic Regression


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,20698,0
Actual Positive,1544,0


Confusion Matrix for SVC


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,20698,0
Actual Positive,1544,0


Confusion Matrix for Decision Tree


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,19111,1587
Actual Positive,1287,257


Confusion Matrix for Random Forest


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,20695,3
Actual Positive,1542,2


Confusion Matrix for Gradient Boosting


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,20666,32
Actual Positive,1509,35


In [51]:
pd.DataFrame.from_dict(results, orient='index')

Unnamed: 0,Precision,Recall,F1,ROC AUC,PRC AUC,MCC
KNN,0.012195,0.000648,0.00123,0.512799,0.041108,-0.013696
Logistic Regression,0.0,0.0,0.0,0.525827,0.534709,0.0
SVC,0.0,0.0,0.0,0.475761,0.534709,0.0
Decision Tree,0.139371,0.166451,0.151712,0.544888,0.181843,0.082752
Random Forest,0.4,0.001295,0.002582,0.73182,0.235312,0.019503
Gradient Boosting,0.522388,0.022668,0.043451,0.761682,0.306451,0.097963


In [52]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

models = {
    'kNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=10_000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier() # XGBoost is much faster than sklearn GBM (order of magnitude)
}

In [53]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek

seed = None
ros = RandomOverSampler(random_state=seed)
rus = RandomUnderSampler(random_state=seed)
smote = SMOTE(random_state=seed)
adasyn = ADASYN(random_state=seed)
tomek = TomekLinks()
smotetomek = SMOTETomek(random_state=seed)

methods = {
    'Original': 'passthrough',
    'Random Oversampling': ros,
    'Random Undersampling': rus,
    'SMOTE': smote,
    'ADASYN': adasyn,
    'Tomek': tomek,
    'SMOTETomek': smotetomek,
}

In [54]:
scoring = {
    'auc_prc': auc_prc_score, 
    'precision': make_scorer(precision_score), 
    'recall': make_scorer(recall_score),
    'mcc': make_scorer(matthews_corrcoef),
    'auc_roc': make_scorer(roc_auc_score)
}

In [55]:
from sklearn.model_selection import cross_validate
from imblearn.pipeline import Pipeline

all_results = []
for model_name, model in models.items():
    results_model = {}

    for method, resampler in methods.items():

        pipeline = Pipeline(
            [
                (method, resampler),
                [model_name, model]
            ]
        )

        scores = cross_validate(pipeline, X, y, scoring=scoring)
        scores = {key: np.mean(values) for key, values in scores.items()}

        results_model[method] = {
            'Precision': scores['test_precision'],
            'Recall': scores['test_recall'],
            'AUC PRC': scores['test_auc_prc'],
            'AUC ROC': scores['test_auc_roc'],
            'MCC': scores['test_mcc'],
            'Training Time': scores['fit_time']
        }
    
    results_model = pd.DataFrame.from_dict(results_model, orient='index')
    print(f'Results for {model_name}')
    display(results_model)

    results_model['Model'] = model_name
    all_results.append(results_model.reset_index(names='Sampler'))

all_results = pd.concat(all_results, axis=0).reset_index(drop=True)

Results for kNN


Unnamed: 0,Precision,Recall,AUC PRC,AUC ROC,MCC,Training Time
Original,0.049987,0.163367,0.119841,0.489745,-0.013164,0.019326
Random Oversampling,0.066946,0.343653,0.218534,0.495082,-0.006085,0.095744
Random Undersampling,0.070364,0.435952,0.113961,0.495974,-0.004278,0.023648
SMOTE,0.068692,0.323702,0.200499,0.496057,-0.004548,0.214593
ADASYN,0.067178,0.331304,0.203294,0.494328,-0.006713,0.950101
Tomek,0.049984,0.163367,0.125063,0.489739,-0.01317,6.614195
SMOTETomek,0.068084,0.323544,0.201485,0.496123,-0.004718,21.734295


Results for Logistic Regression


Unnamed: 0,Precision,Recall,AUC PRC,AUC ROC,MCC,Training Time
Original,0.0,0.0,0.080292,0.5,0.0,0.293089
Random Oversampling,0.078659,0.601923,0.080272,0.532585,0.034365,0.530436
Random Undersampling,0.07731,0.473898,0.078102,0.526095,0.026483,0.057609
SMOTE,0.073113,0.377546,0.073283,0.510329,0.010179,0.39466
ADASYN,0.07432,0.405704,0.075191,0.516125,0.016031,1.103343
Tomek,0.0,0.0,0.082033,0.5,0.0,6.930043
SMOTETomek,0.077896,0.57928,0.078097,0.526296,0.028421,22.784606


Results for Decision Tree


Unnamed: 0,Precision,Recall,AUC PRC,AUC ROC,MCC,Training Time
Original,0.12207,0.226989,0.201979,0.52967,0.05791,2.873224
Random Oversampling,0.118128,0.132636,0.156181,0.528362,0.053834,3.34904
Random Undersampling,0.096242,0.571069,0.348886,0.58053,0.083788,0.257632
SMOTE,0.116722,0.204973,0.189077,0.534783,0.059165,4.113169
ADASYN,0.114138,0.195796,0.183524,0.534926,0.057713,4.993103
Tomek,0.119817,0.211312,0.19357,0.532303,0.058928,9.837056
SMOTETomek,0.113418,0.202603,0.186325,0.533979,0.056528,25.414307


Results for Random Forest


Unnamed: 0,Precision,Recall,AUC PRC,AUC ROC,MCC,Training Time
Original,0.493499,0.003641,0.181964,0.501657,0.035761,17.670419
Random Oversampling,0.552046,0.015036,0.196951,0.506816,0.066032,26.5965
Random Undersampling,0.144191,0.671577,0.198813,0.683436,0.200288,2.113858
SMOTE,0.510909,0.005065,0.180793,0.502339,0.044661,37.26812
ADASYN,0.520085,0.005065,0.181187,0.502339,0.045622,37.012564
Tomek,0.60677,0.00554,0.180553,0.502637,0.052501,23.228304
SMOTETomek,0.489434,0.007756,0.182397,0.503533,0.053907,50.905072


Results for Gradient Boosting


Unnamed: 0,Precision,Recall,AUC PRC,AUC ROC,MCC,Training Time
Original,0.389009,0.141084,0.18057,0.50422,0.067671,43.413203
Random Oversampling,0.162663,0.619396,0.20876,0.646627,0.183272,75.805656
Random Undersampling,0.148126,0.669712,0.207473,0.666249,0.192586,5.091591
SMOTE,0.390908,0.019311,0.180293,0.508034,0.069811,120.527041
ADASYN,0.380501,0.017412,0.180061,0.506867,0.0631,121.52752
Tomek,0.349477,0.173225,0.17688,0.511833,0.079023,48.873028
SMOTETomek,0.440834,0.021052,0.187757,0.509461,0.083287,120.238119


Results for XGBoost


Unnamed: 0,Precision,Recall,AUC PRC,AUC ROC,MCC,Training Time
Original,0.328371,0.046382,0.175827,0.519452,0.097978,0.772463
Random Oversampling,0.159003,0.470116,0.177085,0.633507,0.170771,1.438128
Random Undersampling,0.134509,0.65798,0.178079,0.662028,0.177323,0.468032
SMOTE,0.290141,0.155476,0.181243,0.55135,0.12338,2.260463
ADASYN,0.313815,0.088179,0.18275,0.534858,0.12119,2.854098
Tomek,0.344509,0.066014,0.183485,0.527635,0.117774,7.389913
SMOTETomek,0.306145,0.090242,0.177642,0.534982,0.115405,22.980539
