In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('data/preprocessed.csv')
df.head()

Unnamed: 0,SK_ID_CURR,Tenure_Months,Avg_Monthly_Balance,Avg_Monthly_Utilization,Max_Monthly_Utilization,Pct_Txns_Fully_Paid,Months_Fully_Paid,Pct_Txns_Minimum_Paid,Months_Minimum_Paid,Payment_MinPayment_Ratio_mean,...,AMT_LESS_MAX,DAYS_LATE_AVG,DAYS_LATE_MAX,SK_DPD_DEF,LESS_GRANTED,REJECTED_APPLICATIONS,AMT_CREDIT_SUM_TOTAL,AMT_CREDIT_SUM_DEBT_TOTAL,AMT_CREDIT_SUM_LIMIT_TOTAL,AMT_CREDIT_SUM_OVERDUE_TOTAL
0,100011,74,54482.111149,0.302678,1.05,0.554054,41,0.418919,31,0.50223,...,30833.685,4.807692,32.0,26.0,1,1,435228.3,0.0,0.0,0.0
1,100013,96,18159.919219,0.115301,1.02489,0.770833,74,0.0,0,0.865673,...,23147.82,5.722581,38.0,0.0,1,0,2072280.06,0.0,0.0,0.0
2,100023,8,0.0,0.0,0.0,1.0,8,0.0,0,0.0,...,0.0,15.653846,43.0,0.0,1,0,1645692.345,137038.5,0.0,0.0
3,100028,49,8085.058163,0.035934,0.165937,0.244898,12,0.142857,7,0.0,...,8505.0,3.265487,19.0,0.0,1,0,1520875.08,186304.5,101390.76,0.0
4,100036,12,0.0,0.0,0.0,1.0,12,0.0,0,0.0,...,0.0,18.5,67.0,0.0,0,3,94959.0,8339.355,0.0,0.0


In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline


categoricals = ['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']
numericals = [
    'Avg_Monthly_Balance','Months_Fully_Paid', 
    'Months_Minimum_Paid', 'Months_Minimum_Paid', 
    'AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'OWN_CAR_AGE',
    'Age_Years','Employed_Months', 'AMT_LESS_TOTAL',
    'AMT_LESS_MAX', 'DAYS_LATE_MAX', 'SK_DPD_DEF',
    'AMT_CREDIT_SUM_TOTAL', 'AMT_CREDIT_SUM_DEBT_TOTAL',
    'AMT_CREDIT_SUM_LIMIT_TOTAL', 'AMT_CREDIT_SUM_OVERDUE_TOTAL',
    'Tenure_Months'
]

preproc = ColumnTransformer([
    ('cat', OneHotEncoder(), categoricals),
    ('num', MinMaxScaler(), numericals)
])

In [5]:
X = df.drop(columns='W_DEFAULT')
y = df['W_DEFAULT']

X = preproc.fit_transform(X)

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

lr = LogisticRegression(max_iter=100_000)
svc = SVC(probability=True)
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
gbm = GradientBoostingClassifier()

models = [
    lr, svc, dt, rf, gbm
]
model_names = [
    'Logistic Regression',
    'SVC', 
    'Decision Tree',
    'Random Forest', 
    'Gradient Boosting'
]

In [16]:
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, matthews_corrcoef

for model_name, model in zip(model_names, models):
    model.fit(X_train, y_train)
    print(f'{model_name} results:')

    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)

    print(f'F1 Score: {f1_score(y_test, y_pred)}')
    print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
    print(f'ROC AUC: {roc_auc_score(y_test, y_proba[:,1])}')
    print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}')


Logistic Regression results:
F1 Score: 0.0
MCC: 0.0
ROC AUC: 0.6529267338671503
Confusion Matrix:
[[20690     0]
 [ 1552     0]]
SVC results:
F1 Score: 0.0
MCC: 0.0
ROC AUC: 0.5052049647969785
Confusion Matrix:
[[20690     0]
 [ 1552     0]]
Decision Tree results:
F1 Score: 0.11488862837045721
MCC: 0.04220965972032848
ROC AUC: 0.5229316667746259
Confusion Matrix:
[[19026  1664]
 [ 1356   196]]
Random Forest results:
F1 Score: 0.0038585209003215437
MCC: 0.042406984042292394
ROC AUC: 0.6609995428340798
Confusion Matrix:
[[20690     0]
 [ 1549     3]]
Gradient Boosting results:
F1 Score: 0.002570694087403599
MCC: 0.022647620731586625
ROC AUC: 0.6958391205722172
Confusion Matrix:
[[20688     2]
 [ 1550     2]]


In [None]:
results = {}

for model_name, model in zip(model_names, models):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)

    results[model_name] = {
        'F!': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_proba[:,1]),
        'MCC': matthews_corrcoef(y_test, y_pred)
    }

    print(f'Confusion Matrix for {model_name}')
    cm = confusion_matrix(y_test, y_pred)
    display(pd.DataFrame(cm, index=['Actual Negative', 'Actual Positive'], columns=['Predicted Negative', 'Predicted Positive']))

Confusion Matrix for Logistic Regression


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,20690,0
Actual Positive,1552,0


Confusion Matrix for SVC


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,20690,0
Actual Positive,1552,0


Confusion Matrix for Decision Tree


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,19026,1664
Actual Positive,1356,196


Confusion Matrix for Random Forest


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,20690,0
Actual Positive,1549,3


Confusion Matrix for Gradient Boosting


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,20688,2
Actual Positive,1550,2


In [27]:
pd.DataFrame.from_dict(results, orient='index')

Unnamed: 0,f1,auc roc,mcc
Logistic Regression,0.0,0.652927,0.0
SVC,0.0,0.505205,0.0
Decision Tree,0.114889,0.522932,0.04221
Random Forest,0.003859,0.661,0.042407
Gradient Boosting,0.002571,0.695839,0.022648


In [28]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

models = {
    'kNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=10_000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier() # XGBoost is much faster than sklearn GBM (order of magnitude)
}

In [29]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek

seed = None
ros = RandomOverSampler(random_state=seed)
rus = RandomUnderSampler(random_state=seed)
smote = SMOTE(random_state=seed)
adasyn = ADASYN(random_state=seed)
tomek = TomekLinks()
smotetomek = SMOTETomek(random_state=seed)

methods = {
    'Original': 'passthrough',
    'Random Oversampling': ros,
    'Random Undersampling': rus,
    'SMOTE': smote,
    'ADASYN': adasyn,
    'Tomek': tomek,
    'SMOTETomek': smotetomek,
}

In [30]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import precision_recall_curve, auc, make_scorer
from sklearn.metrics import precision_score, recall_score, roc_auc_score, matthews_corrcoef

def auc_prc(y_true, y_pred):
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
    return auc(recall, precision)

auc_prc = make_scorer(auc_prc, needs_proba=True)

scoring = {
    'auc_prc': auc_prc, 
    'precision': make_scorer(precision_score), 
    'recall': make_scorer(recall_score),
    'mcc': make_scorer(matthews_corrcoef),
    'auc_roc': make_scorer(roc_auc_score)
}

In [31]:
from imblearn.pipeline import Pipeline

all_results = []
for model_name, model in models.items():
    results_model = {}

    for method, resampler in methods.items():

        pipeline = Pipeline(
            [
                (method, resampler),
                [model_name, model]
            ]
        )

        scores = cross_validate(pipeline, X, y, scoring=scoring)
        scores = {key: np.mean(values) for key, values in scores.items()}

        results_model[method] = {
            'Precision': scores['test_precision'],
            'Recall': scores['test_recall'],
            'AUC PRC': scores['test_auc_prc'],
            'AUC ROC': scores['test_auc_roc'],
            'MCC': scores['test_mcc'],
            'Training Time': scores['fit_time']
        }
    
    results_model = pd.DataFrame.from_dict(results_model, orient='index')
    print(f'Results for {model_name}')
    display(results_model)

    results_model['Model'] = model_name
    all_results.append(results_model.reset_index(names='Sampler'))

all_results = pd.concat(all_results, axis=0).reset_index(drop=True)

Results for kNN


Unnamed: 0,Precision,Recall,AUC PRC,AUC ROC,MCC,Training Time
Original,0.134734,0.012029,0.091659,0.503074,0.019846,0.010739
Random Oversampling,0.092425,0.252451,0.128603,0.531497,0.040908,0.073135
Random Undersampling,0.087343,0.55397,0.126581,0.555713,0.057529,0.019994
SMOTE,0.090296,0.357866,0.135103,0.54113,0.046982,0.17727
ADASYN,0.089924,0.363879,0.133294,0.541202,0.046764,0.696657
Tomek,0.127562,0.019152,0.092292,0.504554,0.022802,5.039264
SMOTETomek,0.091225,0.362929,0.137349,0.543281,0.049379,17.113264


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Results for Logistic Regression


Unnamed: 0,Precision,Recall,AUC PRC,AUC ROC,MCC,Training Time
Original,0.0,0.0,0.122327,0.5,0.0,2.06476
Random Oversampling,0.109973,0.614593,0.123077,0.617162,0.123022,7.270437
Random Undersampling,0.108405,0.602246,0.119698,0.611745,0.117464,0.254335
SMOTE,0.108228,0.611112,0.123555,0.613081,0.118531,7.241597
ADASYN,0.108128,0.611428,0.123561,0.612942,0.118356,8.099399
Tomek,0.0,0.0,0.122532,0.5,0.0,6.978225
SMOTETomek,0.108179,0.612853,0.123704,0.613304,0.118699,24.067346


Results for Decision Tree


Unnamed: 0,Precision,Recall,AUC PRC,AUC ROC,MCC,Training Time
Original,0.109985,0.12884,0.150346,0.524582,0.045711,1.34385
Random Oversampling,0.104102,0.113326,0.140198,0.519354,0.037251,1.833878
Random Undersampling,0.087788,0.553968,0.336716,0.55697,0.058852,0.147631
SMOTE,0.098441,0.213358,0.183832,0.53197,0.045519,3.010093
ADASYN,0.097471,0.208291,0.180993,0.530338,0.043486,3.560966
Tomek,0.105838,0.126466,0.14717,0.522378,0.041265,6.410207
SMOTETomek,0.099138,0.210192,0.18271,0.532081,0.046087,20.468546


  _warn_prf(average, modifier, msg_start, len(result))


Results for Random Forest


Unnamed: 0,Precision,Recall,AUC PRC,AUC ROC,MCC,Training Time
Original,0.62,0.000949,0.139117,0.500457,0.021291,11.144087
Random Oversampling,0.351241,0.006015,0.142244,0.502584,0.037887,16.76115
Random Undersampling,0.118551,0.614911,0.143977,0.632682,0.141315,1.376197
SMOTE,0.144593,0.131687,0.117744,0.536048,0.075301,27.986938
ADASYN,0.148109,0.13596,0.116713,0.538076,0.079237,27.712781
Tomek,0.683333,0.001108,0.141008,0.500536,0.02438,16.169087
SMOTETomek,0.146492,0.133585,0.118289,0.53704,0.077312,44.332102


Results for Gradient Boosting


Unnamed: 0,Precision,Recall,AUC PRC,AUC ROC,MCC,Training Time
Original,0.442338,0.002532,0.159369,0.501139,0.02809,23.836764
Random Oversampling,0.125995,0.619183,0.159004,0.645405,0.156712,43.354074
Random Undersampling,0.121748,0.644349,0.154921,0.644503,0.153268,2.655285
SMOTE,0.130343,0.266697,0.113578,0.565252,0.09515,58.397054
ADASYN,0.128301,0.258146,0.112611,0.562041,0.091054,57.729305
Tomek,0.332121,0.002057,0.159127,0.500902,0.021575,26.517166
SMOTETomek,0.131096,0.264795,0.114405,0.565233,0.095638,74.158435


Results for XGBoost


Unnamed: 0,Precision,Recall,AUC PRC,AUC ROC,MCC,Training Time
Original,0.279416,0.009021,0.140398,0.503579,0.038441,0.498935
Random Oversampling,0.134643,0.420705,0.138149,0.606997,0.132281,1.034992
Random Undersampling,0.111731,0.614594,0.131678,0.620563,0.126939,0.285659
SMOTE,0.149987,0.052864,0.11409,0.515022,0.049317,1.660487
ADASYN,0.151499,0.051124,0.113298,0.514576,0.049004,2.023113
Tomek,0.274217,0.010287,0.139458,0.504103,0.040744,5.580905
SMOTETomek,0.157429,0.053656,0.114933,0.515654,0.052581,18.240879
