In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('data/preprocessed.csv')
df.head()

Unnamed: 0,SK_ID_CURR,Tenure_Months,Avg_Monthly_Balance,Avg_Monthly_Utilization,Max_Monthly_Utilization,Pct_Txns_Fully_Paid,Months_Fully_Paid,Pct_Txns_Minimum_Paid,Months_Minimum_Paid,Payment_MinPayment_Ratio_mean,...,AMT_LESS_MAX,DAYS_LATE_AVG,DAYS_LATE_MAX,SK_DPD_DEF,LESS_GRANTED,REJECTED_APPLICATIONS,AMT_CREDIT_SUM_TOTAL,AMT_CREDIT_SUM_DEBT_TOTAL,AMT_CREDIT_SUM_LIMIT_TOTAL,AMT_CREDIT_SUM_OVERDUE_TOTAL
0,100011,74,54482.111149,0.302678,1.05,0.554054,41,0.418919,31,0.50223,...,30833.685,4.807692,32.0,26.0,1,1,435228.3,0.0,0.0,0.0
1,100013,96,18159.919219,0.115301,1.02489,0.770833,74,0.0,0,0.865673,...,23147.82,5.722581,38.0,0.0,1,0,2072280.06,0.0,0.0,0.0
2,100023,8,0.0,0.0,0.0,1.0,8,0.0,0,0.0,...,0.0,15.653846,43.0,0.0,1,0,1645692.345,137038.5,0.0,0.0
3,100028,49,8085.058163,0.035934,0.165937,0.244898,12,0.142857,7,0.0,...,8505.0,3.265487,19.0,0.0,1,0,1520875.08,186304.5,101390.76,0.0
4,100036,12,0.0,0.0,0.0,1.0,12,0.0,0,0.0,...,0.0,18.5,67.0,0.0,0,3,94959.0,8339.355,0.0,0.0


In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline


categoricals = ['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']
numericals = [
    'Avg_Monthly_Balance','Months_Fully_Paid', 
    'Months_Minimum_Paid', 'Months_Minimum_Paid', 
    'AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'OWN_CAR_AGE',
    'Age_Years','Employed_Months', 'AMT_LESS_TOTAL',
    'AMT_LESS_MAX', 'DAYS_LATE_MAX', 'SK_DPD_DEF',
    'AMT_CREDIT_SUM_TOTAL', 'AMT_CREDIT_SUM_DEBT_TOTAL',
    'AMT_CREDIT_SUM_LIMIT_TOTAL', 'AMT_CREDIT_SUM_OVERDUE_TOTAL',
    'Tenure_Months'
]

preproc = ColumnTransformer([
    ('cat', OneHotEncoder(), categoricals),
    ('num', MinMaxScaler(), numericals)
])

In [5]:
X = df.drop(columns='W_DEFAULT')
y = df['W_DEFAULT']

X = preproc.fit_transform(X)

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

lr = LogisticRegression(max_iter=100_000)
svc = SVC(probability=True)
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
gbm = GradientBoostingClassifier()

models = [
    lr, svc, dt, rf, gbm
]
model_names = [
    'Logistic Regression',
    'SVC', 
    'Decision Tree',
    'Random Forest', 
    'Gradient Boosting'
]

In [16]:
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, matthews_corrcoef

for model_name, model in zip(model_names, models):
    model.fit(X_train, y_train)
    print(f'{model_name} results:')

    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)

    print(f'F1 Score: {f1_score(y_test, y_pred)}')
    print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
    print(f'ROC AUC: {roc_auc_score(y_test, y_proba[:,1])}')
    print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}')


Logistic Regression results:
F1 Score: 0.0
MCC: 0.0
ROC AUC: 0.6529267338671503
Confusion Matrix:
[[20690     0]
 [ 1552     0]]
SVC results:
F1 Score: 0.0
MCC: 0.0
ROC AUC: 0.5052049647969785
Confusion Matrix:
[[20690     0]
 [ 1552     0]]
Decision Tree results:
F1 Score: 0.11488862837045721
MCC: 0.04220965972032848
ROC AUC: 0.5229316667746259
Confusion Matrix:
[[19026  1664]
 [ 1356   196]]
Random Forest results:
F1 Score: 0.0038585209003215437
MCC: 0.042406984042292394
ROC AUC: 0.6609995428340798
Confusion Matrix:
[[20690     0]
 [ 1549     3]]
Gradient Boosting results:
F1 Score: 0.002570694087403599
MCC: 0.022647620731586625
ROC AUC: 0.6958391205722172
Confusion Matrix:
[[20688     2]
 [ 1550     2]]


In [None]:
results = {}

for model_name, model in zip(model_names, models):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)

    results[model_name] = {
        'F!': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_proba[:,1]),
        'MCC': matthews_corrcoef(y_test, y_pred)
    }

    print(f'Confusion Matrix for {model_name}')
    cm = confusion_matrix(y_test, y_pred)
    display(pd.DataFrame(cm, index=['Actual Negative', 'Actual Positive'], columns=['Predicted Negative', 'Predicted Positive']))

Confusion Matrix for Logistic Regression


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,20690,0
Actual Positive,1552,0


Confusion Matrix for SVC


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,20690,0
Actual Positive,1552,0


Confusion Matrix for Decision Tree


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,19026,1664
Actual Positive,1356,196


Confusion Matrix for Random Forest


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,20690,0
Actual Positive,1549,3


Confusion Matrix for Gradient Boosting


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,20688,2
Actual Positive,1550,2


In [27]:
pd.DataFrame.from_dict(results, orient='index')

Unnamed: 0,f1,auc roc,mcc
Logistic Regression,0.0,0.652927,0.0
SVC,0.0,0.505205,0.0
Decision Tree,0.114889,0.522932,0.04221
Random Forest,0.003859,0.661,0.042407
Gradient Boosting,0.002571,0.695839,0.022648
