## Modelling

In [1]:
import pickle

with open('credit_default_datasets.pkl', 'rb') as file:
    loaded_datasets = pickle.load(file)

credit_default = loaded_datasets['credit_default']
x_train = loaded_datasets['x_train']
x_test = loaded_datasets['x_test']
y_train = loaded_datasets['y_train']
y_test = loaded_datasets['y_test']

In [2]:
x_train['MARRIAGE_0'] = x_train['MARRIAGE_0'].astype(int)
x_train['MARRIAGE_1'] = x_train['MARRIAGE_1'].astype(int)
x_train['MARRIAGE_2'] = x_train['MARRIAGE_2'].astype(int)
x_train['MARRIAGE_3'] = x_train['MARRIAGE_3'].astype(int)
x_train['SEX_1'] = x_train['SEX_1'].astype(int)
x_train['SEX_2'] = x_train['SEX_2'].astype(int)

x_test['MARRIAGE_0'] = x_test['MARRIAGE_0'].astype(int)
x_test['MARRIAGE_1'] = x_test['MARRIAGE_1'].astype(int)
x_test['MARRIAGE_2'] = x_test['MARRIAGE_2'].astype(int)
x_test['MARRIAGE_3'] = x_test['MARRIAGE_3'].astype(int)
x_test['SEX_1'] = x_test['SEX_1'].astype(int)
x_test['SEX_2'] = x_test['SEX_2'].astype(int)

### Unscaled Data

#### Random Forest


In [3]:
from sklearn.ensemble import RandomForestClassifier
rf_unscaled = RandomForestClassifier(n_estimators = 1000, random_state = 1234)

rf_unscaled.fit(x_train, y_train);
with open('random_forest_unscaled.pkl', 'wb') as file:
    pickle.dump(rf_unscaled, file)

In [4]:
rf_pred_unscaled = rf_unscaled.predict(x_test)
with open('rf_pred_unscaled.pkl', 'wb') as file:
    pickle.dump(rf_pred_unscaled, file)

In [5]:
from sklearn.metrics import confusion_matrix
eval_rf_unscaled = confusion_matrix(rf_pred_unscaled, y_test)

In [57]:
x_train.dtypes

LIMIT_BAL     int64
EDUCATION     int64
AGE           int64
PAY_1         int64
PAY_2         int64
PAY_3         int64
PAY_4         int64
PAY_5         int64
PAY_6         int64
BILL_AMT1     int64
BILL_AMT2     int64
BILL_AMT3     int64
BILL_AMT4     int64
BILL_AMT5     int64
BILL_AMT6     int64
PAY_AMT1      int64
PAY_AMT2      int64
PAY_AMT3      int64
PAY_AMT4      int64
PAY_AMT5      int64
PAY_AMT6      int64
MARRIAGE_0    int64
MARRIAGE_1    int64
MARRIAGE_2    int64
MARRIAGE_3    int64
SEX_1         int64
SEX_2         int64
dtype: object

In [55]:
eval_rf_unscaled

{'accuracy': 0.7706350444687159,
 'recall': 0.4717741935483871,
 'precision': 0.6892488954344624,
 'specificity': 0.9046327683615819}

#### XGBoost

In [6]:
from xgboost import XGBClassifier

xgb_unscaled = XGBClassifier()
xgb_unscaled.fit(x_train, y_train)
with open('xgb_unscaled.pkl', 'wb') as file:
    pickle.dump(xgb_unscaled, file)

In [7]:
xgb_pred_unscaled = xgb_unscaled.predict(x_test)
with open('xgb_pred_unscaled.pkl', 'wb') as file:
    pickle.dump(xgb_pred_unscaled, file)

In [8]:
from sklearn.metrics import confusion_matrix
eval_xgb_unscaled = confusion_matrix(xgb_pred_unscaled, y_test)

#### Neutral Network

In [9]:
import tensorflow as tf

nn_unscaled = tf.keras.Sequential([
    tf.keras.layers.Dense(1, activation='sigmoid', input_shape=(x_train.shape[1], ))
])

nn_unscaled.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_unscaled.fit(x_train, y_train, epochs=10, batch_size=512, validation_data=(x_test, y_test))
with open('nn_unscaled.pkl', 'wb') as file:
    pickle.dump(nn_unscaled, file)

2024-05-31 15:42:50.684136: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
nn_pred_unscaled = nn_unscaled.predict(x_test)
with open('nn_pred_unscaled.pkl', 'wb') as file:
    pickle.dump(nn_pred_unscaled, file)



In [11]:
from sklearn.metrics import confusion_matrix

threshold = 0.5
nn_pred_binary = (nn_pred_unscaled >= threshold).astype(int)

eval_nn_unscaled = confusion_matrix(nn_pred_binary, y_test)

### Scaled Data

In [36]:
with open('credit_default_datasets.pkl', 'rb') as file:
    loaded_datasets = pickle.load(file)

credit_default = loaded_datasets['credit_default']
x_train = loaded_datasets['x_train']
x_test = loaded_datasets['x_test']
y_train = loaded_datasets['y_train']
y_test = loaded_datasets['y_test']

In [39]:
x_train['MARRIAGE_0'] = x_train['MARRIAGE_0'].astype(int)
x_train['MARRIAGE_1'] = x_train['MARRIAGE_1'].astype(int)
x_train['MARRIAGE_2'] = x_train['MARRIAGE_2'].astype(int)
x_train['MARRIAGE_3'] = x_train['MARRIAGE_3'].astype(int)
x_train['SEX_1'] = x_train['SEX_1'].astype(int)
x_train['SEX_2'] = x_train['SEX_2'].astype(int)

x_test['MARRIAGE_0'] = x_test['MARRIAGE_0'].astype(int)
x_test['MARRIAGE_1'] = x_test['MARRIAGE_1'].astype(int)
x_test['MARRIAGE_2'] = x_test['MARRIAGE_2'].astype(int)
x_test['MARRIAGE_3'] = x_test['MARRIAGE_3'].astype(int)
x_test['SEX_1'] = x_test['SEX_1'].astype(int)
x_test['SEX_2'] = x_test['SEX_2'].astype(int)

In [40]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.fit_transform(x_test)

In [41]:
x_train_scaled

array([[0.17721519, 0.66666667, 0.24137931, ..., 0.        , 1.        ,
        0.        ],
       [0.05063291, 0.66666667, 0.29310345, ..., 0.        , 0.        ,
        1.        ],
       [0.05063291, 0.66666667, 0.27586207, ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.60759494, 1.        , 0.29310345, ..., 0.        , 0.        ,
        1.        ],
       [0.01265823, 0.33333333, 0.03448276, ..., 0.        , 0.        ,
        1.        ],
       [0.05063291, 0.33333333, 0.62068966, ..., 0.        , 1.        ,
        0.        ]])

In [42]:
x_test_scaled

array([[0.37662338, 0.66666667, 0.35185185, ..., 0.        , 0.        ,
        1.        ],
       [0.16883117, 0.66666667, 0.11111111, ..., 0.        , 0.        ,
        1.        ],
       [0.11688312, 0.33333333, 0.53703704, ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.01298701, 0.66666667, 0.38888889, ..., 0.        , 0.        ,
        1.        ],
       [0.42857143, 0.66666667, 0.16666667, ..., 0.        , 1.        ,
        0.        ],
       [0.2987013 , 0.66666667, 0.24074074, ..., 0.        , 0.        ,
        1.        ]])

#### Random Forest

In [43]:
from sklearn.ensemble import RandomForestClassifier
rf_scaled = RandomForestClassifier(n_estimators = 1000, random_state = 1234)

rf_scaled.fit(x_train_scaled, y_train);
with open('random_forest_scaled.pkl', 'wb') as file:
    pickle.dump(rf_scaled, file)

In [44]:
rf_pred_scaled = rf_scaled.predict(x_test_scaled)
with open('rf_pred_scaled.pkl', 'wb') as file:
    pickle.dump(rf_pred_scaled, file)

In [45]:
from sklearn.metrics import confusion_matrix
eval_rf_scaled = confusion_matrix(rf_pred_scaled, y_test)

#### XGBoost

In [46]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.5, 0.7, 1]
}

xgb_scaled = XGBClassifier()
grid_search = GridSearchCV(xgb_scaled, param_grid, cv=5, scoring='recall')

xgb_scaled.fit(x_train_scaled, y_train)
with open('xgb_scaled.pkl', 'wb') as file:
    pickle.dump(xgb_scaled, file)

In [47]:
xgb_pred_scaled = xgb_scaled.predict(x_test_scaled)
with open('xgb_pred_scaled.pkl', 'wb') as file:
    pickle.dump(xgb_pred_scaled, file)

In [48]:
from sklearn.metrics import confusion_matrix
eval_xgb_scaled = confusion_matrix(xgb_pred_scaled, y_test)

In [49]:
eval_xgb_scaled

array([[4241, 1684],
       [ 184,  300]])

#### Neural Network

In [58]:
import tensorflow as tf

nn_scaled = tf.keras.Sequential([
    tf.keras.layers.Dense(1, activation='sigmoid', input_shape=(x_train_scaled.shape[1], ))
   
])

nn_scaled.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_scaled.fit(x_train_scaled, y_train, epochs=100, batch_size=512, validation_data=(x_test_scaled, y_test))
with open('nn_scaled.pkl', 'wb') as file:
    pickle.dump(nn_scaled, file)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [61]:
nn_pred_scaled = nn_scaled.predict(x_test_scaled)
with open('nn_pred_scaled.pkl', 'wb') as file:
    pickle.dump(nn_pred_scaled, file)



In [62]:
from sklearn.metrics import confusion_matrix

threshold = 0.5
nn_pred_binary = (nn_pred_scaled >= threshold).astype(int)

eval_nn_scaled = confusion_matrix(nn_pred_binary, y_test)

In [53]:
eval_rf_unscaled

array([[4003, 1048],
       [ 422,  936]])

In [26]:
eval_rf_scaled

array([[4003, 1047],
       [ 422,  937]])

In [27]:
eval_xgb_unscaled

array([[3937, 1045],
       [ 488,  939]])

In [28]:
eval_xgb_scaled

array([[3937, 1045],
       [ 488,  939]])

In [31]:
eval_nn_unscaled

array([[3311, 1331],
       [1114,  653]])

In [30]:
eval_nn_scaled

array([[4425, 1984],
       [   0,    0]])

## Model Evaluation

In [63]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix

def threshold_predictions(predictions, threshold=0.5):
    return (predictions >= threshold).astype(int)

def evaluate_model(y_true, y_pred, positive_class):
    accuracy = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred, pos_label=positive_class)
    precision = precision_score(y_true, y_pred, pos_label=positive_class)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp)
    return {
        "accuracy": accuracy,
        "recall": recall,
        "precision": precision,
        "specificity": specificity
    }

rf_pred_unscaled_binary = threshold_predictions(rf_pred_unscaled)
xgb_pred_unscaled_binary = threshold_predictions(xgb_pred_unscaled)
nn_pred_unscaled_binary = threshold_predictions(nn_pred_unscaled)

rf_pred_scaled_binary = threshold_predictions(rf_pred_scaled)
xgb_pred_scaled_binary = threshold_predictions(xgb_pred_scaled)
nn_pred_scaled_binary = threshold_predictions(nn_pred_scaled)


eval_rf_unscaled = evaluate_model(y_test, rf_pred_unscaled_binary, positive_class=1)
eval_xgb_unscaled = evaluate_model(y_test, xgb_pred_unscaled_binary, positive_class=1)
eval_nn_unscaled = evaluate_model(y_test, nn_pred_unscaled_binary, positive_class=1)

eval_rf_scaled = evaluate_model(y_test, rf_pred_scaled_binary, positive_class=1)
eval_xgb_scaled = evaluate_model(y_test, xgb_pred_scaled_binary, positive_class=1)
eval_nn_scaled = evaluate_model(y_test, nn_pred_scaled_binary, positive_class=1)


compare_rf_unscaled = pd.DataFrame({
    "Model": ["RandomForest Unscaled"],
    "Accuracy": [round(eval_rf_unscaled["accuracy"] * 100, 2)],
    "Recall": [round(eval_rf_unscaled["recall"] * 100, 2)],
    "Precision": [round(eval_rf_unscaled["precision"] * 100, 2)],
    "Specificity": [round(eval_rf_unscaled["specificity"] * 100, 2)]
})

compare_xgb_unscaled = pd.DataFrame({
    "Model": ["XGBoost Unscaled"],
    "Accuracy": [round(eval_xgb_unscaled["accuracy"] * 100, 2)],
    "Recall": [round(eval_xgb_unscaled["recall"] * 100, 2)],
    "Precision": [round(eval_xgb_unscaled["precision"] * 100, 2)],
    "Specificity": [round(eval_xgb_unscaled["specificity"] * 100, 2)]
})

compare_nn_unscaled = pd.DataFrame({
    "Model": ["NeuralNetwork Unscaled"],
    "Accuracy": [round(eval_nn_unscaled["accuracy"] * 100, 2)],
    "Recall": [round(eval_nn_unscaled["recall"] * 100, 2)],
    "Precision": [round(eval_nn_unscaled["precision"] * 100, 2)],
    "Specificity": [round(eval_nn_unscaled["specificity"] * 100, 2)]
})

compare_rf_scaled = pd.DataFrame({
    "Model": ["RandomForest Scaled"],
    "Accuracy": [round(eval_rf_scaled["accuracy"] * 100, 2)],
    "Recall": [round(eval_rf_scaled["recall"] * 100, 2)],
    "Precision": [round(eval_rf_scaled["precision"] * 100, 2)],
    "Specificity": [round(eval_rf_scaled["specificity"] * 100, 2)]
})

compare_xgb_scaled = pd.DataFrame({
    "Model": ["XGBoost Scaled"],
    "Accuracy": [round(eval_xgb_scaled["accuracy"] * 100, 2)],
    "Recall": [round(eval_xgb_scaled["recall"] * 100, 2)],
    "Precision": [round(eval_xgb_scaled["precision"] * 100, 2)],
    "Specificity": [round(eval_xgb_scaled["specificity"] * 100, 2)]
})

compare_nn_scaled = pd.DataFrame({
    "Model": ["NeuralNetwork Scaled"],
    "Accuracy": [round(eval_nn_scaled["accuracy"] * 100, 2)],
    "Recall": [round(eval_nn_scaled["recall"] * 100, 2)],
    "Precision": [round(eval_nn_scaled["precision"] * 100, 2)],
    "Specificity": [round(eval_nn_scaled["specificity"] * 100, 2)]
})

pd.concat([compare_rf_unscaled, compare_xgb_unscaled, compare_nn_unscaled, compare_rf_scaled, compare_xgb_scaled, compare_nn_scaled], ignore_index=True)


Unnamed: 0,Model,Accuracy,Recall,Precision,Specificity
0,RandomForest Unscaled,77.06,47.18,68.92,90.46
1,XGBoost Unscaled,76.08,47.33,65.8,88.97
2,NeuralNetwork Unscaled,61.85,32.91,36.96,74.82
3,RandomForest Scaled,75.6,35.94,70.87,93.38
4,XGBoost Scaled,70.85,15.12,61.98,95.84
5,NeuralNetwork Scaled,75.78,43.6,66.64,90.21
