# Enchanncing Artificial Neural Network by modifying Gradient Descent with Adaptive Moment Estimation (Adam) on Credit Risk Classification using 

Import all libraries needed

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import tensorflow as tf
import tensorflow.keras as keras
import seaborn as sns
import optuna

from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

In [41]:
df = pd.read_csv('../datasets/credit_risk.csv')
df.shape

(32581, 12)

Membaca dataset. it can be found, the dataset has 32581 rows (include column names) and 12 variables. lets take a look into first 5 row and last 5 rows

In [42]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [43]:
df.tail()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0,0.11,N,30
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0,0.15,N,19
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,1,0.46,N,28
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0,0.1,N,26
32580,66,42000,RENT,2.0,MEDICAL,B,6475,9.99,0,0.15,N,30


## Data Understanding

In [44]:
df.columns

Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_status', 'loan_percent_income',
       'cb_person_default_on_file', 'cb_person_cred_hist_length'],
      dtype='object')

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32581 non-null  int64  
 1   person_income               32581 non-null  int64  
 2   person_home_ownership       32581 non-null  object 
 3   person_emp_length           31686 non-null  float64
 4   loan_intent                 32581 non-null  object 
 5   loan_grade                  32581 non-null  object 
 6   loan_amnt                   32581 non-null  int64  
 7   loan_int_rate               29465 non-null  float64
 8   loan_status                 32581 non-null  int64  
 9   loan_percent_income         32581 non-null  float64
 10  cb_person_default_on_file   32581 non-null  object 
 11  cb_person_cred_hist_length  32581 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.0+ MB


In [46]:
df.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,32581.0,32581.0,31686.0,32581.0,29465.0,32581.0,32581.0,32581.0
mean,27.7346,66074.85,4.789686,9589.371106,11.011695,0.218164,0.170203,5.804211
std,6.348078,61983.12,4.14263,6322.086646,3.240459,0.413006,0.106782,4.055001
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,38500.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,79200.0,7.0,12200.0,13.47,0.0,0.23,8.0
max,144.0,6000000.0,123.0,35000.0,23.22,1.0,0.83,30.0


In [47]:
df.drop_duplicates(inplace=True)

In [48]:
df.duplicated().sum()
df.dropna(inplace=True)

Check is there any null value and how much is it

In [49]:
df.isnull().sum() * 100 / len(df)

person_age                    0.0
person_income                 0.0
person_home_ownership         0.0
person_emp_length             0.0
loan_intent                   0.0
loan_grade                    0.0
loan_amnt                     0.0
loan_int_rate                 0.0
loan_status                   0.0
loan_percent_income           0.0
cb_person_default_on_file     0.0
cb_person_cred_hist_length    0.0
dtype: float64

Handle Null Values

In [50]:
df['person_emp_length'] = df['person_emp_length'].fillna(df['person_emp_length'].mean())

In [51]:
mean_loan_int_rate_by_grade = df.groupby('loan_grade')['loan_int_rate'].mean()

In [52]:
def fill_loan_int_rate(row):
    if pd.isnull(row['loan_int_rate']):
        return mean_loan_int_rate_by_grade[row['loan_grade']]
    else:
        return row['loan_int_rate']

In [53]:
df['loan_int_rate'] = df.apply(fill_loan_int_rate, axis=1)

### Understanding each column

In [54]:
df.columns

Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_status', 'loan_percent_income',
       'cb_person_default_on_file', 'cb_person_cred_hist_length'],
      dtype='object')

#### Person Home Ownership

The ownership of someone home (kepemilikan rumah)

In [55]:
df['person_home_ownership'].value_counts()

RENT        14498
MORTGAGE    11736
OWN          2174
OTHER          93
Name: person_home_ownership, dtype: int64

Nothing weird, nothing strange here

#### Loan Intent

Loan interest (alasan ambil pinjaman)

In [56]:
df['loan_intent'].value_counts()

EDUCATION            5670
MEDICAL              5269
VENTURE              4969
PERSONAL             4859
DEBTCONSOLIDATION    4547
HOMEIMPROVEMENT      3187
Name: loan_intent, dtype: int64

There is nothing annomaly too here

#### Loan Grade

Loan rate (tingkat pinjaman)

In [57]:
df['loan_grade'].value_counts()

A    9345
B    9094
C    5682
D    3243
E     869
F     209
G      59
Name: loan_grade, dtype: int64

There are so many loans. i think A is the lowest rank and G the highest rank which is so many credits

#### Failed to pay

Failed to pay in their history (gagal bayar)

In [58]:
df['cb_person_default_on_file'].value_counts()

N    23411
Y     5090
Name: cb_person_default_on_file, dtype: int64

### Binary Value Label Encoder

In [59]:
df['cb_person_default_on_file'] = LabelEncoder().fit_transform(df['cb_person_default_on_file'])

In [60]:
df = pd.get_dummies(df, columns=['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file'])

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28501 entries, 0 to 32580
Data columns (total 27 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      28501 non-null  int64  
 1   person_income                   28501 non-null  int64  
 2   person_emp_length               28501 non-null  float64
 3   loan_amnt                       28501 non-null  int64  
 4   loan_int_rate                   28501 non-null  float64
 5   loan_status                     28501 non-null  int64  
 6   loan_percent_income             28501 non-null  float64
 7   cb_person_cred_hist_length      28501 non-null  int64  
 8   person_home_ownership_MORTGAGE  28501 non-null  uint8  
 9   person_home_ownership_OTHER     28501 non-null  uint8  
 10  person_home_ownership_OWN       28501 non-null  uint8  
 11  person_home_ownership_RENT      28501 non-null  uint8  
 12  loan_intent_DEBTCONSOLIDATION   

### Split Dataset

In [62]:
X = df.drop('loan_status', axis=1)
y = df['loan_status']

In [63]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

NameError: name 'SMOTE' is not defined

### Normalization

In [64]:
X_scaled = pd.DataFrame(StandardScaler().fit_transform(X_resampled), columns = X.columns)
X_scaled.head()

NameError: name 'X_resampled' is not defined

In [65]:
y_scaled = to_categorical(LabelEncoder().fit_transform(y_resampled))
y_scaled

NameError: name 'y_resampled' is not defined

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [67]:
X_train.shape, y_train.shape

((15960, 26), (15960, 2))

In [68]:
X_val.shape, y_val.shape

((3990, 26), (3990, 2))

In [69]:
X_test.shape, y_test.shape

((8551, 26), (8551, 2))

## Create ANN Moodel

This one would be so satisfying

In [70]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor = "val_loss", patience = 10)
layers = [
    Input((X_train.shape[1], )),
    Dropout(0.2),
    Dense(256, activation='relu'),
    Dropout(0.2),
    Dense(128, activation='relu'),
    Dense(2, activation='sigmoid'),
]
epochs = 100

### Adam (Adaptive Moment Estimation)

In [71]:
model_adam = tf.keras.Sequential([
    Input((X_train.shape[1], )),
    Dense(43, activation='relu'),
    Dropout(0.2),
    Dense(29, activation='relu'),
    Dropout(0.3),
    Dense(24, activation='relu'),
    Dropout(0.4),
    Dense(2, activation='sigmoid'),
])

In [None]:
model_adam.compile(optimizer=keras.optimizers.Adam(learning_rate=0.002), loss='binary_crossentropy', metrics=['accuracy'])
print("Adam (Adaptpive Moment Estimation) Gradient Descent")
hist_adam = model_adam.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32, epochs=20)

Adam (Adaptpive Moment Estimation) Gradient Descent
Epoch 1/20


In [None]:
loss_adam = hist_adam.history['loss']
val_loss_adam = hist_adam.history['val_loss']

plt.plot(loss_adam, label='Adam - Training Loss')
plt.plot(val_loss_adam, label='Adam - Validation Loss')

# Add labels and title
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Adam Training and Validation Loss')
plt.legend()
plt.show()

## Accuracies

In [None]:
def calculate_average_accuracy(history, name):
    avg_accuracy = np.mean(history.history['accuracy'])
    avg_val_accuracy = np.mean(history.history['val_accuracy'])
    return avg_accuracy, avg_val_accuracy

models = [
    (hist_sgd, 'SGD'),
    (hist_minibatch, 'Mini-batch'),
    (hist_moment, 'Momentum'),
    (hist_rmsprop, 'RMSprop'),
    (hist_adam, 'Adam')
]

avg_accuracies = []
avg_val_accuracies = []
model_names = []

for history, name in models:
    avg_accuracy, avg_val_accuracy = calculate_average_accuracy(history, name)
    avg_accuracies.append(avg_accuracy)
    avg_val_accuracies.append(avg_val_accuracy)
    model_names.append(name)
    print(f'Average Training Accuracy for {name}: {avg_accuracy:.4f}')
    print(f'Average Validation Accuracy for {name}: {avg_val_accuracy:.4f}\n')

# Plot average accuracies
x = np.arange(len(model_names))

plt.figure(figsize=(12, 8))
plt.bar(x - 0.2, avg_accuracies, 0.4, label='Training Accuracy')
plt.bar(x + 0.2, avg_val_accuracies, 0.4, label='Validation Accuracy')

plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Average Training and Validation Accuracy for Different Models')
plt.xticks(x, model_names)
plt.legend()
plt.show()

In [None]:
threshold = 0.5

y_pred_sgd = model_sgd.predict(X_test)

# Convert probabilities to binary predictions for each model
y_pred_sgd_binary = (y_pred_sgd > threshold).astype(int)
# y_pred_minibatch_binary = (y_pred_minibatch > threshold).astype(int)
# y_pred_moment_binary = (y_pred_moment > threshold).astype(int)
# y_pred_rmsprop_binary = (y_pred_rmsprop > threshold).astype(int)
# y_pred_adam_binary = (y_pred_adam > threshold).astype(int)

# Compute multilabel confusion matrices for each model
cm_sgd_multilabel = multilabel_confusion_matrix(y_test, y_pred_sgd_binary)
# cm_minibatch_multilabel = multilabel_confusion_matrix(y_test, y_pred_minibatch_binary)
# cm_moment_multilabel = multilabel_confusion_matrix(y_test, y_pred_moment_binary)
# cm_rmsprop_multilabel = multilabel_confusion_matrix(y_test, y_pred_rmsprop_binary)
# cm_adam_multilabel = multilabel_confusion_matrix(y_test, y_pred_adam_binary)

print("Multilabel Confusion Matrix for SGD Model:")
print(cm_sgd_multilabel[0])
# print("\nMultilabel Confusion Matrix for Mini-batch Model:")
# print(cm_minibatch_multilabel[0])
# print("\nMultilabel Confusion Matrix for Momentum Model:")
# print(cm_moment_multilabel[0])
# print("\nMultilabel Confusion Matrix for RMSProp Model:")
# print(cm_rmsprop_multilabel[0])
# print("\nMultilabel Confusion Matrix for Adam Model:")
# print(cm_adam_multilabel[0])

def print_classification_report(y_true, y_pred, model_name):
    report = classification_report(y_true, y_pred)
    print(f"Classification Report for {model_name}:")
    print(report)
    print("\n")

# Print classification report for each model
print_classification_report(y_test, y_pred_sgd_binary, "SGD Model")
# print_classification_report(y_test, y_pred_minibatch_binary, "Mini-batch Model")
# print_classification_report(y_test, y_pred_moment_binary, "Momentum Model")
# print_classification_report(y_test, y_pred_rmsprop_binary, "RMSProp Model")
# print_classification_report(y_test, y_pred_adam_binary, "Adam Model")