In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve

from imblearn.over_sampling import SMOTE

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout


In [None]:
DATA_PATH = "../data/loan_data.csv"
MODEL_PATH = "../models/loan_default_model.keras"
OUTPUT_DIR = "../outputs"

os.makedirs(OUTPUT_DIR, exist_ok=True)


In [None]:
df = pd.read_csv(DATA_PATH)
df.head()


In [None]:
nulls = df.isnull().sum()
print("Null Values:\n", nulls)

threshold = 0.5
df = df.loc[:, df.isnull().mean() < threshold]

numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])


In [None]:
default_pct = df['TARGET'].value_counts(normalize=True) * 100
print("Default Distribution:\n", default_pct)


In [None]:
X = df.drop(columns=['TARGET'])
y = df['TARGET']

X = pd.get_dummies(X)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

smote = SMOTE(random_state=42)
X_bal, y_bal = smote.fit_resample(X_scaled, y)


In [None]:
sns.countplot(x=y_bal)
plt.title('Balanced Class Distribution')
plt.savefig(f"{OUTPUT_DIR}/class_distribution.png")
plt.show()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_bal, y_bal, test_size=0.2, random_state=42)


In [None]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.2)


In [None]:
model.save(MODEL_PATH)

with open(f"{OUTPUT_DIR}/model_summary.txt", "w", encoding='utf-8') as f:
    model.summary(print_fn=lambda x: f.write(x + "\n"))


In [None]:
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype("int32")

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.title("Confusion Matrix")
plt.savefig(f"{OUTPUT_DIR}/confusion_matrix.png")
plt.show()


In [None]:
TP = cm[1, 1]
FN = cm[1, 0]
sensitivity = TP / (TP + FN)

auc = roc_auc_score(y_test, y_pred_probs)
fpr, tpr, _ = roc_curve(y_test, y_pred_probs)


In [None]:
plt.plot(fpr, tpr, label=f"AUC = {auc:.2f}")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.savefig(f"{OUTPUT_DIR}/roc_curve.png")
plt.show()


In [None]:
with open(f"{OUTPUT_DIR}/metrics.txt", "w") as f:
    f.write(f"Sensitivity: {sensitivity:.4f}\n")
    f.write(f"AUC: {auc:.4f}\n")

print("Model training and evaluation complete. Outputs saved.")
