In [None]:
#Import necessary libraries
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Load the dataset
df  = pd.read_csv(r'../Dataset/original_dataset.csv')
print("First 5 rows of the Orginal dataset:")
display(df.head())

print("\nDataset Info:")
df.info()

# Checking for missing values
print("\nMissing values per column:")
print(df.isnull().sum())


First 5 rows of the Orginal dataset:


Unnamed: 0,age,gender,marital_status,education_level,annual_income,monthly_income,employment_status,debt_to_income_ratio,credit_score,loan_amount,...,loan_term,installment,grade_subgrade,num_of_open_accounts,total_credit_limit,current_balance,delinquency_history,public_records,num_of_delinquencies,loan_paid_back
0,59,Male,Married,Master's,24240.19,2020.02,Employed,0.074,743,17173.72,...,36,581.88,B5,7,40833.47,24302.07,1,0,1,1
1,72,Female,Married,Bachelor's,20172.98,1681.08,Employed,0.219,531,22663.89,...,60,573.17,F1,5,27968.01,10803.01,1,0,3,1
2,49,Female,Single,High School,26181.8,2181.82,Employed,0.234,779,3631.36,...,60,76.32,B4,2,15502.25,4505.44,0,0,0,1
3,35,Female,Single,High School,11873.84,989.49,Employed,0.264,809,14939.23,...,36,468.07,A5,7,18157.79,5525.63,4,0,5,1
4,63,Other,Single,Other,25326.44,2110.54,Employed,0.26,663,16551.71,...,60,395.5,D5,1,17467.56,3593.91,2,0,2,1



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   20000 non-null  int64  
 1   gender                20000 non-null  object 
 2   marital_status        20000 non-null  object 
 3   education_level       20000 non-null  object 
 4   annual_income         20000 non-null  float64
 5   monthly_income        20000 non-null  float64
 6   employment_status     20000 non-null  object 
 7   debt_to_income_ratio  20000 non-null  float64
 8   credit_score          20000 non-null  int64  
 9   loan_amount           20000 non-null  float64
 10  loan_purpose          20000 non-null  object 
 11  interest_rate         20000 non-null  float64
 12  loan_term             20000 non-null  int64  
 13  installment           20000 non-null  float64
 14  grade_subgrade        20000 non-null  object 
 15  num_

In [None]:
# Data Preprocessing
categorical_cols = ['gender', 'marital_status', 'education_level', 
                    'employment_status', 'loan_purpose', 'grade_subgrade']

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le   # save encoder

X = df.drop('loan_paid_back', axis=1)
y = df['loan_paid_back']

numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

# Save feature columns for API
feature_columns = X.columns
joblib.dump(feature_columns, "models/feature_columns.pkl")
print("\nFeature columns saved!")

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print("Training set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)
print("Test set shape:", X_test.shape)





Feature columns saved!
Training set shape: (14000, 21)
Validation set shape: (3000, 21)
Test set shape: (3000, 21)


In [None]:
# model training and evaluation
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

for name, model in models.items():
    # Train
    model.fit(X_train, y_train)
    
    # Predict on validation set
    y_pred = model.predict(X_val)
    y_proba = model.predict_proba(X_val)[:, 1]  # probability for ROC-AUC
    
    # Metrics
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    roc = roc_auc_score(y_val, y_proba)
    cm = confusion_matrix(y_val, y_pred)
    
    # Display results
    print(f"\n=== {name} ===")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"ROC-AUC: {roc:.4f}")
    print("Confusion Matrix:\n", cm)



=== Logistic Regression ===
Accuracy: 0.8883
Precision: 0.8930
Recall: 0.9765
F1-Score: 0.9329
ROC-AUC: 0.8624
Confusion Matrix:
 [[ 336  279]
 [  56 2329]]

=== Random Forest ===
Accuracy: 0.9017
Precision: 0.8937
Recall: 0.9945
F1-Score: 0.9415
ROC-AUC: 0.8900
Confusion Matrix:
 [[ 333  282]
 [  13 2372]]

=== Gradient Boosting ===
Accuracy: 0.9000
Precision: 0.8930
Recall: 0.9933
F1-Score: 0.9405
ROC-AUC: 0.9058
Confusion Matrix:
 [[ 331  284]
 [  16 2369]]


In [None]:
# saving Models
models_folder = "models"
if not os.path.exists(models_folder):
    os.makedirs(models_folder)
    print(f"Folder '{models_folder}' created.")
else:
    print(f"Folder '{models_folder}' already exists.")

# Save preprocessing objects
joblib.dump(scaler, os.path.join(models_folder, "scaler.pkl"))
joblib.dump(label_encoders, os.path.join(models_folder, "label_encoders.pkl"))
print("Scaler and Label Encoders saved.")

models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

for name, model in models.items():
    # Train
    model.fit(X_train, y_train)
    
    # Predict on validation set
    y_pred = model.predict(X_val)
    y_proba = model.predict_proba(X_val)[:, 1]  # probability for ROC-AUC
    
    # Evaluate metrics
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    roc = roc_auc_score(y_val, y_proba)
    cm = confusion_matrix(y_val, y_pred)
    
    print(f"\n=== {name} ===")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"ROC-AUC: {roc:.4f}")
    print("Confusion Matrix:\n", cm)
    
    filename = os.path.join(models_folder, name.replace(" ", "_").lower() + "_loan_model.pkl")
    joblib.dump(model, filename)
    print(f"{name} saved as '{filename}'")


Folder 'models' already exists.
Scaler and Label Encoders saved.

=== Logistic Regression ===
Accuracy: 0.8883
Precision: 0.8930
Recall: 0.9765
F1-Score: 0.9329
ROC-AUC: 0.8624
Confusion Matrix:
 [[ 336  279]
 [  56 2329]]
Logistic Regression saved as 'models\logistic_regression_loan_model.pkl'

=== Random Forest ===
Accuracy: 0.9017
Precision: 0.8937
Recall: 0.9945
F1-Score: 0.9415
ROC-AUC: 0.8900
Confusion Matrix:
 [[ 333  282]
 [  13 2372]]
Random Forest saved as 'models\random_forest_loan_model.pkl'

=== Gradient Boosting ===
Accuracy: 0.9000
Precision: 0.8930
Recall: 0.9933
F1-Score: 0.9405
ROC-AUC: 0.9058
Confusion Matrix:
 [[ 331  284]
 [  16 2369]]
Gradient Boosting saved as 'models\gradient_boosting_loan_model.pkl'


In [None]:
# Example prediction with the Random Forest model
loaded_model = joblib.load("models/random_forest_loan_model.pkl")

new_applicant = np.array([[90, 1, 0, 2, 25000, 2083.33, 0, 0.1, 700, 15000,
                           0, 0.05, 36, 500, 3, 70000, 10000, 0, 0, 0, 1]])

prediction_proba = loaded_model.predict_proba(new_applicant)[:,1]

threshold = 0.95  # only approve if model is ≥95% confident
status = "Paying Back" if prediction_proba[0] >= threshold else "Not Paying Back"

print("Probability of paying back:", prediction_proba[0])
print("Predicted loan status :", status)


Probability of paying back: 0.59
Predicted loan status : Not Paying Back
