In [16]:
# Import necessary libraries

import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler

In [17]:
df = pd.read_csv('Dataset_with_Synthetic_Personal_Loan.csv')


In [18]:
# DROP UNUSED COLUMNS
# -----------------------
cols_to_drop = ['Loan_ID', 'Loan_Type', 'Property_Area', 'LoanAmount', 'Loan_Amount_Term', 
                'CoapplicantIncome', 'ApplicantIncome', 'Total_Income']
df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

In [19]:
# Numerical columns
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mean())
df['Credit_Card_Debt'] = df['Credit_Card_Debt'].fillna(df['Credit_Card_Debt'].median())
df['Existing_Personal_Loan'] = df['Existing_Personal_Loan'].fillna(0)

In [20]:
# Categorical columns
for col in ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Education', 'Employment_Type']:
    df[col] = df[col].fillna(df[col].mode()[0])

In [21]:
# ENCODING CATEGORICAL VARIABLES
# -----------------------
label_cols = ['Gender', 'Married', 'Education', 'Dependents', 'Self_Employed', 'Employment_Type', 'Loan_Status']
le = LabelEncoder()
for col in label_cols:
    if col in df.columns:
        df[col] = le.fit_transform(df[col])

In [22]:
# INDEPENDENT & DEPENDENT VARIABLES
# -----------------------
X = df.drop(columns=['Loan_Status'], axis=1)
y = df['Loan_Status']

In [23]:
# HANDLE IMBALANCED DATA
# -----------------------
oversample = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversample.fit_resample(X, y)


In [9]:
# SPLIT DATA
# -----------------------
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.25, random_state=42)

In [24]:
# HYPERPARAMETER TUNING FUNCTION & MODEL STORAGE
# -----------------------
best_models = {}

def tune_and_evaluate(model, param_grid, model_name):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    best_models[model_name] = best_model  # Store the best model
    
    y_pred = best_model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    print(f"\n🔍 Best Parameters for {model_name}: {grid_search.best_params_}")
    print(f"✅ Accuracy of {model_name}: {acc * 100:.2f}%")
    print(f"📊 Classification Report for {model_name}:\n{classification_report(y_test, y_pred)}\n")

In [27]:
# PARAMETER GRIDS
# -----------------------

param_grids = {
    "Logistic Regression": {
        "model": LogisticRegression(max_iter=1000),
        "params": {
            "C": [0.01, 0.1, 1, 10, 100], 
            "solver": ["liblinear", "lbfgs"]
        }
    },
    "Decision Tree": {
        "model": DecisionTreeClassifier(random_state=42),
        "params": {
            "max_depth": [3, 5, 10, None],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 5]
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [3, 5, 10, None],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 5]
        }
    },
    "K-Nearest Neighbors": {
        "model": KNeighborsClassifier(),
        "params": {
            "n_neighbors": [3, 5, 7, 9],
            "weights": ["uniform", "distance"],
            "metric": ["euclidean", "manhattan"]
        }
    }
}

In [28]:
# RUN TUNING & EVALUATION
# -----------------------
for name, cfg in param_grids.items():
    tune_and_evaluate(cfg["model"], cfg["params"], name)



🔍 Best Parameters for Logistic Regression: {'C': 1, 'solver': 'lbfgs'}
✅ Accuracy of Logistic Regression: 69.67%
📊 Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.90      0.49      0.64       114
           1       0.61      0.94      0.74        97

    accuracy                           0.70       211
   macro avg       0.76      0.71      0.69       211
weighted avg       0.77      0.70      0.68       211



🔍 Best Parameters for Decision Tree: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
✅ Accuracy of Decision Tree: 84.36%
📊 Classification Report for Decision Tree:
              precision    recall  f1-score   support

           0       0.83      0.89      0.86       114
           1       0.86      0.78      0.82        97

    accuracy                           0.84       211
   macro avg       0.85      0.84      0.84       211
weighted avg       0.85      0.84      0.84       211

In [29]:
# SAVE BEST MODEL TO FILE
# -----------------------
best_model_name = max(best_models, key=lambda k: accuracy_score(y_test, best_models[k].predict(X_test)))
best_model = best_models[best_model_name]

model_path = f"best_model_{best_model_name.replace(' ', '_')}.pkl"
joblib.dump(best_model, model_path)
print(f"\n✅ Best model '{best_model_name}' saved as {model_path}.")


✅ Best model 'Decision Tree' saved as best_model_Decision_Tree.pkl.
