 Setup and Import Dependencies

In [58]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

Load the Data

In [27]:
df = pd.read_csv('Dataset.csv')
print(df.head())

    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   

Data Cleaning & Imputation

In [28]:
# Drop Loan_ID (not useful)
df.drop(columns='Loan_ID', inplace=True)

In [29]:
# Separate features by type
categorical_cols = df.select_dtypes(include='object').columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [30]:
print(categorical_cols)

['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']


In [31]:
print(numerical_cols)

['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']


In [32]:
# Target
target = 'Loan_Status'

In [33]:
# Split X and y
X = df.drop(target, axis=1)
y = df[target].map({'Y': 1, 'N': 0})

In [43]:
# Separate features by type
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

Pipeline with Imputation, Encoding, and Scaling

In [44]:
# Preprocessing for categorical data
# SimpleImputer(strategy='most_frequent'):This fills in missing values in categorical columns using the most common category.
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
 #Converts categorical values like yes or no into binary columns(0 and 1),handle_unknown='ignore' prevents errors when the test set contains unseen categories   
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])


In [45]:

# Preprocessing for numerical data
# SimpleImputer(strategy='median'):fills missing values in numerical columns using the median of each column

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
 # StandardScaler(): Standardizes the numerical features by removing the mean and scalling,to unit variance
    ('scaler', StandardScaler())
])

In [46]:
# Full preprocessor
preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_cols),
    ('cat', cat_pipeline, categorical_cols)
])

In [47]:
y = df['Loan_Status'].map({'Y':1,'N':0}).astype(int)
x = df.drop(columns='Loan_Status')

Train-Test Split

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.1, random_state=42)

In [49]:
print("x_train columns:",X_train.columns)
print("y_train sample:", y_train.head())

x_train columns: Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')
y_train sample: 473    1
462    1
464    0
478    1
84     1
Name: Loan_Status, dtype: int64


Model + Pipeline (Random Forest)

In [50]:
model_rf = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.7580645161290323
              precision    recall  f1-score   support

           0       0.64      0.47      0.55        19
           1       0.79      0.88      0.84        43

    accuracy                           0.76        62
   macro avg       0.72      0.68      0.69        62
weighted avg       0.75      0.76      0.75        62



Hyperparameter Tuning (Random Forest)

In [67]:
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 5, 10],
    'classifier__min_samples_split': [2, 5]
}

grid_search = GridSearchCV(model_rf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_preds = best_model.predict(X_test)
# Results
print("💡 Best rf Accuracy :", grid_search.best_score_)
print("✅ Test Accuracy (rf):", accuracy_score(y_test, y_pred_rf))
print("📊 rf Classification Report:\n", classification_report(y_test, y_pred_rf))
print("⚙️ Best Params:", grid_search.best_params_)

💡 Best rf Accuracy : 0.807960687960688
✅ Test Accuracy (rf): 0.7580645161290323
📊 rf Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.47      0.55        19
           1       0.79      0.88      0.84        43

    accuracy                           0.76        62
   macro avg       0.72      0.68      0.69        62
weighted avg       0.75      0.76      0.75        62

⚙️ Best Params: {'classifier__max_depth': 5, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}


Try Logistic Regression

In [65]:
model_lr = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])
param_grid_lr = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__solver': ['liblinear', 'lbfgs'],
    'classifier__penalty': ['l2']
}

grid_search_lr = GridSearchCV(
    estimator=model_lr,
    param_grid=param_grid_lr,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search_lr.fit(X_train, y_train)
best_lr = grid_search_lr.best_estimator_

y_pred_lr = best_lr.predict(X_test)

# Results
print("💡 Best lr Accuracy :", grid_search_lr.best_score_)
print("✅ Test Accuracy (lr):", accuracy_score(y_test, y_pred_lr))
print("📊 lr Classification Report:\n", classification_report(y_test, y_pred_lr))
print("⚙️ Best Params:", grid_search_lr.best_params_)

💡 Best lr Accuracy : 0.8115970515970516
✅ Test Accuracy (lr): 0.8225806451612904
📊 lr Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.47      0.62        19
           1       0.81      0.98      0.88        43

    accuracy                           0.82        62
   macro avg       0.85      0.73      0.75        62
weighted avg       0.84      0.82      0.80        62

⚙️ Best Params: {'classifier__C': 0.1, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}


Try Support Vector Machine(SVM)

In [69]:
# SVM parameter grid

# SVM pipeline with preprocessing
model_svm = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', SVC())
])
param_grid_svm = {
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__gamma': ['scale', 'auto']  # Only used for 'rbf' kernel
}

# Grid search for SVM
grid_search_svm = GridSearchCV(
    estimator=model_svm,
    param_grid=param_grid_svm,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
# Train
grid_search_svm.fit(X_train, y_train)

# Best model
best_svm = grid_search_svm.best_estimator_
y_pred_svm = best_svm.predict(X_test)

# Results
print("💡 Best SVM Accuracy (CV):", grid_search_svm.best_score_)
print("✅ Test Accuracy (SVM):", accuracy_score(y_test, y_pred_svm))
print("📊 SVM Classification Report:\n", classification_report(y_test, y_pred_svm))
print("⚙️ Best Params:", grid_search_svm.best_params_)

💡 Best SVM Accuracy (CV): 0.807960687960688
✅ Test Accuracy (SVM): 0.8225806451612904
📊 SVM Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.47      0.62        19
           1       0.81      0.98      0.88        43

    accuracy                           0.82        62
   macro avg       0.85      0.73      0.75        62
weighted avg       0.84      0.82      0.80        62

⚙️ Best Params: {'classifier__C': 0.1, 'classifier__gamma': 'scale', 'classifier__kernel': 'linear'}
