In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler
import joblib


In [2]:
# Ignore warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('Dataset_with_Synthetic_Personal_Loan.csv')


In [4]:
# Checking the dataset
df.head()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income,Credit_Card_Debt,Existing_Personal_Loan,Employment_Type
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y,5849.0,1278.26,1,Unemployed
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0,2910.41,1,Salaried
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0,1138.19,0,Self-Employed
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0,1578.14,0,Salaried
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0,721.25,1,Salaried


In [9]:
# DROP UNUSED COLUMNS
# -----------------------
cols_to_drop = ['Loan_ID', 'Loan_Type', 'Property_Area', 'LoanAmount', 
                'Loan_Amount_Term', 'CoapplicantIncome', 'ApplicantIncome', 
                'Total_Income', 'Self_Employed', 'Credit_History', 'Loan_Status']
df.drop(columns=cols_to_drop, inplace=True, errors='ignore')


In [10]:
# HANDLE MISSING VALUES
# -----------------------
df['Credit_Card_Debt'] = df['Credit_Card_Debt'].fillna(df['Credit_Card_Debt'].median())
df['Existing_Personal_Loan'] = df['Existing_Personal_Loan'].fillna(0)

In [11]:
for col in ['Gender', 'Married', 'Dependents', 'Education', 'Employment_Type']:
    df[col] = df[col].fillna(df[col].mode()[0])


In [12]:
# ENCODING CATEGORICAL VARIABLES
# -----------------------
label_cols = ['Gender', 'Married', 'Education', 'Dependents', 'Employment_Type']
le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col])
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Credit_Card_Debt,Existing_Personal_Loan,Employment_Type
0,1,0,0,0,1278.26,1,2
1,1,1,1,0,2910.41,1,0
2,1,1,0,0,1138.19,0,1
3,1,1,0,1,1578.14,0,0
4,1,0,0,0,721.25,1,0


In [None]:
# INDEPENDENT & DEPENDENT VARIABLES
# -----------------------
X = df.drop(columns=['Existing_Personal_Loan'], axis=1)
y = df['Existing_Personal_Loan']

In [15]:
# HANDLE IMBALANCED DATA
# -----------------------
oversample = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversample.fit_resample(X, y)


In [16]:
# SPLIT DATA
# -----------------------
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.25, random_state=42)

In [17]:
# Logistic Regression
model1 = LogisticRegression(max_iter=1000)
model1.fit(X_train, y_train)
y_pred_model1 = model1.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_model1)
print("Accuracy of Logistic Regression:", accuracy * 100)


Accuracy of Logistic Regression: 52.07373271889401


In [18]:
# Decision Tree
model2 = DecisionTreeClassifier(random_state=42)
model2.fit(X_train, y_train)
y_pred_model2 = model2.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_model2)
print("Accuracy of Decision Tree:", accuracy * 100)

Accuracy of Decision Tree: 76.49769585253456


In [19]:
# Random Forest
model3 = RandomForestClassifier(n_estimators=200, random_state=42)
model3.fit(X_train, y_train)
y_pred_model3 = model3.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_model3)
print("Accuracy of Random Forest:", accuracy * 100)

Accuracy of Random Forest: 78.80184331797236


In [20]:
# K-Nearest Neighbors
model4 = KNeighborsClassifier(n_neighbors=3)
model4.fit(X_train, y_train)
y_pred_model4 = model4.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_model4)
print("Accuracy of K-Nearest Neighbors:", accuracy * 100)

Accuracy of K-Nearest Neighbors: 66.3594470046083


In [21]:
# CLASSIFICATION REPORTS
# -----------------------
def generate_classification_report(model_name, y_test, y_pred):
    report = classification_report(y_test, y_pred)
    print(f"Classification Report for {model_name}:\n{report}\n")

generate_classification_report("Logistic Regression", y_test, y_pred_model1)
generate_classification_report("Decision Tree", y_test, y_pred_model2)
generate_classification_report("Random Forest", y_test, y_pred_model3)
generate_classification_report("K-Nearest Neighbors", y_test, y_pred_model4)

Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.54      0.46      0.50       112
           1       0.50      0.58      0.54       105

    accuracy                           0.52       217
   macro avg       0.52      0.52      0.52       217
weighted avg       0.52      0.52      0.52       217


Classification Report for Decision Tree:
              precision    recall  f1-score   support

           0       0.82      0.70      0.75       112
           1       0.72      0.84      0.78       105

    accuracy                           0.76       217
   macro avg       0.77      0.77      0.76       217
weighted avg       0.77      0.76      0.76       217


Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.84      0.73      0.78       112
           1       0.75      0.85      0.79       105

    accuracy                           0.79       2

In [22]:
joblib.dump(model3, "Random_Forest.pkl")


['Random_Forest.pkl']

In [23]:
import joblib

# Load the trained Random Forest model
model = joblib.load("Random_Forest.pkl")

# Make predictions
import numpy as np
sample_input = np.array([[1, 1, 2, 1, 0, 5000]])  # Adjust input as per feature order
prediction = model.predict(sample_input)

# Display result
print("Prediction:", "Approved" if prediction[0] == 1 else "Rejected")


Prediction: Rejected


In [25]:
import joblib

# Load the trained model
model = joblib.load("Random_Forest.pkl")

# Get feature names used during training
print("Feature Order Used in Training:", model.feature_names_in_)


Feature Order Used in Training: ['Gender' 'Married' 'Dependents' 'Education' 'Credit_Card_Debt'
 'Employment_Type']
