In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import os
import joblib


In [27]:
data = pd.read_csv('../data/loan_approval_dataset.csv')

In [32]:
data.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,1,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1
1,2,0,0,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0
2,3,3,1,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0
3,4,3,1,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0
4,5,5,0,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0


In [34]:
mean_values = data[['income_annum', 'loan_amount', 'residential_assets_value', 
    'commercial_assets_value', 'luxury_assets_value', 
    'bank_asset_value']].mean().astype(int)
print(mean_values)
most_frequent_values = data[['no_of_dependents', 'education', 'self_employed']].mode().iloc[0]
print(most_frequent_values)
import pickle

with open('../data/mean_and_frequent_values.pkl', 'wb') as file:
    pickle.dump({'mean_values': mean_values, 'most_frequent_values': most_frequent_values}, file)

print("Saved in '../data/mean_and_frequent_values.pkl'")

income_annum                 5059123
loan_amount                 15133450
residential_assets_value     7472616
commercial_assets_value      4973155
luxury_assets_value         15126305
bank_asset_value             4976692
dtype: int64
no_of_dependents    4
education           1
self_employed       1
Name: 0, dtype: int64
Saved in '../data/mean_and_frequent_values.pkl'


In [28]:
# Strip leading/trailing spaces from column names
data.columns = data.columns.str.strip()

# Access the 'education' column
data["education"].value_counts()

education
Graduate        2144
Not Graduate    2125
Name: count, dtype: int64

In [29]:
data['loan_status'] = data['loan_status'].str.strip()
data['loan_status'] = data['loan_status'].map({'Approved': 1, 'Rejected': 0})

In [30]:
data["education"]=data["education"].str.strip()
data['education'] = data['education'].map({'Graduate': 1, 'Not Graduate': 0})

In [31]:
data["self_employed"]=data["self_employed"].str.strip()
data["self_employed"]=data["self_employed"].map({'Yes': 1, 'No': 0})

In [8]:
data["education"].head()

0    1
1    0
2    1
3    1
4    0
Name: education, dtype: int64

In [23]:
data["Total assets"]=data["residential_assets_value"]+data["commercial_assets_value"]+data["luxury_assets_value"]+data["bank_asset_value"]

In [11]:
correlation_total = data['loan_status'].corr(data['Total assets'])
print(f"Correlation between loan status and CIBIL score: {correlation_total}")

Correlation between loan status and CIBIL score: -0.01128142674264055


In [12]:
scaler = MinMaxScaler()
data[['income_annum', 'loan_amount', 'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value', 'Total assets']] = scaler.fit_transform(data[['income_annum', 'loan_amount', 'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value', 'Total assets']])

In [15]:
y = data['loan_status']
X = data.drop(columns=['loan_status'])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Save the scaler object
preprocessing_objects_path = "../src/models/scaler.pkl"
joblib.dump(scaler, preprocessing_objects_path)

print(f"Preprocessing objects saved to {preprocessing_objects_path}")

Preprocessing objects saved to ../src/models/scaler.pkl


In [28]:
test_df = X_test.copy()
test_df['loan_status'] = y_test.values
train_df = X_train.copy()
train_df['loan_status'] = y_train.values
train_df.to_csv('../data/train.csv', index=False)
test_df.to_csv('../data/test.csv', index=False)

In [27]:
len(train_df)

3415

In [29]:
print(train_df.head())

      loan_id  no_of_dependents  education  self_employed  income_annum  \
1675     1676                 5          0              0      0.793814   
1164     1165                 0          0              1      0.969072   
192       193                 1          1              0      0.061856   
910       911                 2          1              1      0.484536   
567       568                 5          1              1      0.288660   

      loan_amount  loan_term  cibil_score  residential_assets_value  \
1675     0.755102          6          568                  0.202055   
1164     0.859694         12          710                  0.818493   
192      0.066327          8          682                  0.078767   
910      0.326531         18          754                  0.284247   
567      0.275510         12          441                  0.294521   

      commercial_assets_value  luxury_assets_value  bank_asset_value  \
1675                 0.716495             0.401028

In [20]:
num_rows = len(data)
print(f"Amount of rows in dataframe: {num_rows}")

Amount of rows in dataframe: 4269
