# Import & Load Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import joblib

In [2]:
data = pd.read_csv(r"C:\Users\Deepak Verma\OneDrive\Documents\Loan_Default_Prediction\data\raw\loan_approval_dataset.csv")
data.columns = data.columns.str.strip()

In [3]:
data.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


# Removing Unwanted Columns

In [4]:
data = data.drop(["loan_id"], axis = 1)

# Handle Missing Values

In [5]:
# Check for missing values

print(data.isnull().sum())

no_of_dependents            0
education                   0
self_employed               0
income_annum                0
loan_amount                 0
loan_term                   0
cibil_score                 0
residential_assets_value    0
commercial_assets_value     0
luxury_assets_value         0
bank_asset_value            0
loan_status                 0
dtype: int64


In [6]:
data.duplicated().sum()

0

In [7]:
data = data.set_axis(['no_of_dependents', 'education', "self_employed", "annual_income_in_lakhs", "loan_amount_in_lakhs",
                  "loan_term_in_months", "cibil_score", "residential_assets_value", "commercial_assets_value",
                 "luxury_assets_value", "bank_asset_value", "loan_status"], axis=1)

In [8]:
def lakhs(value):
    return value/100000

amount_col = ["annual_income_in_lakhs", "loan_amount_in_lakhs", "residential_assets_value", "commercial_assets_value",
                 "luxury_assets_value", "bank_asset_value"]

for col in amount_col:
    data[col] = data[col].apply(lakhs)

In [9]:
def loan_income_ratio(value1, value2):
    return value2/value1

data["debt_to_income_ratio"] = loan_income_ratio(data["annual_income_in_lakhs"], data["loan_amount_in_lakhs"])

In [10]:
def cibil_class(value):
    if value in range(300, 600):
        return "Low"
    elif value in range(601, 699):
        return "Fair"
    elif value in range(700, 799):
        return  "Good"
    else:
        return "Excellent"

data["cibil_score"] = data["cibil_score"].apply(cibil_class)

In [11]:
data["Asset_Score_in_lakhs"] = data["residential_assets_value"] + data["commercial_assets_value"] + data["luxury_assets_value"] + data["bank_asset_value"]

In [12]:
data = data.drop(columns = ["residential_assets_value", "commercial_assets_value",
                 "luxury_assets_value", "bank_asset_value"], axis = 1)

In [13]:
data.head()

Unnamed: 0,no_of_dependents,education,self_employed,annual_income_in_lakhs,loan_amount_in_lakhs,loan_term_in_months,cibil_score,loan_status,debt_to_income_ratio,Asset_Score_in_lakhs
0,2,Graduate,No,96.0,299.0,12,Good,Approved,3.114583,507.0
1,0,Not Graduate,Yes,41.0,122.0,8,Low,Rejected,2.97561,170.0
2,3,Graduate,No,91.0,297.0,20,Low,Rejected,3.263736,577.0
3,3,Graduate,No,82.0,307.0,8,Low,Rejected,3.743902,527.0
4,5,Not Graduate,Yes,98.0,242.0,20,Low,Rejected,2.469388,550.0


# Encode Categorical Variables

In [14]:
le = LabelEncoder()

binary_cols = ["education", "self_employed", "cibil_score", "loan_status"]

for col in binary_cols:
    data[col] = le.fit_transform(data[col])

# Scaling Numerical

In [15]:
scaler = MinMaxScaler()

x = data.drop(columns = ["loan_status"], axis = 1)
x_scaled = scaler.fit_transform(x)

# Save Processed Data

In [16]:
processed_df = pd.DataFrame(x_scaled, columns = x.columns)
processed_df["loan_status"] = data["loan_status"]
processed_df.to_csv(r"C:\Users\Deepak Verma\OneDrive\Documents\Loan_Default_Prediction\data\processed\loan_data_processed.csv", index = False)

In [17]:
processed_df.head()

Unnamed: 0,no_of_dependents,education,self_employed,annual_income_in_lakhs,loan_amount_in_lakhs,loan_term_in_months,cibil_score,debt_to_income_ratio,Asset_Score_in_lakhs,loan_status
0,0.4,0.0,0.0,0.969072,0.755102,0.555556,0.666667,0.645833,0.557032,0
1,0.0,1.0,1.0,0.402062,0.303571,0.333333,1.0,0.590244,0.183832,1
2,0.6,0.0,0.0,0.917526,0.75,1.0,1.0,0.705495,0.634551,1
3,0.6,0.0,0.0,0.824742,0.77551,0.333333,1.0,0.897561,0.579181,1
4,1.0,1.0,1.0,0.989691,0.609694,1.0,1.0,0.387755,0.604651,1


# Save the Scaler

In [18]:
joblib.dump(scaler, r"C:\Users\Deepak Verma\OneDrive\Documents\Loan_Default_Prediction\models\preprocessing_pipeline.pkl")

['C:\\Users\\Deepak Verma\\OneDrive\\Documents\\Loan_Default_Prediction\\models\\preprocessing_pipeline.pkl']