In [1]:
# Import necessary libraries
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import pickle as pk

In [2]:
# Load the dataset and display the first few rows
data = pd.read_csv('dataset/loan_approval_dataset.csv')

In [3]:
data

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4264,4265,5,Graduate,Yes,1000000,2300000,12,317,2800000,500000,3300000,800000,Rejected
4265,4266,0,Not Graduate,Yes,3300000,11300000,20,559,4200000,2900000,11000000,1900000,Approved
4266,4267,2,Not Graduate,No,6500000,23900000,18,457,1200000,12400000,18100000,7300000,Rejected
4267,4268,1,Not Graduate,No,4100000,12800000,8,780,8200000,700000,14100000,5800000,Approved


In [4]:
# Remove unnecessary column 'loan_id' for better data analysis
data.drop(columns=['loan_id'], inplace =True)

In [5]:
data.columns

Index([' no_of_dependents', ' education', ' self_employed', ' income_annum',
       ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

In [6]:
# Clean column names by stripping whitespace
data.columns= data.columns.str.strip()

In [7]:
data

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected
...,...,...,...,...,...,...,...,...,...,...,...,...
4264,5,Graduate,Yes,1000000,2300000,12,317,2800000,500000,3300000,800000,Rejected
4265,0,Not Graduate,Yes,3300000,11300000,20,559,4200000,2900000,11000000,1900000,Approved
4266,2,Not Graduate,No,6500000,23900000,18,457,1200000,12400000,18100000,7300000,Rejected
4267,1,Not Graduate,No,4100000,12800000,8,780,8200000,700000,14100000,5800000,Approved


In [8]:
# Create a new feature combining all asset columns
data['Assets'] = data.residential_assets_value + data.commercial_assets_value + data.luxury_assets_value + data.bank_asset_value

In [9]:
data

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status,Assets
0,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved,50700000
1,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected,17000000
2,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected,57700000
3,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected,52700000
4,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected,55000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4264,5,Graduate,Yes,1000000,2300000,12,317,2800000,500000,3300000,800000,Rejected,7400000
4265,0,Not Graduate,Yes,3300000,11300000,20,559,4200000,2900000,11000000,1900000,Approved,20000000
4266,2,Not Graduate,No,6500000,23900000,18,457,1200000,12400000,18100000,7300000,Rejected,39000000
4267,1,Not Graduate,No,4100000,12800000,8,780,8200000,700000,14100000,5800000,Approved,28800000


In [10]:
# Drop individual asset columns since they're now combined
data.drop(columns=['residential_assets_value','commercial_assets_value', 'luxury_assets_value', 'bank_asset_value'], inplace =True )

In [11]:
data

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,loan_status,Assets
0,2,Graduate,No,9600000,29900000,12,778,Approved,50700000
1,0,Not Graduate,Yes,4100000,12200000,8,417,Rejected,17000000
2,3,Graduate,No,9100000,29700000,20,506,Rejected,57700000
3,3,Graduate,No,8200000,30700000,8,467,Rejected,52700000
4,5,Not Graduate,Yes,9800000,24200000,20,382,Rejected,55000000
...,...,...,...,...,...,...,...,...,...
4264,5,Graduate,Yes,1000000,2300000,12,317,Rejected,7400000
4265,0,Not Graduate,Yes,3300000,11300000,20,559,Approved,20000000
4266,2,Not Graduate,No,6500000,23900000,18,457,Rejected,39000000
4267,1,Not Graduate,No,4100000,12800000,8,780,Approved,28800000


In [12]:
# Check for missing values in the dataset
data.isnull().sum()

no_of_dependents    0
education           0
self_employed       0
income_annum        0
loan_amount         0
loan_term           0
cibil_score         0
loan_status         0
Assets              0
dtype: int64

In [13]:
# Analyze unique values in categorical column 'education'
data.education.unique()

array([' Graduate', ' Not Graduate'], dtype=object)

In [14]:
# Define a helper function for data cleaning
def clean_data(st):
    st = st.strip()
    return st

In [15]:
# Apply the cleaning function to the 'education' column
clean_data('Graduate')

'Graduate'

In [16]:
# Replace categorical values with numerical equivalents
data.education = data.education.apply(clean_data)

In [17]:
data.education.unique()

array(['Graduate', 'Not Graduate'], dtype=object)

In [18]:
# Process the 'self_employed' column similarly
data['education'] = data['education'].replace(['Graduate', 'Not Graduate'],[1,0])

In [19]:
data

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,loan_status,Assets
0,2,1,No,9600000,29900000,12,778,Approved,50700000
1,0,0,Yes,4100000,12200000,8,417,Rejected,17000000
2,3,1,No,9100000,29700000,20,506,Rejected,57700000
3,3,1,No,8200000,30700000,8,467,Rejected,52700000
4,5,0,Yes,9800000,24200000,20,382,Rejected,55000000
...,...,...,...,...,...,...,...,...,...
4264,5,1,Yes,1000000,2300000,12,317,Rejected,7400000
4265,0,0,Yes,3300000,11300000,20,559,Approved,20000000
4266,2,0,No,6500000,23900000,18,457,Rejected,39000000
4267,1,0,No,4100000,12800000,8,780,Approved,28800000


In [20]:
data.self_employed.unique()

array([' No', ' Yes'], dtype=object)

In [21]:
data.self_employed = data.self_employed.apply(clean_data)

In [22]:
data.self_employed.unique()

array(['No', 'Yes'], dtype=object)

In [23]:
data['self_employed'] = data['self_employed'].replace(['No', 'Yes'],[1,0])

In [24]:
data

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,loan_status,Assets
0,2,1,1,9600000,29900000,12,778,Approved,50700000
1,0,0,0,4100000,12200000,8,417,Rejected,17000000
2,3,1,1,9100000,29700000,20,506,Rejected,57700000
3,3,1,1,8200000,30700000,8,467,Rejected,52700000
4,5,0,0,9800000,24200000,20,382,Rejected,55000000
...,...,...,...,...,...,...,...,...,...
4264,5,1,0,1000000,2300000,12,317,Rejected,7400000
4265,0,0,0,3300000,11300000,20,559,Approved,20000000
4266,2,0,1,6500000,23900000,18,457,Rejected,39000000
4267,1,0,1,4100000,12800000,8,780,Approved,28800000


In [25]:
data.loan_status.unique()

array([' Approved', ' Rejected'], dtype=object)

In [26]:
data.loan_status = data.loan_status.apply(clean_data)

In [27]:
# Encode the target column ('loan_status') for binary classification
data['loan_status'] = data['loan_status'].replace(['Approved', 'Rejected'],[1,0])

In [28]:
data

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,loan_status,Assets
0,2,1,1,9600000,29900000,12,778,1,50700000
1,0,0,0,4100000,12200000,8,417,0,17000000
2,3,1,1,9100000,29700000,20,506,0,57700000
3,3,1,1,8200000,30700000,8,467,0,52700000
4,5,0,0,9800000,24200000,20,382,0,55000000
...,...,...,...,...,...,...,...,...,...
4264,5,1,0,1000000,2300000,12,317,0,7400000
4265,0,0,0,3300000,11300000,20,559,1,20000000
4266,2,0,1,6500000,23900000,18,457,0,39000000
4267,1,0,1,4100000,12800000,8,780,1,28800000


#### Split the dataset into input features and target labels

In [30]:
input_data = data.drop(columns=['loan_status'])
output_data = data['loan_status']

In [31]:
input_data

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,Assets
0,2,1,1,9600000,29900000,12,778,50700000
1,0,0,0,4100000,12200000,8,417,17000000
2,3,1,1,9100000,29700000,20,506,57700000
3,3,1,1,8200000,30700000,8,467,52700000
4,5,0,0,9800000,24200000,20,382,55000000
...,...,...,...,...,...,...,...,...
4264,5,1,0,1000000,2300000,12,317,7400000
4265,0,0,0,3300000,11300000,20,559,20000000
4266,2,0,1,6500000,23900000,18,457,39000000
4267,1,0,1,4100000,12800000,8,780,28800000


In [32]:
output_data

0       1
1       0
2       0
3       0
4       0
       ..
4264    0
4265    1
4266    0
4267    1
4268    1
Name: loan_status, Length: 4269, dtype: int64

In [33]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(input_data, output_data, test_size=0.2, random_state=42)

In [34]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3415, 8), (854, 8), (3415,), (854,))

#### Scale the features for better model performance

In [36]:
scaler = StandardScaler()

In [37]:
X_train_scaled = scaler.fit_transform(X_train)

In [38]:
X_test_scaled = scaler.fit_transform(X_test)

#### Train a logistic regression model

In [40]:
model = LogisticRegression()

In [41]:
model.fit(X_train_scaled, y_train)

In [42]:
# Check model performance on the test set
model.score(X_test_scaled, y_test)

0.905152224824356

In [43]:
# Make a prediction using sample data
pred_data = pd.DataFrame([['2','1','1'	,'9600000', '29900000', '12', '778', '50700000']], columns=['no_of_dependents', 'education', 'self_employed','income_annum', 'loan_amount', 'loan_term', 'cibil_score', 'Assets'])

In [44]:
pred_scaled_data = scaler.transform(pred_data)

In [45]:
model.predict(pred_scaled_data)

array([1])

#### Save the trained model and scaler for deployment

In [47]:
pk.dump(model, open('model.pkl', 'wb'))

In [48]:
pk.dump(scaler,open('scaler.pkl', 'wb'))