In [1]:
#import pandas for data exploration 
import pandas as pd


In [2]:
#load the dataset file
data = pd.read_csv('loan_approval_dataset.csv')

In [4]:
#have a look at the data (first 5 rows)
data.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [6]:
# dropping irrelevant column
data.drop(columns = ['loan_id'], inplace=True)


In [7]:
#total features after dropping loan_id
data.columns

Index([' no_of_dependents', ' education', ' self_employed', ' income_annum',
       ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

In [9]:
# Removes any leading or trailing whitespace characters from column names
data.columns = data.columns.str.strip()

In [10]:
# names of the columns now
data.columns


Index(['no_of_dependents', 'education', 'self_employed', 'income_annum',
       'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value',
       'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value',
       'loan_status'],
      dtype='object')

In [11]:
data['Assets'] = data.residential_assets_value + data.commercial_assets_value+data.luxury_assets_value + data.bank_asset_value

In [12]:
# Create a new feature 'Assets' representing the total value of all type of assets.

data.drop(columns = ['residential_assets_value','commercial_assets_value','luxury_assets_value','bank_asset_value'], inplace = True)

In [13]:
data

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,loan_status,Assets
0,2,Graduate,No,9600000,29900000,12,778,Approved,50700000
1,0,Not Graduate,Yes,4100000,12200000,8,417,Rejected,17000000
2,3,Graduate,No,9100000,29700000,20,506,Rejected,57700000
3,3,Graduate,No,8200000,30700000,8,467,Rejected,52700000
4,5,Not Graduate,Yes,9800000,24200000,20,382,Rejected,55000000
...,...,...,...,...,...,...,...,...,...
4264,5,Graduate,Yes,1000000,2300000,12,317,Rejected,7400000
4265,0,Not Graduate,Yes,3300000,11300000,20,559,Approved,20000000
4266,2,Not Graduate,No,6500000,23900000,18,457,Rejected,39000000
4267,1,Not Graduate,No,4100000,12800000,8,780,Approved,28800000


In [14]:
data.isnull().sum()

no_of_dependents    0
education           0
self_employed       0
income_annum        0
loan_amount         0
loan_term           0
cibil_score         0
loan_status         0
Assets              0
dtype: int64

In [15]:
# status of values in education column
data.education.unique()

array([' Graduate', ' Not Graduate'], dtype=object)

In [16]:
# removing leading or trailing whitespaces
def clean_data(st):
    st = st.strip()
    return st

In [23]:
# applying the clean_date function to the 'education' column
data.education = data.education.apply(clean_data)

In [24]:
# Convert 'education' column to numerical values: Graduate → 1, Not Graduate → 0
data['education'] = data['education'].replace(['Graduate', 'Not Graduate'],[1,0])

In [25]:
# checking the values in the 'education' column now
data.education.unique()

array([1, 0], dtype=int64)

In [26]:
# checking values of  'self_employed' column
data.self_employed.unique()

array(['No', 'Yes'], dtype=object)

In [27]:
# applying clean_data function to the self_employed column
data.self_employed = data.self_employed.apply(clean_data)

In [28]:
# values of 'self_employed' column now
data.self_employed.unique()

array(['No', 'Yes'], dtype=object)

In [29]:
# Convert 'self_employed' column to numerical values: Yes → 1, No → 0
data.self_employed = data.self_employed.replace(['No', 'Yes'],[0,1])

In [31]:
# checking values of 'loan_status' column
data.loan_status.unique()

array([' Approved', ' Rejected'], dtype=object)

In [28]:
# applying clean_data funtion to the loan_status column
data.loan_status = data.loan_status.apply(clean_data)

In [29]:
# values of the loan_status after applying above function
data.loan_status.unique()

array(['Approved', 'Rejected'], dtype=object)

In [30]:
# Convert 'loan_status' column to numerical values: Approved → 1, Rejected → 0
data.loan_status = data.loan_status.replace(['Approved', 'Rejected'],[1,0])

In [31]:
# final data
data

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,loan_status,Assets
0,2,1,0,9600000,29900000,12,778,1,50700000
1,0,0,1,4100000,12200000,8,417,0,17000000
2,3,1,0,9100000,29700000,20,506,0,57700000
3,3,1,0,8200000,30700000,8,467,0,52700000
4,5,0,1,9800000,24200000,20,382,0,55000000
...,...,...,...,...,...,...,...,...,...
4264,5,1,1,1000000,2300000,12,317,0,7400000
4265,0,0,1,3300000,11300000,20,559,1,20000000
4266,2,0,0,6500000,23900000,18,457,0,39000000
4267,1,0,0,4100000,12800000,8,780,1,28800000


In [32]:
# Import the train_test_split function from scikit-learn's model_selection module
from sklearn.model_selection import train_test_split

In [33]:
# Define the input (features, X) by dropping the 'loan_status' column (the target variable)
# and the output (target, y) as the 'loan_status' column itself.
input_data = data.drop(columns=['loan_status'])
output_data = data['loan_status']

In [34]:
input_data

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,Assets
0,2,1,0,9600000,29900000,12,778,50700000
1,0,0,1,4100000,12200000,8,417,17000000
2,3,1,0,9100000,29700000,20,506,57700000
3,3,1,0,8200000,30700000,8,467,52700000
4,5,0,1,9800000,24200000,20,382,55000000
...,...,...,...,...,...,...,...,...
4264,5,1,1,1000000,2300000,12,317,7400000
4265,0,0,1,3300000,11300000,20,559,20000000
4266,2,0,0,6500000,23900000,18,457,39000000
4267,1,0,0,4100000,12800000,8,780,28800000


In [35]:
output_data

0       1
1       0
2       0
3       0
4       0
       ..
4264    0
4265    1
4266    0
4267    1
4268    1
Name: loan_status, Length: 4269, dtype: int64

In [36]:
# Split the data into training and testing sets.
x_train,x_test,y_train,y_test = train_test_split(input_data,output_data, test_size=0.2)

In [37]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((3415, 8), (854, 8), (3415,), (854,))

In [38]:
from sklearn.preprocessing import StandardScaler

In [39]:
scaler = StandardScaler()

In [40]:
x_train_scaled = scaler.fit_transform(x_train)

In [41]:
x_test_scaled = scaler.transform(x_test)

In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [43]:
rf_model = RandomForestClassifier(n_estimators=200, max_depth=None, random_state=42)


In [44]:
rf_model.fit(x_train_scaled, y_train)


0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [45]:
rf_y_pred = rf_model.predict(x_test_scaled)


In [46]:

print("Random Forest Accuracy:", accuracy_score(y_test, rf_y_pred))
print("\nClassification Report:\n", classification_report(y_test, rf_y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, rf_y_pred))


Random Forest Accuracy: 0.9789227166276346

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.95      0.97       305
           1       0.97      0.99      0.98       549

    accuracy                           0.98       854
   macro avg       0.98      0.97      0.98       854
weighted avg       0.98      0.98      0.98       854


Confusion Matrix:
 [[290  15]
 [  3 546]]


In [47]:

log_y_pred = rf_model.predict(x_test_scaled)


In [48]:
import pickle as pk

In [51]:
pk.dump(rf_model, open('model.pkl','wb'))

In [52]:
pk.dump(scaler,open('scaler.pkl','wb'))