## Importing Modules 

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

## Loading Dataset 

In [4]:
data = pd.read_csv('loan_approval_dataset.csv')
data.head(5) 

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


## EDA 

In [5]:
data.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   loan_id                   4269 non-null   int64 
 1   no_of_dependents          4269 non-null   int64 
 2   education                 4269 non-null   object
 3   self_employed             4269 non-null   object
 4   income_annum              4269 non-null   int64 
 5   loan_amount               4269 non-null   int64 
 6   loan_term                 4269 non-null   int64 
 7   cibil_score               4269 non-null   int64 
 8   residential_assets_value  4269 non-null   int64 
 9   commercial_assets_value   4269 non-null   int64 
 10  luxury_assets_value       4269 non-null   int64 
 11  bank_asset_value          4269 non-null   int64 
 12  loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB


In [6]:
data.describe() 

Unnamed: 0,loan_id,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
count,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0
mean,2135.0,2.498712,5059124.0,15133450.0,10.900445,599.936051,7472617.0,4973155.0,15126310.0,4976692.0
std,1232.498479,1.69591,2806840.0,9043363.0,5.709187,172.430401,6503637.0,4388966.0,9103754.0,3250185.0
min,1.0,0.0,200000.0,300000.0,2.0,300.0,-100000.0,0.0,300000.0,0.0
25%,1068.0,1.0,2700000.0,7700000.0,6.0,453.0,2200000.0,1300000.0,7500000.0,2300000.0
50%,2135.0,3.0,5100000.0,14500000.0,10.0,600.0,5600000.0,3700000.0,14600000.0,4600000.0
75%,3202.0,4.0,7500000.0,21500000.0,16.0,748.0,11300000.0,7600000.0,21700000.0,7100000.0
max,4269.0,5.0,9900000.0,39500000.0,20.0,900.0,29100000.0,19400000.0,39200000.0,14700000.0


In [7]:
data.isnull().sum() 

loan_id                     0
no_of_dependents            0
education                   0
self_employed               0
income_annum                0
loan_amount                 0
loan_term                   0
cibil_score                 0
residential_assets_value    0
commercial_assets_value     0
luxury_assets_value         0
bank_asset_value            0
loan_status                 0
dtype: int64

## Replacing String Values using Label Encoding 

The List of Columns having string values are - 

1. education 

2. self_employed

3. loan_status 

In [8]:
data['education'].unique()

array([' Graduate', ' Not Graduate'], dtype=object)

In [9]:
data['self_employed'].unique()

array([' No', ' Yes'], dtype=object)

In [10]:
data['loan_status'].unique()

array([' Approved', ' Rejected'], dtype=object)

In [11]:
data['education'] = data['education'].replace([' Graduate', ' Not Graduate'], [1, 0])
data['self_employed'] = data['self_employed'].replace([' Yes', ' No'], [1, 0])
data['loan_status'] = data['loan_status'].replace([' Approved', ' Rejected'], [1, 0]) 

  data['education'] = data['education'].replace([' Graduate', ' Not Graduate'], [1, 0])
  data['self_employed'] = data['self_employed'].replace([' Yes', ' No'], [1, 0])
  data['loan_status'] = data['loan_status'].replace([' Approved', ' Rejected'], [1, 0])


In [12]:
data['education'].unique()

array([1, 0])

In [13]:
data['self_employed'].unique()

array([0, 1])

In [14]:
data['loan_status'].unique()

array([1, 0])

## Building the Model 

In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report 

In [19]:
model = DecisionTreeClassifier()

In [20]:
X = data.drop(['loan_id', 'loan_status'], axis = 1)
y = data['loan_status'] 

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3) 

In [22]:
model.fit(X_train, y_train) 

In [23]:
prediction = model.predict(X_test) 

In [24]:
accuracy_score(y_test, prediction) * 100 

97.65807962529274

In [25]:
print(classification_report(y_test, prediction)) 

              precision    recall  f1-score   support

           0       0.98      0.96      0.97       471
           1       0.98      0.99      0.98       810

    accuracy                           0.98      1281
   macro avg       0.98      0.97      0.97      1281
weighted avg       0.98      0.98      0.98      1281



## Checking Accuracy for Multiple Models 

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier 

In [27]:
models = {'DecisionTreeClassifier': DecisionTreeClassifier(),
         'LogisticRegression': LogisticRegression(),
         'RandomForestClassifier': RandomForestClassifier(),
         'XGBClassifier': XGBClassifier()}

for model_name, mod in models.items():
    print(f"\nChecking for {model_name}")
    model = mod
    
    model.fit(X_train, y_train)
    
    prediction = model.predict(X_test) 
    
    print(f"Accuracy Score - {accuracy_score(y_test, prediction) * 100 }\n")
    print(classification_report(y_test, prediction)) 


Checking for DecisionTreeClassifier
Accuracy Score - 97.81420765027322

              precision    recall  f1-score   support

           0       0.98      0.96      0.97       471
           1       0.98      0.99      0.98       810

    accuracy                           0.98      1281
   macro avg       0.98      0.97      0.98      1281
weighted avg       0.98      0.98      0.98      1281


Checking for LogisticRegression
Accuracy Score - 72.6775956284153

              precision    recall  f1-score   support

           0       0.77      0.37      0.50       471
           1       0.72      0.93      0.81       810

    accuracy                           0.73      1281
   macro avg       0.74      0.65      0.66      1281
weighted avg       0.74      0.73      0.70      1281


Checking for RandomForestClassifier


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy Score - 97.81420765027322

              precision    recall  f1-score   support

           0       0.98      0.96      0.97       471
           1       0.98      0.99      0.98       810

    accuracy                           0.98      1281
   macro avg       0.98      0.97      0.98      1281
weighted avg       0.98      0.98      0.98      1281


Checking for XGBClassifier
Accuracy Score - 97.50195160031225

              precision    recall  f1-score   support

           0       0.97      0.96      0.97       471
           1       0.98      0.98      0.98       810

    accuracy                           0.98      1281
   macro avg       0.97      0.97      0.97      1281
weighted avg       0.97      0.98      0.97      1281



## Saving Model 

Since DecisionTreeClassifier has given the best accuracy, we will use it for building the API 

In [28]:
model = DecisionTreeClassifier()

In [29]:
model.fit(X_train, y_train) 

In [30]:
import pickle
import joblib

In [31]:
with open("final_model.pkl", "wb") as f:
    pickle.dump(model, f)
    
joblib.dump(model, "final_model.joblib") 

['final_model.joblib']

In [32]:
test_data = data = [[4, 1, 1, 10000000, 100000, 2, 800, 100000, 100000, 100000, 100000]]

In [33]:
model.predict(test_data) 



array([1])

In [34]:
loaded_model = joblib.load("final_model.joblib") 

In [35]:
loaded_model.predict(test_data) 



array([1])

In [1]:
import sklearn; print(sklearn.__version__) 

1.3.2


In [68]:
!pip install --upgrade scikit-learn 

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Downloading scikit_learn-1.6.1-cp312-cp312-win_amd64.whl (11.1 MB)
   ---------------------------------------- 11.1/11.1 MB 11.4 MB/s eta 0:00:00
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.5.1
    Uninstalling scikit-learn-1.5.1:
      Successfully uninstalled scikit-learn-1.5.1
Successfully installed scikit-learn-1.6.1


  You can safely remove it manually.

[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
!pip uninstall scikit-learn 

^C
