<a href="https://colab.research.google.com/github/geomwangi007/Machine-Learning-Projects/blob/main/Loan_Eligibility_Prediction_using_Machine_Learning_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook kernel, I will be predicting eligibility for loans among customers and investigating the criteria that are absent, aiming to understand why certain customers are unable to secure a loan while others are not.

**Loan Eligibility Prediction **

In [121]:
#Importing the dependencies
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

In [122]:
#Loading the dataset
df  = pd.read_csv('/content/Loan_Data.csv')

In [123]:
#Loading the first five rows of the dataset
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [124]:
df.shape

(614, 13)

In [125]:
#Checking for missing values
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [126]:
#Dropping the missing values rows
df.dropna()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [128]:
#Checking the distribution of the Target varible
df['Loan_Status'].value_counts()

Y    422
N    192
Name: Loan_Status, dtype: int64

In [129]:
#Columns in the dataset
df.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [130]:
#Encoding the categorical varibles into numerical values
df = pd.get_dummies(df, columns =  ['Gender', 'Married', 'Education',
       'Self_Employed', 'Property_Area'])
df= df.replace({'Y':0 , 'N':1})

In [131]:
df.head()

Unnamed: 0,Loan_ID,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Gender_Female,Gender_Male,Married_No,Married_Yes,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,LP001002,0,5849,0.0,,360.0,1.0,0,0,1,1,0,1,0,1,0,0,0,1
1,LP001003,1,4583,1508.0,128.0,360.0,1.0,1,0,1,0,1,1,0,1,0,1,0,0
2,LP001005,0,3000,0.0,66.0,360.0,1.0,0,0,1,0,1,1,0,0,1,0,0,1
3,LP001006,0,2583,2358.0,120.0,360.0,1.0,0,0,1,0,1,0,1,1,0,0,0,1
4,LP001008,0,6000,0.0,141.0,360.0,1.0,0,0,1,1,0,1,0,1,0,0,0,1


In [132]:
df.isnull().sum()

Loan_ID                     0
Dependents                 15
ApplicantIncome             0
CoapplicantIncome           0
LoanAmount                 22
Loan_Amount_Term           14
Credit_History             50
Loan_Status                 0
Gender_Female               0
Gender_Male                 0
Married_No                  0
Married_Yes                 0
Education_Graduate          0
Education_Not Graduate      0
Self_Employed_No            0
Self_Employed_Yes           0
Property_Area_Rural         0
Property_Area_Semiurban     0
Property_Area_Urban         0
dtype: int64

In [133]:
df = df.dropna()

In [136]:
#Replacing the 3+ with 3
df = df.replace({'3+': '3'})

In [137]:
df.head()

Unnamed: 0,Loan_ID,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Gender_Female,Gender_Male,Married_No,Married_Yes,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
1,LP001003,1,4583,1508.0,128.0,360.0,1.0,1,0,1,0,1,1,0,1,0,1,0,0
2,LP001005,0,3000,0.0,66.0,360.0,1.0,0,0,1,0,1,1,0,0,1,0,0,1
3,LP001006,0,2583,2358.0,120.0,360.0,1.0,0,0,1,0,1,0,1,1,0,0,0,1
4,LP001008,0,6000,0.0,141.0,360.0,1.0,0,0,1,1,0,1,0,1,0,0,0,1
5,LP001011,2,5417,4196.0,267.0,360.0,1.0,0,0,1,0,1,1,0,0,1,0,0,1


In [138]:
df.isnull().sum()

Loan_ID                    0
Dependents                 0
ApplicantIncome            0
CoapplicantIncome          0
LoanAmount                 0
Loan_Amount_Term           0
Credit_History             0
Loan_Status                0
Gender_Female              0
Gender_Male                0
Married_No                 0
Married_Yes                0
Education_Graduate         0
Education_Not Graduate     0
Self_Employed_No           0
Self_Employed_Yes          0
Property_Area_Rural        0
Property_Area_Semiurban    0
Property_Area_Urban        0
dtype: int64

**Substantiating the Features and Targets**
x= features
y = target

In [140]:
x = df.drop(columns = ['Loan_ID','Loan_Status'] , axis = 1)
y = df['Loan_Status']

In [142]:
x

Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
1,1,4583,1508.0,128.0,360.0,1.0,0,1,0,1,1,0,1,0,1,0,0
2,0,3000,0.0,66.0,360.0,1.0,0,1,0,1,1,0,0,1,0,0,1
3,0,2583,2358.0,120.0,360.0,1.0,0,1,0,1,0,1,1,0,0,0,1
4,0,6000,0.0,141.0,360.0,1.0,0,1,1,0,1,0,1,0,0,0,1
5,2,5417,4196.0,267.0,360.0,1.0,0,1,0,1,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,2900,0.0,71.0,360.0,1.0,1,0,1,0,1,0,1,0,1,0,0
610,3,4106,0.0,40.0,180.0,1.0,0,1,0,1,1,0,1,0,1,0,0
611,1,8072,240.0,253.0,360.0,1.0,0,1,0,1,1,0,1,0,0,0,1
612,2,7583,0.0,187.0,360.0,1.0,0,1,0,1,1,0,1,0,0,0,1


In [143]:
df['Dependents'].value_counts()

0    295
2     92
1     85
3     45
Name: Dependents, dtype: int64

In [144]:
y

1      1
2      0
3      0
4      0
5      0
      ..
609    0
610    0
611    0
612    0
613    1
Name: Loan_Status, Length: 517, dtype: int64

In [145]:
df['Dependents'] = df['Dependents'].astype(float)

In [146]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 517 entries, 1 to 613
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Dependents               517 non-null    object 
 1   ApplicantIncome          517 non-null    int64  
 2   CoapplicantIncome        517 non-null    float64
 3   LoanAmount               517 non-null    float64
 4   Loan_Amount_Term         517 non-null    float64
 5   Credit_History           517 non-null    float64
 6   Gender_Female            517 non-null    uint8  
 7   Gender_Male              517 non-null    uint8  
 8   Married_No               517 non-null    uint8  
 9   Married_Yes              517 non-null    uint8  
 10  Education_Graduate       517 non-null    uint8  
 11  Education_Not Graduate   517 non-null    uint8  
 12  Self_Employed_No         517 non-null    uint8  
 13  Self_Employed_Yes        517 non-null    uint8  
 14  Property_Area_Rural      5

**Splitting the training and testing data**

In [147]:
x_train, x_test , y_train ,y_test = train_test_split(x , y , test_size= 0.15 ,random_state= 42 , stratify= y)

In [148]:
models = [LogisticRegression(max_iter=1000) , SVC(kernel = 'linear')]

In [149]:
def model_comparison():

  for model in models:

    classifier = model.fit(x_train , y_train)

    training_data_prediction = model.predict(x_train)

    testing_data_prediction = model.predict(x_test)

    training_data_accuracy = accuracy_score(training_data_prediction , y_train)

    testing_data_accuracy = accuracy_score(testing_data_prediction , y_test)

    print('The training_data_accuracy of ' , model ,'is  =' ,np.round(training_data_accuracy*100 , 2 ))

    print('The testing_data_accuracy of ' , model ,'is  = ' ,np.round(testing_data_accuracy*100, 2 ))

    print('-------------------------------------------------------------------------')

In [150]:
model_comparison()

The training_data_accuracy of  LogisticRegression(max_iter=1000) is  = 78.36
The testing_data_accuracy of  LogisticRegression(max_iter=1000) is  =  87.18
-------------------------------------------------------------------------
The training_data_accuracy of  SVC(kernel='linear') is  = 78.36
The testing_data_accuracy of  SVC(kernel='linear') is  =  83.33
-------------------------------------------------------------------------


In [115]:
from sklearn.model_selection import cross_val_score

***Cross-validation***

In [151]:
def cross_val():

  for model in models:

    cross_val_scores = cross_val_score(model ,x , y , cv = 5 ,scoring = 'accuracy')

    mean_cross_val_scores = np.mean(cross_val_scores)

    print('The cross validation scores for ' , model , 'are ' , cross_val_scores)

    print("The mean cross validation scores for " , model , '=' ,np.round(mean_cross_val_scores),2)

    print('-----------------------------------------------------------------------------')

In [119]:
cross_val()

The cross validation scores for  LogisticRegression(max_iter=1000) are  [0.80769231 0.78846154 0.78640777 0.83495146 0.81553398]
The mean cross validation scores for  LogisticRegression(max_iter=1000) = 0.8066094100074682
The cross validation scores for  SVC(kernel='linear') are  [0.77884615 0.74038462 0.75728155 0.78640777 0.81553398]
The mean cross validation scores for  SVC(kernel='linear') = 0.7756908140403287
