In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import svm
dataset =pd.read_csv('train.csv')
dataset.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


<div align="center"><h2> Data cleaning</h2></div>

In [52]:
#size of data
dataset.shape

(614, 13)

In [53]:
#check status of dataset
dataset.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [54]:
#check for missing values
dataset.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [55]:
#drop missing values (columns)
dataset=dataset.dropna()
dataset.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [56]:
#compare intial size to new size
dataset.shape

(480, 13)

In [57]:
#convert all datatype to numeric values (categorical data)
dataset.replace({"Loan_Status":{'N':0,'Y':1}},inplace=True)
dataset.replace({"Gender":{'Male':0,'Female':1}},inplace=True)
dataset.replace({"Married":{'No':0,'Yes':1}},inplace=True)
dataset.replace({"Education":{'Not Graduate':0,'Graduate':1}},inplace=True)
dataset.replace({"Self_Employed":{'No':0,'Yes':1}},inplace=True)
dataset.replace({"Property_Area":{'Rural':0,'Semiurban':1,'Urban':2}},inplace=True)

In [58]:
dataset.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,0,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,LP001005,0,1,0,1,1,3000,0.0,66.0,360.0,1.0,2,1
3,LP001006,0,1,0,0,0,2583,2358.0,120.0,360.0,1.0,2,1
4,LP001008,0,0,0,1,0,6000,0.0,141.0,360.0,1.0,2,1
5,LP001011,0,1,2,1,1,5417,4196.0,267.0,360.0,1.0,2,1


In [59]:
dataset['Dependents'].value_counts()

0     274
2      85
1      80
3+     41
Name: Dependents, dtype: int64

In [60]:
#replace all values with value > 3
dataset=dataset.replace(to_replace='3+',value=4)

In [61]:
dataset['Dependents'].value_counts()

0    274
2     85
1     80
4     41
Name: Dependents, dtype: int64

<div align="center"><h2>Data visualization</h2></div>

In [62]:
import seaborn as sns

In [63]:
#sns.countplot(x='Education',hue='Loan_Status',data=dataset)

In [64]:
#sns.countplot(x='Married',hue='Loan_Status',data=dataset)

<div align="center"><h2>Data Training</h2></div>

In [65]:
#split data into x and y
x= dataset.drop(columns=["Loan_ID","Loan_Status"],axis=1)
y= dataset["Loan_Status"]

In [66]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=2)

In [69]:
#SVM Alogrithm
model=svm.SVC(kernel='linear')

In [70]:
model.fit(x_train,y_train)

SVC(kernel='linear')

<div align="center">
    <h2> EVALUATION OF THE MODEL </h2>
</div>

In [72]:
#determine the accuracy of the trained model
train_res=model.predict(x_train)
accuracy_score(train_res,y_train)

0.7890625

In [73]:
#determine the accuracy of the test data
test_res=model.predict(x_test)
accuracy_score(test_res,y_test)

0.8229166666666666

<div align="center">
    <h2> CREATING THE PREDICTIVE MODEL </h2>
</div>

In [79]:
import numpy as np
input_data=(0,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0)
input_data_numpy_array=np.asarray(input_data)
input_data_reshaped=input_data_numpy_array.reshape(1,-1)
prediction= model.predict(input_data_reshaped)
res = ["You are denied loan access" if each > 0.8 else "Loan granted" for each in prediction][0]
print(res)

You are denied loan access


<div align="center">
    <h2> SAVING & LOADING THE MODEL </h2>
</div>

In [82]:
import pickle
filename='trained_model.sav'
#Save model in file
#wb write binary file
pickle.dump(model,open(filename,'wb'))

In [83]:
#Load model from file
#rb read binary file
loaded_model= pickle.load(open('trained_model.sav','rb'))

In [84]:
#Test saved model
input_data=(0,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0)
input_data_numpy_array=np.asarray(input_data)
input_data_reshaped=input_data_numpy_array.reshape(1,-1)
prediction= loaded_model.predict(input_data_reshaped)
res = ["You are denied loan access" if each > 0.8 else "Loan granted" for each in prediction][0]
print(res)

You are denied loan access
