this is a demonstration on using Support Vector Machine for a loan status prediction

In [1]:
# import dependencies/libraries

import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

train_test_split is helpful to spilt our data for testing purposes
sklearn is a machine learning library in python
am using Kaggle's dataset: https://www.kaggle.com/datasets/altruistdelhite04/loan-prediction-problem-dataset?resource=download&select=train_u6lujuX_CVtuZ9i.csv



In [2]:
# read dataset

loan_dataset = pd.read_csv('dataset/train-dataset.csv')

In [3]:
# read the first 5 rows of our dataset
loan_dataset.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


remember, the naming is from 0, 1,... so, the first 5 would be 0 through 4

In [4]:
# check number of rows and columns

loan_dataset.shape

(614, 13)

In [5]:
loan_dataset.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [6]:
# missing values from the dataset

loan_dataset.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

wow, that's a lot! the highest being credit history of 50 values

now, we drop the missing values. this is not the standard way of doing things. we normally substitute these values with mean and mode after Exploratory Data Analysis(EDA).
for now, we will drop the missing values 🤫


In [7]:
loan_dataset = loan_dataset.dropna()

In [8]:
# check if values have been dropped

loan_dataset.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

so, all missing values have been dropped from our dataset

In [9]:
# check the shape

loan_dataset.shape

(480, 13)

we've moved from RC(row, column) of (614,13) to (480,13). that's 134 rows dropped. again, this is not the preferred way of dealing with missing/null values.

let's now convert features to numeric.
for loan status: Y (yes) and N(no) will be 1 and, 0 respectively
then, count the dependents column

In [10]:
loan_dataset['Dependents'].value_counts()

Dependents
0     274
2      85
1      80
3+     41
Name: count, dtype: int64

In [11]:
# replace 3+ value to 4

loan_dataset = loan_dataset.replace(to_replace='3+', value=4)

In [12]:
# check if the replacement has occured

loan_dataset['Dependents'].value_counts()

Dependents
0    274
2     85
1     80
4     41
Name: count, dtype: int64

In [13]:
# now, we convert all catergorical columns to numerical ones like married, gender, etc

loan_dataset.replace({'Married': {'No':0, 'Yes': 1}, 'Gender':{'Male':1, 'Female':0},'Self_Employed':{'No':0, 'Yes':1},'Property_Area':{'Rural': 0, 'Semiurban':1, 'Urban':2},'Education':{'Graduate':1, 'Not Graduate':0}},inplace=True)

In [14]:
loan_dataset. head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,N
2,LP001005,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,2,Y
3,LP001006,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,2,Y
4,LP001008,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,2,Y
5,LP001011,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,2,Y
6,LP001013,1,1,0,0,0,2333,1516.0,95.0,360.0,1.0,2,Y
7,LP001014,1,1,4,1,0,3036,2504.0,158.0,360.0,0.0,1,N
8,LP001018,1,1,2,1,0,4006,1526.0,168.0,360.0,1.0,2,Y
9,LP001020,1,1,1,1,0,12841,10968.0,349.0,360.0,1.0,1,N
10,LP001024,1,1,2,1,0,3200,700.0,70.0,360.0,1.0,2,Y


In [15]:
loan_dataset.shape

(480, 13)

dropping a column so as it's represented by Y(labels) and others by X(features)

In [16]:
features=loan_dataset.drop(columns=['Loan_ID','Loan_Status'],axis=1)
labels=loan_dataset['Loan_Status']


In [17]:
# spilt data to test and train sets
# when using algorithms such as random forest, we tune hyperparameters and thus include validation datasets

X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size=0.4, random_state=42)

In [18]:
# print the results

print(len(labels), len(Y_train),len(X_train), len(X_test), len(X_train))

480 288 288 192 288


In [19]:
# introduce our model

support_vector_machine = svm.SVC(kernel='linear')

we chose the SVC(kernel='linear') as it has more flexibility in terms of choice of penalities and loss functions, and should scale better to large number of samples

In [20]:
support_vector_machine.fit(X_train, Y_train)

let's now train the model

In [21]:
X_train_prediction = support_vector_machine.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [25]:
# print the accuracy_score
print(f'Accuracy on training data: {training_data_accuracy:.5f}')

Accuracy on training data: 0.76389


testing the data

In [26]:
X_test_prediction = support_vector_machine.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [28]:
# print the accuracy_score

print (f'Accuracy on test data: : {test_data_accuracy:.5f}')

Accuracy on test data: : 0.72396


conclusion: 
the accuracy of the model to predict loan status recipients will be 72% on test data and 76% on train data