In [1]:
import pandas as pd
import numpy as np
df=pd.read_csv("loan.csv")
df.head(2)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import Normalizer

In [3]:
#axis is required(key error is thrown)[S1:drop which are not neccesary]
df.drop(['Loan_ID','Gender','Married','Dependents','CoapplicantIncome'],axis=1,inplace=True)
df

Unnamed: 0,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Graduate,No,5849,,360.0,1.0,Urban,Y
1,Graduate,No,4583,128.0,360.0,1.0,Rural,N
2,Graduate,Yes,3000,66.0,360.0,1.0,Urban,Y
3,Not Graduate,No,2583,120.0,360.0,1.0,Urban,Y
4,Graduate,No,6000,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...
609,Graduate,No,2900,71.0,360.0,1.0,Rural,Y
610,Graduate,No,4106,40.0,180.0,1.0,Rural,Y
611,Graduate,No,8072,253.0,360.0,1.0,Urban,Y
612,Graduate,No,7583,187.0,360.0,1.0,Urban,Y


In [4]:
#S2:find the null values
df.isnull().sum()

Education            0
Self_Employed       32
ApplicantIncome      0
LoanAmount          22
Loan_Amount_Term    14
Credit_History      50
Property_Area        0
Loan_Status          0
dtype: int64

In [5]:
#s3:fill all the missing values
df['Self_Employed'].fillna(method='ffill',inplace=True)

In [6]:
df['LoanAmount'].fillna(df['LoanAmount'].mean(),inplace=True)

In [7]:
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mean(),inplace=True)

In [8]:
df['Credit_History'].fillna(method='ffill',inplace=True)

In [9]:
#s4:Recheck for null values
df.isnull().sum()

Education           0
Self_Employed       0
ApplicantIncome     0
LoanAmount          0
Loan_Amount_Term    0
Credit_History      0
Property_Area       0
Loan_Status         0
dtype: int64

In [10]:
df.shape

(614, 8)

In [11]:
#s5:Convert the string values to float values(i.e encode the string(yes/no) values to (1/0)using dummies
Education = pd.get_dummies(df['Education'],drop_first=True)

In [12]:
Self_Employed = pd.get_dummies(df['Self_Employed'],drop_first=True)

In [13]:
Property_Area = pd.get_dummies(df['Property_Area'],drop_first=True)

In [14]:
Loan_Status =pd.get_dummies(df['Loan_Status'],drop_first=True)

In [15]:
#s6:concat the columns which you encoded(mod_df is the new dataframe formed after concat that is why not using inplace)
mod_df = pd.concat([df,Education,Self_Employed,Property_Area,Loan_Status],axis=1)

In [16]:
mod_df.head(2)

Unnamed: 0,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Not Graduate,Yes,Semiurban,Urban,Y
0,Graduate,No,5849,146.412162,360.0,1.0,Urban,Y,0,0,0,1,1
1,Graduate,No,4583,128.0,360.0,1.0,Rural,N,0,0,0,0,0


In [17]:
mod_df.drop(["Education","Self_Employed","Property_Area","Loan_Status"],axis = 1 ,inplace = True)

In [18]:
mod_df

Unnamed: 0,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Not Graduate,Yes,Semiurban,Urban,Y
0,5849,146.412162,360.0,1.0,0,0,0,1,1
1,4583,128.000000,360.0,1.0,0,0,0,0,0
2,3000,66.000000,360.0,1.0,0,1,0,1,1
3,2583,120.000000,360.0,1.0,1,0,0,1,1
4,6000,141.000000,360.0,1.0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...
609,2900,71.000000,360.0,1.0,0,0,0,0,1
610,4106,40.000000,180.0,1.0,0,0,0,0,1
611,8072,253.000000,360.0,1.0,0,0,0,1,1
612,7583,187.000000,360.0,1.0,0,0,0,1,1


In [19]:
#s8:check for null values in the new dataframe
mod_df.isnull().sum()

ApplicantIncome     0
LoanAmount          0
Loan_Amount_Term    0
Credit_History      0
Not Graduate        0
Yes                 0
Semiurban           0
Urban               0
Y                   0
dtype: int64

In [20]:
#s9:check the datatype for the new dataframe
mod_df.dtypes

ApplicantIncome       int64
LoanAmount          float64
Loan_Amount_Term    float64
Credit_History      float64
Not Graduate          uint8
Yes                   uint8
Semiurban             uint8
Urban                 uint8
Y                     uint8
dtype: object

In [21]:
mod_df.shape

(614, 9)

In [22]:
X = mod_df.iloc[:,0:8]
Y = mod_df.iloc[:,8]

In [23]:
print(type(X))

<class 'pandas.core.frame.DataFrame'>


In [24]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)

In [25]:
print(type(rescaledX))

<class 'numpy.ndarray'>


In [26]:
print(rescaledX)
#print(Y)

[[ 0.07299082  0.          0.27985054 ... -0.40358244 -0.7820157
   1.42814704]
 [-0.13441195 -0.21927331  0.27985054 ... -0.40358244 -0.7820157
  -0.70020801]
 [-0.39374734 -0.957641    0.27985054 ...  2.47780848 -0.7820157
   1.42814704]
 ...
 [ 0.43717437  1.26937121  0.27985054 ... -0.40358244 -0.7820157
   1.42814704]
 [ 0.35706382  0.4833669   0.27985054 ... -0.40358244 -0.7820157
   1.42814704]
 [-0.13441195 -0.15972753  0.27985054 ...  2.47780848  1.2787467
  -0.70020801]]


In [27]:
print(type(rescaledX))

<class 'numpy.ndarray'>


In [28]:
print(Y)

0      1
1      0
2      1
3      1
4      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Y, Length: 614, dtype: uint8


In [29]:
X_train,X_test,Y_train,Y_test = train_test_split(rescaledX,Y,random_state=0,test_size=0.2)

In [30]:
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX, Y, test_size=0.30, random_state=42, stratify = Y)
maxK = 15
accuracies = []
for k in range(1,maxK+1):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, Y_train)
    acc = knn.score(X_test, Y_test)*100
    print('For k =',k, 'Accuracy = ', acc, '%')
    accuracies.append(acc)

For k = 1 Accuracy =  77.29729729729729 %
For k = 2 Accuracy =  70.8108108108108 %
For k = 3 Accuracy =  79.45945945945945 %
For k = 4 Accuracy =  75.13513513513513 %
For k = 5 Accuracy =  83.24324324324324 %
For k = 6 Accuracy =  81.08108108108108 %
For k = 7 Accuracy =  82.16216216216216 %
For k = 8 Accuracy =  81.08108108108108 %
For k = 9 Accuracy =  82.16216216216216 %
For k = 10 Accuracy =  81.62162162162161 %
For k = 11 Accuracy =  82.16216216216216 %
For k = 12 Accuracy =  81.08108108108108 %
For k = 13 Accuracy =  82.70270270270271 %
For k = 14 Accuracy =  82.16216216216216 %
For k = 15 Accuracy =  82.16216216216216 %


In [31]:
import math
math.sqrt(len(Y_test))

13.601470508735444

In [32]:
classifier = KNeighborsClassifier(n_neighbors=11, p=2,metric='euclidean')

In [33]:
classifier.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=11, p=2,
                     weights='uniform')

In [34]:
Y_pred = classifier.predict(X_test)

In [35]:
#print(len(Y_pred))
#print(len(Y_train))
#print(Y_test)
print(len(Y_test))
#print(Y_pred)

185


In [36]:
print(f1_score(Y_test, Y_pred))

0.8808664259927798


In [37]:
print(accuracy_score(Y_test,Y_pred))

0.8216216216216217


In [38]:
classifier.predict([[5849,146.412162,360.0,1.0,0,0,0,1]])

array([1], dtype=uint8)

In [39]:
classifier.predict([[4583,128.000000,360.0,1.0,0,0,0,0]])

array([1], dtype=uint8)

In [40]:
##predicting using logistic regression

In [41]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [42]:
model.fit(X_train, Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [43]:
y_predicted = model.predict(X_test)

In [44]:
print(f1_score(Y_test, Y_pred))

0.8808664259927798


In [45]:
model.score(X_test,Y_test)

0.8432432432432433

In [46]:
###applying the bayes 

In [47]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, Y_train)


GaussianNB(priors=None, var_smoothing=1e-09)

In [48]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [49]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, y_pred)


In [50]:
cm

array([[ 33,  25],
       [  5, 122]], dtype=int64)

In [51]:
print(f1_score(Y_test, y_pred))

0.8905109489051095
