In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
loan = pd.read_csv('LoanDatasets.csv')
loan.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
loan.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [4]:
loan.isnull()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,False,False,False,False,False,False,False,False,True,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
626,False,False,False,False,False,False,False,False,False,False,False,False,False
627,False,False,False,False,False,False,False,False,False,False,False,False,False
628,False,False,False,False,False,False,False,False,False,False,False,False,False
629,False,False,False,False,False,False,False,False,False,False,False,False,False


In [5]:
print('We have {} Null values in data'.format(loan.isnull().values.sum()))

We have 152 Null values in data


In [6]:
loan.dropna(axis=1, inplace=True)

In [7]:
loan.head()

Unnamed: 0,Loan_ID,Education,ApplicantIncome,CoapplicantIncome,Property_Area,Loan_Status
0,LP001002,Graduate,5849,0.0,Urban,Y
1,LP001003,Graduate,4583,1508.0,Rural,N
2,LP001005,Graduate,3000,0.0,Urban,Y
3,LP001006,Not Graduate,2583,2358.0,Urban,Y
4,LP001008,Graduate,6000,0.0,Urban,Y


In [8]:
print('No of duplicates in data: {}'.format(sum(loan.duplicated())))

No of duplicates in data: 17


In [9]:
loan.drop_duplicates(inplace = True)

In [10]:
print('No of duplicates in data: {}'.format(sum(loan.duplicated())))

No of duplicates in data: 0


In [11]:
loan[["Education"]] = loan[["Education"]].apply(lambda col:pd.Categorical(col).codes)
loan[["Property_Area"]] = loan[["Property_Area"]].apply(lambda col:pd.Categorical(col).codes)
loan[["Loan_Status"]] = loan[["Loan_Status"]].apply(lambda col:pd.Categorical(col).codes)

In [12]:
loan.head()

Unnamed: 0,Loan_ID,Education,ApplicantIncome,CoapplicantIncome,Property_Area,Loan_Status
0,LP001002,0,5849,0.0,2,1
1,LP001003,0,4583,1508.0,0,0
2,LP001005,0,3000,0.0,2,1
3,LP001006,1,2583,2358.0,2,1
4,LP001008,0,6000,0.0,2,1


In [13]:
a=loan.drop("Loan_ID", axis=1)
a

Unnamed: 0,Education,ApplicantIncome,CoapplicantIncome,Property_Area,Loan_Status
0,0,5849,0.0,2,1
1,0,4583,1508.0,0,0
2,0,3000,0.0,2,1
3,1,2583,2358.0,2,1
4,0,6000,0.0,2,1
...,...,...,...,...,...
609,0,2900,0.0,0,1
610,0,4106,0.0,0,1
611,0,8072,240.0,2,1
612,0,7583,0.0,2,1


In [14]:
x_train=a.drop("Loan_Status", axis=1)
x_train

Unnamed: 0,Education,ApplicantIncome,CoapplicantIncome,Property_Area
0,0,5849,0.0,2
1,0,4583,1508.0,0
2,0,3000,0.0,2
3,1,2583,2358.0,2
4,0,6000,0.0,2
...,...,...,...,...
609,0,2900,0.0,0
610,0,4106,0.0,0
611,0,8072,240.0,2
612,0,7583,0.0,2


In [15]:
y_train=loan["Loan_Status"]
y_train

0      1
1      0
2      1
3      1
4      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 614, dtype: int8

In [16]:
loan_test = pd.read_csv('LoanDataset_test (1).csv')
loan_test

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban
...,...,...,...,...,...,...,...,...,...,...,...,...
362,LP002971,Male,Yes,3+,Not Graduate,Yes,4009,1777,113.0,360.0,1.0,Urban
363,LP002975,Male,Yes,0,Graduate,No,4158,709,115.0,360.0,1.0,Urban
364,LP002980,Male,No,0,Graduate,No,3250,1993,126.0,360.0,,Semiurban
365,LP002986,Male,Yes,0,Graduate,No,5000,2393,158.0,360.0,1.0,Rural


In [17]:
loan_test.isnull()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,True,False
4,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
362,False,False,False,False,False,False,False,False,False,False,False,False
363,False,False,False,False,False,False,False,False,False,False,False,False
364,False,False,False,False,False,False,False,False,False,False,True,False
365,False,False,False,False,False,False,False,False,False,False,False,False


In [18]:
print('We have {} Null values in data'.format(loan_test.isnull().values.sum()))

We have 84 Null values in data


In [19]:
loan_test.dropna(axis=1, inplace=True)

In [20]:
loan_test

Unnamed: 0,Loan_ID,Married,Education,ApplicantIncome,CoapplicantIncome,Property_Area
0,LP001015,Yes,Graduate,5720,0,Urban
1,LP001022,Yes,Graduate,3076,1500,Urban
2,LP001031,Yes,Graduate,5000,1800,Urban
3,LP001035,Yes,Graduate,2340,2546,Urban
4,LP001051,No,Not Graduate,3276,0,Urban
...,...,...,...,...,...,...
362,LP002971,Yes,Not Graduate,4009,1777,Urban
363,LP002975,Yes,Graduate,4158,709,Urban
364,LP002980,No,Graduate,3250,1993,Semiurban
365,LP002986,Yes,Graduate,5000,2393,Rural


In [21]:
print('We have {} Null values in data'.format(loan_test.isnull().values.sum()))


We have 0 Null values in data


In [22]:
loan_test.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
362    False
363    False
364    False
365    False
366    False
Length: 367, dtype: bool

In [23]:
print('No of duplicates in data: {}'.format(sum(loan_test.duplicated())))

No of duplicates in data: 0


In [24]:
loan_test[["Education"]] = loan_test[["Education"]].apply(lambda col:pd.Categorical(col).codes)
loan_test[["Property_Area"]] = loan_test[["Property_Area"]].apply(lambda col:pd.Categorical(col).codes)

In [25]:
x_test=loan_test.drop("Loan_ID", axis=1)
x_test

Unnamed: 0,Married,Education,ApplicantIncome,CoapplicantIncome,Property_Area
0,Yes,0,5720,0,2
1,Yes,0,3076,1500,2
2,Yes,0,5000,1800,2
3,Yes,0,2340,2546,2
4,No,1,3276,0,2
...,...,...,...,...,...
362,Yes,1,4009,1777,2
363,Yes,0,4158,709,2
364,No,0,3250,1993,1
365,Yes,0,5000,2393,0


In [26]:
x_test=x_test.drop("Married", axis=1)

In [27]:
x_test

Unnamed: 0,Education,ApplicantIncome,CoapplicantIncome,Property_Area
0,0,5720,0,2
1,0,3076,1500,2
2,0,5000,1800,2
3,0,2340,2546,2
4,1,3276,0,2
...,...,...,...,...
362,1,4009,1777,2
363,0,4158,709,2
364,0,3250,1993,1
365,0,5000,2393,0


In [28]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   Education          367 non-null    int8 
 1   ApplicantIncome    367 non-null    int64
 2   CoapplicantIncome  367 non-null    int64
 3   Property_Area      367 non-null    int8 
dtypes: int64(2), int8(2)
memory usage: 6.6 KB


In [29]:
from sklearn.ensemble import RandomForestClassifier
rf =RandomForestClassifier(n_estimators=100)
rf.fit(x_train,y_train)

RandomForestClassifier()

In [30]:
y_pred=rf.predict(x_test)

In [31]:
y_pred

array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,

In [35]:
## Decision Tree Classifier 

In [37]:
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier()
dtc.fit(x_train, y_train)
dtc_pred=dtc.predict(x_test)
dtc_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,

In [38]:
# Naive_Bayes classifier 

In [40]:
from sklearn.naive_bayes import BernoulliNB
nb = BernoulliNB()
nb.fit(x_train,y_train)
nb_pred = nb.predict(x_test)
nb_pred

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,

In [41]:
## Regression

In [42]:
from sklearn import datasets, linear_model, metrics
linearreg= linear_model.LinearRegression()
linearreg.fit(x_train, y_train)
linearreg_pred=linearreg.predict(x_test)
linearreg_pred

array([0.74181021, 0.73069176, 0.72374601, 0.72080588, 0.64330006,
       0.6084617 , 0.62965603, 0.61080443, 0.72657636, 0.60338729,
       0.64365621, 0.71656874, 0.74480192, 0.71372166, 0.72817389,
       0.74191225, 0.71248961, 0.72626973, 0.64952239, 0.69715811,
       0.6284669 , 0.64559678, 0.74529669, 0.64235865, 0.69509716,
       0.46215367, 0.7309155 , 0.69786192, 0.72089381, 0.72141907,
       0.62917089, 0.73192917, 0.71267923, 0.71820251, 0.73089558,
       0.73109244, 0.69513589, 0.71828821, 0.74159267, 0.72459713,
       0.73314631, 0.74580686, 0.60870161, 0.69654823, 0.74576451,
       0.7024786 , 0.62565398, 0.60885326, 0.7036769 , 0.61583868,
       0.72971981, 0.61820219, 0.69716939, 0.73409245, 0.63941699,
       0.74752796, 0.60980142, 0.6919596 , 0.69221447, 0.72806993,
       0.61069551, 0.73615992, 0.72759249, 0.70123789, 0.6409629 ,
       0.69282407, 0.62489317, 0.73052452, 0.72239839, 0.66159594,
       0.73089993, 0.7285647 , 0.72578154, 0.69135591, 0.58607