### Importing dataset

In [136]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df=pd.read_csv("loan_prediction_train.csv")
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


### Handling missing values

In [137]:
#number of missing values
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [138]:
df['Gender'].value_counts(dropna=False)


Male      489
Female    112
NaN        13
Name: Gender, dtype: int64

In [139]:
df['Gender'].fillna(value='Male',inplace=True)

In [140]:
df['Dependents'].value_counts(dropna=False)

0      345
1      102
2      101
3+      51
NaN     15
Name: Dependents, dtype: int64

In [141]:
df['Dependents'].fillna(value=0,inplace=True)

In [142]:
df['Married'].value_counts(dropna=False)

Yes    398
No     213
NaN      3
Name: Married, dtype: int64

In [143]:
df['Married'].fillna(value='Yes',inplace=True)

In [144]:
df['Self_Employed'].value_counts(dropna=False)

No     500
Yes     82
NaN     32
Name: Self_Employed, dtype: int64

In [145]:
df['Self_Employed'].fillna(value='No',inplace=True)

In [146]:
df['Loan_Amount_Term'].value_counts(dropna=False)

 360.0    512
 180.0     44
 480.0     15
NaN        14
 300.0     13
 84.0       4
 240.0      4
 120.0      3
 36.0       2
 60.0       2
 12.0       1
Name: Loan_Amount_Term, dtype: int64

In [147]:
df['Loan_Amount_Term'].fillna(value=360.0,inplace=True)

In [148]:
df['Credit_History'].value_counts(dropna=False)

 1.0    475
 0.0     89
NaN      50
Name: Credit_History, dtype: int64

In [149]:
df['Credit_History'].fillna(value=1,inplace=True)

In [150]:
df['LoanAmount'].fillna(value=df['LoanAmount'].mean(),inplace=True)

In [151]:
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [152]:
df.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


###### No missing values

### Handling caltegorical data

In [153]:
#create a list of features to dummy
dummy_list=['Gender','Married','Education','Self_Employed','Credit_History','Loan_Status']
#function to dummy all categorical variable used for modelling
def get_dummy(df,dummy_list):
    for x in dummy_list:
        dummies=pd.get_dummies(df[x],drop_first=True)
        
        df[x]=dummies
        #df=pd.concat([df,dummies],axis=1)
    return df
df2=get_dummy(df,dummy_list)
df2.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,1,0,0,0,0,5849,0.0,146.412162,360.0,1,Urban,1
1,LP001003,1,1,1,0,0,4583,1508.0,128.0,360.0,1,Rural,0
2,LP001005,1,1,0,0,1,3000,0.0,66.0,360.0,1,Urban,1
3,LP001006,1,1,0,1,0,2583,2358.0,120.0,360.0,1,Urban,1
4,LP001008,1,0,0,0,0,6000,0.0,141.0,360.0,1,Urban,1
5,LP001011,1,1,2,0,1,5417,4196.0,267.0,360.0,1,Urban,1
6,LP001013,1,1,0,1,0,2333,1516.0,95.0,360.0,1,Urban,1
7,LP001014,1,1,3+,0,0,3036,2504.0,158.0,360.0,0,Semiurban,0
8,LP001018,1,1,2,0,0,4006,1526.0,168.0,360.0,1,Urban,1
9,LP001020,1,1,1,0,0,12841,10968.0,349.0,360.0,1,Semiurban,0


In [154]:
dummies=pd.get_dummies(df2['Property_Area'],prefix='Property_Area')
df2=df2.drop('Property_Area',1)
df2=pd.concat([df2,dummies],axis=1)


In [155]:
dummies=pd.get_dummies(df2['Dependents'],prefix='Dependents',drop_first=True)
df2=df2.drop('Dependents',1)
df2=pd.concat([df2,dummies],axis=1)
df2.head()

Unnamed: 0,Loan_ID,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Dependents_0,Dependents_1,Dependents_2,Dependents_3+
0,LP001002,1,0,0,0,5849,0.0,146.412162,360.0,1,1,0,0,1,1,0,0,0
1,LP001003,1,1,0,0,4583,1508.0,128.0,360.0,1,0,1,0,0,0,1,0,0
2,LP001005,1,1,0,1,3000,0.0,66.0,360.0,1,1,0,0,1,1,0,0,0
3,LP001006,1,1,1,0,2583,2358.0,120.0,360.0,1,1,0,0,1,1,0,0,0
4,LP001008,1,0,0,0,6000,0.0,141.0,360.0,1,1,0,0,1,1,0,0,0


### Removing unnecessary columns

In [156]:
df2.drop('Loan_ID',axis=1,inplace=True)
df2.head()

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Dependents_0,Dependents_1,Dependents_2,Dependents_3+
0,1,0,0,0,5849,0.0,146.412162,360.0,1,1,0,0,1,1,0,0,0
1,1,1,0,0,4583,1508.0,128.0,360.0,1,0,1,0,0,0,1,0,0
2,1,1,0,1,3000,0.0,66.0,360.0,1,1,0,0,1,1,0,0,0
3,1,1,1,0,2583,2358.0,120.0,360.0,1,1,0,0,1,1,0,0,0
4,1,0,0,0,6000,0.0,141.0,360.0,1,1,0,0,1,1,0,0,0


### Splitting dependent and independent variable

In [157]:
x_train=df2.drop(['Loan_Status'],axis=1)
y_train=df2['Loan_Status']