In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
def OneHotEncode(data, column_name):
    one_hot = pd.get_dummies(data[str(column_name)])
    data = data.drop(str(column_name), axis=1)
    data = data.join(one_hot)
    return data

In [3]:
def DropDummy(data, column_to_drop, column_to_keep, new_column):
    data = data.drop(column_to_drop, axis=1)
    data[new_column] = data[column_to_keep]
    data = data.drop(column_to_keep, axis=1)
    return data

In [4]:
df = pd.read_csv('train.csv')

In [5]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [6]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [7]:
df = df.dropna()

In [8]:
df = OneHotEncode(df, 'Gender')

In [9]:
df.head()

Unnamed: 0,Loan_ID,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Female,Male
1,LP001003,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,0,1
2,LP001005,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,0,1
3,LP001006,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,0,1
4,LP001008,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,0,1
5,LP001011,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y,0,1


In [10]:
df = DropDummy(df, 'Female', 'Male', 'Gender')

In [11]:
df.head()

Unnamed: 0,Loan_ID,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Gender
1,LP001003,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,1
2,LP001005,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,1
3,LP001006,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,1
4,LP001008,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,1
5,LP001011,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y,1


In [12]:
# Encoding other columns & removing dummies

df = OneHotEncode(df, 'Education')
df = df.drop('Not Graduate', axis=1)

df = OneHotEncode(df, 'Married')
df = DropDummy(df, 'No', 'Yes', 'Married')

df = OneHotEncode(df, 'Self_Employed')
df = DropDummy(df, 'No', 'Yes', 'Self_Employed')

df = OneHotEncode(df, 'Property_Area')
df = df.drop('Semiurban', axis=1)

In [13]:
df.head()

Unnamed: 0,Loan_ID,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Gender,Graduate,Married,Self_Employed,Rural,Urban
1,LP001003,1,4583,1508.0,128.0,360.0,1.0,N,1,1,1,0,1,0
2,LP001005,0,3000,0.0,66.0,360.0,1.0,Y,1,1,1,1,0,1
3,LP001006,0,2583,2358.0,120.0,360.0,1.0,Y,1,0,1,0,0,1
4,LP001008,0,6000,0.0,141.0,360.0,1.0,Y,1,1,0,0,0,1
5,LP001011,2,5417,4196.0,267.0,360.0,1.0,Y,1,1,1,1,0,1


In [14]:
# Standardizing Dependents type

for i in range(480):
    if df.iloc[i,1] == '3+':
        df.iloc[i,1] = 3

In [15]:
df['Loan_Granted'] = ' '

In [16]:
# Encoding label

for i in range(480):
    if df.iloc[i,7] == 'Y':
        df.iloc[i,-1] = '1'
    else:
        df.iloc[i,-1] = '0'

In [17]:
df.head()

Unnamed: 0,Loan_ID,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Gender,Graduate,Married,Self_Employed,Rural,Urban,Loan_Granted
1,LP001003,1,4583,1508.0,128.0,360.0,1.0,N,1,1,1,0,1,0,0
2,LP001005,0,3000,0.0,66.0,360.0,1.0,Y,1,1,1,1,0,1,1
3,LP001006,0,2583,2358.0,120.0,360.0,1.0,Y,1,0,1,0,0,1,1
4,LP001008,0,6000,0.0,141.0,360.0,1.0,Y,1,1,0,0,0,1,1
5,LP001011,2,5417,4196.0,267.0,360.0,1.0,Y,1,1,1,1,0,1,1


In [18]:
df = df.drop('Loan_Status', axis=1)
df = df.drop('Loan_ID', axis=1)

In [19]:
df.head()

Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender,Graduate,Married,Self_Employed,Rural,Urban,Loan_Granted
1,1,4583,1508.0,128.0,360.0,1.0,1,1,1,0,1,0,0
2,0,3000,0.0,66.0,360.0,1.0,1,1,1,1,0,1,1
3,0,2583,2358.0,120.0,360.0,1.0,1,0,1,0,0,1,1
4,0,6000,0.0,141.0,360.0,1.0,1,1,0,0,0,1,1
5,2,5417,4196.0,267.0,360.0,1.0,1,1,1,1,0,1,1


In [20]:
# Prepararing variables for model

X = df.iloc[:,:-1]
y = df.iloc[:, -1]

In [21]:
# Using Logistic Regression

model = LogisticRegression()

In [22]:
model.fit(X, y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
y_pred = model.predict(X)

In [24]:
print(accuracy_score(y_pred, y))

0.8145833333333333


In [25]:
confusion_matrix(y, y_pred)

array([[ 66,  82],
       [  7, 325]], dtype=int64)