In [120]:
import pandas as pd
import numpy as np
import statistics
import warnings
warnings.filterwarnings('ignore') # ignore warnings
train= pd.read_csv('train.csv')
test= pd.read_csv('test.csv')
train['Type']= 'train'
test['Type']= 'test'

In [121]:
#Combine train & test data for data cleansing
fulldata = pd.concat([train,test], axis=0, sort= True)

#Combine income of applicant & co-applicant
fulldata['Total_Income']= fulldata.ApplicantIncome + fulldata.CoapplicantIncome
fulldata.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,Credit_History,Dependents,Education,Gender,LoanAmount,Loan_Amount_Term,Loan_ID,Loan_Status,Married,Property_Area,Self_Employed,Type,Total_Income
0,5849,0.0,1.0,0,Graduate,Male,,360.0,LP001002,Y,No,Urban,No,train,5849.0
1,4583,1508.0,1.0,1,Graduate,Male,128.0,360.0,LP001003,N,Yes,Rural,No,train,6091.0
2,3000,0.0,1.0,0,Graduate,Male,66.0,360.0,LP001005,Y,Yes,Urban,Yes,train,3000.0
3,2583,2358.0,1.0,0,Not Graduate,Male,120.0,360.0,LP001006,Y,Yes,Urban,No,train,4941.0
4,6000,0.0,1.0,0,Graduate,Male,141.0,360.0,LP001008,Y,No,Urban,No,train,6000.0


In [122]:
#define income group based on income range
def process_income_group(row):
    if row['Total_Income']< 4000:
        return 'Low'
    elif row['Total_Income'] >= 4000 and row['Total_Income']< 8000:
        return 'Medium'
    else:
        return 'High'

fulldata['Income_Group']= fulldata.apply(process_income_group, axis=1)

In [123]:
fulldata.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,Credit_History,Dependents,Education,Gender,LoanAmount,Loan_Amount_Term,Loan_ID,Loan_Status,Married,Property_Area,Self_Employed,Type,Total_Income,Income_Group
0,5849,0.0,1.0,0,Graduate,Male,,360.0,LP001002,Y,No,Urban,No,train,5849.0,Medium
1,4583,1508.0,1.0,1,Graduate,Male,128.0,360.0,LP001003,N,Yes,Rural,No,train,6091.0,Medium
2,3000,0.0,1.0,0,Graduate,Male,66.0,360.0,LP001005,Y,Yes,Urban,Yes,train,3000.0,Low
3,2583,2358.0,1.0,0,Not Graduate,Male,120.0,360.0,LP001006,Y,Yes,Urban,No,train,4941.0,Medium
4,6000,0.0,1.0,0,Graduate,Male,141.0,360.0,LP001008,Y,No,Urban,No,train,6000.0,Medium


In [124]:
#Identify Columns with nulls
fulldata.isnull().sum()

ApplicantIncome        0
CoapplicantIncome      0
Credit_History        79
Dependents            25
Education              0
Gender                24
LoanAmount            27
Loan_Amount_Term      20
Loan_ID                0
Loan_Status          367
Married                3
Property_Area          0
Self_Employed         55
Type                   0
Total_Income           0
Income_Group           0
dtype: int64

In [125]:
#Replace nulls with meaningful values.
fulldata.Gender.fillna(statistics.mode(fulldata.Gender),inplace= True)
fulldata.Married.fillna(statistics.mode(fulldata.Married),inplace= True)
fulldata.Dependents.fillna(statistics.mode(fulldata.Dependents),inplace= True)
fulldata.Credit_History.fillna(statistics.mode(fulldata.Credit_History),inplace= True)
fulldata.Self_Employed.fillna(statistics.mode(fulldata.Self_Employed),inplace= True)

In [126]:
fulldata.isnull().sum()

ApplicantIncome        0
CoapplicantIncome      0
Credit_History         0
Dependents             0
Education              0
Gender                 0
LoanAmount            27
Loan_Amount_Term      20
Loan_ID                0
Loan_Status          367
Married                0
Property_Area          0
Self_Employed          0
Type                   0
Total_Income           0
Income_Group           0
dtype: int64

In [127]:
#Encode categorical values
from sklearn.preprocessing import LabelEncoder
number= LabelEncoder()
cat_cols= ['Credit_History', 'Dependents', 'Gender', 'Married', 'Education', 'Property_Area', 'Self_Employed', 'Income_Group']

for var in cat_cols:
    fulldata[var] = number.fit_transform(fulldata[var].astype('str'))
    
    
fulldata.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,Credit_History,Dependents,Education,Gender,LoanAmount,Loan_Amount_Term,Loan_ID,Loan_Status,Married,Property_Area,Self_Employed,Type,Total_Income,Income_Group
0,5849,0.0,1,0,0,1,,360.0,LP001002,Y,0,2,0,train,5849.0,2
1,4583,1508.0,1,1,0,1,128.0,360.0,LP001003,N,1,0,0,train,6091.0,2
2,3000,0.0,1,0,0,1,66.0,360.0,LP001005,Y,1,2,1,train,3000.0,1
3,2583,2358.0,1,0,1,1,120.0,360.0,LP001006,Y,1,2,0,train,4941.0,2
4,6000,0.0,1,0,0,1,141.0,360.0,LP001008,Y,0,2,0,train,6000.0,2


In [128]:
#Seperate standardized train & test data
train_modified= fulldata[fulldata['Type']== 'train']
train_modified['Loan_Status'] = number.fit_transform(train_modified['Loan_Status'].astype('str'))
test_modified= fulldata[fulldata['Type']== 'test']

In [129]:
#Model creation & Loan_Status prediction
from sklearn.linear_model import LogisticRegression
predictor_logistics = ['Credit_History','Education', 'Gender', 'Married','Property_Area', 'Income_Group']
x_train = train_modified[predictor_logistics].values
y_train = train_modified['Loan_Status'].values
x_test = test_modified[predictor_logistics].values
model= LogisticRegression()
model.fit(x_train, y_train)
y_test= model.predict(x_test)

#Final result of test dataset
test_modified['Loan_Status']= y_test

In [130]:
test_modified.Loan_Status.value_counts()

1    308
0     59
Name: Loan_Status, dtype: int64

In [131]:
test_modified[test_modified['Loan_Status']==0].head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,Credit_History,Dependents,Education,Gender,LoanAmount,Loan_Amount_Term,Loan_ID,Loan_Status,Married,Property_Area,Self_Employed,Type,Total_Income,Income_Group
7,3881,0.0,0,2,1,1,147.0,360.0,LP001056,0,1,0,0,test,3881.0,1
13,12173,0.0,0,2,0,1,166.0,360.0,LP001094,0,1,1,0,test,12173.0,0
25,0,24000.0,0,0,0,1,148.0,360.0,LP001153,0,0,0,0,test,24000.0,0
35,3150,0.0,0,0,0,1,176.0,360.0,LP001203,0,0,1,0,test,3150.0,1
55,2750,0.0,0,0,0,1,130.0,360.0,LP001313,0,0,2,0,test,2750.0,1
