In [111]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2, f_classif
%matplotlib inline

In [112]:
df = pd.read_csv('loan_DT_NB.csv')
df.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


### Preprocessing

In [113]:
df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [114]:
# NOTE: TAKEN FROM LECTURE NOTES
df['Gender']=df['Gender'].map({'Male':1,'Female':0})
df['Married']=df['Married'].map({'Yes':1,'No':0})
df['Education']=df['Education'].map({'Graduate':1,'Not Graduate':0})
df['Dependents'].replace('3+',3,inplace=True)
df['Self_Employed']=df['Self_Employed'].map({'Yes':1,'No':0})
df['Property_Area']=df['Property_Area'].map({'Semiurban':1,'Urban':2,'Rural':3})
df['Loan_Status']=df['Loan_Status'].map({'Y':1,'N':0})

# NOTE: TAKEN FROM LECTURE NOTES
null_col=['Gender','Married','Dependents','Self_Employed','Credit_History','LoanAmount','Loan_Amount_Term']
df[null_col]=df[null_col].replace({np.nan:df['Gender'].mode(),
                                   np.nan:df['Married'].mode(),
                                   np.nan:df['Dependents'].mode(),
                                   np.nan:df['Self_Employed'].mode(),
                                   np.nan:df['Credit_History'].mode(),
                                   np.nan:df['LoanAmount'].mean(),
                                   np.nan:df['Loan_Amount_Term'].mean()})

In [115]:
df.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,1.0,0.0,0,1,0.0,5849,0.0,342.0,360.0,1.0,2,1
1,LP001003,1.0,1.0,1,1,0.0,4583,1508.0,128.0,360.0,1.0,3,0
2,LP001005,1.0,1.0,0,1,1.0,3000,0.0,66.0,360.0,1.0,2,1
3,LP001006,1.0,1.0,0,0,0.0,2583,2358.0,120.0,360.0,1.0,2,1
4,LP001008,1.0,0.0,0,1,0.0,6000,0.0,141.0,360.0,1.0,2,1
5,LP001011,1.0,1.0,2,1,1.0,5417,4196.0,267.0,360.0,1.0,2,1
6,LP001013,1.0,1.0,0,0,0.0,2333,1516.0,95.0,360.0,1.0,2,1
7,LP001014,1.0,1.0,3,1,0.0,3036,2504.0,158.0,360.0,0.0,1,0
8,LP001018,1.0,1.0,2,1,0.0,4006,1526.0,168.0,360.0,1.0,2,1
9,LP001020,1.0,1.0,1,1,0.0,12841,10968.0,349.0,360.0,1.0,1,0


In [116]:
# defining dep and indep variables
# feature engineering
X_df = df.drop(columns=['Loan_ID','Property_Area','Loan_Status'])
Y_df = df[['Loan_Status']]

In [118]:
# Normalizing the numerical columns of X
X_num_cols = ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']
for col in X_num_cols:
    X_df[col] = MinMaxScaler().fit_transform(np.array(X_df[col].values).reshape(-1,1))

In [119]:
# X_df after normalization
X_df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,1.0,0.0,0,1,0.0,0.070489,0.000000,0.481910,0.743590,1.0
1,1.0,1.0,1,1,0.0,0.054830,0.036192,0.172214,0.743590,1.0
2,1.0,1.0,0,1,1.0,0.035250,0.000000,0.082489,0.743590,1.0
3,1.0,1.0,0,0,0.0,0.030093,0.056592,0.160637,0.743590,1.0
4,1.0,0.0,0,1,0.0,0.072356,0.000000,0.191027,0.743590,1.0
...,...,...,...,...,...,...,...,...,...,...
609,0.0,0.0,0,1,0.0,0.034014,0.000000,0.089725,0.743590,1.0
610,1.0,1.0,3,1,0.0,0.048930,0.000000,0.044863,0.358974,1.0
611,1.0,1.0,1,1,0.0,0.097984,0.005760,0.353111,0.743590,1.0
612,1.0,1.0,2,1,0.0,0.091936,0.000000,0.257598,0.743590,1.0


In [120]:
# numeric and categorical data to apply feature selection
X_df_numeric = X_df[['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term','Dependents']]
X_df_cat = X_df[['Gender','Married','Education','Self_Employed']]

In [121]:
X_df_numeric

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Dependents
0,0.070489,0.000000,0.481910,0.743590,0
1,0.054830,0.036192,0.172214,0.743590,1
2,0.035250,0.000000,0.082489,0.743590,0
3,0.030093,0.056592,0.160637,0.743590,0
4,0.072356,0.000000,0.191027,0.743590,0
...,...,...,...,...,...
609,0.034014,0.000000,0.089725,0.743590,0
610,0.048930,0.000000,0.044863,0.358974,3
611,0.097984,0.005760,0.353111,0.743590,1
612,0.091936,0.000000,0.257598,0.743590,2


In [122]:
X_df_cat

Unnamed: 0,Gender,Married,Education,Self_Employed
0,1.0,0.0,1,0.0
1,1.0,1.0,1,0.0
2,1.0,1.0,1,1.0
3,1.0,1.0,0,0.0
4,1.0,0.0,1,0.0
...,...,...,...,...
609,0.0,0.0,1,0.0
610,1.0,1.0,1,0.0
611,1.0,1.0,1,0.0
612,1.0,1.0,1,0.0


In [123]:
#Input Categorical and Output categorical
Selector_cat = SelectKBest(chi2, k=3)
Selector_cat.fit(X_df_cat.values, Y_df.values.ravel())
mask_cat = Selector_cat.get_support()
print(mask_cat)

[ True  True False  True]


In [124]:
#Input numerical and output Categorical so applying ANOVA
Selector_num = SelectKBest(f_classif, k=4)
Selector_num.fit(X_df_numeric.values, Y_df.values.ravel())
mask_num = Selector_num.get_support()
print(mask_num)

[False  True  True  True  True]


**Feature selection suggests that we should remove applicant income! Income is an important part of acquiring Loan, Hence not applying Feature selection for numerical data**

In [125]:
#Applycing feature selection for categorical data hence the final df is:
X_df = X_df.drop(columns=['Education'])

In [126]:
#Train and Test
X = X_df.values
Y = Y_df.values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

# Models

### Naive Bayes

In [105]:
clf_gnb = GaussianNB()
clf_gnb.fit(X_train, Y_train.ravel())
y_gnb = clf_gnb.predict(X_test)

In [106]:
print(f'f1_score for Naive Bayes is {f1_score(y_gnb, Y_test)}')
print(f'accuracy score for Naive Bayes is {accuracy_score(y_gnb, Y_test)}')

f1_score for Naive Bayes is 0.04347826086956522
accuracy score for Naive Bayes is 0.2845528455284553


### Random Forest

In [107]:
clf_rf = RandomForestClassifier(max_depth=5, n_estimators=100, random_state=0)
clf_rf.fit(X_train, Y_train.ravel())
y_rf = clf_rf.predict(X_test)

In [108]:
print(f'f1_score for Random Forest is {f1_score(y_rf, Y_test)}')
print(f'accuracy score for Random Forest is {accuracy_score(y_rf, Y_test)}')

f1_score for Random Forest is 0.8934010152284264
accuracy score for Random Forest is 0.8292682926829268


### Decision Tree

In [109]:
clf_dt = DecisionTreeClassifier()
clf_dt.fit(X_train, Y_train.ravel())
y_dt = clf_dt.predict(X_test)

In [110]:
print(f'f1_score for Decesion Tree is {f1_score(y_dt, Y_test)}')
print(f'accuracy score for Decesion Tree is {accuracy_score(y_dt, Y_test)}')

f1_score for Decesion Tree is 0.7602339181286549
accuracy score for Decesion Tree is 0.6666666666666666


### Logistic Regression

In [30]:
# used different max iter and solver to make to algorithm converge
clf_lr = LogisticRegression(solver='newton-cg', max_iter=200)
clf_lr.fit(X_train, Y_train.ravel())
y_lr = clf_lr.predict(X_test)

In [33]:
print(f'f1_score for Logistic Regression is {f1_score(y_lr, Y_test)}')
print(f'accuracy score for Logistic Regression is {accuracy_score(y_lr, Y_test)}')

f1_score for Logistic Regression is 0.8450704225352113
accuracy score for Logistic Regression is 0.7317073170731707


## Conclusion

Naive Bayes is performing worse while Random Forest has the best performance. Logistic regression wasn't converging so changes the method to newton-cg and max iteration from 100 to 200. Decesion Tree has better results than Naive Bayes but but worse than logistic and Random Forest. Random Forest is Ensemble model, it uses weak learners like Decision Tree and based on average or majority it makes the prediction.