In [79]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline

In [18]:
df = pd.read_csv('loan_DT_NB.csv')
df.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


### Preprocessing

In [7]:
df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [19]:
# NOTE: TAKEN FROM LECTURE NOTES
df['Gender']=df['Gender'].map({'Male':1,'Female':0})
df['Married']=df['Married'].map({'Yes':1,'No':0})
df['Education']=df['Education'].map({'Graduate':1,'Not Graduate':0})
df['Dependents'].replace('3+',3,inplace=True)
df['Self_Employed']=df['Self_Employed'].map({'Yes':1,'No':0})
df['Property_Area']=df['Property_Area'].map({'Semiurban':1,'Urban':2,'Rural':3})
df['Loan_Status']=df['Loan_Status'].map({'Y':1,'N':0})

# NOTE: TAKEN FROM LECTURE NOTES
null_col=['Gender','Married','Dependents','Self_Employed','Credit_History','LoanAmount','Loan_Amount_Term']
df[null_col]=df[null_col].replace({np.nan:df['Gender'].mode(),
                                   np.nan:df['Married'].mode(),
                                   np.nan:df['Dependents'].mode(),
                                   np.nan:df['Self_Employed'].mode(),
                                   np.nan:df['Credit_History'].mode(),
                                   np.nan:df['LoanAmount'].mean(),
                                   np.nan:df['Loan_Amount_Term'].mean()})

In [20]:
df.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,1.0,0.0,0,1,0.0,5849,0.0,342.0,360.0,1.0,2,1
1,LP001003,1.0,1.0,1,1,0.0,4583,1508.0,128.0,360.0,1.0,3,0
2,LP001005,1.0,1.0,0,1,1.0,3000,0.0,66.0,360.0,1.0,2,1
3,LP001006,1.0,1.0,0,0,0.0,2583,2358.0,120.0,360.0,1.0,2,1
4,LP001008,1.0,0.0,0,1,0.0,6000,0.0,141.0,360.0,1.0,2,1
5,LP001011,1.0,1.0,2,1,1.0,5417,4196.0,267.0,360.0,1.0,2,1
6,LP001013,1.0,1.0,0,0,0.0,2333,1516.0,95.0,360.0,1.0,2,1
7,LP001014,1.0,1.0,3,1,0.0,3036,2504.0,158.0,360.0,0.0,1,0
8,LP001018,1.0,1.0,2,1,0.0,4006,1526.0,168.0,360.0,1.0,2,1
9,LP001020,1.0,1.0,1,1,0.0,12841,10968.0,349.0,360.0,1.0,1,0


In [27]:
# defining dep and indep variables
# feature engineering
X = df.drop(columns=['Loan_ID','Property_Area','Loan_Status']).values
Y = df[['Loan_Status']].values

In [82]:
X

array([[1.0, 0.0, '0', ..., 342.0, 360.0, 1.0],
       [1.0, 1.0, '1', ..., 128.0, 360.0, 1.0],
       [1.0, 1.0, '0', ..., 66.0, 360.0, 1.0],
       ...,
       [1.0, 1.0, '1', ..., 253.0, 360.0, 1.0],
       [1.0, 1.0, '2', ..., 187.0, 360.0, 1.0],
       [0.0, 0.0, '0', ..., 133.0, 360.0, 0.0]], dtype=object)

In [81]:
# Normalizing the numerical columns of X
X_num_cols = ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']
for col in X_num_cols:
    X[col] = MinMaxScaler().fit_transform(np.array(X[col].values).reshape(-1,1))

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [32]:
#Train and Test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

# Models

### Naive Bayes

In [55]:
clf_gnb = GaussianNB()
clf_gnb.fit(X_train, Y_train.ravel())
y_gnb = clf_gnb.predict(X_test)

In [56]:
print(f'f1_score for Naive Bayes is {f1_score(y_gnb, Y_test)}')
print(f'accuracy score for Naive Bayes is {accuracy_score(y_gnb, Y_test)}')

f1_score for Naive Bayes is 0.06451612903225806
accuracy score for Naive Bayes is 0.2926829268292683


### Random Forest

In [73]:
clf_rf = RandomForestClassifier(max_depth=5, n_estimators=100, random_state=0)
clf_rf.fit(X_train, Y_train.ravel())
y_rf = clf_rf.predict(X_test)

In [74]:
print(f'f1_score for Random Forest is {f1_score(y_rf, Y_test)}')
print(f'accuracy score for Random Forest is {accuracy_score(y_rf, Y_test)}')

f1_score for Random Forest is 0.8934010152284264
accuracy score for Random Forest is 0.8292682926829268


### Decision Tree

In [75]:
clf_dt = DecisionTreeClassifier()
clf_dt.fit(X_train, Y_train.ravel())
y_dt = clf_dt.predict(X_test)

In [76]:
print(f'f1_score for Decesion Tree is {f1_score(y_dt, Y_test)}')
print(f'accuracy score for Decesion Tree is {accuracy_score(y_dt, Y_test)}')

f1_score for Decesion Tree is 0.7647058823529411
accuracy score for Decesion Tree is 0.6747967479674797


### Logistic Regression

In [78]:
clf_lr = LogisticRegression()
clf_lr.fit(X_train, Y_train.ravel())
y_lr = clf_lr.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
