In [15]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn import ensemble # required for RandomForest
from sklearn import naive_bayes # required for naive bayes
from sklearn import neighbors # for kNN
from sklearn import linear_model
from sklearn import model_selection
from sklearn import feature_selection # required for RFE
from sklearn import preprocessing
from sklearn import metrics
warnings.filterwarnings('ignore')

In [16]:
def printmetrics(actual,predicted):
    print('AUC : ',np.round(metrics.roc_auc_score(actual,predicted),4))
    print('Accuracy :',np.round(metrics.accuracy_score(actual,predicted),4))
    print('Precision : ',np.round(metrics.precision_score(actual,predicted),4))
    print('Recall : ',np.round(metrics.recall_score(actual,predicted),4))
    print('F1 : ',np.round(metrics.f1_score(actual,predicted),4))

In [17]:
df=pd.read_csv('loan_data_set.csv')
df=df.drop('Loan_ID',axis=1)
df=df.dropna()

newcols=['gender','married','dependents','edu','selfemp','appinc','coappinc','amnt','term','chistory','proparea','status']
df.columns=newcols
df['status']=df['status'].replace({'Y':1,'N':0})
X=df.drop('status',axis=1)
y=df['status']

#X['gender']=X['gender'].replace({'Male':1,'Female':0})
#X['married']=X['married'].replace({'Yes':1,'No':0})
#X['dependents']=X['dependents'].replace({'3+':3})
#X['dependents']=X['dependents'].astype(np.int)
#X['edu']=X['edu'].replace({'Graduate':1,'Not Graduate':2})
#X['selfemp']=X['selfemp'].replace({'Yes':1,'No':0})
#X['proparea']=X['proparea'].replace({'Rural':1,'Urban':2,'Semiurban':3})

catcols=list(X.select_dtypes(include='object').columns)
numcols = list(X.select_dtypes(exclude='object').columns)

# Xtrain,Xtest,ytrain,ytest=model_selection.train_test_split(X,y,test_size=.15,random_state=42)

### kNN :Scaling requied, OHE required

In [18]:
catcols.append('chistory')
numcols.remove('chistory')

In [19]:
catcols

['gender', 'married', 'dependents', 'edu', 'selfemp', 'proparea', 'chistory']

In [20]:
numcols

['appinc', 'coappinc', 'amnt', 'term']

## implement OHE

In [21]:
Xohe=pd.get_dummies(X, columns=catcols, drop_first=True)

In [22]:
Xohe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 480 entries, 1 to 613
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   appinc              480 non-null    int64  
 1   coappinc            480 non-null    float64
 2   amnt                480 non-null    float64
 3   term                480 non-null    float64
 4   gender_Male         480 non-null    uint8  
 5   married_Yes         480 non-null    uint8  
 6   dependents_1        480 non-null    uint8  
 7   dependents_2        480 non-null    uint8  
 8   dependents_3+       480 non-null    uint8  
 9   edu_Not Graduate    480 non-null    uint8  
 10  selfemp_Yes         480 non-null    uint8  
 11  proparea_Semiurban  480 non-null    uint8  
 12  proparea_Urban      480 non-null    uint8  
 13  chistory_1.0        480 non-null    uint8  
dtypes: float64(3), int64(1), uint8(10)
memory usage: 23.4 KB


In [23]:
Xtrain, Xtest, ytrain,ytest = model_selection.train_test_split(Xohe, y, test_size=.15, random_state=42)

## implement scaling
- standard scalar

In [24]:
scaler=preprocessing.StandardScaler()
scaler.fit(Xtrain[numcols])
Xtrain[numcols]=scaler.transform(Xtrain[numcols])
Xtest[numcols]=scaler.transform(Xtest[numcols])

## kNN

In [25]:
model=neighbors.KNeighborsClassifier(n_neighbors=7, algorithm='ball_tree') # n_neighbors -> k of kNN, should be in ODD

In [26]:
model.fit(Xtrain, ytrain) # training data is arranged into ball tree algorithm

KNeighborsClassifier(algorithm='ball_tree', n_neighbors=7)

In [32]:
predtrain=model.predict(Xtrain)
predtest=model.predict(Xtest)

In [33]:
printmetrics(ytrain, predtrain)

AUC :  0.6726
Accuracy : 0.7868
Precision :  0.774
Recall :  0.9751
F1 :  0.863


In [34]:
printmetrics(ytest, predtest)

AUC :  0.6134
Accuracy : 0.75
Precision :  0.7619
Recall :  0.9412
F1 :  0.8421
