In [5]:
# Apply XGBoost classification algorithm on the
# loan data set and compare its performance
# against SVM and Logistic Regression by suitably
# splitting the data into train and test sets.

In [58]:

# all necessary imports

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [7]:
dataset = pd.read_excel('loan.xlsx')
new_dataset = dataset
dataset.head()

Unnamed: 0,Sex,Age,Time_at_address,Res_status,Telephone,Occupation,Job_status,Time_employed,Time_bank,Liab_ref,Acc_ref,Home_Expn,Balance,Decision
0,M,50.75,0.585,owner,given,unemploye,unemploye,0,0,f,given,145,0,reject
1,M,19.67,10.0,rent,not_given,labourer,governmen,0,0,t,given,140,0,reject
2,F,52.830002,15.0,owner,given,creative_,private_s,5,14,f,given,0,2200,accept
3,M,22.67,2.54,rent,not_given,creative_,governmen,2,0,f,given,0,0,accept
4,M,29.25,13.0,owner,given,driver,governmen,0,0,f,given,228,0,reject


In [8]:
# dataset summary
dataset.describe()

Unnamed: 0,Age,Time_at_address,Time_employed,Time_bank,Home_Expn,Balance
count,429.0,429.0,429.0,429.0,429.0,429.0
mean,31.510163,4.650758,1.871795,2.27972,176.727273,898.382284
std,11.843595,4.804037,3.254023,3.966105,142.590659,3814.56534
min,15.17,0.0,0.0,0.0,0.0,0.0
25%,22.67,1.0,0.0,0.0,80.0,0.0
50%,28.5,2.75,1.0,0.0,160.0,10.0
75%,38.25,7.0,2.0,3.0,272.0,484.0
max,76.75,25.209999,20.0,23.0,760.0,51100.0


In [9]:
# finding out null data => missing data
dataset.isnull().sum()

# inference => no missing data

Sex                0
Age                0
Time_at_address    0
Res_status         0
Telephone          0
Occupation         0
Job_status         0
Time_employed      0
Time_bank          0
Liab_ref           0
Acc_ref            0
Home_Expn          0
Balance            0
Decision           0
dtype: int64

In [10]:
dataset.shape

# before calculating correlation, we shall one hot encode Sex, Res_Status, Telephone, Occupation, Job Status, Acc_ref, Decision

(429, 14)

In [11]:
# one hot encoding
onehot_encoder = OneHotEncoder(handle_unknown='ignore')

# label encoder
label_encoder = LabelEncoder()

In [12]:
# sex
print('Sex: ', dataset['Sex'].nunique())

new_dataset['Sex'] = label_encoder.fit_transform(dataset['Sex'])

Sex:  2


In [13]:
# Res_Status
print('Res_status: ', dataset['Res_status'].nunique())

dataset['Res_status'] = label_encoder.fit_transform(dataset['Res_status'])

Res_status:  2


In [14]:
# Telephone
print('Telephone: ', dataset['Telephone'].nunique())

dataset['Telephone'] = label_encoder.fit_transform(dataset['Telephone'])

Telephone:  2


In [15]:
# Acc_ref
print('Acc_ref: ', dataset['Acc_ref'].nunique())

dataset['Acc_ref'] = label_encoder.fit_transform(dataset['Acc_ref'])

Acc_ref:  2


In [16]:
# Liab_ref
print('Liab_ref: ', dataset['Liab_ref'].nunique())

dataset['Liab_ref'] = label_encoder.fit_transform(dataset['Liab_ref'])

Liab_ref:  2


In [17]:
# Decision
print(dataset['Decision'].dtype)
print('Decision: ', dataset['Decision'].nunique())

dataset['Decision'] = label_encoder.fit_transform(dataset['Decision'])

object
Decision:  2


In [18]:
# Job_status

print('Job_status: ', dataset['Job_status'].unique())

Job_status:  ['unemploye' 'governmen' 'private_s' 'self_empl' 'retired' 'student'
 'military']


In [19]:
# Occupation
print('Occupation: ', dataset['Occupation'].unique())

# dataset['Occupation'] = label_encoder.fit_transform(dataset['Res_Status'])

Occupation:  ['unemploye' 'labourer' 'creative_' 'driver' 'professio' 'manager'
 'guard_etc' 'executive' 'office_st' 'productio' 'semi_pro' 'sales']


In [20]:
features_array = onehot_encoder.fit_transform(dataset[['Job_status', 'Occupation']]).toarray()

In [21]:
feature_labels = ['governmen', 'military', 'private_s', 'retired', 'self_empl',
        'student', 'unemploye', 'creative_', 'driver', 'executive', 'guard_etc', 'labourer',
        'manager', 'office_st', 'productio', 'professio', 'sales',
        'semi_pro', 'unemploye']
feature_labels

['governmen',
 'military',
 'private_s',
 'retired',
 'self_empl',
 'student',
 'unemploye',
 'creative_',
 'driver',
 'executive',
 'guard_etc',
 'labourer',
 'manager',
 'office_st',
 'productio',
 'professio',
 'sales',
 'semi_pro',
 'unemploye']

In [22]:
features = pd.DataFrame(features_array, columns = feature_labels)

In [23]:
dataset = pd.concat([dataset, features], axis=1)

In [24]:
dataset.drop(['Occupation','Job_status'], axis=1, inplace=True)

In [25]:
dataset.columns

Index(['Sex', 'Age', 'Time_at_address', 'Res_status', 'Telephone',
       'Time_employed', 'Time_bank', 'Liab_ref', 'Acc_ref', 'Home_Expn',
       'Balance', 'Decision', 'governmen', 'military', 'private_s', 'retired',
       'self_empl', 'student', 'unemploye', 'creative_', 'driver', 'executive',
       'guard_etc', 'labourer', 'manager', 'office_st', 'productio',
       'professio', 'sales', 'semi_pro', 'unemploye'],
      dtype='object')

In [26]:
# data prepared for correlation

corr = dataset.corr()
print(corr)

                      Sex       Age  Time_at_address  Res_status  Telephone  \
Sex              1.000000  0.025167        -0.031594    0.121713   0.121713   
Age              0.025167  1.000000         0.217342   -0.080265  -0.080265   
Time_at_address -0.031594  0.217342         1.000000   -0.100614  -0.100614   
Res_status       0.121713 -0.080265        -0.100614    1.000000   1.000000   
Telephone        0.121713 -0.080265        -0.100614    1.000000   1.000000   
Time_employed    0.077239  0.426890         0.278849   -0.074690  -0.074690   
Time_bank       -0.072303  0.208736         0.202083   -0.132413  -0.132413   
Liab_ref         0.010508  0.049109        -0.049978    0.018827   0.018827   
Acc_ref          0.095826 -0.040445        -0.144258   -0.000201  -0.000201   
Home_Expn        0.115355 -0.072254        -0.267470   -0.006477  -0.006477   
Balance         -0.045248  0.101819         0.072235   -0.068829  -0.068829   
Decision         0.052564 -0.181241        -0.163298

In [27]:
# dropping columns based on correlations

# also dropping Time_at_address as it plays trivial role in decision making

dataset.drop(['Telephone', 'Time_at_address'], axis=1, inplace=True)

In [28]:
dataset.columns

# inference => we have 33 variables

Index(['Sex', 'Age', 'Res_status', 'Time_employed', 'Time_bank', 'Liab_ref',
       'Acc_ref', 'Home_Expn', 'Balance', 'Decision', 'governmen', 'military',
       'private_s', 'retired', 'self_empl', 'student', 'unemploye',
       'creative_', 'driver', 'executive', 'guard_etc', 'labourer', 'manager',
       'office_st', 'productio', 'professio', 'sales', 'semi_pro',
       'unemploye'],
      dtype='object')

In [29]:
dataset.head()

Unnamed: 0,Sex,Age,Res_status,Time_employed,Time_bank,Liab_ref,Acc_ref,Home_Expn,Balance,Decision,...,executive,guard_etc,labourer,manager,office_st,productio,professio,sales,semi_pro,unemploye
0,1,50.75,0,0,0,0,0,145,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,19.67,1,0,0,1,0,140,0,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,52.830002,0,5,14,0,0,0,2200,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,22.67,1,2,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,29.25,0,0,0,0,0,228,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
X = dataset[['Sex', 'Age', 'Res_status', 'Time_employed', 'Time_bank', 'Liab_ref',
       'Acc_ref', 'Home_Expn', 'Balance', 'governmen', 'military',
       'private_s', 'retired', 'self_empl', 'student',
       'creative_', 'driver', 'executive', 'guard_etc', 'labourer', 'manager',
       'office_st', 'productio', 'professio', 'sales', 'semi_pro']]

y = dataset[['Decision']]

In [81]:
X.nunique()

Sex                2
Age              274
Res_status         2
Time_employed     19
Time_bank         20
Liab_ref           2
Acc_ref            2
Home_Expn        126
Balance          171
governmen          2
military           2
private_s          2
retired            2
self_empl          2
student            2
creative_          2
driver             2
executive          2
guard_etc          2
labourer           2
manager            2
office_st          2
productio          2
professio          2
sales              2
semi_pro           2
dtype: int64

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [93]:

# fit model no training data
model = XGBClassifier(objective ='reg:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

model.fit(X_train, y_train)

XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.3,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=10, n_jobs=0,
              num_parallel_tree=1, objective='reg:logistic', predictor='auto',
              random_state=0, ...)

In [94]:
predictions = model.predict(X_test)

In [95]:
model.score(X, y)

0.7948717948717948

In [96]:
# calculating the statistical metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, predictions))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, predictions))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

Mean Absolute Error: 0.19767441860465115
Mean Squared Error: 0.19767441860465115
Root Mean Squared Error: 0.44460591382105025


In [97]:
#Confusion matrix:
print("Confusion Matrix : \n", confusion_matrix(y_test, predictions))

Confusion Matrix : 
 [[27 14]
 [ 3 42]]


In [98]:
r2 = r2_score(y_test, predictions)
print("R squared value : ", r2)

R squared value :  0.20758807588075912


In [99]:
from scipy.stats import linregress
linregress(y_test.to_numpy().ravel(), np.array(predictions))

LinregressResult(slope=0.5918699186991868, intercept=0.3414634146341465, rvalue=0.6202543406978069, pvalue=1.9027902709052533e-10, stderr=0.08166854504861733, intercept_stderr=0.059076104470194984)