In [1]:
# Logistic Regression:
# Apply logistic regression on the Loan dataset with
# "Decision" as the dependent variable. Do the necessary
# one-hot encoding for the categorical variables and
# discard the irrelevant variables. Use whatever
# libraries are appropriate when coding in Python.

In [2]:
# all necessary imports

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [3]:
dataset = pd.read_excel('loan.xlsx')
new_dataset = dataset
dataset.head()

Unnamed: 0,Sex,Age,Time_at_address,Res_status,Telephone,Occupation,Job_status,Time_employed,Time_bank,Liab_ref,Acc_ref,Home_Expn,Balance,Decision
0,M,50.75,0.585,owner,given,unemploye,unemploye,0,0,f,given,145,0,reject
1,M,19.67,10.0,rent,not_given,labourer,governmen,0,0,t,given,140,0,reject
2,F,52.830002,15.0,owner,given,creative_,private_s,5,14,f,given,0,2200,accept
3,M,22.67,2.54,rent,not_given,creative_,governmen,2,0,f,given,0,0,accept
4,M,29.25,13.0,owner,given,driver,governmen,0,0,f,given,228,0,reject


In [4]:
# dataset summary
dataset.describe()

Unnamed: 0,Age,Time_at_address,Time_employed,Time_bank,Home_Expn,Balance
count,429.0,429.0,429.0,429.0,429.0,429.0
mean,31.510163,4.650758,1.871795,2.27972,176.727273,898.382284
std,11.843595,4.804037,3.254023,3.966105,142.590659,3814.56534
min,15.17,0.0,0.0,0.0,0.0,0.0
25%,22.67,1.0,0.0,0.0,80.0,0.0
50%,28.5,2.75,1.0,0.0,160.0,10.0
75%,38.25,7.0,2.0,3.0,272.0,484.0
max,76.75,25.209999,20.0,23.0,760.0,51100.0


In [5]:
# finding out null data => missing data
dataset.isnull().sum()

# inference => no missing data

Sex                0
Age                0
Time_at_address    0
Res_status         0
Telephone          0
Occupation         0
Job_status         0
Time_employed      0
Time_bank          0
Liab_ref           0
Acc_ref            0
Home_Expn          0
Balance            0
Decision           0
dtype: int64

In [6]:
dataset.shape

# before calculating correlation, we shall one hot encode Sex, Res_Status, Telephone, Occupation, Job Status, Acc_ref, Decision

(429, 14)

In [7]:
# one hot encoding
onehot_encoder = OneHotEncoder(handle_unknown='ignore')

# label encoder
label_encoder = LabelEncoder()

In [8]:
# sex
print('Sex: ', dataset['Sex'].nunique())

new_dataset['Sex'] = label_encoder.fit_transform(dataset['Sex'])

Sex:  2


In [9]:
# Res_Status
print('Res_status: ', dataset['Res_status'].nunique())

dataset['Res_status'] = label_encoder.fit_transform(dataset['Res_status'])

Res_status:  2


In [10]:
# Telephone
print('Telephone: ', dataset['Telephone'].nunique())

dataset['Telephone'] = label_encoder.fit_transform(dataset['Telephone'])

Telephone:  2


In [11]:
# Acc_ref
print('Acc_ref: ', dataset['Acc_ref'].nunique())

dataset['Acc_ref'] = label_encoder.fit_transform(dataset['Acc_ref'])

Acc_ref:  2


In [12]:
# Liab_ref
print('Liab_ref: ', dataset['Liab_ref'].nunique())

dataset['Liab_ref'] = label_encoder.fit_transform(dataset['Liab_ref'])

Liab_ref:  2


In [13]:
# Decision
print(dataset['Decision'].dtype)
print('Decision: ', dataset['Decision'].nunique())

dataset['Decision'] = label_encoder.fit_transform(dataset['Decision'])

object
Decision:  2


In [14]:
# Job_status

print('Job_status: ', dataset['Job_status'].unique())

Job_status:  ['unemploye' 'governmen' 'private_s' 'self_empl' 'retired' 'student'
 'military']


In [15]:
# Occupation
print('Occupation: ', dataset['Occupation'].unique())

# dataset['Occupation'] = label_encoder.fit_transform(dataset['Res_Status'])

Occupation:  ['unemploye' 'labourer' 'creative_' 'driver' 'professio' 'manager'
 'guard_etc' 'executive' 'office_st' 'productio' 'semi_pro' 'sales']


In [16]:
features_array = onehot_encoder.fit_transform(dataset[['Job_status', 'Occupation']]).toarray()

In [17]:
feature_labels = ['governmen', 'military', 'private_s', 'retired', 'self_empl',
        'student', 'unemploye', 'creative_', 'driver', 'executive', 'guard_etc', 'labourer',
        'manager', 'office_st', 'productio', 'professio', 'sales',
        'semi_pro', 'unemploye']
feature_labels

['governmen',
 'military',
 'private_s',
 'retired',
 'self_empl',
 'student',
 'unemploye',
 'creative_',
 'driver',
 'executive',
 'guard_etc',
 'labourer',
 'manager',
 'office_st',
 'productio',
 'professio',
 'sales',
 'semi_pro',
 'unemploye']

In [18]:
features = pd.DataFrame(features_array, columns = feature_labels)

In [19]:
dataset = pd.concat([dataset, features], axis=1)

In [20]:
dataset.drop(['Occupation','Job_status'], axis=1, inplace=True)

In [21]:
dataset.columns

Index(['Sex', 'Age', 'Time_at_address', 'Res_status', 'Telephone',
       'Time_employed', 'Time_bank', 'Liab_ref', 'Acc_ref', 'Home_Expn',
       'Balance', 'Decision', 'governmen', 'military', 'private_s', 'retired',
       'self_empl', 'student', 'unemploye', 'creative_', 'driver', 'executive',
       'guard_etc', 'labourer', 'manager', 'office_st', 'productio',
       'professio', 'sales', 'semi_pro', 'unemploye'],
      dtype='object')

In [22]:
# data prepared for correlation

corr = dataset.corr()
corr

Unnamed: 0,Sex,Age,Time_at_address,Res_status,Telephone,Time_employed,Time_bank,Liab_ref,Acc_ref,Home_Expn,...,executive,guard_etc,labourer,manager,office_st,productio,professio,sales,semi_pro,unemploye
Sex,1.0,0.025167,-0.031594,0.121713,0.121713,0.077239,-0.072303,0.010508,0.095826,0.115355,...,0.033932,0.022947,-0.00822,0.141564,-0.239058,0.089171,-0.028788,-0.116177,0.057602,-0.08511
Age,0.025167,1.0,0.217342,-0.080265,-0.080265,0.42689,0.208736,0.049109,-0.040445,-0.072254,...,0.195693,0.028675,-0.035318,0.056127,-0.085265,-0.025142,-0.008012,-0.061906,-0.0029,0.210038
Time_at_address,-0.031594,0.217342,1.0,-0.100614,-0.100614,0.278849,0.202083,-0.049978,-0.144258,-0.26747,...,0.194672,-0.074592,-0.027105,-0.062565,0.060743,0.007001,-0.023361,0.018662,-0.069254,0.028853
Res_status,0.121713,-0.080265,-0.100614,1.0,1.0,-0.07469,-0.132413,0.018827,-0.000201,-0.006477,...,-0.013801,-0.059316,0.103196,-0.007468,-0.141542,0.015006,0.058154,-0.043244,0.012631,0.048437
Telephone,0.121713,-0.080265,-0.100614,1.0,1.0,-0.07469,-0.132413,0.018827,-0.000201,-0.006477,...,-0.013801,-0.059316,0.103196,-0.007468,-0.141542,0.015006,0.058154,-0.043244,0.012631,0.048437
Time_employed,0.077239,0.42689,0.278849,-0.07469,-0.07469,1.0,0.291542,0.122561,-0.025,-0.110933,...,0.158402,-0.048504,-0.076296,0.085916,0.045731,-0.005221,0.065455,-0.080398,0.001919,-0.042615
Time_bank,-0.072303,0.208736,0.202083,-0.132413,-0.132413,0.291542,1.0,0.051,-0.158334,-0.111761,...,0.070908,-0.079085,-0.102461,-0.004365,0.115238,-0.035694,0.030341,-0.087884,0.042998,-0.026283
Liab_ref,0.010508,0.049109,-0.049978,0.018827,0.018827,0.122561,0.051,1.0,0.033197,0.127133,...,-0.047445,0.051444,0.044265,0.041821,-0.005693,0.037271,0.030484,-0.057168,-0.003053,-0.086354
Acc_ref,0.095826,-0.040445,-0.144258,-0.000201,-0.000201,-0.025,-0.158334,0.033197,1.0,0.096426,...,0.023691,-0.017403,0.029235,0.254292,-0.071914,-0.036777,-0.063317,-0.077889,-0.019012,-0.041226
Home_Expn,0.115355,-0.072254,-0.26747,-0.006477,-0.006477,-0.110933,-0.111761,0.127133,0.096426,1.0,...,-0.022788,0.070586,-0.024372,0.121765,-0.017424,0.048673,0.112318,-0.049655,0.094591,-0.139506


In [23]:
# dropping columns based on correlations

# also dropping Time_at_address as it plays trivial role in decision making

dataset.drop(['Telephone', 'Time_at_address'], axis=1, inplace=True)

In [24]:
dataset.columns

# inference => we have 33 variables

Index(['Sex', 'Age', 'Res_status', 'Time_employed', 'Time_bank', 'Liab_ref',
       'Acc_ref', 'Home_Expn', 'Balance', 'Decision', 'governmen', 'military',
       'private_s', 'retired', 'self_empl', 'student', 'unemploye',
       'creative_', 'driver', 'executive', 'guard_etc', 'labourer', 'manager',
       'office_st', 'productio', 'professio', 'sales', 'semi_pro',
       'unemploye'],
      dtype='object')

In [25]:
dataset.head()

Unnamed: 0,Sex,Age,Res_status,Time_employed,Time_bank,Liab_ref,Acc_ref,Home_Expn,Balance,Decision,...,executive,guard_etc,labourer,manager,office_st,productio,professio,sales,semi_pro,unemploye
0,1,50.75,0,0,0,0,0,145,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,19.67,1,0,0,1,0,140,0,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,52.830002,0,5,14,0,0,0,2200,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,22.67,1,2,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,29.25,0,0,0,0,0,228,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
X = dataset[['Sex', 'Age', 'Res_status', 'Time_employed', 'Time_bank', 'Liab_ref',
       'Acc_ref', 'Home_Expn', 'Balance', 'governmen', 'military',
       'private_s', 'retired', 'self_empl', 'student', 'unemploye',
       'creative_', 'driver', 'executive', 'guard_etc', 'labourer', 'manager',
       'office_st', 'productio', 'professio', 'sales', 'semi_pro',
       'unemploye']]

y = dataset[['Decision']]

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [28]:
# applying logistic regression
logistic_regression = LogisticRegression()
regression_model = logistic_regression.fit(X_train,y_train)

regression_model.score(X, y)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7972027972027972

In [29]:
predictions = logistic_regression.predict(X_test)

In [30]:
# calculating the statistical metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, predictions))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, predictions))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

Mean Absolute Error: 0.16279069767441862
Mean Squared Error: 0.16279069767441862
Root Mean Squared Error: 0.4034732923929645


In [31]:
#Confusion matrix:
print("Confusion Matrix : \n", confusion_matrix(y_test, predictions))

Confusion Matrix : 
 [[21 12]
 [ 2 51]]


In [33]:
r2 = r2_score(y_test, predictions)
print("R squared value : ", r2)

R squared value :  0.31160663236134944


In [71]:
from scipy.stats import linregress
linregress(y_test.to_numpy().ravel(), np.array(predictions))


LinregressResult(slope=0.5986277873070323, intercept=0.3636363636363638, rvalue=0.6576844658854166, pvalue=6.012999211574679e-12, stderr=0.07481072828931758, intercept_stderr=0.05872899808288119)