In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_excel("CreditWorthiness.xlsx")

In [3]:
data.columns

Index(['Cbal', 'Cdur', 'Chist', 'Cpur', 'Camt', 'Sbal', 'Edur', 'InRate',
       'MSG', 'Oparties', 'Rdur', 'Prop', 'age', 'inPlans', 'Htype', 'NumCred',
       'JobType', 'Ndepend', 'telephone', 'foreign', 'creditScore'],
      dtype='object')

In [4]:
data['Sbal'].value_counts()

Sbal
Rs. < 1000              603
no savings account      183
1000 <= Rs. < 5,000     103
5000 <= Rs. < 10,000     63
Rs. >= 10,000            48
Name: count, dtype: int64

In [5]:
data['Sbal'].nunique()

5

In [6]:
data['Chist'].value_counts()

Chist
all settled till now      618
dues not paid earlier     293
all settled                49
none taken/all settled     40
Name: count, dtype: int64

In [7]:
filtered_df = data[data['Chist'] == 'dues not paid earlier']

In [8]:
filtered_df['age'].mean()

np.float64(38.436860068259385)

In [9]:
data = pd.read_excel("CreditWorthiness_full.xlsx", sheet_name = 'Data')

In [10]:
data['creditScore'] = data['creditScore'].map({'good':1, 'bad':0})

In [11]:
data.head()

Unnamed: 0,Cbal,Cdur,Chist,Cpur,Camt,Sbal,Edur,InRate,MSG,Oparties,...,Prop,age,inPlans,Htype,NumCred,JobType,Ndepend,telephone,foreign,creditScore
0,0 <= Rs. < 2000,9,all settled till now,Business,13790,Rs. < 1000,1 to 4 years,2,married or widowed male,no one,...,real estate,27,bank,own,1,employee with official position,1,yes,no,1
1,0 <= Rs. < 2000,15,dues not paid earlier,electronics,15250,no savings account,more than 7 years,4,single male,"yes, guarantor",...,real estate,50,none,own,2,employee with official position,1,yes,no,1
2,0 <= Rs. < 2000,36,none taken/all settled,Business,19410,Rs. < 1000,more than 7 years,4,single male,no one,...,Unknown,61,none,free,1,"employed either in management, self or in high...",1,yes,no,0
3,0 <= Rs. < 2000,48,none taken/all settled,Business,144090,Rs. < 1000,1 to 4 years,2,single male,no one,...,Other cars etc.,25,none,own,1,employee with official position,1,yes,no,0
4,no checking account,24,all settled till now,electronics,31690,Rs. < 1000,less than 1 year,4,divorced or separated or married female,no one,...,life insurance/building society,26,none,own,1,employee with official position,1,yes,no,1


In [12]:
data.columns

Index(['Cbal', 'Cdur', 'Chist', 'Cpur', 'Camt', 'Sbal', 'Edur', 'InRate',
       'MSG', 'Oparties', 'Rdur', 'Prop', 'age', 'inPlans', 'Htype', 'NumCred',
       'JobType', 'Ndepend', 'telephone', 'foreign', 'creditScore'],
      dtype='object')

In [13]:
X = pd.get_dummies(data.drop(columns = ['creditScore']), drop_first = True)
Y = data['creditScore']

In [14]:
X.columns

Index(['Cdur', 'Camt', 'InRate', 'age', 'NumCred', 'Ndepend',
       'Cbal_0 <= Rs. < 2000', 'Cbal_Rs. >=2000', 'Cbal_no checking account',
       'Chist_all settled till now', 'Chist_dues not paid earlier',
       'Chist_none taken/all settled', 'Cpur_domestic needs', 'Cpur_education',
       'Cpur_electronics', 'Cpur_furniture', 'Cpur_miscellaneous',
       'Cpur_new vehicle', 'Cpur_renovation', 'Cpur_retaining',
       'Cpur_second hand vehicle', 'Sbal_5000 <= Rs. < 10,000',
       'Sbal_Rs. < 1000', 'Sbal_Rs. >= 10,000', 'Sbal_no savings account',
       'Edur_4 to 7 years', 'Edur_less than 1 year', 'Edur_more than 7 years',
       'Edur_not employed', 'MSG_divorced or separated or married female',
       'MSG_married or widowed male', 'MSG_single male',
       'Oparties_yes, co-applicant', 'Oparties_yes, guarantor',
       'Rdur_2 to 3 years', 'Rdur_less than a year', 'Rdur_more than 3 years',
       'Prop_Unknown', 'Prop_life insurance/building society',
       'Prop_real estate'

In [15]:
data.describe()

Unnamed: 0,Cdur,Camt,InRate,age,NumCred,Ndepend,creditScore
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,20.903,32592.58,2.973,35.546,1.407,1.155,0.7
std,12.058814,28227.36876,1.118715,11.375469,0.577654,0.362086,0.458487
min,4.0,2380.0,1.0,19.0,1.0,1.0,0.0
25%,12.0,13535.0,2.0,27.0,1.0,1.0,0.0
50%,18.0,23075.0,3.0,33.0,1.0,1.0,1.0
75%,24.0,39602.5,4.0,42.0,2.0,1.0,1.0
max,72.0,184120.0,4.0,75.0,4.0,2.0,1.0


In [16]:
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size = 0.25, random_state = 1)

In [17]:
logis_mod = LogisticRegression(max_iter = 10000)
logis_mod.fit(train_x, train_y)
prediction_log = logis_mod.predict(test_x)

In [18]:
con = classification_report(test_y, prediction_log)
print(con)

              precision    recall  f1-score   support

           0       0.55      0.53      0.54        62
           1       0.85      0.86      0.85       188

    accuracy                           0.78       250
   macro avg       0.70      0.69      0.70       250
weighted avg       0.77      0.78      0.77       250



In [19]:
confusion_matrix_logr = confusion_matrix(test_y, prediction_log)
tn, fp, fn, tp = confusion_matrix_logr.ravel()
print(confusion_matrix_logr)
print('tp: ', tp, 'tn: ', tn, 'fp: ', fp, 'fn: ', fn)

[[ 33  29]
 [ 27 161]]
tp:  161 tn:  33 fp:  29 fn:  27


In [20]:
acc_score_logr = accuracy_score(test_y, prediction_log)
print(round(acc_score_logr * 100, 2))

77.6


In [21]:
lr_precision = tp/(tp+fp)
lr_recall = tp/(tp+fn)
lr_f1_score = 2/(1/lr_precision + 1/lr_recall)
print("Precision is: ", round(lr_precision*100, 2),
      "Recall is: ", round(lr_recall*100, 2),
      "F1 Score is: ", round(lr_f1_score*100, 2))


Precision is:  84.74 Recall is:  85.64 F1 Score is:  85.19


In [22]:
print("Misclassified samples : %d"%(test_y != prediction_log).sum())

Misclassified samples : 56


In [29]:
data1 = data.drop('creditScore', axis = 1)
col_names = data1.describe().columns.tolist()
features = train_x[col_names]
scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)
train_x.loc[:, col_names] = features.astype(float)

In [31]:
features = test_x[col_names]
features = scaler.transform(features.values)
test_x.loc[:, col_names] = features

In [32]:
logis_mod_stn = LogisticRegression(max_iter = 10000)
logis_mod_stn.fit(train_x, train_y)
prediction_log_stn = logis_mod_stn.predict(test_x)

In [33]:
print("Misclassified samples : %d"%(test_y != prediction_log_stn).sum())

Misclassified samples : 188


In [34]:
(188 - 56 )/ 56 

2.357142857142857