In [82]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from datetime import date, timedelta
import math
from sklearn.model_selection import StratifiedShuffleSplit

In [36]:
traindata = pd.read_csv('./dataset/processed_tempdata.csv')

In [37]:
traindata.shape

(1671, 12)

In [38]:
traindata.head()

Unnamed: 0,termdays,bank_account_type,bank_name_clients,employment_status_clients,rankloanamt,rankloannum,isreffered,totalduerank,agerank,avgloanamtrank,repaymentrank,good_bad_flag
0,15,Other,EcoBank,Permanent,2,1,0,1,2,2,2,Good
1,15,Savings,Access Bank,Unemployed,1,1,0,1,2,2,3,Good
2,30,Other,First Bank,Self-Employed,3,2,0,2,2,3,3,Good
3,30,Savings,Union Bank,Permanent,2,1,0,1,1,2,3,Good
4,30,Other,First Bank,Permanent,2,2,0,1,2,2,3,Good


#### One hot encoding for nominal categorical value

In [40]:
one_hot_bankAccType = pd.get_dummies(traindata['bank_account_type'])

In [41]:
traindata = traindata.drop('bank_account_type', axis=1)

In [42]:
traindata = traindata.join(one_hot_bankAccType)

In [43]:
traindata.head()

Unnamed: 0,termdays,bank_name_clients,employment_status_clients,rankloanamt,rankloannum,isreffered,totalduerank,agerank,avgloanamtrank,repaymentrank,good_bad_flag,Current,Other,Savings
0,15,EcoBank,Permanent,2,1,0,1,2,2,2,Good,0,1,0
1,15,Access Bank,Unemployed,1,1,0,1,2,2,3,Good,0,0,1
2,30,First Bank,Self-Employed,3,2,0,2,2,3,3,Good,0,1,0
3,30,Union Bank,Permanent,2,1,0,1,1,2,3,Good,0,0,1
4,30,First Bank,Permanent,2,2,0,1,2,2,3,Good,0,1,0


In [44]:
one_hot_bnc = pd.get_dummies(traindata['bank_name_clients'])
traindata = traindata.drop('bank_name_clients', axis=1)
traindata = traindata.join(one_hot_bnc)

In [54]:
traindata = traindata.drop('good_bad_flag', axis=1)

In [46]:
one_hot_esc = pd.get_dummies(traindata['employment_status_clients'])
traindata = traindata.drop('employment_status_clients', axis=1)
traindata = traindata.join(one_hot_esc)

In [47]:
traindata.head()

Unnamed: 0,termdays,rankloanamt,rankloannum,isreffered,totalduerank,agerank,avgloanamtrank,repaymentrank,good_bad_flag,Current,...,Union Bank,Unity Bank,Wema Bank,Zenith Bank,Contract,Permanent,Retired,Self-Employed,Student,Unemployed
0,15,2,1,0,1,2,2,2,Good,0,...,0,0,0,0,0,1,0,0,0,0
1,15,1,1,0,1,2,2,3,Good,0,...,0,0,0,0,0,0,0,0,0,1
2,30,3,2,0,2,2,3,3,Good,0,...,0,0,0,0,0,0,0,1,0,0
3,30,2,1,0,1,1,2,3,Good,0,...,1,0,0,0,0,1,0,0,0,0
4,30,2,2,0,1,2,2,3,Good,0,...,0,0,0,0,0,1,0,0,0,0


In [48]:
def numericClass(gbf):
    if( gbf == 'Good' ):
        return 1
    else:
        return 0

In [49]:
traindata['class'] = traindata['good_bad_flag'].apply(numericClass)

In [None]:
traindata = traindata.drop('good_bad_flag', axis=1)

In [62]:
traindata.head()

Unnamed: 0,termdays,rankloanamt,rankloannum,isreffered,totalduerank,agerank,avgloanamtrank,repaymentrank,Current,Other,...,Unity Bank,Wema Bank,Zenith Bank,Contract,Permanent,Retired,Self-Employed,Student,Unemployed,class
0,15,2,1,0,1,2,2,2,0,1,...,0,0,0,0,1,0,0,0,0,1
1,15,1,1,0,1,2,2,3,0,0,...,0,0,0,0,0,0,0,0,1,1
2,30,3,2,0,2,2,3,3,0,1,...,0,0,0,0,0,0,1,0,0,1
3,30,2,1,0,1,1,2,3,0,0,...,0,0,0,0,1,0,0,0,0,1
4,30,2,2,0,1,2,2,3,0,1,...,0,0,0,0,1,0,0,0,0,1


In [61]:
traindata.shape

(1671, 36)

In [60]:
traindata.to_csv('./dataset/numeric_traindata.csv', index=False)

In [65]:
def getSample(size, df):
    size1 = int(round(size * 0.50))
    size2 = size - size1
    data_good = df[(df['class'] == 1)]
    data_bad = df[(df['class'] == 0)]
    data_sample_good = data_good.sample(n=size1)
    data_sample_bad = data_bad.sample(n=size2)
    return pd.concat((data_sample_good, data_sample_bad))

In [66]:
train_600 = getSample(600, traindata)

In [67]:
train_600.to_csv('./dataset/python/train_600.csv')

In [70]:
train_600.columns

Index(['termdays', 'rankloanamt', 'rankloannum', 'isreffered', 'totalduerank',
       'agerank', 'avgloanamtrank', 'repaymentrank', 'Current', 'Other',
       'Savings', 'Access Bank', 'Diamond Bank', 'EcoBank', 'FCMB',
       'Fidelity Bank', 'First Bank', 'GT Bank', 'Heritage Bank',
       'Keystone Bank', 'Skye Bank', 'Stanbic IBTC', 'Standard Chartered',
       'Sterling Bank', 'UBA', 'Union Bank', 'Unity Bank', 'Wema Bank',
       'Zenith Bank', 'Contract', 'Permanent', 'Retired', 'Self-Employed',
       'Student', 'Unemployed', 'class'],
      dtype='object')

In [75]:
X = train_600[train_600.columns[0:35]]

In [77]:
Y = train_600['class']

In [83]:
sss = StratifiedShuffleSplit(n_splits=3, test_size=200, random_state=0)

In [84]:
i = 1 
for  train_index, test_index in sss.split(X, Y):
    print( i, "TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = train_600.iloc[train_index], train_600.iloc[test_index]
    X_train.to_csv("./dataset/python/fold" + str(i) + "_train.csv",index=False)
    X_test.to_csv("./dataset/python/fold" + str(i) + "_test.csv",index=False)
    i = i + 1

1 TRAIN: [294 355 366 129 168 380 261 123 564 247 552 283 588 316 187 100 412 189
  98 322 143 586 118 439 471 459 456  54 271  73 416 350 553 268 562 236
 518 232  45  76   0  25 250 102 397 579 286 568 592 499  50 374 152 522
  20 224 326  77 249 406   3 393 101  47 304 132 526 145  29 520 276 126
 115 279 204 112 151 169 346  82 297 479  62 175 494 593 396 215 234 217
 214 524 134 155 570 555 257  33 354 103 452  86 377 125 407 449 446 389
 394 437 434 512 362 430 545  75 539 516 119 281 226 110 202 336 114 595
  11  40 159 292  96 352 505 464 550 225 205  99  23  67 231 153 359 519
 161 392 137 315  36 200  85 481 424 506 109 344 429  89 188  35 361   7
  88 221 580 425 211 270 468 182  71 333 457 411 445 176 178 222  12 418
 299 140 289 517 142  69 436 275 514 476  95 330 510 254   2 240 489 107
 206  37 308 401 458 383 453 482 462 303  31 535 566 523 122  21 210 184
 258 572 548 537 529 329 591 291 242  97 363 435 334 478 438 128 197 149
  30 408  39 298 312 513 560 170 475 376 5

In [88]:
train_600.columns[0:35].values

array(['termdays', 'rankloanamt', 'rankloannum', 'isreffered',
       'totalduerank', 'agerank', 'avgloanamtrank', 'repaymentrank',
       'Current', 'Other', 'Savings', 'Access Bank', 'Diamond Bank',
       'EcoBank', 'FCMB', 'Fidelity Bank', 'First Bank', 'GT Bank',
       'Heritage Bank', 'Keystone Bank', 'Skye Bank', 'Stanbic IBTC',
       'Standard Chartered', 'Sterling Bank', 'UBA', 'Union Bank',
       'Unity Bank', 'Wema Bank', 'Zenith Bank', 'Contract', 'Permanent',
       'Retired', 'Self-Employed', 'Student', 'Unemployed'], dtype=object)

In [111]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
def testModel(train, test, treefile, criteria):
    fold_train = pd.read_csv(train)
    fold_test = pd.read_csv(test)
    clf = DecisionTreeClassifier(criterion=criteria)
    clf.fit(fold_train[[c for c in train_600.columns[0:35]]], fold_train["class"])
    from sklearn import tree
    with open(treefile,"w") as f:
        f = tree.export_graphviz(clf,
                        feature_names=[c for c in train_600.columns[0:35]], out_file=f)
    predictions = clf.predict(fold_test[[c for c in train_600.columns[0:35]]])
    tn, fp, fn, tp = confusion_matrix(fold_test['class'], predictions).ravel()
    print(criteria)
    print("TP -", tp, "TN - ", tn, "FP - ", fp, "FN - ", fn)
    print("Accuracy -",round(accuracy_score(fold_test['class'], predictions),2))
    print("Recall -",round(recall_score(fold_test['class'], predictions),2))
    print("Precision -",round(precision_score(fold_test['class'], predictions),2))
    print("F1 -",round(f1_score(fold_test['class'], predictions),2)) 
    print("Sensitivity -",round(tp/(tp+fn),2))
    print("Specificity -",round(tn/(tn+fp),2))

In [112]:
testModel('./dataset/python/fold1_train.csv', './dataset/python/fold1_test.csv', 'fold1-gini.dot','gini')

gini
TP - 45 TN -  55 FP -  45 FN -  55
Accuracy - 0.5
Recall - 0.45
Precision - 0.5
F1 - 0.47
Sensitivity - 0.45
Specificity - 0.55


In [113]:
testModel('./dataset/python/fold1_train.csv', './dataset/python/fold1_test.csv', 'fold1-entropy.dot','entropy')

entropy
TP - 51 TN -  49 FP -  51 FN -  49
Accuracy - 0.5
Recall - 0.51
Precision - 0.5
F1 - 0.5
Sensitivity - 0.51
Specificity - 0.49


In [114]:
testModel('./dataset/python/fold2_train.csv', './dataset/python/fold2_test.csv', 'fold2-gini.dot','gini')

gini
TP - 52 TN -  46 FP -  54 FN -  48
Accuracy - 0.49
Recall - 0.52
Precision - 0.49
F1 - 0.5
Sensitivity - 0.52
Specificity - 0.46


In [115]:
testModel('./dataset/python/fold2_train.csv', './dataset/python/fold2_test.csv', 'fold2-entropy.dot','entropy')

entropy
TP - 46 TN -  40 FP -  60 FN -  54
Accuracy - 0.43
Recall - 0.46
Precision - 0.43
F1 - 0.45
Sensitivity - 0.46
Specificity - 0.4


In [116]:
testModel('./dataset/python/fold3_train.csv', './dataset/python/fold3_test.csv', 'fold3-gini.dot','gini')

gini
TP - 41 TN -  49 FP -  51 FN -  59
Accuracy - 0.45
Recall - 0.41
Precision - 0.45
F1 - 0.43
Sensitivity - 0.41
Specificity - 0.49


In [117]:
testModel('./dataset/python/fold3_train.csv', './dataset/python/fold3_test.csv', 'fold3-entropy.dot','entropy')

entropy
TP - 48 TN -  49 FP -  51 FN -  52
Accuracy - 0.48
Recall - 0.48
Precision - 0.48
F1 - 0.48
Sensitivity - 0.48
Specificity - 0.49
