In [40]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from datetime import date, timedelta
import math

In [2]:
traindata = pd.read_csv('./dataset/train.csv')

In [3]:
traindata.shape

(4368, 21)

In [4]:
traindata.head()

Unnamed: 0,approveddate,creationdate,customerid,loanamount,loannumber,referredby,systemloanid,termdays,totaldue,birthdate,...,longitude_gps,latitude_gps,bank_name_clients,bank_branch_clients,employment_status_clients,level_of_education_clients,avg_loanamount,avg_totaldue,repaymentdays,good_bad_flag
0,2017-07-25 08:22:56.000000,2017-07-25 07:22:47.000000,8a2a81a74ce8c05d014cfb32a0da1049,30000.0,12,,301994762,30,34500.0,1972-01-15 00:00:00.000000,...,3.43201,6.433055,Diamond Bank,,Permanent,Post-Graduate,18181.818182,22081.818182,10.0,Good
1,2017-07-05 17:04:41.000000,2017-07-05 16:04:18.000000,8a85886e54beabf90154c0a29ae757c0,15000.0,2,,301965204,30,17250.0,1985-08-23 00:00:00.000000,...,3.885298,7.3207,GT Bank,"DUGBE,IBADAN",Permanent,Graduate,0.0,0.0,0.0,Good
2,2017-07-06 14:52:57.000000,2017-07-06 13:52:51.000000,8a8588f35438fe12015444567666018e,20000.0,7,,301966580,15,22250.0,1984-09-18 00:00:00.000000,...,11.13935,10.292041,EcoBank,,Permanent,,10000.0,11750.0,-5.0,Good
3,2017-07-27 19:00:41.000000,2017-07-27 18:00:35.000000,8a85890754145ace015429211b513e16,10000.0,3,,301999343,15,11500.0,1977-10-10 00:00:00.000000,...,3.98577,7.491708,First Bank,,Permanent,,10000.0,12250.0,-15.0,Good
4,2017-07-03 23:42:45.000000,2017-07-03 22:42:39.000000,8a858970548359cc0154883481981866,40000.0,9,,301962360,30,44000.0,1986-09-07 00:00:00.000000,...,7.457913,9.076574,GT Bank,,Permanent,Primary,18750.0,23550.0,25.0,Good


In [18]:
traindata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4368 entries, 0 to 4367
Data columns (total 21 columns):
approveddate                  4368 non-null object
creationdate                  4368 non-null object
customerid                    4368 non-null object
loanamount                    4368 non-null float64
loannumber                    4368 non-null int64
referredby                    587 non-null object
systemloanid                  4368 non-null int64
termdays                      4368 non-null int64
totaldue                      4368 non-null float64
birthdate                     4368 non-null object
bank_account_type             4368 non-null object
longitude_gps                 4368 non-null float64
latitude_gps                  4368 non-null float64
bank_name_clients             4368 non-null object
bank_branch_clients           43 non-null object
employment_status_clients     3711 non-null object
level_of_education_clients    605 non-null object
avg_loanamount               

### ---- Feature creation and processing ---

### 1- approveddate and creationdate are same for almost all records, will not carry much information - we can remove them

### 2 - Finding Impact of loanamount on class label

In [8]:
pd.crosstab(traindata.loanamount,traindata.good_bad_flag)

good_bad_flag,Bad,Good
loanamount,Unnamed: 1_level_1,Unnamed: 2_level_1
10000.0,655,1807
15000.0,11,71
20000.0,127,653
25000.0,4,27
30000.0,96,488
35000.0,1,9
40000.0,46,287
45000.0,0,1
50000.0,11,66
60000.0,1,7


#### Most of the bad loan are for 10000, we can create three ordinal values 1 for <= 20000 and 2 for <= 40000 and 3 for <=60000

In [117]:
def getLoanAmtRank(loanAmt):
    if( loanAmt <= 20000 ):
        return 1
    elif( loanAmt <= 40000):
        return 2
    else:
        return 3

In [118]:
traindata['rankloanamt'] = traindata['loanamount'].apply(getLoanAmtRank)

In [119]:
pd.crosstab(traindata.rankloanamt,traindata.good_bad_flag)

good_bad_flag,Bad,Good
rankloanamt,Unnamed: 1_level_1,Unnamed: 2_level_1
1,793,2531
2,147,811
3,12,74


### 3 - Finding impact of loan number on class label

In [10]:
pd.crosstab(traindata.loannumber,traindata.good_bad_flag)

good_bad_flag,Bad,Good
loannumber,Unnamed: 1_level_1,Unnamed: 2_level_1
2,347,1046
3,179,491
4,111,332
5,52,274
6,56,191
7,41,211
8,37,183
9,30,150
10,28,163
11,16,97


#### Loan number > 18 are pure. We can divide this column into two ordinal values 1 for <= 18 and 2 for > 18

In [120]:
def getLoanNumberRank(loannumber):
    if( loannumber < 18 ):
        return 1
    else:
        return 2

In [121]:
traindata['rankloannum'] = traindata['loannumber'].apply(getLoanNumberRank)

In [122]:
pd.crosstab(traindata.rankloannum, traindata.good_bad_flag)

good_bad_flag,Bad,Good
rankloannum,Unnamed: 1_level_1,Unnamed: 2_level_1
1,950,3399
2,2,17


### 4 - Impact of referredby on class label

In [137]:
pd.crosstab(traindata.referredby.isnull(),traindata.good_bad_flag)

good_bad_flag,Bad,Good
referredby,Unnamed: 1_level_1,Unnamed: 2_level_1
False,134,453
True,818,2963


#### There is relation, if loan is through rferral then most of the loans are bad, we can create binary attribute for this

#### if referred then 1 else 0

In [150]:
traindata['referredby'] = traindata['referredby'].fillna(0)

In [151]:
def isReferred(reffered):
    if( reffered == 0 ):
        return 0
    else:
        return 1

In [153]:
traindata['isreffered'] = traindata['referredby'].apply(isReferred)

In [156]:
pd.crosstab(traindata.isreffered,traindata.good_bad_flag)

good_bad_flag,Bad,Good
isreffered,Unnamed: 1_level_1,Unnamed: 2_level_1
0,818,2963
1,134,453


### 5 - Impact of systemloanid on class label

In [15]:
traindata['systemloanid'].unique().shape

(4368,)

#### This is unique number and it has no impact on class label and we can drop this column

### 6 - Impact of termdays on class label

In [17]:
pd.crosstab(traindata.termdays,traindata.good_bad_flag)

good_bad_flag,Bad,Good
termdays,Unnamed: 1_level_1,Unnamed: 2_level_1
15,208,711
30,673,2456
60,65,223
90,6,26


#### There is no clear relation here, we can let this categorical data as it is

### 7 - Impact of totaldue on class label

#### totaldue - loanamount can give interest amount - we can find impact of interest amount on class label

In [19]:
traindata['interest'] = traindata['totaldue'] - traindata['loanamount']

In [20]:
traindata.head()

Unnamed: 0,approveddate,creationdate,customerid,loanamount,loannumber,referredby,systemloanid,termdays,totaldue,birthdate,...,latitude_gps,bank_name_clients,bank_branch_clients,employment_status_clients,level_of_education_clients,avg_loanamount,avg_totaldue,repaymentdays,good_bad_flag,interest
0,2017-07-25 08:22:56.000000,2017-07-25 07:22:47.000000,8a2a81a74ce8c05d014cfb32a0da1049,30000.0,12,,301994762,30,34500.0,1972-01-15 00:00:00.000000,...,6.433055,Diamond Bank,,Permanent,Post-Graduate,18181.818182,22081.818182,10.0,Good,4500.0
1,2017-07-05 17:04:41.000000,2017-07-05 16:04:18.000000,8a85886e54beabf90154c0a29ae757c0,15000.0,2,,301965204,30,17250.0,1985-08-23 00:00:00.000000,...,7.3207,GT Bank,"DUGBE,IBADAN",Permanent,Graduate,0.0,0.0,0.0,Good,2250.0
2,2017-07-06 14:52:57.000000,2017-07-06 13:52:51.000000,8a8588f35438fe12015444567666018e,20000.0,7,,301966580,15,22250.0,1984-09-18 00:00:00.000000,...,10.292041,EcoBank,,Permanent,,10000.0,11750.0,-5.0,Good,2250.0
3,2017-07-27 19:00:41.000000,2017-07-27 18:00:35.000000,8a85890754145ace015429211b513e16,10000.0,3,,301999343,15,11500.0,1977-10-10 00:00:00.000000,...,7.491708,First Bank,,Permanent,,10000.0,12250.0,-15.0,Good,1500.0
4,2017-07-03 23:42:45.000000,2017-07-03 22:42:39.000000,8a858970548359cc0154883481981866,40000.0,9,,301962360,30,44000.0,1986-09-07 00:00:00.000000,...,9.076574,GT Bank,,Permanent,Primary,18750.0,23550.0,25.0,Good,4000.0


In [21]:
pd.crosstab(traindata.interest,traindata.good_bad_flag)

good_bad_flag,Bad,Good
interest,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0,1
250.0,0,1
500.0,0,3
750.0,0,14
1000.0,0,19
1125.0,2,37
1500.0,174,555
1687.5,2,16
1750.0,1,2
2000.0,0,13


#### No clear relation for this continuous attribute. We can drop totaldue and interest

### 8 - Impact of birthdate (age) on class label

#### age is loan approve date - birth date

In [23]:
def createDate(strDate):
    strDate = strDate[:10]
    s = strDate.split('-')
    s = [int(y) for y in s]  
    s = date(s[0], s[1], s[2])
    return s

In [52]:
def getAgeInYears(strDays):
    strDays = str(strDays)
    s = strDays.split(" ")
    return math.floor(int(s[0])/365)

In [25]:
traindata['approveddate'] = traindata['approveddate'].apply(createDate)

In [26]:
traindata['birthdate'] = traindata['birthdate'].apply(createDate)

In [53]:
traindata['age'] = traindata['approveddate'] - traindata['birthdate']

In [54]:
traindata.head()

Unnamed: 0,approveddate,creationdate,customerid,loanamount,loannumber,referredby,systemloanid,termdays,totaldue,birthdate,...,bank_name_clients,bank_branch_clients,employment_status_clients,level_of_education_clients,avg_loanamount,avg_totaldue,repaymentdays,good_bad_flag,interest,age
0,2017-07-25,2017-07-25 07:22:47.000000,8a2a81a74ce8c05d014cfb32a0da1049,30000.0,12,,301994762,30,34500.0,1972-01-15,...,Diamond Bank,,Permanent,Post-Graduate,18181.818182,22081.818182,10.0,Good,4500.0,16628 days
1,2017-07-05,2017-07-05 16:04:18.000000,8a85886e54beabf90154c0a29ae757c0,15000.0,2,,301965204,30,17250.0,1985-08-23,...,GT Bank,"DUGBE,IBADAN",Permanent,Graduate,0.0,0.0,0.0,Good,2250.0,11639 days
2,2017-07-06,2017-07-06 13:52:51.000000,8a8588f35438fe12015444567666018e,20000.0,7,,301966580,15,22250.0,1984-09-18,...,EcoBank,,Permanent,,10000.0,11750.0,-5.0,Good,2250.0,11979 days
3,2017-07-27,2017-07-27 18:00:35.000000,8a85890754145ace015429211b513e16,10000.0,3,,301999343,15,11500.0,1977-10-10,...,First Bank,,Permanent,,10000.0,12250.0,-15.0,Good,1500.0,14535 days
4,2017-07-03,2017-07-03 22:42:39.000000,8a858970548359cc0154883481981866,40000.0,9,,301962360,30,44000.0,1986-09-07,...,GT Bank,,Permanent,Primary,18750.0,23550.0,25.0,Good,4000.0,11257 days


In [55]:
traindata['age'] = traindata['age'].apply(getAgeInYears)

In [56]:
traindata.head()

Unnamed: 0,approveddate,creationdate,customerid,loanamount,loannumber,referredby,systemloanid,termdays,totaldue,birthdate,...,bank_name_clients,bank_branch_clients,employment_status_clients,level_of_education_clients,avg_loanamount,avg_totaldue,repaymentdays,good_bad_flag,interest,age
0,2017-07-25,2017-07-25 07:22:47.000000,8a2a81a74ce8c05d014cfb32a0da1049,30000.0,12,,301994762,30,34500.0,1972-01-15,...,Diamond Bank,,Permanent,Post-Graduate,18181.818182,22081.818182,10.0,Good,4500.0,45
1,2017-07-05,2017-07-05 16:04:18.000000,8a85886e54beabf90154c0a29ae757c0,15000.0,2,,301965204,30,17250.0,1985-08-23,...,GT Bank,"DUGBE,IBADAN",Permanent,Graduate,0.0,0.0,0.0,Good,2250.0,31
2,2017-07-06,2017-07-06 13:52:51.000000,8a8588f35438fe12015444567666018e,20000.0,7,,301966580,15,22250.0,1984-09-18,...,EcoBank,,Permanent,,10000.0,11750.0,-5.0,Good,2250.0,32
3,2017-07-27,2017-07-27 18:00:35.000000,8a85890754145ace015429211b513e16,10000.0,3,,301999343,15,11500.0,1977-10-10,...,First Bank,,Permanent,,10000.0,12250.0,-15.0,Good,1500.0,39
4,2017-07-03,2017-07-03 22:42:39.000000,8a858970548359cc0154883481981866,40000.0,9,,301962360,30,44000.0,1986-09-07,...,GT Bank,,Permanent,Primary,18750.0,23550.0,25.0,Good,4000.0,30


In [57]:
pd.crosstab(traindata.age,traindata.good_bad_flag)

good_bad_flag,Bad,Good
age,Unnamed: 1_level_1,Unnamed: 2_level_1
21,6,17
22,30,42
23,26,83
24,42,112
25,54,123
26,45,130
27,56,163
28,60,212
29,49,227
30,68,212


#### old adult > 45 have more pure values. We can divide this into two categorical values - 1 for <= 45 and 2 for > 45

In [159]:
def getAgeRank(age):
    if( age <= 45 ):
        return 1
    else:
        return 2

In [160]:
traindata['agerank'] = traindata['age'].apply(getAgeRank)

In [161]:
pd.crosstab(traindata.agerank,traindata.good_bad_flag)

good_bad_flag,Bad,Good
agerank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,927,3291
2,25,125


### 9 - Impact of bank account type

In [60]:
pd.crosstab(traindata.bank_account_type,traindata.good_bad_flag)

good_bad_flag,Bad,Good
bank_account_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Current,7,57
Other,131,756
Savings,814,2603


#### we can use this categorical column as it is

### 10 - impact of longitude_gps & latitude_gps  on class label

In [61]:
pd.crosstab(traindata.longitude_gps,traindata.good_bad_flag)

good_bad_flag,Bad,Good
longitude_gps,Unnamed: 1_level_1,Unnamed: 2_level_1
-149.033700,1,0
-118.247009,0,1
-117.822509,1,0
-98.495394,1,0
-95.737058,0,1
-95.693698,0,1
-95.609958,0,1
-95.337212,0,1
-92.391699,1,0
-84.637253,0,1


In [62]:
pd.crosstab(traindata.latitude_gps,traindata.good_bad_flag)

good_bad_flag,Bad,Good
latitude_gps,Unnamed: 1_level_1,Unnamed: 2_level_1
-33.868818,0,1
-8.348493,1,0
4.384586,0,1
4.400570,0,1
4.401684,1,0
4.417736,0,1
4.432237,0,1
4.438133,1,0
4.505915,0,1
4.545390,0,1


#### both these continous attribute doesnt carry much information in relation with class label

### 11 - impact of bank_name_clients on class label

In [64]:
pd.crosstab(traindata.bank_name_clients,traindata.good_bad_flag)

good_bad_flag,Bad,Good
bank_name_clients,Unnamed: 1_level_1,Unnamed: 2_level_1
Access Bank,101,328
Diamond Bank,42,256
EcoBank,41,117
FCMB,29,89
Fidelity Bank,27,76
First Bank,133,442
GT Bank,348,1256
Heritage Bank,3,19
Keystone Bank,4,16
Skye Bank,32,79


#### data is spread uniformally across all bank - no much impact on claa label

### 12 - Impact of bank branch client on class label

In [67]:
pd.crosstab(traindata.bank_branch_clients,traindata.good_bad_flag)

good_bad_flag,Bad,Good
bank_branch_clients,Unnamed: 1_level_1,Unnamed: 2_level_1
IDI - ORO MUSHIN,1,0
"17, SANUSI FAFUNWA STREET, VICTORIA ISLAND, LAGOS",0,1
"3, OBA AKRAN",0,1
"40,SAPELE ROAD ,OPPOSITE DUMAZ JUNCTION BENIN CITY EDO STATE.",0,1
ABEOKUTA,0,1
ABULE EGBA,0,1
"ACCESS BANK PLC, CHALLENGE ROUNDABOUT IBADAN, OYO STATE.",1,0
ADEOLA HOPEWELL,0,1
ADETOKUNBO ADEMOLA,0,1
AJOSE ADEOGUN,0,1


#### we can remove this attribute as there is very less data available 

### 13 - Impact of employment_status_clients on class label

In [71]:
traindata['employment_status_clients'].isnull().sum()

657

### Fill empty with next valid observation

In [80]:
traindata['employment_status_clients'] = traindata['employment_status_clients'].fillna(method='bfill')

In [81]:
pd.crosstab(traindata.employment_status_clients,traindata.good_bad_flag)

good_bad_flag,Bad,Good
employment_status_clients,Unnamed: 1_level_1,Unnamed: 2_level_1
Contract,1,3
Permanent,792,2896
Retired,4,6
Self-Employed,87,339
Student,53,117
Unemployed,14,55


#### we can use these categorical values as it is

### 14 - Impact of level_of_education_clients on class label

In [84]:
traindata['level_of_education_clients'].isnull().sum()

3763

#### there are many missing values - we can remove this attribute

### 15 - Impact of avg_loanamount and avg_totaldue

In [85]:
pd.crosstab(traindata.avg_loanamount,traindata.good_bad_flag)

good_bad_flag,Bad,Good
avg_loanamount,Unnamed: 1_level_1,Unnamed: 2_level_1
0.000000,3,6
5000.000000,0,1
7500.000000,0,1
8000.000000,0,1
8333.333333,0,2
8750.000000,0,2
9000.000000,0,4
9166.666667,0,1
9250.000000,0,1
9333.333333,0,1


In [112]:
def getAvgLoanamtRank(avgLoanamt):
    if( avgLoanamt < 10000 ):
        return 1
    elif( avgLoanamt < 20000):
        return 2
    elif( avgLoanamt < 30000):
        return 3
    else:
        return 4

In [113]:
traindata['avgloanamtrank'] = traindata['avg_loanamount'].apply(getAvgLoanamtRank)

In [114]:
pd.crosstab(traindata.avgloanamtrank,traindata.good_bad_flag)

good_bad_flag,Bad,Good
avgloanamtrank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,6,25
2,875,2945
3,66,422
4,5,24


In [94]:
traindata['avg_interest'] = traindata['avg_totaldue'] - traindata['avg_loanamount']

In [95]:
def getInt(num):
    return math.floor(num)

In [96]:
traindata['avg_interest'] = traindata['avg_interest'].apply(getInt)

In [98]:
pd.crosstab(traindata.avg_interest,traindata.good_bad_flag)

good_bad_flag,Bad,Good
avg_interest,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3,6
750,0,1
1125,0,1
1225,0,1
1241,0,1
1250,0,2
1312,0,3
1332,0,1
1333,1,1
1339,0,1


#### Try creating four categories 

In [107]:
def getInterestRank(avgInterest):
    if( avgInterest < 2500 ):
        return 1
    elif( avgInterest < 5000):
        return 2
    elif( avgInterest < 7500):
        return 3
    else:
        return 4

In [108]:
traindata['avginterestrank'] = traindata['avg_interest'].apply(getInterestRank)

In [109]:
pd.crosstab(traindata.avginterestrank,traindata.good_bad_flag)

good_bad_flag,Bad,Good
avginterestrank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,341,1102
2,596,2273
3,14,40
4,1,1


#### Not much information with this attribute - We can drop avg_interest, avginterestrank

### 16 - Impact of repaymentdays on class label

In [99]:
pd.crosstab(traindata.repaymentdays,traindata.good_bad_flag)

good_bad_flag,Bad,Good
repaymentdays,Unnamed: 1_level_1,Unnamed: 2_level_1
-363.0,1,0
-285.0,1,0
-227.0,1,0
-191.0,0,1
-171.0,0,1
-164.0,1,0
-162.0,1,0
-160.0,0,1
-152.0,0,1
-142.0,0,1


#### repayment days show impact on class label. negative repayment days have more bad loan

#### we can split range (-350 to 400) into 4 categorical rank 1 < -200, 2 < 0, 3 < 200, 4 < 400 

In [100]:
def getRepaymetRank(repaymentDays):
    if( repaymentDays < -200 ):
        return 1
    elif( repaymentDays < 0):
        return 2
    elif( repaymentDays < 200):
        return 3
    else:
        return 4

In [101]:
traindata['repaymentrank'] = traindata['repaymentdays'].apply(getRepaymetRank)

In [102]:
pd.crosstab(traindata.repaymentrank,traindata.good_bad_flag)

good_bad_flag,Bad,Good
repaymentrank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3,0
2,381,518
3,566,2896
4,2,2


#### repaymentrank is pure for rank 1, we can drop repaymentdays

In [163]:
traindata.head()

Unnamed: 0,approveddate,creationdate,customerid,loanamount,loannumber,referredby,systemloanid,termdays,totaldue,birthdate,...,interest,age,avg_interest,repaymentrank,avginterestrank,avgloanamtrank,rankloanamt,rankloannum,isreffered,agerank
0,2017-07-25,2017-07-25 07:22:47.000000,8a2a81a74ce8c05d014cfb32a0da1049,30000.0,12,0,301994762,30,34500.0,1972-01-15,...,4500.0,45,3900,3,2,2,2,1,0,1
1,2017-07-05,2017-07-05 16:04:18.000000,8a85886e54beabf90154c0a29ae757c0,15000.0,2,0,301965204,30,17250.0,1985-08-23,...,2250.0,31,0,3,1,1,1,1,0,1
2,2017-07-06,2017-07-06 13:52:51.000000,8a8588f35438fe12015444567666018e,20000.0,7,0,301966580,15,22250.0,1984-09-18,...,2250.0,32,1750,2,1,2,1,1,0,1
3,2017-07-27,2017-07-27 18:00:35.000000,8a85890754145ace015429211b513e16,10000.0,3,0,301999343,15,11500.0,1977-10-10,...,1500.0,39,2250,2,1,2,1,1,0,1
4,2017-07-03,2017-07-03 22:42:39.000000,8a858970548359cc0154883481981866,40000.0,9,0,301962360,30,44000.0,1986-09-07,...,4000.0,30,4800,3,2,2,2,1,0,1


### --- Feature Reduction - Removing unwanted feature ---

In [None]:
traindata.drop(['approveddate','creationdate','customerid','loanamount','loannumber','referredby',
                'systemloanid','totaldue','birthdate','longitude_gps','latitude_gps','bank_name_clients',
               'bank_branch_clients','level_of_education_clients','avg_loanamount','avg_totaldue',
                'avg_interest','avginterestrank'],axis=1, inplace=True)

In [167]:
traindata.drop(['interest','age'],axis=1, inplace=True)

In [169]:
traindata.drop(['repaymentdays'],axis=1, inplace=True)

In [170]:
traindata.head()

Unnamed: 0,termdays,bank_account_type,employment_status_clients,good_bad_flag,repaymentrank,avgloanamtrank,rankloanamt,rankloannum,isreffered,agerank
0,30,Other,Permanent,Good,3,2,2,1,0,1
1,30,Savings,Permanent,Good,3,1,1,1,0,1
2,15,Other,Permanent,Good,2,2,1,1,0,1
3,15,Savings,Permanent,Good,2,2,1,1,0,1
4,30,Other,Permanent,Good,3,2,2,1,0,1


In [171]:
traindata.shape

(4368, 10)

In [172]:
def getSample(size, df):
    size1 = int(round(size * 0.50))
    size2 = size - size1
    data_good = df[(df['good_bad_flag'] == 'Good')]
    data_bad = df[(df['good_bad_flag'] == 'Bad')]
    data_sample_good = data_good.sample(n=size1)
    data_sample_bad = data_bad.sample(n=size2)
    return pd.concat((data_sample_good, data_sample_bad))

### Reordering to move class label at end

In [174]:
columns = [column for column in traindata.columns if column != 'good_bad_flag']

In [175]:
columns = columns + ['good_bad_flag']

In [176]:
traindata = traindata[columns]

In [177]:
traindata.head()

Unnamed: 0,termdays,bank_account_type,employment_status_clients,repaymentrank,avgloanamtrank,rankloanamt,rankloannum,isreffered,agerank,good_bad_flag
0,30,Other,Permanent,3,2,2,1,0,1,Good
1,30,Savings,Permanent,3,1,1,1,0,1,Good
2,15,Other,Permanent,2,2,1,1,0,1,Good
3,15,Savings,Permanent,2,2,1,1,0,1,Good
4,30,Other,Permanent,3,2,2,1,0,1,Good


In [178]:
train_600 = getSample(600, traindata)