In [310]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

### Load Fold data

In [365]:
adult_dataset = pd.read_csv('./data/adult_dataset.csv')

In [366]:
adult_dataset.shape

(48842, 15)

In [367]:
adult_dataset

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [368]:
adult_dataset[(adult_dataset['class'] == '<=50K')].shape

(37155, 15)

In [369]:
adult_dataset[(adult_dataset['class'] == '>50K')].shape

(11687, 15)

## Attribute selection

In [370]:
pd.crosstab(adult_dataset.age,adult_dataset['class'])

class,<=50K,>50K
age,Unnamed: 1_level_1,Unnamed: 2_level_1
17,595,0
18,862,0
19,1050,3
20,1112,1
21,1090,6
22,1161,17
23,1307,22
24,1162,44
25,1119,76
26,1068,85


### Age can decide the class - Child category is earning less than 50k

In [374]:
adult_dataset['ageState'] = np.where(adult_dataset['age'] > 22, 'Old', 'Young')

In [375]:
pd.crosstab(adult_dataset['ageState'] , adult_dataset['class'])

class,<=50K,>50K
ageState,Unnamed: 1_level_1,Unnamed: 2_level_1
Old,31285,11660
Young,5870,27


In [376]:
pd.crosstab(adult_dataset.relationship, adult_dataset['class'])

class,<=50K,>50K
relationship,Unnamed: 1_level_1,Unnamed: 2_level_1
Husband,10870,8846
Not-in-family,11307,1276
Other-relative,1454,52
Own-child,7470,111
Unmarried,4816,309
Wife,1238,1093


In [377]:
def checkRelationship(rel):
    if(rel.strip() in ['Husband','Wife']) :
        return 'Family'
    else:
        return 'NonFamily'
adult_dataset['relationshipState'] = adult_dataset['relationship'].apply(checkRelationship)

In [378]:
pd.crosstab(adult_dataset.relationshipState, adult_dataset['class'])

class,<=50K,>50K
relationshipState,Unnamed: 1_level_1,Unnamed: 2_level_1
Family,12108,9939
NonFamily,25047,1748


In [322]:
pd.crosstab(adult_dataset['capital-gain'], adult_dataset['class'])

class,<=50K,>50K
capital-gain,Unnamed: 1_level_1,Unnamed: 2_level_1
0,35611,9196
114,8,0
401,5,0
594,52,0
914,10,0
991,6,0
1055,37,0
1086,8,0
1111,1,0
1151,13,0


In [379]:
def checkCapitalGain(gain):
    if( gain == 0 ):
        return 'Unknown'
    elif( gain < 7000):
        return 'Low'
    else:
        return 'High'
adult_dataset['capitalGainState'] = adult_dataset['capital-gain'].apply(checkCapitalGain)

In [380]:
pd.crosstab(adult_dataset['capitalGainState'], adult_dataset['class'])

class,<=50K,>50K
capitalGainState,Unnamed: 1_level_1,Unnamed: 2_level_1
High,28,2027
Low,1516,464
Unknown,35611,9196


In [381]:
pd.crosstab(adult_dataset['native-country'], adult_dataset['class'])

class,<=50K,>50K
native-country,Unnamed: 1_level_1,Unnamed: 2_level_1
?,637,220
Cambodia,19,9
Canada,119,63
China,86,36
Columbia,81,4
Cuba,104,34
Dominican-Republic,98,5
Ecuador,39,6
El-Salvador,144,11
England,80,47


In [382]:
def checkCountry(ctry):
    if( ctry.strip() == '?' ):
        return 'Unknown'
    elif( ctry.strip() in ['Honduras','Holand-Netherlands','Jamaica','Laos','Dominican-Republic','Vietnam','Columbia','El-Salvador','Guatemala','Haiti','Outlying-US(Guam-USVI-etc)','Puerto-Rico','Mexico']):
        return 'T1'
    else:
        return 'T2'
adult_dataset['geoLoc'] = adult_dataset['native-country'].apply(checkCountry)

In [383]:
pd.crosstab(adult_dataset['geoLoc'], adult_dataset['class'])

class,<=50K,>50K
geoLoc,Unnamed: 1_level_1,Unnamed: 2_level_1
T1,1774,126
T2,34744,11341
Unknown,637,220


In [384]:
pd.crosstab(adult_dataset['education-num'], adult_dataset['class'])

class,<=50K,>50K
education-num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,82,1
2,239,8
3,482,27
4,893,62
5,715,41
6,1302,87
7,1720,92
8,609,48
9,13281,2503
10,8815,2063


In [385]:
adult_dataset['edState'] = np.where(adult_dataset['education-num'] < 9, 'Low', 'High')

In [386]:
pd.crosstab(adult_dataset['edState'], adult_dataset['class'])

class,<=50K,>50K
edState,Unnamed: 1_level_1,Unnamed: 2_level_1
High,31113,11321
Low,6042,366


In [387]:
pd.crosstab(adult_dataset['marital-status'], adult_dataset['class'])

class,<=50K,>50K
marital-status,Unnamed: 1_level_1,Unnamed: 2_level_1
Divorced,5962,671
Married-AF-spouse,23,14
Married-civ-spouse,12395,9984
Married-spouse-absent,570,58
Never-married,15384,733
Separated,1431,99
Widowed,1390,128


In [388]:
def checkMarriedState(st):
    if( st.strip() in ['Married-AF-spouse','Married-civ-spouse']):
        return 'M'
    else:
        return 'S'
adult_dataset['maritalStat'] = adult_dataset['marital-status'].apply(checkMarriedState)

In [389]:
pd.crosstab(adult_dataset['maritalStat'], adult_dataset['class'])

class,<=50K,>50K
maritalStat,Unnamed: 1_level_1,Unnamed: 2_level_1
M,12418,9998
S,24737,1689


In [390]:
pd.crosstab(adult_dataset['workclass'], adult_dataset['class'])

class,<=50K,>50K
workclass,Unnamed: 1_level_1,Unnamed: 2_level_1
?,2534,265
Federal-gov,871,561
Local-gov,2209,927
Never-worked,10,0
Private,26519,7387
Self-emp-inc,757,938
Self-emp-not-inc,2785,1077
State-gov,1451,530
Without-pay,19,2


In [391]:
pd.crosstab(adult_dataset['race'], adult_dataset['class'])

class,<=50K,>50K
race,Unnamed: 1_level_1,Unnamed: 2_level_1
Amer-Indian-Eskimo,415,55
Asian-Pac-Islander,1110,409
Black,4119,566
Other,356,50
White,31155,10607


In [392]:
pd.crosstab(adult_dataset['occupation'], adult_dataset['class'])

class,<=50K,>50K
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1
?,2544,265
Adm-clerical,4843,768
Armed-Forces,10,5
Craft-repair,4729,1383
Exec-managerial,3178,2908
Farming-fishing,1317,173
Handlers-cleaners,1934,138
Machine-op-inspct,2650,372
Other-service,4719,204
Priv-house-serv,239,3


In [393]:
adult_dataset['class-new'] = adult_dataset['class']

In [394]:
adult_dataset['class'] = np.where(adult_dataset['class-new'] == '<=50K', '0', '1')

In [395]:
adult_dataset[(adult_dataset['class'] == '0')].shape

(37155, 22)

In [396]:
adult_dataset[(adult_dataset['class'] == '1')].shape

(11687, 22)

In [397]:
adult_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 22 columns):
age                  48842 non-null int64
workclass            48842 non-null object
fnlwgt               48842 non-null int64
education            48842 non-null object
education-num        48842 non-null int64
marital-status       48842 non-null object
occupation           48842 non-null object
relationship         48842 non-null object
race                 48842 non-null object
sex                  48842 non-null object
capital-gain         48842 non-null int64
capital-loss         48842 non-null int64
hours-per-week       48842 non-null int64
native-country       48842 non-null object
class                48842 non-null object
ageState             48842 non-null object
relationshipState    48842 non-null object
capitalGainState     48842 non-null object
geoLoc               48842 non-null object
edState              48842 non-null object
maritalStat          48842 non-null o

### Drop unwanted features and reorder required features

In [398]:
adult_dataset.drop(['age','workclass','marital-status','race','occupation','fnlwgt','education','education-num','relationship','class-new',
                    'capital-gain','capital-loss','hours-per-week','native-country'],axis=1, inplace=True)

In [399]:
adult_dataset

Unnamed: 0,sex,class,ageState,relationshipState,capitalGainState,geoLoc,edState,maritalStat
0,Male,0,Old,NonFamily,Low,T2,High,S
1,Male,0,Old,Family,Unknown,T2,High,M
2,Male,0,Old,NonFamily,Unknown,T2,High,S
3,Male,0,Old,Family,Unknown,T2,Low,M
4,Female,0,Old,Family,Unknown,T2,High,M
5,Female,0,Old,Family,Unknown,T2,High,M
6,Female,0,Old,NonFamily,Unknown,T1,Low,S
7,Male,1,Old,Family,Unknown,T2,High,M
8,Female,1,Old,NonFamily,High,T2,High,S
9,Male,1,Old,Family,Low,T2,High,M


In [400]:
columns = [column for column in adult_dataset.columns if column != 'class']

In [401]:
columns = columns + ['class']

In [402]:
adult_dataset = adult_dataset[columns]

In [403]:
adult_dataset

Unnamed: 0,sex,ageState,relationshipState,capitalGainState,geoLoc,edState,maritalStat,class
0,Male,Old,NonFamily,Low,T2,High,S,0
1,Male,Old,Family,Unknown,T2,High,M,0
2,Male,Old,NonFamily,Unknown,T2,High,S,0
3,Male,Old,Family,Unknown,T2,Low,M,0
4,Female,Old,Family,Unknown,T2,High,M,0
5,Female,Old,Family,Unknown,T2,High,M,0
6,Female,Old,NonFamily,Unknown,T1,Low,S,0
7,Male,Old,Family,Unknown,T2,High,M,1
8,Female,Old,NonFamily,High,T2,High,S,1
9,Male,Old,Family,Low,T2,High,M,1


In [404]:
adult_dataset[(adult_dataset['class'] == '0')].shape

(37155, 8)

In [405]:
adult_dataset[(adult_dataset['class'] == '1')].shape

(11687, 8)

In [406]:
def catToNum(series):
    series = series.astype('category')
    return series.cat.codes

In [407]:
temp_dataset = adult_dataset[['sex','ageState','relationshipState','capitalGainState','geoLoc','edState','maritalStat']].apply(catToNum)
adult_dataset[['sex','ageState','relationshipState','capitalGainState','geoLoc','edState','maritalStat']] = temp_dataset

In [408]:
adult_dataset

Unnamed: 0,sex,ageState,relationshipState,capitalGainState,geoLoc,edState,maritalStat,class
0,1,0,1,1,1,0,1,0
1,1,0,0,2,1,0,0,0
2,1,0,1,2,1,0,1,0
3,1,0,0,2,1,1,0,0
4,0,0,0,2,1,0,0,0
5,0,0,0,2,1,0,0,0
6,0,0,1,2,0,1,1,0
7,1,0,0,2,1,0,0,1
8,0,0,1,0,1,0,1,1
9,1,0,0,1,1,0,0,1


In [409]:
adult_dataset.to_csv('./data/adult_dataset_preprocessed.csv',index=False)

In [410]:
def getSample(size, df):
    size1 = int(round(size * 0.50))
    size2 = size - size1
    adult_dataset_less50 = df[(df['class'] == '0')]
    adult_dataset_more50 = df[(df['class'] == '1')]
    adult_sample_less50 = adult_dataset_less50.sample(n=size1)
    adult_sample_more50 = adult_dataset_more50.sample(n=size2)
    return pd.concat((adult_sample_less50, adult_sample_more50))

In [411]:
adult_ds_300 = getSample(300, adult_dataset)

In [None]:
adult_ds_300.to_csv('./data/adult_ds300_preprocessed.csv',index=False)

In [420]:
adult_ds_300.head()

Unnamed: 0,sex,ageState,relationshipState,capitalGainState,geoLoc,edState,maritalStat,class
9878,1,0,0,2,1,0,0,0
22114,1,0,0,2,1,0,0,0
38528,0,0,1,2,1,0,1,0
29258,1,0,1,2,1,0,1,0
41183,1,0,1,2,1,0,1,0


In [421]:
X = adult_ds_300[adult_ds_300.columns[0:7]] 

In [422]:
y = adult_ds_300['class']

In [423]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=120, random_state=0)

In [424]:
i = 1 
for  train_index, test_index in sss.split(X, Y):
    print( i, "TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = adult_ds_300.iloc[train_index], adult_ds_300.iloc[test_index]
    X_train.to_csv("./data/fold/fold" + str(i) + "_train.csv",index=False)
    X_test.to_csv("./data/fold/fold" + str(i) + "_test.csv",index=False)
    i = i + 1

1 TRAIN: [ 27  90 287 135 156  60 185 166 153 203  94  20 155 150 220 288 290 243
 197 138  33 238 244 154  49  66 240  80 276 123 257 149   4 277 218 107
 233  62 200  51  86 190 134 298 122 178 215 143 170  95 194  41 296  59
 101 144 183 209 136  30   5  15 262 139 282  56 168  14  93  43  55 202
  37 116  71 231 261  16   3 113 229 284 179  76 226  45  52 181  13 237
 145  50 225 157 281   2 137  46 207 159   7  10 263  63 236 201 239  18
  69  19 204 250  83 294 242  40  61 227 212 241 110  92 247  74 161   0
  89  73  44 274 280 189  24  97 152  75 283 111  64 199 291  54 224 187
  26 176 265 126 104 125  96 297 112 169  22   8 279 124 255 109  98 230
 196 267 118 130 106 162 253 158 108 146 214 141  12 206 129 268 272 160] TEST: [ 77 114  57  35 198 147  88 140  65  47 251 192   9  39 245  29 260 164
  23 264  32 254  81 269  17 292 208 270 128 115   6  99 121  38  42 132
 120 171 234 222  85 180 188  91 232 217  28 191  72 252 131 216 256 210
  84  68 246  34 221 127  79 186 29

In [493]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
def testModel(train, test, treefile, criteria):
    fold_train = pd.read_csv(train)
    fold_test = pd.read_csv(test)
    clf = DecisionTreeClassifier(criterion=criteria)
    clf.fit(fold_train[["sex","ageState","relationshipState","capitalGainState","geoLoc","edState","maritalStat"]], fold_train["class"])
    from sklearn import tree
    with open(treefile,"w") as f:
        f = tree.export_graphviz(clf,
                        feature_names=["sex","ageState","relationshipState","capitalGainState","geoLoc","edState","maritalStat"], out_file=f)
    predictions = clf.predict(fold_test[["sex","ageState","relationshipState","capitalGainState","geoLoc","edState","maritalStat"]])
    tn, fp, fn, tp = confusion_matrix(fold_test['class'], predictions).ravel()
    print("Accuracy -",round(accuracy_score(fold_test['class'], predictions),2))
    print("Recall -",round(recall_score(fold_test['class'], predictions),2))
    print("Precision -",round(precision_score(fold_test['class'], predictions),2))
    print("F1 -",round(f1_score(fold_test['class'], predictions),2)) 
    print("Sensitivity -",round(tp/(tp+fn),2))
    print("Specificity -",round(tn/(tn+fp),2))

In [483]:
testModel('./data/fold/fold1_train.csv', './data/fold/fold1_test.csv', 'fold1-gini.dot','gini')

Accuracy - 0.8
Recall - 0.81
Precision - 0.81
F1 - 0.81
Sensitivity - 0.81
Specificity - 0.79


In [484]:
testModel('./data/fold/fold1_train.csv', './data/fold/fold1_test.csv', 'fold1-entropy.dot','entropy')

Accuracy - 0.8
Recall - 0.81
Precision - 0.81
F1 - 0.81
Sensitivity - 0.81
Specificity - 0.79


In [485]:
testModel('./data/fold/fold2_train.csv', './data/fold/fold2_test.csv', 'fold2-gini.dot','gini')

Accuracy - 0.85
Recall - 0.87
Precision - 0.84
F1 - 0.85
Sensitivity - 0.87
Specificity - 0.83


In [486]:
testModel('./data/fold/fold2_train.csv', './data/fold/fold2_test.csv', 'fold2-entropy.dot','entropy')

Accuracy - 0.87
Recall - 0.9
Precision - 0.84
F1 - 0.87
Sensitivity - 0.9
Specificity - 0.83


In [487]:
testModel('./data/fold/fold3_train.csv', './data/fold/fold3_test.csv', 'fold3-gini.dot','gini')

Accuracy - 0.78
Recall - 0.86
Precision - 0.73
F1 - 0.79
Sensitivity - 0.86
Specificity - 0.72


In [488]:
testModel('./data/fold/fold3_train.csv', './data/fold/fold3_test.csv', 'fold3-entropy.dot','entropy')

Accuracy - 0.78
Recall - 0.86
Precision - 0.73
F1 - 0.79
Sensitivity - 0.86
Specificity - 0.72


In [489]:
testModel('./data/fold/fold4_train.csv', './data/fold/fold4_test.csv', 'fold4-gini.dot','gini')

Accuracy - 0.78
Recall - 0.81
Precision - 0.75
F1 - 0.78
Sensitivity - 0.81
Specificity - 0.74


In [490]:
testModel('./data/fold/fold4_train.csv', './data/fold/fold4_test.csv', 'fold4-entropy.dot','entropy')

Accuracy - 0.78
Recall - 0.81
Precision - 0.75
F1 - 0.78
Sensitivity - 0.81
Specificity - 0.74


In [491]:
testModel('./data/fold/fold5_train.csv', './data/fold/fold5_test.csv', 'fold5-gini.dot','gini')

Accuracy - 0.82
Recall - 0.83
Precision - 0.81
F1 - 0.82
Sensitivity - 0.83
Specificity - 0.8


In [492]:
testModel('./data/fold/fold5_train.csv', './data/fold/fold5_test.csv', 'fold5-entropy.dot','entropy')

Accuracy - 0.81
Recall - 0.82
Precision - 0.8
F1 - 0.81
Sensitivity - 0.82
Specificity - 0.8
