# EDA - Income Census Adult Dataset 

In [169]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.inspection import permutation_importance
import shap

from scipy import stats
from pandas_profiling import ProfileReport

In [170]:
adult_trainset = pd.read_csv('./data/adult.data.csv')
adult_testset = pd.read_csv('./data/adult.test.csv')

In [171]:
#Fix test set index
adult_testset.reset_index(inplace=True)

In [172]:
# Adding the columns
adult_trainset.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
adult_testset.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]

#Remove the . in the income
adult_testset["income"] = adult_testset["income"].str.replace(".","")

#Replace ' ?' with NAN
adult_trainset.replace(' ?', np.NaN, inplace=True)

The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


In [175]:
for i in adult_trainset.columns:
    print(adult_trainset[i].value_counts())

36    898
31    888
34    886
23    877
35    876
     ... 
83      6
88      3
85      3
86      1
87      1
Name: age, Length: 73, dtype: int64
 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 State-gov            1297
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64
164190    13
203488    13
123011    13
148995    12
126675    12
          ..
325573     1
140176     1
318264     1
329205     1
257302     1
Name: fnlwgt, Length: 21647, dtype: int64
 HS-grad         10501
 Some-college     7291
 Bachelors        5354
 Masters          1723
 Assoc-voc        1382
 11th             1175
 Assoc-acdm       1067
 10th              933
 7th-8th           646
 Prof-school       576
 9th               514
 12th              433
 Doctorate         413
 5th-6th           333
 1st-4th           168
 Preschool          51
Name: education, dtype: int64
9     10501
10     7291
1

In [176]:
adult_trainset.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income               0
dtype: int64

In [177]:
adult_trainset["workclass"].value_counts()

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 State-gov            1297
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64

In [178]:
adult_trainset["occupation"].value_counts()

 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3769
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: occupation, dtype: int64

In [179]:
adult_trainset["native-country"].value_counts()

 United-States                 29169
 Mexico                          643
 Philippines                     198
 Germany                         137
 Canada                          121
 Puerto-Rico                     114
 El-Salvador                     106
 India                           100
 Cuba                             95
 England                          90
 Jamaica                          81
 South                            80
 China                            75
 Italy                            73
 Dominican-Republic               70
 Vietnam                          67
 Guatemala                        64
 Japan                            62
 Poland                           60
 Columbia                         59
 Taiwan                           51
 Haiti                            44
 Iran                             43
 Portugal                         37
 Nicaragua                        34
 Peru                             31
 France                           29
 

In [180]:
adult_trainset.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income               0
dtype: int64

In [215]:
#Imputing missing values

#WORK CLASS
#Method: by frequency of tuples
workclass_frequency = adult_trainset["workclass"].value_counts(normalize=True)
workclass_labels = workclass_frequency.index
workclass_freqvalues = workclass_frequency.values
workclass_nan_rows = adult_trainset["workclass"].isna()
workclass_size = len(adult_trainset[workclass_nan_rows])

adult_trainset.loc[workclass_nan_rows, 'workclass'] = np.random.choice(workclass_labels, size=workclass_size, p=workclass_freqvalues)

#OCCUPATION
#Method: by frequency of tuples
occupation_frequency = adult_trainset["occupation"].value_counts(normalize=True)
occupation_labels = occupation_frequency.index
occupation_freqvalues = occupation_frequency.values
occupation_nan_rows = adult_trainset["occupation"].isna()
occupation_size = len(adult_trainset[occupation_nan_rows])

adult_trainset.loc[occupation_nan_rows, 'occupation'] = np.random.choice(occupation_labels, size=occupation_size, p=occupation_freqvalues)

#NATIVE COUNTRY
#Method: by mode
adult_trainset['native-country'] = adult_trainset['native-country'].fillna(adult_trainset['native-country'].mode()[0])

In [217]:
adult_trainset.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [218]:
adult_trainset.to_csv("adult_train_imputed.csv", index=False)

In [219]:
# A helper function to split datasets into X records and y labels

def X_y_split(df):
    """
    Splits a dataframe into X and y, e.g. X_train and y_train
    """
    X = df.drop([df.columns[-1]], axis = 1)
    y = df[df.columns[-1]]
           
    return X, y

In [220]:
X_train, y_train = X_y_split(adult_trainset)

In [221]:
X_test, y_test = X_y_split(adult_testset)

In [222]:
len_features = X_train.shape[1]

# columns with missing values: occupation, native country, workclass
# Imputation

# Monday tasks
- Look at correlation of those missing values with other dimensions.
    - Look if there are any patterns with the missing values
        ex. Workclass has missing values. If all of them were female, that would inform missing value imputations.
        KNN might be helpful.
        
can reveal systemic issues to the dataset. 

In [154]:
X_train.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')

In [225]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

categorical_feats = []
for i in X_train.columns:
    if type(X_train[i][0]) == str:
        categorical_feats.append(i)
        
for i in categorical_feats:
    X_train[i] = le.fit_transform(X_train[i])
    
for i in categorical_feats:
    X_test[i] = le.fit_transform(X_test[i])

In [226]:
X_test

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25,4,226802,1,7,4,7,3,2,1,0,0,40,38
1,38,4,89814,11,9,2,5,0,4,1,0,0,50,38
2,28,2,336951,7,12,2,11,0,4,1,0,0,40,38
3,44,4,160323,15,10,2,7,0,2,1,7688,0,40,38
4,18,0,103497,15,10,4,0,3,4,0,0,0,30,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,39,4,215419,9,13,0,10,1,4,0,0,0,36,38
16277,64,0,321403,11,9,6,0,2,2,1,0,0,40,38
16278,38,4,374983,9,13,2,10,0,4,1,0,0,50,38
16279,44,4,83891,9,13,0,1,3,1,1,5455,0,40,38


In [227]:
ytrain = le.fit_transform(ytrain)
ytest = le.fit_transform(ytest)
ytrain #<=50k --> 0

array([0, 0, 0, ..., 0, 0, 1])

In [159]:
ytrain

array([0, 0, 0, ..., 0, 0, 1])

In [160]:
ytest

array([0, 0, 1, ..., 0, 0, 1])

In [231]:
mtry

[1, 3, 7]

In [230]:
ntree = [500, 1000]
mtry = [int(0.5*np.sqrt(len_features)), int(np.sqrt(len_features)), int(2*np.sqrt(len_features))]

for i in ntree:
    for j in mtry:
        RF_clf = RandomForestClassifier(n_estimators=i, max_features=j, oob_score=True)
        RF_clf.fit(X_train, y_train)
        print(f"Random Forest Model ntree={i} mtry={j}")
        print(f"RF score {RF_clf.score(X_test, y_test)}")
        print(f"OOB score {RF_clf.oob_score_}")
        print("-----------------------------------\n")

Random Forest Model ntree=500 mtry=1
RF score 0.8508076899453351
OOB score 0.856081081081081
-----------------------------------

Random Forest Model ntree=500 mtry=3
RF score 0.8490264725753947
OOB score 0.8585995085995086
-----------------------------------

Random Forest Model ntree=500 mtry=7
RF score 0.8462010932989374
OOB score 0.8569103194103194
-----------------------------------

Random Forest Model ntree=1000 mtry=1
RF score 0.8499477919046742
OOB score 0.8577088452088452
-----------------------------------

Random Forest Model ntree=1000 mtry=3
RF score 0.8505005835022419
OOB score 0.8589373464373464
-----------------------------------

Random Forest Model ntree=1000 mtry=7
RF score 0.8463853571647934
OOB score 0.8584152334152334
-----------------------------------



BEST MODEL: ntree 1000, mtry 3

In [232]:
RF_clf_1 = RandomForestClassifier(n_estimators=1000, 
                                  max_features=int(np.sqrt(len_features)), 
                                  oob_score=True)

In [233]:
cv_scores = cross_val_score(RF_clf_1, X_train, y_train, cv=StratifiedKFold(n_splits=3, shuffle=True))

In [234]:
cv_scores

array([0.855445  , 0.85819589, 0.85911729])

In [236]:
skf = StratifiedKFold(n_splits=3, random_state=44, shuffle=True)
counter = 0
for train_index, test_index in skf.split(X_train, y_train):
    counter += 1
    print(f"FOLD {counter}")
    X_train_skf, X_test_skf = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_skf, y_test_skf = y_train.iloc[train_index], y_train.iloc[test_index]
    print("X_train shape", X_train_skf.shape)
    print("y_train shape", y_train_skf.shape)
    print("X_test shape", X_test_skf.shape)
    print("y_test shape", y_test_skf.shape)
    
    RF_clf_1.fit(X_train_skf, y_train_skf)
    y_pred = RF_clf_1.predict(X_test_skf)
    print(f"Confusion Matrix for FOLD {counter}")
    print(confusion_matrix(y_test_skf, y_pred))
    print(classification_report(y_test_skf,y_pred))
    print(f"---------------------------------------------------------")

FOLD 1
X_train shape (21706, 14)
y_train shape (21706,)
X_test shape (10854, 14)
y_test shape (10854,)
Confusion Matrix for FOLD 1
[[7703  537]
 [ 998 1616]]
              precision    recall  f1-score   support

       <=50K       0.89      0.93      0.91      8240
        >50K       0.75      0.62      0.68      2614

    accuracy                           0.86     10854
   macro avg       0.82      0.78      0.79     10854
weighted avg       0.85      0.86      0.85     10854

---------------------------------------------------------
FOLD 2
X_train shape (21707, 14)
y_train shape (21707,)
X_test shape (10853, 14)
y_test shape (10853,)
Confusion Matrix for FOLD 2
[[7619  621]
 [ 956 1657]]
              precision    recall  f1-score   support

       <=50K       0.89      0.92      0.91      8240
        >50K       0.73      0.63      0.68      2613

    accuracy                           0.85     10853
   macro avg       0.81      0.78      0.79     10853
weighted avg       0.85    

In [None]:
explainer = shap.TreeExplainer(RF_clf_1)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, plot_type="bar", max_display=10)