In this binary classification problem we use two imputers, one of which is a custom class based one. We also handle our unbalanced nature of our dataset.

ref.: http://archive.ics.uci.edu/ml/datasets/Adult

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.utils import class_weight
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

Using TensorFlow backend.


In [2]:
columns = [
    "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
    "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
    "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
]

df_train = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
                      header=None, names=columns, sep=' *, *', engine='python')
df_test = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
                      header=None, names=columns, sep=' *, *', engine='python', skiprows=1)

In [3]:
df_train.shape, df_test.shape

((32561, 15), (16281, 15))

In [4]:
df_train.head()

Unnamed: 0,Age,WorkClass,fnlwgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


Here we have inbalanced datasets. We handle this later:

In [5]:
df_train['Income'].value_counts(), df_test['Income'].value_counts()

(<=50K    24720
 >50K      7841
 Name: Income, dtype: int64,
 <=50K.    12435
 >50K.      3846
 Name: Income, dtype: int64)

In [6]:
df_train.dtypes

Age               int64
WorkClass        object
fnlwgt            int64
Education        object
EducationNum      int64
MaritalStatus    object
Occupation       object
Relationship     object
Race             object
Gender           object
CapitalGain       int64
CapitalLoss       int64
HoursPerWeek      int64
NativeCountry    object
Income           object
dtype: object

First 8 features are categorical, next 4 continuous, 2 dropped:

In [7]:
feature_names = ["WorkClass", "Education", "MaritalStatus", "Occupation", "Relationship", "Race", "Gender", "NativeCountry",
    "Age","fnlwgt", "EducationNum", "HoursPerWeek"]
X_train = df_train[feature_names].values
y_train = df_train['Income'].values
X_test = df_test[feature_names].values
y_test = df_test['Income'].values

Rows containing '?' values:

In [8]:
X_train[X_train == '?'].shape

(4262,)

In [9]:
X_test[X_test == '?'].shape

(2203,)

### SKLearn Imputer for demonstration purposes only:

In this case we know which columns are intented as we have to specify columns for the transformer:

In [10]:
tr_imp = ColumnTransformer([
    ('imp', SimpleImputer(missing_values='?', strategy='most_frequent', verbose=1), [0, 3, 7])],
    remainder='passthrough'
)

In [11]:
X_train_tr = tr_imp.fit_transform(X_train)

Folling structure should contain no rows:

In [12]:
X_train_tr[X_train_tr == '?'].shape

(0,)

### Class-based imputer. We will use this solution onwards:

In [13]:
class myCategoricalImputer(BaseEstimator, TransformerMixin):

    def find_col_max(self, x):
        uvalues, cnt = np.unique(x, return_counts=True)
        qm_idx = np.where(uvalues == '?')
        uvalues = np.delete(uvalues, qm_idx)
        cnt = np.delete(cnt, qm_idx)
        mvalue = uvalues[np.argmax(cnt)]
        return mvalue

    def fit(self, X, y=None):
        self.mvalues = {}
        for i in range(0, np.ma.size(X, axis=1)):
            self.mvalues[i] = self.find_col_max(X[:, i])
        return self

    def transform(self, X):
        X_copy = X.copy()
        for i in range(0, np.ma.size(X, axis=1)):
            X_col = X_copy[:, i]
            X_col[X_col == '?'] = self.mvalues[i]
            X_copy[:, i] = X_col
        return X_copy

In [14]:
tr_oenc = ColumnTransformer([
    ('g1', OrdinalEncoder(), [0, 1, 2, 3, 4, 5, 6, 7])],
    remainder='passthrough'
)

tr_std = ColumnTransformer([
    ('g1', StandardScaler(), [-1, -2, -3, -4])],
    remainder='passthrough'
)

In [15]:
pipe = make_pipeline(myCategoricalImputer(), tr_oenc, tr_std)

X_train = pipe.fit_transform(X_train)
X_test = pipe.transform(X_test)

y_train = LabelEncoder().fit_transform(y_train)
y_test = LabelEncoder().fit_transform(y_test)

Handle the unbalanced data set:

In [16]:
cw = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)

In [17]:
model = Sequential()
model.add(Dense(12, input_dim=12, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
opt = SGD(learning_rate=0.01)
model.compile(optimizer=opt, loss='binary_crossentropy')

In [18]:
history = model.fit(x=X_train, y=y_train,
                    batch_size=32, epochs=50, verbose=0, class_weight=cw)

In [19]:
y_pred = model.predict_classes(X_test)

In [20]:
print(confusion_matrix(y_test, y_pred))

[[10604  1831]
 [ 1350  2496]]


In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.85      0.87     12435
           1       0.58      0.65      0.61      3846

    accuracy                           0.80     16281
   macro avg       0.73      0.75      0.74     16281
weighted avg       0.81      0.80      0.81     16281

