In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

## Load Data

In [4]:
census = pd.read_csv('census_data.csv')
census.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Check Data

In [10]:
census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
age               32561 non-null int64
workclass         32561 non-null object
education         32561 non-null object
education_num     32561 non-null int64
marital_status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
gender            32561 non-null object
capital_gain      32561 non-null int64
capital_loss      32561 non-null int64
hours_per_week    32561 non-null int64
native_country    32561 non-null object
income_bracket    32561 non-null object
dtypes: int64(5), object(9)
memory usage: 3.5+ MB


In [7]:
census.shape

(32561, 14)

In [8]:
census.isnull().sum()

age               0
workclass         0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
gender            0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income_bracket    0
dtype: int64

## Preprocessing
### `income_bucket`  0-1 encoding

In [9]:
census.income_bracket.value_counts()

 <=50K    24720
 >50K      7841
Name: income_bracket, dtype: int64

In [13]:
census['high_income'] = pd.get_dummies(census.income_bracket, drop_first=True)
census.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket,high_income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0


### Train test stplit

In [14]:
from sklearn.model_selection import train_test_split

In [16]:
X = census.drop(['income_bracket', 'high_income'], axis=1)
y = census['high_income']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(22792, 13) (9769, 13)
(22792,) (9769,)


## TF Model
### Feature columns

In [20]:
X.columns

Index(['age', 'workclass', 'education', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'gender', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country'],
      dtype='object')

In [21]:
cat_cols = ['workclass', 'education', 'marital_status', 
            'occupation', 'relationship', 'race', 
            'gender', 'native_country']
cont_cols = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

feat_cols = [tf.feature_column.categorical_column_with_hash_bucket(cat, 1000) for cat in cat_cols] + \
            [tf.feature_column.numeric_column(cont) for cont in cont_cols]
feat_cols

[_HashedCategoricalColumn(key='workclass', hash_bucket_size=1000, dtype=tf.string),
 _HashedCategoricalColumn(key='education', hash_bucket_size=1000, dtype=tf.string),
 _HashedCategoricalColumn(key='marital_status', hash_bucket_size=1000, dtype=tf.string),
 _HashedCategoricalColumn(key='occupation', hash_bucket_size=1000, dtype=tf.string),
 _HashedCategoricalColumn(key='relationship', hash_bucket_size=1000, dtype=tf.string),
 _HashedCategoricalColumn(key='race', hash_bucket_size=1000, dtype=tf.string),
 _HashedCategoricalColumn(key='gender', hash_bucket_size=1000, dtype=tf.string),
 _HashedCategoricalColumn(key='native_country', hash_bucket_size=1000, dtype=tf.string),
 _NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='education_num', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='capital_gain', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _Numeric

### Input functions

In [25]:
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train, y=y_train, batch_size=100, num_epochs=100, shuffle=True)

### Estimator model

In [26]:
lin_model = tf.estimator.LinearClassifier(feature_columns=feat_cols)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_session_config': None, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_tf_random_seed': 1, '_save_summary_steps': 100, '_model_dir': '/var/folders/n_/yc8w5lvd0939ndnjzj5wm8mr0000gn/T/tmpug6vqbsu'}


### Train model

In [44]:
lin_model.train(input_fn=input_func, steps=20000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /var/folders/n_/yc8w5lvd0939ndnjzj5wm8mr0000gn/T/tmpug6vqbsu/model.ckpt-2000
INFO:tensorflow:Saving checkpoints for 2001 into /var/folders/n_/yc8w5lvd0939ndnjzj5wm8mr0000gn/T/tmpug6vqbsu/model.ckpt.
INFO:tensorflow:step = 2001, loss = 109.807
INFO:tensorflow:global_step/sec: 152.216
INFO:tensorflow:step = 2101, loss = 29.1162 (0.658 sec)
INFO:tensorflow:global_step/sec: 164.275
INFO:tensorflow:step = 2201, loss = 57.2364 (0.609 sec)
INFO:tensorflow:global_step/sec: 163.167
INFO:tensorflow:step = 2301, loss = 106.041 (0.613 sec)
INFO:tensorflow:global_step/sec: 163.307
INFO:tensorflow:step = 2401, loss = 104.327 (0.612 sec)
INFO:tensorflow:global_step/sec: 161.932
INFO:tensorflow:step = 2501, loss = 133.915 (0.618 sec)
INFO:tensorflow:global_step/sec: 163.24
INFO:tensorflow:step = 2601, loss = 29.3066 (0.613 sec)
INFO:tensorflow:global_step/sec: 165.042
INFO:tensorflow:step = 2701, loss = 77.0501 (0.60

INFO:tensorflow:step = 10201, loss = 29.2047 (0.594 sec)
INFO:tensorflow:global_step/sec: 169.891
INFO:tensorflow:step = 10301, loss = 30.9145 (0.589 sec)
INFO:tensorflow:global_step/sec: 172.14
INFO:tensorflow:step = 10401, loss = 83.9577 (0.581 sec)
INFO:tensorflow:global_step/sec: 172.068
INFO:tensorflow:step = 10501, loss = 119.567 (0.582 sec)
INFO:tensorflow:global_step/sec: 175.355
INFO:tensorflow:step = 10601, loss = 35.0192 (0.570 sec)
INFO:tensorflow:global_step/sec: 171.887
INFO:tensorflow:step = 10701, loss = 34.5632 (0.582 sec)
INFO:tensorflow:global_step/sec: 173.068
INFO:tensorflow:step = 10801, loss = 70.3529 (0.578 sec)
INFO:tensorflow:global_step/sec: 174.348
INFO:tensorflow:step = 10901, loss = 39.4592 (0.574 sec)
INFO:tensorflow:global_step/sec: 171.141
INFO:tensorflow:step = 11001, loss = 44.6822 (0.584 sec)
INFO:tensorflow:global_step/sec: 171.886
INFO:tensorflow:step = 11101, loss = 49.7237 (0.582 sec)
INFO:tensorflow:global_step/sec: 168.194
INFO:tensorflow:step 

INFO:tensorflow:step = 18601, loss = 36.7745 (0.586 sec)
INFO:tensorflow:global_step/sec: 168.027
INFO:tensorflow:step = 18701, loss = 37.6207 (0.595 sec)
INFO:tensorflow:global_step/sec: 163.896
INFO:tensorflow:step = 18801, loss = 38.7191 (0.610 sec)
INFO:tensorflow:global_step/sec: 161.916
INFO:tensorflow:step = 18901, loss = 30.0858 (0.618 sec)
INFO:tensorflow:global_step/sec: 163.068
INFO:tensorflow:step = 19001, loss = 21.2462 (0.613 sec)
INFO:tensorflow:global_step/sec: 162.399
INFO:tensorflow:step = 19101, loss = 49.0121 (0.616 sec)
INFO:tensorflow:global_step/sec: 165.235
INFO:tensorflow:step = 19201, loss = 46.4197 (0.605 sec)
INFO:tensorflow:global_step/sec: 166.473
INFO:tensorflow:step = 19301, loss = 31.8609 (0.601 sec)
INFO:tensorflow:global_step/sec: 165.238
INFO:tensorflow:step = 19401, loss = 38.8551 (0.605 sec)
INFO:tensorflow:global_step/sec: 165.622
INFO:tensorflow:step = 19501, loss = 22.411 (0.604 sec)
INFO:tensorflow:global_step/sec: 168.193
INFO:tensorflow:step 

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x121642a90>

## Evaluate

### Predict

In [45]:
# eval_func  = tf.estimator.inputs.pandas_input_fn(x=X_test,  y=y_test,  batch_size=100, shuffle=False)
pred_func  = tf.estimator.inputs.pandas_input_fn(x=X_test,  batch_size=X_test.shape[0], shuffle=False)

In [46]:
y_pred_gen = lin_model.predict(input_fn=pred_func)
y_pred_gen

<generator object Estimator.predict at 0x12087e8e0>

In [47]:
y_pred = [pred['class_ids'][0] for pred in y_pred_gen]

INFO:tensorflow:Restoring parameters from /var/folders/n_/yc8w5lvd0939ndnjzj5wm8mr0000gn/T/tmpug6vqbsu/model.ckpt-22000


### Evaluate

In [48]:
from sklearn.metrics import classification_report

In [49]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.89      0.92      0.90      7436
          1       0.70      0.64      0.67      2333

avg / total       0.85      0.85      0.85      9769

