In [135]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [136]:
data = pd.read_csv('census_data.csv')

In [137]:
data.columns

Index(['age', 'workclass', 'education', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'gender', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country', 'income_bracket'],
      dtype='object')

In [138]:
data['income_bracket'].unique()

array([' <=50K', ' >50K'], dtype=object)

In [139]:
def fix_label(label):
    if label == ' <=50K':
        return 0
    else:
        return 1
    
data['income_bracket'] = data['income_bracket'].apply(fix_label)
data

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
5,37,Private,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,0
6,49,Private,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,0
7,52,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,1
8,31,Private,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,1
9,42,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,1


In [140]:
#Test train split!

from sklearn.model_selection import train_test_split

x_data = data.drop('income_bracket', axis=1)
y_labels = data['income_bracket']

X_train, X_test, y_train, y_test = train_test_split(x_data, y_labels, test_size=0.33, random_state=42)

In [141]:
# Continuous columns: age, education_num, capital_gain, capital_loss, hours_per_week

age = tf.feature_column.numeric_column('age')
education_num = tf.feature_column.numeric_column('education_num')
capital_gain = tf.feature_column.numeric_column('capital_gain')
capital_loss = tf.feature_column.numeric_column('capital_loss')
hours_per_week = tf.feature_column.numeric_column('hours_per_week')

In [142]:
# Categoric columns: workclass, education, marital_status, occupation, relationship, race, gender, native_country

workclass = tf.feature_column.categorical_column_with_hash_bucket('workclass', hash_bucket_size=1000)
education = tf.feature_column.categorical_column_with_hash_bucket('education', hash_bucket_size=1000)
marital_status = tf.feature_column.categorical_column_with_hash_bucket('marital_status', hash_bucket_size=1000)
occupation = tf.feature_column.categorical_column_with_hash_bucket('occupation', hash_bucket_size=1000)
relationship = tf.feature_column.categorical_column_with_hash_bucket('relationship', hash_bucket_size=1000)
race = tf.feature_column.categorical_column_with_hash_bucket('race', hash_bucket_size=1000)
gender = tf.feature_column.categorical_column_with_vocabulary_list('gender', ['Female', 'Male'])
native_country = tf.feature_column.categorical_column_with_hash_bucket('native_country', hash_bucket_size=1000)

In [143]:
workclass = tf.feature_column.embedding_column(categorical_column=workclass, dimension=1000)
education = tf.feature_column.embedding_column(categorical_column=education, dimension=1000)
marital_status = tf.feature_column.embedding_column(categorical_column=marital_status, dimension=1000)
occupation = tf.feature_column.embedding_column(categorical_column=occupation, dimension=1000)
relationship = tf.feature_column.embedding_column(categorical_column=relationship, dimension=1000)
race = tf.feature_column.embedding_column(categorical_column=race, dimension=1000)
gender = tf.feature_column.embedding_column(categorical_column=gender, dimension=2)
native_country = tf.feature_column.embedding_column(categorical_column=native_country, dimension=1000)

feature_cols = [age, education_num, capital_gain, capital_loss, hours_per_week, workclass, education, marital_status, occupation, relationship, race, gender, native_country]

In [144]:
input_function = tf.estimator.inputs.pandas_input_fn(x = X_train, y = y_train, batch_size=100, num_epochs=10000, shuffle=True)

In [145]:
model = tf.estimator.DNNClassifier(hidden_units=[9, 12, 15, 12, 9], feature_columns=feature_cols, n_classes=2)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp0wccvj_q', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f28d55a25f8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [146]:
model.train(input_fn=input_function, steps=20000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmp0wccvj_q/model.ckpt.
INFO:tensorflow:loss = 128.61047, step = 1
INFO:tensorflow:global_step/sec: 26.8274
INFO:tensorflow:loss = 35.38564, step = 101 (3.730 sec)
INFO:tensorflow:global_step/sec: 31.7211
INFO:tensorflow:loss = 31.376417, step = 201 (3.151 sec)
INFO:tensorflow:global_step/sec: 31.8144
INFO:tensorflow:loss = 41.479572, step = 301 (3.143 sec)
INFO:tensorflow:global_step/sec: 31.5348
INFO:tensorflow:loss = 35.575203, step = 401 (3.171 sec)
INFO:tensorflow:global_step/sec: 32.3526
INFO:tensorflow:loss = 31.42372, step = 501 (3.094 sec)
INFO:tensorflow:global_step/sec: 31.9439
INFO:tensorflow:loss = 29.813374, step = 601 (3.128 sec)
INFO:tensorflow:global_step/sec: 32.0195
INFO:tensorflow:lo

INFO:tensorflow:global_step/sec: 32.8712
INFO:tensorflow:loss = 25.827145, step = 8101 (3.043 sec)
INFO:tensorflow:global_step/sec: 32.5037
INFO:tensorflow:loss = 40.512398, step = 8201 (3.080 sec)
INFO:tensorflow:global_step/sec: 33.1565
INFO:tensorflow:loss = 41.430508, step = 8301 (3.011 sec)
INFO:tensorflow:global_step/sec: 32.8001
INFO:tensorflow:loss = 41.20863, step = 8401 (3.048 sec)
INFO:tensorflow:global_step/sec: 32.0917
INFO:tensorflow:loss = 29.55718, step = 8501 (3.116 sec)
INFO:tensorflow:global_step/sec: 32.3487
INFO:tensorflow:loss = 28.556652, step = 8601 (3.091 sec)
INFO:tensorflow:global_step/sec: 32.6333
INFO:tensorflow:loss = 34.803, step = 8701 (3.067 sec)
INFO:tensorflow:global_step/sec: 32.0771
INFO:tensorflow:loss = 42.37947, step = 8801 (3.116 sec)
INFO:tensorflow:global_step/sec: 32.7024
INFO:tensorflow:loss = 26.759926, step = 8901 (3.058 sec)
INFO:tensorflow:global_step/sec: 32.5723
INFO:tensorflow:loss = 37.789303, step = 9001 (3.072 sec)
INFO:tensorflow:

INFO:tensorflow:global_step/sec: 32.7619
INFO:tensorflow:loss = 31.848196, step = 16401 (3.051 sec)
INFO:tensorflow:global_step/sec: 32.8468
INFO:tensorflow:loss = 28.0031, step = 16501 (3.047 sec)
INFO:tensorflow:global_step/sec: 32.7027
INFO:tensorflow:loss = 31.958261, step = 16601 (3.057 sec)
INFO:tensorflow:global_step/sec: 33.2883
INFO:tensorflow:loss = 24.679653, step = 16701 (3.005 sec)
INFO:tensorflow:global_step/sec: 33.5387
INFO:tensorflow:loss = 28.521957, step = 16801 (2.980 sec)
INFO:tensorflow:global_step/sec: 33.2989
INFO:tensorflow:loss = 21.442686, step = 16901 (3.003 sec)
INFO:tensorflow:global_step/sec: 32.8516
INFO:tensorflow:loss = 22.3093, step = 17001 (3.045 sec)
INFO:tensorflow:global_step/sec: 33.2851
INFO:tensorflow:loss = 25.834879, step = 17101 (3.011 sec)
INFO:tensorflow:global_step/sec: 33.488
INFO:tensorflow:loss = 27.90376, step = 17201 (2.979 sec)
INFO:tensorflow:global_step/sec: 33.7145
INFO:tensorflow:loss = 33.081017, step = 17301 (2.966 sec)
INFO:t

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7f28d5223e10>

In [147]:
eval_input_function = tf.estimator.inputs.pandas_input_fn(x = X_test, y = y_test, batch_size=100, num_epochs=1, shuffle=False)

In [148]:
# Evaluation
model.evaluate(eval_input_function)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-07-04-07:36:46
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp0wccvj_q/model.ckpt-20000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-07-04-07:36:50
INFO:tensorflow:Saving dict for global step 20000: accuracy = 0.8564117, accuracy_baseline = 0.7627024, auc = 0.90902054, auc_precision_recall = 0.77773607, average_loss = 0.31359363, global_step = 20000, label/mean = 0.2372976, loss = 31.202566, precision = 0.73299396, prediction/mean = 0.2525909, recall = 0.6211765


{'accuracy': 0.8564117,
 'accuracy_baseline': 0.7627024,
 'auc': 0.90902054,
 'auc_precision_recall': 0.77773607,
 'average_loss': 0.31359363,
 'label/mean': 0.2372976,
 'loss': 31.202566,
 'precision': 0.73299396,
 'prediction/mean': 0.2525909,
 'recall': 0.6211765,
 'global_step': 20000}

In [151]:
# Prediction

pred_function = tf.estimator.inputs.pandas_input_fn(x=X_test, batch_size=len(X_test), shuffle=False)
pred_generator = model.predict(input_fn=pred_function)

In [152]:
preds = list(pred_generator)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp0wccvj_q/model.ckpt-20000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [153]:
preds

[{'logits': array([-4.074352], dtype=float32),
  'logistic': array([0.01671896], dtype=float32),
  'probabilities': array([0.9832811 , 0.01671896], dtype=float32),
  'class_ids': array([0]),
  'classes': array([b'0'], dtype=object)},
 {'logits': array([0.07377889], dtype=float32),
  'logistic': array([0.5184364], dtype=float32),
  'probabilities': array([0.48156366, 0.5184364 ], dtype=float32),
  'class_ids': array([1]),
  'classes': array([b'1'], dtype=object)},
 {'logits': array([1.1399422], dtype=float32),
  'logistic': array([0.757669], dtype=float32),
  'probabilities': array([0.24233095, 0.757669  ], dtype=float32),
  'class_ids': array([1]),
  'classes': array([b'1'], dtype=object)},
 {'logits': array([-4.0515685], dtype=float32),
  'logistic': array([0.01709765], dtype=float32),
  'probabilities': array([0.9829024 , 0.01709765], dtype=float32),
  'class_ids': array([0]),
  'classes': array([b'0'], dtype=object)},
 {'logits': array([-4.9601917], dtype=float32),
  'logistic': arr

In [157]:
class_preds = [pred['class_ids'][0] for pred in preds] # prediction of the classes
#class_preds

In [158]:
from sklearn.metrics import classification_report
print(classification_report(y_test, class_preds))

             precision    recall  f1-score   support

          0       0.89      0.93      0.91      8196
          1       0.73      0.62      0.67      2550

avg / total       0.85      0.86      0.85     10746

