In [2]:
import tensorflow as tf
import pandas as pd

In [3]:
COLS = ["age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", 
        "relationship", "race", "gender", "capital_gain", "capital_loss", "hours_per_week", 
        "native_country","income_bracket"]
df_train = pd.read_csv("./dataset/adult.data", names=COLS, skipinitialspace=True)
df_test  = pd.read_csv("./dataset/adult.test" ,names=COLS, skipinitialspace=True, skiprows=1)

LABEL_COL = "label"
df_train[LABEL_COL] = (df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
df_test[LABEL_COL]  = (df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)

CAT_COLS = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "gender", "native_country"]
CON_COLS = ["age", "education_num", "capital_gain", "capital_loss", "hours_per_week"]

In [4]:
def input_fn(df):
    cont_cols = {k: tf.constant(df[k].values) for k in CON_COLS}
    cat_cols  = {k: tf.SparseTensor(
                     indices=[[i, 0] for i in range(df[k].size)],
                     values=df[k].values,
                     shape=[df[k].size,1])
                 for k in CAT_COLS}
    feat_cols = dict(cont_cols.items() + cat_cols.items())
    label = tf.constant(df[LABEL_COL].values)
    return feat_cols, label

def train_input_fn():
    return input_fn(df_train)

def test_input_fn():
    return input_fn(df_test)

In [5]:
#categorical columns
gender = tf.contrib.layers.sparse_column_with_keys(column_name="gender", keys=["Female", "Male"])
race = tf.contrib.layers.sparse_column_with_keys(column_name="race", keys=
                                                 ["Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White"])
education = tf.contrib.layers.sparse_column_with_hash_bucket(column_name="education", hash_bucket_size=1000)
relationship = tf.contrib.layers.sparse_column_with_hash_bucket(column_name="relationship", hash_bucket_size=100)
workclass = tf.contrib.layers.sparse_column_with_hash_bucket(column_name="workclass", hash_bucket_size=100)
occupation = tf.contrib.layers.sparse_column_with_hash_bucket(column_name="occupation", hash_bucket_size=1000)
native_country = tf.contrib.layers.sparse_column_with_hash_bucket(column_name="native_country", hash_bucket_size=1000)

#continuous columns
age = tf.contrib.layers.real_valued_column(column_name="age")
age_buckets = tf.contrib.layers.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
education_num = tf.contrib.layers.real_valued_column(column_name="education_num")
capital_gain = tf.contrib.layers.real_valued_column(column_name="capital_gain")
capital_loss = tf.contrib.layers.real_valued_column(column_name="capital_loss")
hours_per_week = tf.contrib.layers.real_valued_column(column_name="hours_per_week")



In [7]:
wide_columns = [gender, native_country, education, occupation, workclass, relationship, age_buckets,
                tf.contrib.layers.crossed_column([education, occupation], hash_bucket_size=int(1e4)),
                tf.contrib.layers.crossed_column([native_country, occupation], hash_bucket_size=int(1e4)),
                tf.contrib.layers.crossed_column([age_buckets, education, occupation], hash_bucket_size=int(1e6))]                                               



In [9]:
# Deep columns = embeedding of categorical cols + continuus cols
# As an alternative to the '8'. we can set to $log_2(n)$ or $k n^{1/4}$, 
# where n is the number of unique features and k is a smal lconstant (< 10)
deep_columns = [
  tf.contrib.layers.embedding_column(workclass, dimension=8),
  tf.contrib.layers.embedding_column(education, dimension=8),
  tf.contrib.layers.embedding_column(gender, dimension=8),
  tf.contrib.layers.embedding_column(relationship, dimension=8),
  tf.contrib.layers.embedding_column(native_country, dimension=8),
  tf.contrib.layers.embedding_column(occupation, dimension=8),
  age, education_num, capital_gain, capital_loss, hours_per_week]



In [10]:
import tempfile
model_dir = tempfile.mkdtemp()

In [12]:
m = tf.contrib.learn.DNNLinearCombinedClassifier(
    model_dir=model_dir,
    linear_feature_columns=wide_columns,
    dnn_feature_columns=deep_columns,
    dnn_hidden_units=[100,50])

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'save_summary_steps': 100, '_num_ps_replicas': 0, '_task_type': None, '_environment': 'local', '_is_chief': True, 'save_checkpoints_secs': 600, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fdb3faeb5d0>, 'tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_task_id': 0, 'tf_random_seed': None, 'keep_checkpoint_every_n_hours': 10000, '_evaluation_master': '', 'save_checkpoints_steps': None, '_master': '', 'keep_checkpoint_max': 5}


In [15]:
m.fit(input_fn=train_input_fn, steps=200)
results = m.evaluate(input_fn=test_input_fn, steps=1)
print results

Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scal

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:loss = 25.9798, step = 2
INFO:tensorflow:Saving checkpoints for 2 into /tmp/tmpWSkcuL/model.ckpt.
INFO:tensorflow:global_step/sec: 2.02156
INFO:tensorflow:loss = 0.534488, step = 202
INFO:tensorflow:global_step/sec: 2.12443
INFO:tensorflow:Saving checkpoints for 202 into /tmp/tmpWSkcuL/model.ckpt.
INFO:tensorflow:Loss for final step: 0.534488.
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Es

Instructions for updating:
The default behavior of sparse_feature_cross is changing, the default
value for hash_key will change to SPARSE_FEATURE_CROSS_DEFAULT_HASH_KEY.
From that point on sparse_feature_cross will always use FingerprintCat64
to concatenate the feature fingerprints. And the underlying
_sparse_feature_cross_op.sparse_feature_cross operation will be marked
as deprecated.
INFO:tensorflow:Restored model from /tmp/tmpWSkcuL
INFO:tensorflow:Eval steps [0,1) for training step 202.
INFO:tensorflow:Saving evaluation summary for step 202: accuracy = 0.827038, accuracy/baseline_label_mean = 0.236226, accuracy/threshold_0.500000_mean = 0.827038, auc = 0.812116, labels/actual_label_mean = 0.236226, labels/prediction_mean = 0.197441, loss = 0.438026, precision/positive_threshold_0.500000_mean = 0.756986, recall/positive_threshold_0.500000_mean = 0.394436
{'accuracy/baseline_label_mean': 0.23622628, 'accuracy/threshold_0.500000_mean': 0.82703763, 'auc': 0.81211591, 'global_step': 202