# TensorFlow Wide & Deep Learning Tutorial

https://www.tensorflow.org/versions/r0.10/tutorials/wide_and_deep/index.html

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf

tf.logging.set_verbosity(tf.logging.ERROR)

**Reading The Census Data**

In [2]:
import os
import shutil

HOME_DIR = 'census'
DATA_DIR = os.path.join(HOME_DIR, 'data')

CENSUS_TRAINING = "adult.data"
CENSUS_TEST = "adult.test"

TRAINING_FILE = os.path.join(DATA_DIR, CENSUS_TRAINING)
TEST_FILE = os.path.join(DATA_DIR, CENSUS_TEST)

from tensorflow.contrib.learn.python.learn.datasets.base import maybe_download
maybe_download(CENSUS_TRAINING, DATA_DIR, 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data')
maybe_download(CENSUS_TEST, DATA_DIR, 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test')

MODEL_DIR = os.path.join(HOME_DIR, 'model', 'wide_n_deep')

if os.path.isdir(MODEL_DIR):
    shutil.rmtree(MODEL_DIR)

In [3]:
COLUMNS = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "income_bracket"
]

CATEGORICAL_COLUMNS = [
    "workclass",
    "education",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "native_country"
]

CONTINUOUS_COLUMNS = [
    "age",
    "education_num",
    "capital_gain",
    "capital_loss",
    "hours_per_week"
]

record_defaults = []

for i, colname in enumerate(COLUMNS):
    if colname in CATEGORICAL_COLUMNS:
        record_defaults.append([""])
    elif colname in CONTINUOUS_COLUMNS:
        record_defaults.append([0.0])
    else:
        print("Undefined column {}:{} (assuming CATEGORICAL type)".format(colname, i))
        record_defaults.append([""])

def read_census_csv(filename_queue, skip_header=False):
    reader = tf.TextLineReader(skip_header_lines=int(skip_header))
    _, value = reader.read(filename_queue)

    cols = tf.decode_csv(value, record_defaults=record_defaults)

    label_raw = cols.pop() # income_bracket
    label = tf.py_func(lambda x: int(">50K" in x), [label_raw], [tf.int64])[0]
    label.set_shape([])

    cols.append(label)
    return cols

def input_pipeline(filenames, batch_size, skip_header=False):
    filename_queue = tf.train.string_input_producer(filenames)
    input_cols = read_census_csv(filename_queue, skip_header)
    
    example = tf.train.batch(input_cols, batch_size=batch_size)
    label = example.pop()
    
    feature_cols = dict(zip(COLUMNS, example))
    
    indices = [[i, 0] for i in range(batch_size)]
    for colname in CATEGORICAL_COLUMNS:
        tensor = feature_cols[colname]
        feature_cols[colname] = tf.SparseTensor(indices, tensor, [batch_size, 1])

    return feature_cols, label

def train_input_fn():
    return input_pipeline([TRAINING_FILE], 128, False)

def eval_input_fn():
    return input_pipeline([TEST_FILE], 128, True)

Undefined column fnlwgt:2 (assuming CATEGORICAL type)
Undefined column income_bracket:14 (assuming CATEGORICAL type)


**Define Base Feature Columns**

In [4]:
# Categorical base columns

workclass = tf.contrib.layers.sparse_column_with_hash_bucket("workclass", hash_bucket_size=100)
education = tf.contrib.layers.sparse_column_with_hash_bucket("education", hash_bucket_size=1000)
marital_status = tf.contrib.layers.sparse_column_with_hash_bucket("marital_status", hash_bucket_size=100)
occupation = tf.contrib.layers.sparse_column_with_hash_bucket("occupation", hash_bucket_size=1000)
relationship = tf.contrib.layers.sparse_column_with_hash_bucket("relationship", hash_bucket_size=100)
native_country = tf.contrib.layers.sparse_column_with_hash_bucket("native_country", hash_bucket_size=1000)

gender = tf.contrib.layers.sparse_column_with_keys(
    column_name="gender",
    keys=["female", "male"])

race = tf.contrib.layers.sparse_column_with_keys(
    column_name="race",
    keys=["Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White"])

# Continuous base columns

age = tf.contrib.layers.real_valued_column("age")
education_num = tf.contrib.layers.real_valued_column("education_num")
capital_gain = tf.contrib.layers.real_valued_column("capital_gain")
capital_loss = tf.contrib.layers.real_valued_column("capital_loss")
hours_per_week = tf.contrib.layers.real_valued_column("hours_per_week")

**The Wide Model: Linear Model with Crossed Feature Columns**

In [5]:
age_buckets = tf.contrib.layers.bucketized_column(
    age,
    boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

education_x_occupation = tf.contrib.layers.crossed_column(
    [education, occupation],
    hash_bucket_size=int(1e4))

native_country_x_occupation = tf.contrib.layers.crossed_column(
    [native_country, occupation],
    hash_bucket_size=int(1e4))

age_buckets_x_race_x_occupation = tf.contrib.layers.crossed_column(
    [age_buckets, race, occupation],
    hash_bucket_size=int(1e6))

wide_columns = [
    gender,
    native_country,
    education,
    occupation,
    workclass,
    marital_status,
    relationship,
    age_buckets,
    education_x_occupation,
    native_country_x_occupation,
    age_buckets_x_race_x_occupation]

wide_columns

[_SparseColumn(column_name='gender', is_integerized=False, bucket_size=None, lookup_config=_SparseIdLookupConfig(vocabulary_file=None, keys=('female', 'male'), num_oov_buckets=0, vocab_size=2, default_value=-1), combiner='sum', dtype=tf.string),
 _SparseColumn(column_name='native_country', is_integerized=False, bucket_size=1000, lookup_config=None, combiner='sum', dtype=tf.string),
 _SparseColumn(column_name='education', is_integerized=False, bucket_size=1000, lookup_config=None, combiner='sum', dtype=tf.string),
 _SparseColumn(column_name='occupation', is_integerized=False, bucket_size=1000, lookup_config=None, combiner='sum', dtype=tf.string),
 _SparseColumn(column_name='workclass', is_integerized=False, bucket_size=100, lookup_config=None, combiner='sum', dtype=tf.string),
 _SparseColumn(column_name='marital_status', is_integerized=False, bucket_size=100, lookup_config=None, combiner='sum', dtype=tf.string),
 _SparseColumn(column_name='relationship', is_integerized=False, bucket_siz

**The Deep Model: Neural Network with Embeddings**

In [6]:
deep_columns = [
    tf.contrib.layers.embedding_column(workclass, dimension=8),
    tf.contrib.layers.embedding_column(education, dimension=8),
    tf.contrib.layers.embedding_column(marital_status, dimension=8),
    tf.contrib.layers.embedding_column(gender, dimension=8),
    tf.contrib.layers.embedding_column(relationship, dimension=8),
    tf.contrib.layers.embedding_column(race, dimension=8),
    tf.contrib.layers.embedding_column(native_country, dimension=8),
    tf.contrib.layers.embedding_column(occupation, dimension=8),
    age,
    education_num,
    capital_gain,
    capital_loss,
    hours_per_week]

deep_columns

[_EmbeddingColumn(sparse_id_column=_SparseColumn(column_name='workclass', is_integerized=False, bucket_size=100, lookup_config=None, combiner='sum', dtype=tf.string), dimension=8, combiner='mean', initializer=<function _initializer at 0x7f70768e97d0>, ckpt_to_load_from=None, tensor_name_in_ckpt=None),
 _EmbeddingColumn(sparse_id_column=_SparseColumn(column_name='education', is_integerized=False, bucket_size=1000, lookup_config=None, combiner='sum', dtype=tf.string), dimension=8, combiner='mean', initializer=<function _initializer at 0x7f70768ee230>, ckpt_to_load_from=None, tensor_name_in_ckpt=None),
 _EmbeddingColumn(sparse_id_column=_SparseColumn(column_name='marital_status', is_integerized=False, bucket_size=100, lookup_config=None, combiner='sum', dtype=tf.string), dimension=8, combiner='mean', initializer=<function _initializer at 0x7f70768ee398>, ckpt_to_load_from=None, tensor_name_in_ckpt=None),
 _EmbeddingColumn(sparse_id_column=_SparseColumn(column_name='gender', is_integerized

**Combining Wide and Deep Models into One**

In [7]:
m = tf.contrib.learn.DNNLinearCombinedClassifier(
    model_dir=MODEL_DIR,
    linear_feature_columns=wide_columns,
    dnn_feature_columns=deep_columns,
    dnn_hidden_units=[100, 50])

**Training and Evaluating The Model**

In [8]:
m.fit(input_fn=train_input_fn, steps=200)

results = m.evaluate(input_fn=eval_input_fn, steps=1)
for key in sorted(results):
    print('{}: {}'.format(key, results[key]))

accuracy: 0.9296875
accuracy/baseline_target_mean: 0.2109375
accuracy/threshold_0.500000_mean: 0.9296875
auc: 0.963696360588
global_step: 200
labels/actual_target_mean: 0.2109375
labels/prediction_mean: 0.220046311617
loss: 3.95447969437
precision/positive_threshold_0.500000_mean: 0.909090936184
recall/positive_threshold_0.500000_mean: 0.740740716457
