# TensorFlow Linear Model Tutorial

https://www.tensorflow.org/tutorials/wide

https://www.tensorflow.org/tutorials/linear

In [1]:
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.ERROR)

tf.__version__

'1.1.0'

**Reading The Census Data**

In [2]:
import os
import shutil

HOME_DIR = 'census'
DATA_DIR = os.path.join(HOME_DIR, 'data')

CENSUS_TRAINING = "adult.data"
CENSUS_TEST = "adult.test"

TRAINING_FILE = os.path.join(DATA_DIR, CENSUS_TRAINING)
TEST_FILE = os.path.join(DATA_DIR, CENSUS_TEST)

from tensorflow.contrib.learn.python.learn.datasets.base import maybe_download
maybe_download(CENSUS_TRAINING,
               DATA_DIR,
               'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data')
maybe_download(CENSUS_TEST,
               DATA_DIR,
               'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test')

MODEL_DIR = os.path.join(HOME_DIR, 'model', 'wide')

if os.path.isdir(MODEL_DIR):
    shutil.rmtree(MODEL_DIR)

In [3]:
COLUMNS = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "income_bracket"
]

CATEGORICAL_COLUMNS = [
    "workclass",
    "education",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "native_country"
]

CONTINUOUS_COLUMNS = [
    "age",
    "education_num",
    "capital_gain",
    "capital_loss",
    "hours_per_week"
]

record_defaults = []

for i, colname in enumerate(COLUMNS):
    if colname in CATEGORICAL_COLUMNS:
        record_defaults.append([""])
    elif colname in CONTINUOUS_COLUMNS:
        record_defaults.append([0.0])
    else:
        print("Undefined column {}:{} (assuming CATEGORICAL type)".format(colname, i))
        record_defaults.append([""])

record_defaults

Undefined column fnlwgt:2 (assuming CATEGORICAL type)
Undefined column income_bracket:14 (assuming CATEGORICAL type)


[[0.0],
 [''],
 [''],
 [''],
 [0.0],
 [''],
 [''],
 [''],
 [''],
 [''],
 [0.0],
 [0.0],
 [0.0],
 [''],
 ['']]

In [4]:
def read_census_csv(filename_queue, skip_header=False):
    reader = tf.TextLineReader(skip_header_lines=int(skip_header))
    _, value = reader.read(filename_queue)

    cols = tf.decode_csv(value, record_defaults=record_defaults)

    label_raw = cols.pop() # income_bracket
    label = tf.py_func(lambda x: int('>50K' in x.decode('utf-8')), [label_raw], [tf.int64])[0]
    label.set_shape([])

    cols.append(label)
    return cols

def input_pipeline(filenames, batch_size, skip_header=False):
    filename_queue = tf.train.string_input_producer(filenames)
    input_cols = read_census_csv(filename_queue, skip_header)
    
    example = tf.train.batch(input_cols, batch_size=batch_size)
    label = example.pop()
    
    feature_cols = dict(zip(COLUMNS, example))
    
    indices = [[i, 0] for i in range(batch_size)]
    for colname in CATEGORICAL_COLUMNS:
        tensor = feature_cols[colname]
        feature_cols[colname] = tf.SparseTensor(indices, tensor, [batch_size, 1])

    return feature_cols, label

def train_input_fn():
    return input_pipeline([TRAINING_FILE], 128, False)

def eval_input_fn():
    return input_pipeline([TEST_FILE], 128, True)

**Selecting and Engineering Features for the Model**

*Base Categorical Feature Columns*

In [5]:
workclass = tf.contrib.layers.sparse_column_with_hash_bucket("workclass", hash_bucket_size=100)
education = tf.contrib.layers.sparse_column_with_hash_bucket("education", hash_bucket_size=1000)
marital_status = tf.contrib.layers.sparse_column_with_hash_bucket("marital_status", hash_bucket_size=100)
occupation = tf.contrib.layers.sparse_column_with_hash_bucket("occupation", hash_bucket_size=1000)
relationship = tf.contrib.layers.sparse_column_with_hash_bucket("relationship", hash_bucket_size=100)
native_country = tf.contrib.layers.sparse_column_with_hash_bucket("native_country", hash_bucket_size=1000)

gender = tf.contrib.layers.sparse_column_with_keys(
    column_name="gender",
    keys=["female", "male"])

race = tf.contrib.layers.sparse_column_with_keys(
    column_name="race",
    keys=["Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White"])

*Base Continuous Feature Columns*

In [6]:
age = tf.contrib.layers.real_valued_column("age")

*Making Continuous Features Categorical through Bucketization*

In [7]:
age_buckets = tf.contrib.layers.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

*Intersecting Multiple Columns with CrossedColumn*

In [8]:
education_x_occupation = tf.contrib.layers.crossed_column(
    [education, occupation],
    hash_bucket_size=int(1e4))

age_buckets_x_race_x_occupation = tf.contrib.layers.crossed_column(
    [age_buckets, race, occupation],
    hash_bucket_size=int(1e6))

**Defining The Logistic Regression Model**

In [9]:
m = tf.contrib.learn.LinearClassifier(
    feature_columns=[
        gender,
        native_country,
        education,
        occupation,
        workclass,
        marital_status,
        race,
        age_buckets,
        education_x_occupation,
        age_buckets_x_race_x_occupation],
    model_dir=MODEL_DIR)

**Training and Evaluating Our Model**

In [10]:
m.fit(input_fn=train_input_fn, steps=200)

results = m.evaluate(input_fn=eval_input_fn, steps=1)
for key in sorted(results):
    print('{}: {}'.format(key, results[key]))

accuracy: 0.8828125
accuracy/baseline_label_mean: 0.2109375
accuracy/threshold_0.500000_mean: 0.8828125
auc: 0.9033737182617188
global_step: 200
labels/actual_label_mean: 0.2109375
labels/prediction_mean: 0.20066951215267181
loss: 0.31832727789878845
precision/positive_threshold_0.500000_mean: 0.875
recall/positive_threshold_0.500000_mean: 0.5185185074806213
