# Reading data

https://www.tensorflow.org/versions/r0.10/how_tos/reading_data/index.html

https://archive.ics.uci.edu/ml/datasets/Census+Income

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf

In [2]:
import os

HOME_DIR = 'census'
DATA_DIR = os.path.join(HOME_DIR, 'data')

CENSUS_TRAINING = "adult.data"

TRAINING_FILE = os.path.join(DATA_DIR, CENSUS_TRAINING)

from tensorflow.contrib.learn.python.learn.datasets.base import maybe_download
maybe_download(CENSUS_TRAINING, DATA_DIR, 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data')

'census/data/adult.data'

In [3]:
COLUMNS = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "income_bracket"
]

CATEGORICAL_COLUMNS = [
    "workclass",
    "education",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "native_country"
]

CONTINUOUS_COLUMNS = [
    "age",
    "education_num",
    "capital_gain",
    "capital_loss",
    "hours_per_week"
]

record_defaults = []

for i, colname in enumerate(COLUMNS):
    if colname in CATEGORICAL_COLUMNS:
        record_defaults.append([""])
    elif colname in CONTINUOUS_COLUMNS:
        record_defaults.append([0.0])
    else:
        print("Undefined column {}:{} (assuming CATEGORICAL type)".format(colname, i))
        record_defaults.append([""])

record_defaults

Undefined column fnlwgt:2 (assuming CATEGORICAL type)
Undefined column income_bracket:14 (assuming CATEGORICAL type)


[[0.0],
 [''],
 [''],
 [''],
 [0.0],
 [''],
 [''],
 [''],
 [''],
 [''],
 [0.0],
 [0.0],
 [0.0],
 [''],
 ['']]

In [4]:
filename_queue = tf.train.string_input_producer([TRAINING_FILE])

reader = tf.TextLineReader()
_, value = reader.read(filename_queue)

example = tf.decode_csv(value, record_defaults=record_defaults)

label_raw = example.pop() # income_bracket
label = tf.py_func(lambda x: int(">50K" in x), [label_raw], [tf.int64])[0]

with tf.Session() as sess:
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    for i in range(10):
        example_value, label_value = sess.run([example, label])
        print(example_value, label_value)

    coord.request_stop()
    coord.join(threads)

[39.0, ' State-gov', ' 77516', ' Bachelors', 13.0, ' Never-married', ' Adm-clerical', ' Not-in-family', ' White', ' Male', 2174.0, 0.0, 40.0, ' United-States'] 0
[50.0, ' Self-emp-not-inc', ' 83311', ' Bachelors', 13.0, ' Married-civ-spouse', ' Exec-managerial', ' Husband', ' White', ' Male', 0.0, 0.0, 13.0, ' United-States'] 0
[38.0, ' Private', ' 215646', ' HS-grad', 9.0, ' Divorced', ' Handlers-cleaners', ' Not-in-family', ' White', ' Male', 0.0, 0.0, 40.0, ' United-States'] 0
[53.0, ' Private', ' 234721', ' 11th', 7.0, ' Married-civ-spouse', ' Handlers-cleaners', ' Husband', ' Black', ' Male', 0.0, 0.0, 40.0, ' United-States'] 0
[28.0, ' Private', ' 338409', ' Bachelors', 13.0, ' Married-civ-spouse', ' Prof-specialty', ' Wife', ' Black', ' Female', 0.0, 0.0, 40.0, ' Cuba'] 0
[37.0, ' Private', ' 284582', ' Masters', 14.0, ' Married-civ-spouse', ' Exec-managerial', ' Wife', ' White', ' Female', 0.0, 0.0, 40.0, ' United-States'] 0
[49.0, ' Private', ' 160187', ' 9th', 5.0, ' Married-

In [5]:
def read_census_csv(filename_queue):
    reader = tf.TextLineReader()
    _, value = reader.read(filename_queue)

    cols = tf.decode_csv(value, record_defaults=record_defaults)

    label_raw = cols.pop() # income_bracket
    label = tf.py_func(lambda x: int(">50K" in x), [label_raw], [tf.int64])[0]
    label.set_shape([])

    cols.append(label)
    return cols

def input_pipeline(filenames, batch_size):
    filename_queue = tf.train.string_input_producer(filenames)
    input_cols = read_census_csv(filename_queue)
    
    example_batch = tf.train.batch(input_cols, batch_size=batch_size)
    label_batch = example_batch.pop()
    
    return example_batch, label_batch

example, label = input_pipeline([TRAINING_FILE], 10)

with tf.Session() as sess:
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    example_value, label_value = sess.run([example, label])
    print(example_value, label_value)

    coord.request_stop()
    coord.join(threads)

[array([ 39.,  50.,  38.,  53.,  28.,  37.,  49.,  52.,  31.,  42.], dtype=float32), array([' State-gov', ' Self-emp-not-inc', ' Private', ' Private',
       ' Private', ' Private', ' Private', ' Self-emp-not-inc', ' Private',
       ' Private'], dtype=object), array([' 77516', ' 83311', ' 215646', ' 234721', ' 338409', ' 284582',
       ' 160187', ' 209642', ' 45781', ' 159449'], dtype=object), array([' Bachelors', ' Bachelors', ' HS-grad', ' 11th', ' Bachelors',
       ' Masters', ' 9th', ' HS-grad', ' Masters', ' Bachelors'], dtype=object), array([ 13.,  13.,   9.,   7.,  13.,  14.,   5.,   9.,  14.,  13.], dtype=float32), array([' Never-married', ' Married-civ-spouse', ' Divorced',
       ' Married-civ-spouse', ' Married-civ-spouse', ' Married-civ-spouse',
       ' Married-spouse-absent', ' Married-civ-spouse', ' Never-married',
       ' Married-civ-spouse'], dtype=object), array([' Adm-clerical', ' Exec-managerial', ' Handlers-cleaners',
       ' Handlers-cleaners', ' Prof-special