# Reading data

https://www.tensorflow.org/programmers_guide/reading_data

https://archive.ics.uci.edu/ml/datasets/Census+Income

In [1]:
import tensorflow as tf

tf.VERSION

'1.7.0'

In [2]:
import os

HOME_DIR = 'census'
DATA_DIR = os.path.join(HOME_DIR, 'data')

CENSUS_TRAINING = 'adult.data'

TRAINING_FILE = os.path.join(DATA_DIR, CENSUS_TRAINING)

from tensorflow.contrib.learn.python.learn.datasets.base import maybe_download
maybe_download(CENSUS_TRAINING, DATA_DIR,
               'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data')

Instructions for updating:
Use the retry module or similar alternatives.
Instructions for updating:
Please write your own downloading logic.


'census/data/adult.data'

In [3]:
COLUMNS = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education_num',
    'marital_status',
    'occupation',
    'relationship',
    'race',
    'gender',
    'capital_gain',
    'capital_loss',
    'hours_per_week',
    'native_country',
    'income_bracket'
]

CATEGORICAL_COLUMNS = [
    'workclass',
    'education',
    'marital_status',
    'occupation',
    'relationship',
    'race',
    'gender',
    'native_country'
]

CONTINUOUS_COLUMNS = [
    'age',
    'education_num',
    'capital_gain',
    'capital_loss',
    'hours_per_week'
]

record_defaults = []

for i, colname in enumerate(COLUMNS):
    if colname in CATEGORICAL_COLUMNS:
        record_defaults.append([''])
    elif colname in CONTINUOUS_COLUMNS:
        record_defaults.append([0.0])
    else:
        print('Undefined column {}:{} (assuming CATEGORICAL type)'.format(colname, i))
        record_defaults.append([''])

record_defaults

Undefined column fnlwgt:2 (assuming CATEGORICAL type)
Undefined column income_bracket:14 (assuming CATEGORICAL type)


[[0.0],
 [''],
 [''],
 [''],
 [0.0],
 [''],
 [''],
 [''],
 [''],
 [''],
 [0.0],
 [0.0],
 [0.0],
 [''],
 ['']]

In [4]:
filename_queue = tf.train.string_input_producer([TRAINING_FILE])

reader = tf.TextLineReader()
_, value = reader.read(filename_queue)

example = tf.decode_csv(value, record_defaults=record_defaults)

label_raw = example.pop() # income_bracket
label = tf.py_func(lambda x: int('>50K' in x.decode('utf-8')), [label_raw], [tf.int64])[0]

with tf.Session() as sess:
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    for i in range(10):
        example_value, label_value = sess.run([example, label])
        print(example_value, label_value)

    coord.request_stop()
    coord.join(threads)

[39.0, b' State-gov', b' 77516', b' Bachelors', 13.0, b' Never-married', b' Adm-clerical', b' Not-in-family', b' White', b' Male', 2174.0, 0.0, 40.0, b' United-States'] 0
[50.0, b' Self-emp-not-inc', b' 83311', b' Bachelors', 13.0, b' Married-civ-spouse', b' Exec-managerial', b' Husband', b' White', b' Male', 0.0, 0.0, 13.0, b' United-States'] 0
[38.0, b' Private', b' 215646', b' HS-grad', 9.0, b' Divorced', b' Handlers-cleaners', b' Not-in-family', b' White', b' Male', 0.0, 0.0, 40.0, b' United-States'] 0
[53.0, b' Private', b' 234721', b' 11th', 7.0, b' Married-civ-spouse', b' Handlers-cleaners', b' Husband', b' Black', b' Male', 0.0, 0.0, 40.0, b' United-States'] 0
[28.0, b' Private', b' 338409', b' Bachelors', 13.0, b' Married-civ-spouse', b' Prof-specialty', b' Wife', b' Black', b' Female', 0.0, 0.0, 40.0, b' Cuba'] 0
[37.0, b' Private', b' 284582', b' Masters', 14.0, b' Married-civ-spouse', b' Exec-managerial', b' Wife', b' White', b' Female', 0.0, 0.0, 40.0, b' United-States'] 0

In [5]:
def read_census_csv(filename_queue):
    reader = tf.TextLineReader()
    _, value = reader.read(filename_queue)

    cols = tf.decode_csv(value, record_defaults=record_defaults)

    label_raw = cols.pop() # income_bracket
    label = tf.py_func(lambda x: int('>50K' in x.decode('utf-8')), [label_raw], [tf.int64])[0]
    label.set_shape([])

    cols.append(label)
    return cols

def input_pipeline(filenames, batch_size):
    filename_queue = tf.train.string_input_producer(filenames)
    input_cols = read_census_csv(filename_queue)
    
    example_batch = tf.train.batch(input_cols, batch_size=batch_size)
    label_batch = example_batch.pop()
    
    return example_batch, label_batch

example, label = input_pipeline([TRAINING_FILE], 10)

with tf.Session() as sess:
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    example_value, label_value = sess.run([example, label])
    print(example_value, label_value)

    coord.request_stop()
    coord.join(threads)

[array([39., 50., 38., 53., 28., 37., 49., 52., 31., 42.], dtype=float32), array([b' State-gov', b' Self-emp-not-inc', b' Private', b' Private',
       b' Private', b' Private', b' Private', b' Self-emp-not-inc',
       b' Private', b' Private'], dtype=object), array([b' 77516', b' 83311', b' 215646', b' 234721', b' 338409',
       b' 284582', b' 160187', b' 209642', b' 45781', b' 159449'],
      dtype=object), array([b' Bachelors', b' Bachelors', b' HS-grad', b' 11th', b' Bachelors',
       b' Masters', b' 9th', b' HS-grad', b' Masters', b' Bachelors'],
      dtype=object), array([13., 13.,  9.,  7., 13., 14.,  5.,  9., 14., 13.], dtype=float32), array([b' Never-married', b' Married-civ-spouse', b' Divorced',
       b' Married-civ-spouse', b' Married-civ-spouse',
       b' Married-civ-spouse', b' Married-spouse-absent',
       b' Married-civ-spouse', b' Never-married', b' Married-civ-spouse'],
      dtype=object), array([b' Adm-clerical', b' Exec-managerial', b' Handlers-cleaners',
  