In [1]:
import tempfile
import urllib
import pandas as pd
import tensorflow as tf

In [2]:
COLS = ["age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", 
        "relationship", "race", "gender", "capital_gain", "capital_loss", "hours_per_week", 
        "native_country","income_bracket"]
df_train = pd.read_csv("./dataset/adult.data", names=COLS, skipinitialspace=True)
df_test  = pd.read_csv("./dataset/adult.test" ,names=COLS, skipinitialspace=True, skiprows=1)

In [3]:
LABEL_COL = "label"
df_train[LABEL_COL] = (df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
df_test[LABEL_COL]  = (df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)

In [4]:
CAT_COLS = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "gender", "native_country"]
CON_COLS = ["age", "education_num", "capital_gain", "capital_loss", "hours_per_week"]

In [6]:
def input_fn(df):
    cont_cols = {k: tf.constant(df[k].values) for k in CON_COLS}
    cat_cols  = {k: tf.SparseTensor(
                     indices=[[i, 0] for i in range(df[k].size)],
                     values=df[k].values,
                     shape=[df[k].size,1])
                 for k in CAT_COLS}
    feat_cols = dict(cont_cols.items() + cat_cols.items())
    label = tf.constant(df[LABEL_COL].values)
    return feat_cols, label

def train_input_fn():
    return input_fn(df_train)

def test_input_fn():
    return input_fn(df_test)

In [7]:
gender = tf.contrib.layers.sparse_column_with_keys(column_name="gender", keys=["Female", "Male"])
education = tf.contrib.layers.sparse_column_with_hash_bucket(column_name="education", hash_bucket_size=1000)
relationship = tf.contrib.layers.sparse_column_with_hash_bucket(column_name="relationship", hash_bucket_size=100)
workclass = tf.contrib.layers.sparse_column_with_hash_bucket(column_name="workclass", hash_bucket_size=100)
marital_status = tf.contrib.layers.sparse_column_with_hash_bucket(column_name="marital_status", hash_bucket_size=100)
race = tf.contrib.layers.sparse_column_with_hash_bucket(column_name="race", hash_bucket_size=100)
occupation = tf.contrib.layers.sparse_column_with_hash_bucket(column_name="occupation", hash_bucket_size=1000)
native_country = tf.contrib.layers.sparse_column_with_hash_bucket(column_name="native_country", hash_bucket_size=1000)



In [8]:
age = tf.contrib.layers.real_valued_column(column_name="age")
education_num = tf.contrib.layers.real_valued_column(column_name="education_num")
capital_gain = tf.contrib.layers.real_valued_column(column_name="capital_gain")
capital_loss = tf.contrib.layers.real_valued_column(column_name="capital_loss")
hours_per_week = tf.contrib.layers.real_valued_column(column_name="hours_per_week")

In [9]:
age_buckets = tf.contrib.layers.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

In [10]:
education_x_occupation = tf.contrib.layers.crossed_column([education, occupation], hash_bucket_size=int(1e4))
age_buckets_x_education_oocupation = tf.contrib.layers.crossed_column([age_buckets, education, occupation], hash_bucket_size=int(1e6))



In [16]:
model_dir = tempfile.mkdtemp()
m = tf.contrib.learn.LinearClassifier(feature_columns=[gender, native_country, education, occupation, 
                                                       workclass, marital_status, race, age_buckets, 
                                                       education_x_occupation, age_buckets_x_education_oocupation], 
                                      optimizer=tf.train.FtrlOptimizer(learning_rate=0.1,
                                                                        l1_regularization_strength=1,
                                                                        l2_regularization_strength=1),
                                      model_dir=model_dir)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'save_summary_steps': 100, '_num_ps_replicas': 0, '_task_type': None, '_environment': 'local', '_is_chief': True, 'save_checkpoints_secs': 600, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fe1ad9f2bd0>, 'tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_task_id': 0, 'tf_random_seed': None, 'keep_checkpoint_every_n_hours': 10000, '_evaluation_master': '', 'save_checkpoints_steps': None, '_master': '', 'keep_checkpoint_max': 5}


In [17]:
m.fit(input_fn=train_input_fn, steps=200)

Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
The default behavior of sparse_feature_cross is changing, the

<tensorflow.contrib.learn.python.learn.estimators.linear.LinearClassifier at 0x7fe1ad9f2b90>

In [18]:
results = m.evaluate(input_fn=test_input_fn, steps=1)
for key in sorted(results):
    print key, results[key]

Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
The default behavior of sparse_feature_cross is changing, the