In [1]:
# Import necessary packages and modules
import csv
import tensorflow as tf
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Download adult.data and adult.test from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/
# Provide the path to these files in the lines below
train_data = pd.read_csv('adult.data', header=None)
test_data = pd.read_csv('adult.test', header=None, skiprows=[0])

In [3]:
# Add column names
train_data.columns = ['age', 'workclass', 'observation-weight', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'class']
test_data.columns = ['age', 'workclass', 'observation-weight', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'class']

In [4]:
# View a sample of the data
train_data.head(5)

Unnamed: 0,age,workclass,observation-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
# Drop the observation weight column
train_data = train_data.drop('observation-weight', axis = 1)
test_data = test_data.drop('observation-weight', axis = 1)

In [6]:
# Create the list of variables/column names of different datatypes 
# This is used to perform datatype specific processing
binary_variables = ['class']
categorical_variables = ['workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
numeric_variables = ['hours-per-week','age','capital-gain','capital-loss']

In [7]:
# Convert the datatype of categorical and binary variables to string
# Remove whitespaces from the beginning and the end
for variable in categorical_variables + binary_variables:
    train_data[variable] = train_data[variable].astype(np.str)
    train_data[variable] = train_data[variable].map(lambda x : x.strip())
    test_data[variable] = test_data[variable].astype(np.str)
    test_data[variable] = test_data[variable].map(lambda x : x.strip())

In [8]:
# Replace unknown values indicated by ? with nans to be later imputed with appropriate value
for variable in train_data.columns:
    train_data[variable] = train_data[variable].replace('?', np.nan)
for variable in test_data.columns:
    test_data[variable] = test_data[variable].replace('?', np.nan)

In [9]:
# Identify the columns with nans
train_data.columns[pd.isnull(train_data).any()].tolist()

['workclass', 'occupation', 'native-country']

In [10]:
# Replace nans with mode of the values in the respective variables/columns
imputed_variables = ['workclass','occupation','native-country']
imputed_values = train_data[imputed_variables].mode()
for variable in imputed_variables:
    train_data[variable].fillna(value=imputed_values[variable][0], inplace=True)
    test_data[variable].fillna(value=imputed_values[variable][0], inplace=True)

In [11]:
for variable in numeric_variables:
    train_data[variable] = train_data[variable].astype(np.float)
    test_data[variable] = test_data[variable].astype(np.float)

In [12]:
# Convert each of the categorical variables to a set of binary variables, one for each value 
# the categorical variable takes
# Create and update keys to contain the final set of variables to be used to build the model
keys = []
for variable in categorical_variables:
    vals_uniq = list(train_data[variable].unique())
    temp = train_data[variable].map(lambda x: np.eye(len(vals_uniq))[vals_uniq.index(x)])
    mat = [x for x in temp.as_matrix()]
    df = pd.DataFrame(mat)
    df.columns = vals_uniq
    train_data = pd.concat([train_data,df],axis=1)
    
    temp = test_data[variable].map(lambda x: np.eye(len(vals_uniq))[vals_uniq.index(x)])
    mat = [x for x in temp.as_matrix()]
    df = pd.DataFrame(mat)
    df.columns = vals_uniq
    test_data = pd.concat([test_data,df],axis=1)
    
    keys+=vals_uniq

In [13]:
# Add numerical variables' names to the final set of variables to be used to build the model
keys+=numeric_variables

In [14]:
print keys
print len(keys)

['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov', 'Local-gov', 'Self-emp-inc', 'Without-pay', 'Never-worked', 'Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college', 'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school', '5th-6th', '10th', '1st-4th', 'Preschool', '12th', '13', '9', '7', '14', '5', '10', '12', '11', '4', '16', '15', '3', '6', '2', '1', '8', 'Never-married', 'Married-civ-spouse', 'Divorced', 'Married-spouse-absent', 'Separated', 'Married-AF-spouse', 'Widowed', 'Adm-clerical', 'Exec-managerial', 'Handlers-cleaners', 'Prof-specialty', 'Other-service', 'Sales', 'Craft-repair', 'Transport-moving', 'Farming-fishing', 'Machine-op-inspct', 'Tech-support', 'Protective-serv', 'Armed-Forces', 'Priv-house-serv', 'Not-in-family', 'Husband', 'Wife', 'Own-child', 'Unmarried', 'Other-relative', 'White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Male', 'Female', 'United-States', 'Cuba', 'Jamaica', 'India', 'Mexico', 'South', 'Puerto-Ri

In [15]:
# Convert the values of the class variable to 0-1 format 
train_data['class'] = train_data['class'].map(lambda x : '0' if x == '<=50K' else '1')
test_data['class'] = test_data['class'].map(lambda x : '0' if x == '<=50K.' else '1')

In [16]:
# Create training and test set with only the final set of variables
tr=train_data.reindex(index=train_data.index,columns=keys)
tt=test_data.reindex(index=test_data.index,columns=keys)
print tr.shape
print tt.shape

(32561, 119)
(16281, 119)


In [17]:
# Convert the class type from string to int
tr['class'] = train_data['class'].astype(int)
tt['class'] = test_data['class'].astype(int)

In [18]:
# View a sample of the data to be used to build the model
tr.head(5)

Unnamed: 0,State-gov,Self-emp-not-inc,Private,Federal-gov,Local-gov,Self-emp-inc,Without-pay,Never-worked,Bachelors,HS-grad,...,Vietnam,Hong,Ireland,Hungary,Holand-Netherlands,hours-per-week,age,capital-gain,capital-loss,class
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,40.0,39.0,2174.0,0.0,0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,13.0,50.0,0.0,0.0,0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,40.0,38.0,0.0,0.0,0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,40.0,53.0,0.0,0.0,0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,40.0,28.0,0.0,0.0,0


In [19]:
# Set model parameters
learning_rate = 0.1
training_epochs = 10
batch_size = 100
display_step = 1

In [20]:
# Define tf Graph Input
X = tf.placeholder(tf.float32, [None, 119]) # Each data point is of 119 dimension
Y = tf.placeholder(tf.float32, [None]) # Each label is binary
 
# Set model weights and bias
W = tf.random_normal([119,1], mean=0.1,stddev=0.1)
b = tf.Variable(tf.zeros([1]))
 
# Construct model
Y_pred = tf.nn.sigmoid(tf.matmul(X, W) + b)
 
# Minimize error using cross entropy or L2_loss (Currently commented)
# For cross entropy, note the addition of 1e-10 to avoid NaNs
cost_function = tf.reduce_mean(-tf.reduce_sum(Y*tf.log(Y_pred + 1e-10)+(1-Y)*tf.log((1-Y_pred)+1e-10), reduction_indices=1))
#cost_function = tf.nn.l2_loss(Y_pred-Y)

# Set threshold to make prediction
thres = 0.7*tf.ones([tf.shape(X)[0]])
Y_pred_thres = tf.greater(Y_pred,thres)

# Note accuracy
correct_prediction = tf.equal(Y,tf.cast(Y_pred_thres,tf.float32))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

# Learn weights and bias using Gradient Descent
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost_function)
#optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_function)
 
# Initializing the variables
init = tf.initialize_all_variables()

In [22]:
# Inside a tensorflow session, learn the model and note the accuracy on test data
with tf.Session() as sess:
    sess.run(init)
 
    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        avg_acc = 0.
        total_batch = int(tr.shape[0]/batch_size)
        # Loop over all batches
        for i in range(total_batch):
            train_batch = tr.sample(batch_size)
            # Fit training using batch data
            _, c, a = sess.run([optimizer,cost_function,accuracy], feed_dict={X: [x for x in train_batch[keys].values],
                                    Y: [x for x in train_batch['class'].values]})
            # Compute average loss
            avg_cost += c / total_batch
            avg_acc += a / total_batch
           
        # Display logs per epoch step
        if (epoch+1) % display_step == 0:
            print "Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost) , "acc=", "{:.9f}".format(avg_acc) 
 
    print "Optimization Finished!"
    
    # Note the performance of the model on a sample of the test set
    avg_acc = 0.
    test_sample = tt.sample(5000)
    c,a = sess.run([cost_function,accuracy], feed_dict={X: [x for x in test_sample[keys].values], 
                                    Y: [x for x in test_sample['class'].values]})
    avg_acc = a
    print "acc=", "{:.9f}".format(avg_acc)

Epoch: 0001 cost= 452.519021184 acc= 0.518870769
Epoch: 0002 cost= 367.085918673 acc= 0.619656617
Epoch: 0003 cost= 372.865044931 acc= 0.614692309
Epoch: 0004 cost= 381.609871544 acc= 0.616646770
Epoch: 0005 cost= 369.399025175 acc= 0.619521231
Epoch: 0006 cost= 372.605332736 acc= 0.610646771
Epoch: 0007 cost= 372.122348445 acc= 0.620913847
Epoch: 0008 cost= 366.052592539 acc= 0.608152001
Epoch: 0009 cost= 381.747739680 acc= 0.619923077
Epoch: 0010 cost= 385.824755202 acc= 0.615708309
Optimization Finished!
acc= 0.687935531
