In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import ml_project as ml
import time
from sklearn.decomposition import PCA
csv_data = pd.read_csv("adult.data.txt", skipinitialspace=True)

This dataset is used to predict whether a person's annual income is >50K or <=50K

These are the fetures we'll use:

age: continuous.

workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.

fnlwgt: continuous.

education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.

education-num: continuous.

marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.

occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.

relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.

race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.

sex: Female, Male.

capital-gain: continuous.

capital-loss: continuous.

hours-per-week: continuous.

native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.

In [None]:
# here's what the fist few samples look like
csv_data.head(10)

In order to use this to do classification with POLK and deep nets we'll transform all of these features to be vectors

In [None]:
# for simplicity, we'll toss out any sample that contains an unknown value represented as '?'
drop_inds = []
for i, sample in csv_data.iterrows():
    for val in sample:
        if val == '?':
            drop_inds.append(i)
csv_data = csv_data.drop(drop_inds)            

# next, we need to transform nominal variables that are represented as strings into a numerical value
# we'll use some dictionaries based on the descriptions

# just drop the education column b/c it is equivalent to education_num
csv_data = csv_data.drop(columns=['education'])

replace_dict = {
    'workclass':{
        'Private':0, 
        'Self-emp-not-inc':1, 
        'Self-emp-inc':2, 
        'Federal-gov':3, 
        'Local-gov':4, 
        'State-gov':5, 
        'Without-pay':6, 
        'Never-worked':7
    },
    'marital-status':{
        'Married-civ-spouse':0,
        'Divorced':1,
        'Never-married':2,
        'Separated':3,
        'Widowed':4,
        'Married-spouse-absent':5, 
        'Married-AF-spouse':6
    },
    'occupation':{
        'Tech-support':0,
        'Craft-repair':1,
        'Other-service':2,
        'Sales':3,
        'Exec-managerial':4,
        'Prof-specialty':5,
        'Handlers-cleaners':6,
        'Machine-op-inspct':7,
        'Adm-clerical':8,
        'Farming-fishing':9,
        'Transport-moving':10,
        'Priv-house-serv':11,
        'Protective-serv':12,
        'Armed-Forces':13
    },
    'relationship':{
        'Wife':0,
        'Own-child':1,
        'Husband':2,
        'Not-in-family':3,
        'Other-relative':4,
        'Unmarried':5
    },
    'race':{
        'White':0,
        'Asian-Pac-Islander':1,
        'Amer-Indian-Eskimo':2,
        'Other':3,
        'Black':4
    },
    'sex':{
        'Female':0,
        'Male':1
    },
    'native-country':{
        'United-States':0,
        'Cambodia':1,
        'England':2,
        'Puerto-Rico':3,
        'Canada':4,
        'Germany':5,
        'Outlying-US(Guam-USVI-etc)':6,
        'India':7,
        'Japan':8,
        'Greece':9,
        'South':10,
        'China':11,
        'Cuba':12,
        'Iran':13,
        'Honduras':14,
        'Philippines':15,
        'Italy':16,
        'Poland':17,
        'Jamaica':18,
        'Vietnam':19,
        'Mexico':20,
        'Portugal':21,
        'Ireland':22,
        'France':23,
        'Dominican-Republic':24,
        'Laos':25,
        'Ecuador':26,
        'Taiwan':27,
        'Haiti':28,
        'Columbia':29,
        'Hungary':30,
        'Guatemala':31,
        'Nicaragua':32,
        'Scotland':33,
        'Thailand':34,
        'Yugoslavia':35,
        'El-Salvador':36,
        'Trinadad&Tobago':37,
        'Peru':38,
        'Hong':39,
        'Holand-Netherlands':40
    },
    'class':{
        '<=50K':0,
        '>50K':1
    }
}
csv_data = csv_data.replace(replace_dict)
labels = csv_data['class']
data = csv_data.drop(columns=['class'])

Next, we'll normalize all the features by taking the column-wise mean and subtracting it from each value, then dividing by the standard deviation

In [None]:
data = data.transform(lambda x: (x-x.mean())-x.std())

When breaking into training and test we need make some changes to account for the class imbalance

In [None]:
N = len(labels)
data = np.array(data)
labels  = np.array(labels)
# find the indices of each class
zero_inds = np.where(labels==0)[0]
one_inds = np.where(labels)[0]
data_0, data_1 = data[zero_inds], data[one_inds]
label_0, label_1 = labels[zero_inds], labels[one_inds]
# we want an even split of each class. Since there are fewer one's we'll split those and randomly sample the same number of zeros
N = len(one_inds)
train_slice = int(.8 * N)
test_slice = N - train_slice
# preallocate
train_data = np.zeros((2*train_slice, data.shape[1]))
test_data = np.zeros((2*test_slice, data.shape[1]))
# split up the ones
train_data[:train_slice,:] = data_1[:train_slice,:]
test_data[test_slice:,:] = data_1[train_slice:,:]
# randomly choose as many zeros as we have ones
chosen_inds = np.random.choice(zero_inds,N,replace=False)
# split them between training and test
train_data[train_slice:,:] = data[chosen_inds[:train_slice],:]
test_data[test_slice:,:] = data[chosen_inds[train_slice:],:]
# put the labels together
train_labels = np.zeros(2*train_slice)
train_labels[:train_slice] = np.ones(train_slice)
test_labels = np.zeros(2*test_slice)
test_labels[:test_slice] = np.ones(test_slice)
# shuffle everything
np.random.seed(101)
np.random.shuffle(train_data)
np.random.seed(101)
np.random.shuffle(train_labels)
np.random.seed(202)
np.random.shuffle(test_data)
np.random.seed(202)
np.random.shuffle(test_labels)

Save the transformation

In [None]:
np.save('train_data',train_data)
np.save('train_labels',train_labels)
np.save('test_data',test_data)
np.save('test_labels',test_labels)


In [None]:
train_data = np.load('train_data.npy')
train_labels = np.load('train_labels.npy')
test_data = np.load('test_data.npy')
test_labels = np.load('test_labels.npy')

Next, we'll define a function to train a neural net to classify this data set

In [None]:
def train_mlp(step_size, num_layers, epochs, data):
    train_data, train_labels, test_data, test_labels = data
    print('*************************************\ntraining mlp with {} layers and step_size={}...'.format(num_layers,step_size))
    tf.reset_default_graph()
    sess = tf.Session()

    inputs = tf.placeholder(tf.float64, shape=(None,train_data.shape[1]),name='input_placeholder')
    labels = tf.placeholder(dtype=tf.int32, shape=(None,))
    one_hot_labels = tf.one_hot(labels,2)

    h = inputs

    for layer in range(num_layers):
        h = tf.contrib.layers.fully_connected(inputs=h, num_outputs=12, weights_regularizer=tf.nn.l2_loss)
    h = tf.contrib.layers.fully_connected(inputs=h, num_outputs=2, activation_fn=None)

    regularizer_loss = tf.losses.get_regularization_loss()
    alpha = 1e-6
    loss = tf.reduce_mean(tf.losses.sigmoid_cross_entropy(multi_class_labels=one_hot_labels, logits=h)) + alpha * regularizer_loss

#     optimizer = tf.train.GradientDescentOptimizer(learning_rate=.001)
    optimizer = tf.train.AdamOptimizer(learning_rate=step_size)
    minimizer = optimizer.minimize(loss)

    prediction = tf.argmax(h,1)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction, tf.cast(labels,tf.int64)),tf.float64))
    sess.run(tf.global_variables_initializer())
    print( "Total number of variables used ", np.sum([v.get_shape().num_elements() for v in tf.trainable_variables()]) )
    BS = 10
    for epoch in range(epochs):
        np.random.seed(epoch)
        np.random.shuffle(train_data)
        np.random.seed(epoch)
        np.random.shuffle(train_labels)
        train_losses = []
        test_losses = []
        train_errs = []
        test_errs = []
        step_times = []
        for i in range(0, train_data.shape[0]-BS+1, BS):
            batch_data, batch_labels = train_data[i:i+BS], train_labels[i:i+BS]
            start = time.time()
            _, _train_loss, _train_acc = sess.run([minimizer, loss, accuracy], feed_dict={inputs: batch_data, labels: batch_labels})
            end = time.time()
            # update training stats
            train_losses.append(_train_loss)
            train_errs.append(1-_train_acc)
            step_times.append(end-start)
            # update test stats
            test_loss, test_acc, pred_vec = sess.run([loss, accuracy, prediction], feed_dict={inputs: test_data, labels: test_labels})
            test_losses.append(test_loss)
            test_errs.append(1-test_acc)
    # print('Train loss: {:.3f}. Training Acc: {:.3f}. Test loss: {:.3f}. Test Acc: {:.3f}.'.format(np.mean(train_losses), np.mean(train_accs), test_loss, test_acc))
    print('Test accuracy: {:.3f}.'.format(test_acc))
    print('final confusion matrix:')
    TP = np.sum(np.logical_and(pred_vec == 1, test_labels == 1))
    FP = np.sum(np.logical_and(pred_vec == 1, test_labels == 0))
    TN = np.sum(np.logical_and(pred_vec == 0, test_labels == 0))
    FN = np.sum(np.logical_and(pred_vec == 0, test_labels == 1))
    print('TP: {}. FP: {}.\nTN: {}. FN: {}.'.format(TP,FP,TN,FN))
    return train_losses, test_losses, train_errs, test_errs, step_times

Now we want to try the same thing with POLK

In [None]:
def train_POLK(step_size, sigma, eps, data):
    train_data, train_labels, test_data, test_labels = data
    train_losses = []
    test_losses = []
    train_errors = []
    test_errors = []
    model_orders = []
    step_times = []
    BS = 10
    kernel = ml.gaussian_kernel(sigma)
    model = ml.sklr_model(kernel, 1e-9, eps)
    sgd = ml.SGD(model)
    print('*********************')
    print('training POLK with step szie: {}. sigma: {}. error threshold: {}. '.format(step_size, sigma, eps))
    for e in range(epochs):
        print('epoch: ', e)
        epoch_start = time.time()
        seed = e
        np.random.seed(seed)
        np.random.shuffle(train_data)
        np.random.seed(seed)
        np.random.shuffle(train_labels)
        for i in range(0, train_data.shape[0], BS):
            start = time.time()
            sgd.fit(step_size, train_data[i:i+BS], train_labels[i:i+BS])
            end = time.time()
            # add the time to compute sgd
            step_times.append(end-start)
            # calcualte training and test loss
            train_losses.append(model.loss(train_data, train_labels))
            test_losses.append(model.loss(test_data, test_labels))
            # calculate training accuracy
            predictions = model.predict(train_data) >= .5
            train_labels.shape = predictions.shape
            correct = (predictions == train_labels).sum()
            train_errors.append(1 - (correct/(train_labels.shape[0])))
            # calculate test accuracy
            predictions = model.predict(test_data) >= .5
            test_labels.shape = predictions.shape
            correct = (predictions == test_labels).sum()
            test_errors.append(1 - (correct/(test_labels.shape[0])))
            # add the current model order
            model_orders.append(model.dictionary().shape[0])
        epoch_end = time.time()
        print('time to run epoch: {} seconds'.format(epoch_end - epoch_start))
        print('training loss: {}. test loss: {}'.format(train_losses[-1],test_losses[-1]))
        print('model order: ',model.dictionary().shape[0])
        print('test error: {}'.format(test_errors[-1]))
    return train_losses, test_losses, train_errors, test_errors, step_times, model_orders

In [None]:
# both methods will train over a series of batch sizes
step_sizes = [1,.5,.3,.1,.03,.01,.003,.001,.0003,.0001,.00003,.00001]
epochs = 30
data = (train_data, train_labels, test_data, test_labels)

In [None]:
# will be indexed by (num_layers, step_size)
mlp_stats = {}
layers = range(2,6)
for num_layers in layers:
    for step_size in step_sizes:
        result = train_mlp(step_size, num_layers, epochs, data)
        train_losses, test_losses, train_errors, test_errors, step_times = result
        stat = {}
        stat['train_loss'] = train_losses
        stat['test_loss'] = test_losses
        stat['train_errors'] = train_errors
        stat['test_errors'] = test_errors
        stat['step_times'] = step_times
        mlp_stats[num_layers,step_size] = stat



In [None]:
# will be indexed by (epsilon-error, sigma, step_size)
polk_stats = {}
epsilons = [.1,.01,.001,.0009,.0008,.0007,.0006]
sigmas = np.arange(.1,1.1,.1)
for eps in epsilons:
    for sigma in sigmas:
        for step_size in step_sizes:
            result = train_POLK(step_size, sigma, eps, data)
            train_losses, test_losses, train_errors, test_errors, step_times, model_orders = result
            stat = {}
            stat['train_loss'] = train_losses
            stat['test_loss'] = test_losses
            stat['train_errors'] = train_errors
            stat['test_errors'] = test_errors
            stat['step_times'] = step_times
            stat['model_order'] = model_orders
            plok_stats[eps,sigma,step_size] = stat