In [37]:
import numpy as np

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD

inputs = (
    ("age", ("continuous",)), 
    ("workclass", ("Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov", "Without-pay", "Never-worked")), 
    ("fnlwgt", ("continuous",)), 
    ("education", ("Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool")), 
    ("education-num", ("continuous",)), 
    ("marital-status", ("Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed", "Married-spouse-absent", "Married-AF-spouse")), 
    ("occupation", ("Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv", "Armed-Forces")), 
    ("relationship", ("Wife", "Own-child", "Husband", "Not-in-family", "Other-relative", "Unmarried")), 
    ("race", ("White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black")), 
    ("sex", ("Female", "Male")), 
    ("capital-gain", ("continuous",)), 
    ("capital-loss", ("continuous",)), 
    ("hours-per-week", ("continuous",)), 
    ("native-country", ("United-States", "Cambodia", "England", "Puerto-Rico", "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras", "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"))
)

input_shape = []
for i in inputs:
    count = len(i[1 ])
    input_shape.append(count)
input_dim = sum(input_shape)
print("input_shape: ", input_shape)
print("input_dim: ", input_dim)
print()


outputs = (">50K", "<=50K")

output_dim = len(outputs)
print("output_dim: ", output_dim)
print()


# We combined train and test data in a file to split them using
# Keras functionalities at training time later in this script. 
all_data = np.genfromtxt('data/adult.all.txt', delimiter=', ', dtype=str, autostrip=True)
print("training data count: ", len(all_data))

X_train = all_data[:, :-1]
y_train = all_data[:, -1:]
# print(set(y_train.flatten()))


model = Sequential()

model.add(Dense(output_dim=output_dim, init='uniform', input_dim=input_dim))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='rmsprop')


input_shape:  [1, 8, 1, 16, 1, 7, 14, 6, 5, 2, 1, 1, 1, 41]
input_dim:  105

output_dim:  2

training data count:  48842


In [38]:
def isFloat(string):
    # http://stackoverflow.com/questions/2356925/how-to-check-whether-string-might-be-type-cast-to-float-in-python
    try:
        float(string)
        return True
    except ValueError:
        return False
    
def find_means_for_continuous_types(X_train):
    means = []
    for col in range(len(X_train[0])):
        summ = 0
        count = 0.000000000000000000001
        for value in X_train[:, col]:
            if isFloat(value): 
                summ += float(value)
                count +=1
        means.append(summ/count)
    return means

means = find_means_for_continuous_types(X_train)
print("mean values for data types (if continuous): ", means)

mean values for data types (if continuous):  [38.64358543876172, 0.0, 189664.13459727284, 0.0, 10.078088530363212, 0.0, 0.0, 0.0, 0.0, 0.0, 1079.0676262233324, 87.50231358257237, 40.422382375824085, 0.0]


In [60]:

def flatten_persons_inputs_for_model(person_inputs):
    global inputs
    global input_shape
    global input_dim
    global means
    float_inputs = []
    
    for i in range(len(input_shape)):
        features_of_this_type = input_shape[i]
        is_feature_continuous = features_of_this_type == 1
        
        if is_feature_continuous:
            mean = means[i]
            if isFloat(person_inputs[i]):
                scale_factor = 1/(2*mean)  # we prefer inputs mainly scaled from -1 to 1. 
                float_inputs.append(float(person_inputs[i])*scale_factor)
            else:
                float_inputs.append(mean)
        
        else:
            for j in range(features_of_this_type):
                feature_name = inputs[i][1][j]

                if feature_name == person_inputs[i]:
                    float_inputs.append(1.)
                else:
                    float_inputs.append(-1./features_of_this_type)
    
    # print(len(float_inputs), "\n")
    # print(float_inputs)
    return float_inputs
    
new_X_train = []
for person in range(len(X_train)):
    formatted_X = flatten_persons_inputs_for_model(X_train[person])
    new_X_train.append(formatted_X)

In [61]:
print("Original training data format example: ")
print(X_train[4])  # 4 is a random person, from cuba. 
print()

print("New training data format example: ")
print(new_X_train[4])
print()

print("In fact, we just crushed the data in such a way that it will optimise the neural network (model). \n\
It is crushed according to the `input_shape` variable: \n\
    say, if there are 41 native countries in the dataset, there will be 41 input dimensions for the \n\
    neural network with a value of 1/41 for every 41 input node for a given person, except that the \n\
    node representing the real country of the person will have a value of 1.")

for i in new_X_train:
    if len(i) != input_dim:
        raise Exception(
            "Every person should have 105 data fields now. {} here.".format(len(i)))

Original training data format example: 
['28' 'Private' '338409' 'Bachelors' '13' 'Married-civ-spouse'
 'Prof-specialty' 'Wife' 'Black' 'Female' '0' '0' '40' 'Cuba']

New training data format example: 
[0.36228522382287026, 1.0, -0.125, -0.125, -0.125, -0.125, -0.125, -0.125, -0.125, 0.8921270242224961, 1.0, -0.0625, -0.0625, -0.0625, -0.0625, -0.0625, -0.0625, -0.0625, -0.0625, -0.0625, -0.0625, -0.0625, -0.0625, -0.0625, -0.0625, -0.0625, 0.6449635742350183, 1.0, -0.14285714285714285, -0.14285714285714285, -0.14285714285714285, -0.14285714285714285, -0.14285714285714285, -0.14285714285714285, -0.07142857142857142, -0.07142857142857142, -0.07142857142857142, -0.07142857142857142, -0.07142857142857142, 1.0, -0.07142857142857142, -0.07142857142857142, -0.07142857142857142, -0.07142857142857142, -0.07142857142857142, -0.07142857142857142, -0.07142857142857142, -0.07142857142857142, 1.0, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666666666, -0.16666666666

In [None]:
model.fit(new_X_train, y_train, nb_epoch=3, batch_size=16, validation_split=0.2, show_accuracy=True, verbose=2)
