/
multi_input.py
175 lines (135 loc) · 5.97 KB
/
multi_input.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#! /usr/bin/env python3
"""Example of using different input layers for different input types."""
import tempfile
import urllib.request as req
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.metrics import log_loss, accuracy_score
import aboleth as ab
# Data properties
COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
"marital_status", "occupation", "relationship", "race", "gender",
"capital_gain", "capital_loss", "hours_per_week", "native_country",
"income_bracket"]
CATEGORICAL_COLUMNS = ["workclass", "education", "marital_status",
"occupation", "relationship", "race", "gender",
"native_country"]
CONTINUOUS_COLUMNS = ["age", "education_num", "capital_gain", "capital_loss",
"hours_per_week"]
LABEL_COLUMN = "label"
# Algorithm properties
RSEED = 666
ab.set_hyperseed(RSEED)
# Sample width of net
T_SAMPLES = 1 # Number of random samples to get from an Aboleth net
EMBED_DIMS = 5 # Number of dimensions to embed the categorical columns into
BSIZE = 50 # Mini batch size
NITER = 60000 # Number of iterations (mini-batch views)
P_SAMPLES = 50 # Number of samples to use for prediction
CONFIG = tf.ConfigProto(device_count={'GPU': 0}) # Use GPU ?
def main():
"""Run the demo."""
# Get Continuous and categorical data
df_train, df_test = fetch_data()
df = pd.concat((df_train, df_test))
X_con, X_cat, n_cats, Y = input_fn(df)
n_samples_ = tf.placeholder_with_default(T_SAMPLES, [])
# Define the continuous layers
con_layer = (
ab.InputLayer(name='con', n_samples=n_samples_) >>
ab.RandomFourier(100, kernel=ab.RBF(learn_lenscale=True)) >>
ab.Dense(output_dim=16, init_fn="autonorm")
)
# Now define the cateogrical layers, which we embed
# Note every Embed call can be different, this is just "lazy"
cat_layer_list = [ab.Embed(EMBED_DIMS, i, init_fn="autonorm")
for i in n_cats]
cat_layer = (
ab.InputLayer(name='cat', n_samples=n_samples_) >>
ab.PerFeature(*cat_layer_list) >> # Assign columns to embedding layers
ab.Activation(tf.nn.selu) >>
ab.Dense(16, init_fn="autonorm")
)
# Now we can feed the initial continuous and cateogrical layers to further
# "joint" layers after we concatenate them
net = (
ab.Concat(con_layer, cat_layer) >>
ab.Activation(tf.nn.selu) >>
ab.DenseVariational(output_dim=1)
)
# Split data into training and testing
Xt_con, Xs_con = np.split(X_con, [len(df_train)], axis=0)
Xt_cat, Xs_cat = np.split(X_cat, [len(df_train)], axis=0)
Yt, Ys = np.split(Y, [len(df_train)], axis=0)
# Graph place holders
X_con_ = tf.placeholder(tf.float32, [None, Xt_con.shape[1]])
X_cat_ = tf.placeholder(tf.int32, [None, Xt_cat.shape[1]])
Y_ = tf.placeholder(tf.float32, [None, 1])
# Feed dicts
train_dict = {X_con_: Xt_con, X_cat_: Xt_cat, Y_: Yt}
test_dict = {X_con_: Xs_con, X_cat_: Xs_cat, n_samples_: P_SAMPLES}
# Make model
N = len(Xt_con)
nn, kl = net(con=X_con_, cat=X_cat_)
likelihood = tf.distributions.Bernoulli(logits=nn)
prob = ab.sample_mean(likelihood.probs)
loss = ab.elbo(likelihood.log_prob(Y_), kl, N)
optimizer = tf.train.AdamOptimizer()
train = optimizer.minimize(loss)
init = tf.global_variables_initializer()
with tf.Session(config=CONFIG):
init.run()
# We're going to just use a feed_dict to feed in batches, which we
# generate here
batches = ab.batch(
train_dict,
batch_size=BSIZE,
n_iter=NITER)
for i, data in enumerate(batches):
train.run(feed_dict=data)
if i % 1000 == 0:
loss_val = loss.eval(feed_dict=data)
print("Iteration {}, loss = {}".format(i, loss_val))
# Predict
Ep = prob.eval(feed_dict=test_dict)
Ey = Ep > 0.5 # Max probability assignment
acc = accuracy_score(Ys.flatten(), Ey.flatten())
logloss = log_loss(Ys.flatten(), np.hstack((1 - Ep, Ep)))
print("Accuracy = {}, log loss = {}".format(acc, logloss))
def fetch_data():
"""Download the data."""
train_file = tempfile.NamedTemporaryFile()
test_file = tempfile.NamedTemporaryFile()
req.urlretrieve("http://mlr.cs.umass.edu/ml/machine-learning-databases"
"/adult/adult.data", train_file.name)
req.urlretrieve("http://mlr.cs.umass.edu/ml/machine-learning-databases/"
"adult/adult.test", test_file.name)
df_train = pd.read_csv(train_file, names=COLUMNS, skipinitialspace=True)
df_test = pd.read_csv(test_file, names=COLUMNS, skipinitialspace=True,
skiprows=1)
df_train[LABEL_COLUMN] = (df_train["income_bracket"]
.apply(lambda x: ">50K" in x)).astype(int)
df_test[LABEL_COLUMN] = (df_test["income_bracket"]
.apply(lambda x: ">50K" in x)).astype(int)
return df_train, df_test
def input_fn(df):
"""Format the downloaded data."""
# Creates a dictionary mapping from each continuous feature column name (k)
# to the values of that column stored in a constant Tensor.
continuous_cols = [df[k].values for k in CONTINUOUS_COLUMNS]
X_con = np.stack(continuous_cols).astype(np.float32).T
# Standardise
X_con -= X_con.mean(axis=0)
X_con /= X_con.std(axis=0)
# Creates a dictionary mapping from each categorical feature column name
categ_cols = [np.where(pd.get_dummies(df[k]).values)[1][:, np.newaxis]
for k in CATEGORICAL_COLUMNS]
n_values = [np.amax(c) + 1 for c in categ_cols]
X_cat = np.concatenate(categ_cols, axis=1).astype(np.int32)
# Converts the label column into a constant Tensor.
label = df[LABEL_COLUMN].values[:, np.newaxis]
# Returns the feature columns and the label.
return X_con, X_cat, n_values, label
if __name__ == "__main__":
main()