In [1]:
import tensorflow as tf
import numpy as np

In [2]:
mnist = tf.keras.datasets.mnist
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train / 255.0
X_test = X_test / 255.0

In [3]:
n_train, n_test = len(X_train), len(X_test)

In [4]:
def accuracy(y, yhat):
    return (y==yhat).mean()

## DNN

In [5]:
X_train = X_train.reshape(-1, 28*28)
X_test = X_test.reshape(-1, 28*28)

In [6]:
tf.reset_default_graph()
X = tf.placeholder(tf.float32, (None, 28*28))
y = tf.placeholder(tf.int32, (None))
hidden1 = tf.layers.dense(X, 300, activation=tf.nn.relu)
hidden2 = tf.layers.dense(hidden1, 100, activation=tf.nn.relu)
logits = tf.layers.dense(hidden2, 10)
yhat = tf.argmax(logits, axis=1)
loss = tf.losses.sparse_softmax_cross_entropy(labels=y, logits=logits)

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [7]:
learning_rate = 0.01
momentum = 0.9
batch_size = 50
n_batches = n_train // batch_size + bool(n_train % batch_size)
n_epochs = 30

In [8]:
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum)
training_op = optimizer.minimize(loss)

In [9]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(n_epochs):
        for batch in np.random.permutation(n_batches):
            X_batch = X_train[batch*batch_size : (batch+1)*batch_size]
            y_batch = y_train[batch*batch_size : (batch+1)*batch_size]
            sess.run(training_op, {X:X_batch, y:y_batch})
        logits_train = sess.run(logits, {X:X_train})
        yhat_train = sess.run(yhat, {X:X_train})
        yhat_test = sess.run(yhat, {X:X_test})
        print(epoch, accuracy(y_train, yhat_train), accuracy(y_test, yhat_test))

0 0.9584666666666667 0.9545
1 0.9682166666666666 0.962
2 0.9819833333333333 0.973
3 0.9869 0.976
4 0.9878166666666667 0.976
5 0.99295 0.9797
6 0.9938166666666667 0.9794
7 0.99535 0.9801
8 0.9977166666666667 0.9808
9 0.9976 0.9802
10 0.9987833333333334 0.9807
11 0.9992666666666666 0.981
12 0.99965 0.9828
13 0.9997666666666667 0.9821
14 0.9998333333333334 0.982
15 0.9999666666666667 0.9824
16 0.9999666666666667 0.9824
17 0.9999833333333333 0.9825
18 1.0 0.9827
19 1.0 0.9827
20 1.0 0.9823
21 1.0 0.9823
22 1.0 0.9824
23 1.0 0.9822
24 1.0 0.9821
25 1.0 0.9826
26 1.0 0.9821
27 1.0 0.9826
28 1.0 0.9825
29 1.0 0.9827


## DNN (Dropout)

In [5]:
X_train = X_train.reshape(-1, 28*28)
X_test = X_test.reshape(-1, 28*28)

In [None]:
def mlp_dropout(X, n_neurons, training, rate=0.0, activation=tf.nn.relu, output_activation=None):
    for l in range(len(n_neurons)):
        X_dropout = tf.layers.dropout(X, rate=dropout_rate, training=training)
        X = tf.layers.dense(X_dropout, n_neurons[l], activation=(activation if l<len(n_neurons)-1 else output_activation))
    return X

In [8]:
tf.reset_default_graph()
X = tf.placeholder(tf.float32, (None, 28*28))
y = tf.placeholder(tf.int32, (None))
training = tf.placeholder_with_default(False, ())
logits = mlp_dropout(X, [300, 100, 10], training, rate=0.2)
yhat = tf.argmax(logits, axis=1)
loss = tf.losses.sparse_softmax_cross_entropy(labels=y, logits=logits)

In [12]:
learning_rate = 0.01
momentum = 0.9
batch_size = 50
n_batches = n_train // batch_size + bool(n_train % batch_size)
n_epochs = 30

In [13]:
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum)
training_op = optimizer.minimize(loss)

In [14]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(n_epochs):
        for batch in np.random.permutation(n_batches):
            X_batch = X_train[batch*batch_size : (batch+1)*batch_size]
            y_batch = y_train[batch*batch_size : (batch+1)*batch_size]
            sess.run(training_op, {X:X_batch, y:y_batch, training:True})
        logits_train = sess.run(logits, {X:X_train})
        yhat_train = sess.run(yhat, {X:X_train})
        yhat_test = sess.run(yhat, {X:X_test})
        print(epoch, accuracy(y_train, yhat_train), accuracy(y_test, yhat_test))

0 0.9500166666666666 0.951
1 0.96755 0.9637
2 0.9744833333333334 0.9716
3 0.9786 0.974
4 0.9816 0.9754
5 0.9849333333333333 0.9785
6 0.9854333333333334 0.9765
7 0.9883 0.9806
8 0.9888 0.9807
9 0.9897666666666667 0.9812
10 0.99055 0.9814
11 0.9914666666666667 0.982
12 0.9925833333333334 0.9836
13 0.9931 0.9841
14 0.9938 0.9835
15 0.9934833333333334 0.9828
16 0.9943166666666666 0.9816
17 0.9948166666666667 0.984
18 0.9948333333333333 0.9813
19 0.9952833333333333 0.9833
20 0.9956833333333334 0.9836
21 0.9961333333333333 0.9844
22 0.9964666666666666 0.9855
23 0.9963 0.9846
24 0.9962833333333333 0.9851
25 0.9970166666666667 0.9844
26 0.9970333333333333 0.9857
27 0.9968166666666667 0.9838
28 0.9977833333333334 0.9848
29 0.9978166666666667 0.9853


## DNN (L2 regularization)

In [27]:
X_train = X_train.reshape(-1, 28*28)
X_test = X_test.reshape(-1, 28*28)

In [28]:
tf.reset_default_graph()
X = tf.placeholder(tf.float32, (None, 28*28))
y = tf.placeholder(tf.int32, (None))
hidden1 = tf.layers.dense(X, 300, activation=tf.nn.relu)
hidden2 = tf.layers.dense(hidden1, 100, activation=tf.nn.relu)
logits = tf.layers.dense(hidden2, 10)
yhat = tf.argmax(logits, axis=1)
loss = tf.losses.sparse_softmax_cross_entropy(labels=y, logits=logits)

scale = 1E-3
l2_sum = [tf.reduce_sum(tf.pow(w, 2)) for w in tf.global_variables() if 'kernel:0' in w.name]
reg_loss = 0.5 * scale * tf.reduce_sum(l2_sum)
base_loss = tf.losses.sparse_softmax_cross_entropy(labels=y, logits=logits)
loss = base_loss + reg_loss

In [29]:
learning_rate = 0.01
momentum = 0.9
batch_size = 50
n_batches = n_train // batch_size + bool(n_train % batch_size)
n_epochs = 30

In [30]:
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum)
training_op = optimizer.minimize(loss)

In [31]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(n_epochs):
        for batch in np.random.permutation(n_batches):
            X_batch = X_train[batch*batch_size : (batch+1)*batch_size]
            y_batch = y_train[batch*batch_size : (batch+1)*batch_size]
            sess.run(training_op, {X:X_batch, y:y_batch})
        logits_train = sess.run(logits, {X:X_train})
        yhat_train = sess.run(yhat, {X:X_train})
        yhat_test = sess.run(yhat, {X:X_test})
        print(epoch, accuracy(y_train, yhat_train), accuracy(y_test, yhat_test))

0 0.95385 0.9516
1 0.9642 0.9607
2 0.9738833333333333 0.9688
3 0.9787166666666667 0.9708
4 0.9796833333333334 0.9717
5 0.9859833333333333 0.9777
6 0.98655 0.9755
7 0.9891833333333333 0.978
8 0.99055 0.9792
9 0.9893666666666666 0.9785
10 0.98855 0.9779
11 0.9915333333333334 0.9796
12 0.9913833333333333 0.98
13 0.9919166666666667 0.981
14 0.9927166666666667 0.979
15 0.9925833333333334 0.9802
16 0.9920333333333333 0.9791
17 0.9917833333333334 0.9795
18 0.9917666666666667 0.9796
19 0.9920333333333333 0.9809
20 0.99325 0.9817
21 0.9926833333333334 0.9792
22 0.9935333333333334 0.9803
23 0.9931166666666666 0.9796
24 0.9939166666666667 0.9795
25 0.9935 0.98
26 0.9853833333333334 0.9732
27 0.99195 0.9789
28 0.99395 0.981
29 0.9948 0.9831


## DNN (Batch Normalization)

In [26]:
X_train = X_train.reshape(-1, 28*28)
X_test = X_test.reshape(-1, 28*28)

In [None]:
def mlp_bn(X, n_neurons, training, momentum=0.99, activation=tf.nn.relu, output_activation=None):
    for l in range(len(n_neurons)):
        X_dense = tf.layers.dense(X, n_neurons[l])
        X_bn = tf.layers.batch_normalization(X_dense, training=training, momentum=momentum)
        X = activation(X_bn) if l < len(n_neurons)-1 else (X_bn if output_activation==None else output_activation(X_bn))
    return X

In [None]:
tf.reset_default_graph()
X = tf.placeholder(tf.float32, (None, 28*28))
y = tf.placeholder(tf.int32, (None))
training = tf.placeholder_with_default(False, ())
logits = mlp_bn(X, [300,100,10], training)
yhat = tf.argmax(logits, axis=1)
loss = tf.losses.sparse_softmax_cross_entropy(labels=y, logits=logits)


In [None]:
learning_rate = 1E-3
batch_size = 100
n_bathces = int(len(X_train) / batch_size)
n_epochs = 30

In [None]:
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = [optimizer.minimize(loss), tf.get_collection(tf.GraphKeys.UPDATE_OPS)]

In [None]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(n_epochs):
        for batch in np.random.permutation(n_bathces):
            X_batch, y_batch = X_train[batch*batch_size : (batch+1)*batch_size], y_train[batch*batch_size : (batch+1)*batch_size]
            sess.run(training_op, {X:X_batch, y:y_batch, training:True})
        yhat_train = sess.run(yhat, {X:X_train})
        yhat_test = sess.run(yhat, {X:X_test})
        print(epoch, accuracy(y_train, yhat_train), accuracy(y_test, yhat_test))

## DNN (Polyak)

In [16]:
X_train = X_train.reshape(-1, 28*28)
X_test = X_test.reshape(-1, 28*28)

In [17]:
tf.reset_default_graph()
X = tf.placeholder(tf.float32, (None, 28*28))
y = tf.placeholder(tf.int32, (None))
hidden1 = tf.layers.dense(X, 300, activation=tf.nn.relu)
hidden2 = tf.layers.dense(hidden1, 100, activation=tf.nn.relu)
logits = tf.layers.dense(hidden2, 10)
yhat = tf.argmax(logits, axis=1)
loss = tf.losses.sparse_softmax_cross_entropy(labels=y, logits=logits)

In [18]:
learning_rate = 1E-3
batch_size = 100
n_bathces = int(len(X_train) / batch_size)
n_epochs = 30

In [19]:
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)

In [20]:
params = [param for param in tf.global_variables() if ('kernel:0' in param.name) | ('bias:0' in param.name)]
cache_params = [tf.Variable(tf.zeros(param.shape.dims)) for param in params]
pol_params = [tf.Variable(tf.zeros(param.shape.dims)) for param in params]

params_update_op = [tf.assign(pol_param, 0.999*pol_param+0.001*param) for pol_param, param in zip(pol_params, params)]
assign_cache_op = [tf.assign(cache_param, param) for cache_param, param in zip(cache_params, params)]
assign_polyak_op = [tf.assign(param, pol_param) for pol_param, param in zip(pol_params, params)]
assign_origin_op = [tf.assign(param, cache_param) for cache_param, param in zip(cache_params, params)]

In [21]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(n_epochs):
        for batch in np.random.permutation(n_bathces):
            X_batch, y_batch = X_train[batch*batch_size : (batch+1)*batch_size], y_train[batch*batch_size : (batch+1)*batch_size]
            sess.run([training_op, params_update_op], {X:X_batch, y:y_batch})
        yhat_train,  yhat_test = sess.run(yhat, {X:X_train}), sess.run(yhat, {X:X_test})
        acc_train, acc_test = accuracy(y_train, yhat_train), accuracy(y_test, yhat_test)
        sess.run([assign_cache_op, assign_polyak_op])
        yhat_train, yhat_test = sess.run(yhat, {X:X_train}), sess.run(yhat, {X:X_test})
        acc_train_pol, acc_test_pol = accuracy(y_train, yhat_train), accuracy(y_test, yhat_test)
        sess.run([assign_origin_op])
        print(epoch, acc_train, acc_test)
        print(epoch, acc_train_pol, acc_test_pol)
        print()

0 0.9624166666666667 0.9583
0 0.95775 0.9549

1 0.9755333333333334 0.9678
1 0.9732333333333333 0.9666

2 0.9813 0.9753
2 0.9803 0.9734

3 0.97625 0.966
3 0.9853666666666666 0.9766

4 0.9901166666666666 0.976
4 0.9896666666666667 0.9795

5 0.9917666666666667 0.9802
5 0.9931333333333333 0.9807

6 0.99025 0.976
6 0.9954666666666667 0.9823

7 0.9929833333333333 0.9782
7 0.9975 0.984

8 0.99435 0.98
8 0.9983166666666666 0.9845

9 0.99555 0.9796
9 0.9990833333333333 0.985

10 0.9949166666666667 0.9787
10 0.9993666666666666 0.9847

11 0.9972 0.9817
11 0.9995 0.9854

12 0.9952333333333333 0.9782
12 0.9996833333333334 0.9854

13 0.9977 0.9824
13 0.9998166666666667 0.9851

14 0.9941333333333333 0.9769
14 0.9998 0.9848

15 0.9956666666666667 0.9781
15 0.9999 0.9853

16 0.9963333333333333 0.9809
16 0.9999333333333333 0.9853

17 0.99535 0.9787
17 0.9999333333333333 0.9855

18 0.9933666666666666 0.976
18 0.9999166666666667 0.9856

19 0.9944166666666666 0.9785
19 0.99995 0.9855

20 0.9963666666666666

## CNN

In [17]:
X_train = X_train.reshape([-1, 28, 28, 1])
X_test = X_test.reshape([-1, 28, 28, 1])

In [18]:
tf.reset_default_graph()
X = tf.placeholder(tf.float32, (None, 28, 28, 1))
y = tf.placeholder(tf.int32, (None))
C1 = tf.layers.conv2d(X, kernel_size=[5,5], filters=6, padding='SAME', activation=tf.nn.relu)
S2 = tf.layers.max_pooling2d(C1, pool_size=[2,2], strides=2)
C3 = tf.layers.conv2d(S2, kernel_size=[5,5], filters=16, activation=tf.nn.relu)
S4 = tf.layers.max_pooling2d(C3, pool_size=[2,2], strides=2)
C5 = tf.layers.conv2d(S4, kernel_size=[5,5], filters=120, activation=tf.nn.relu)
F6 = tf.layers.dense(C5, 84, activation=tf.nn.relu)
logits = tf.reshape(tf.layers.dense(F6, 10), [-1, 10])
yhat = tf.argmax(logits, axis=1)
loss = tf.losses.sparse_softmax_cross_entropy(labels=y, logits=logits)

In [19]:
learning_rate = 0.01
momentum = 0.9
batch_size = 50
n_batches = n_train // batch_size + bool(n_train % batch_size)
n_epochs = 40

In [20]:
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum)
training_op = optimizer.minimize(loss)

In [21]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(n_epochs):
        for batch in np.random.permutation(n_batches):
            X_batch = X_train[batch*batch_size : (batch+1)*batch_size]
            y_batch = y_train[batch*batch_size : (batch+1)*batch_size]
            sess.run(training_op, {X:X_batch, y:y_batch})
        logits_train = sess.run(logits, {X:X_train})
        yhat_train = sess.run(yhat, {X:X_train})
        yhat_test = sess.run(yhat, {X:X_test})
        print(epoch, accuracy(y_train, yhat_train), accuracy(y_test, yhat_test))

0 0.9781833333333333 0.9792
1 0.9880333333333333 0.9865
2 0.9880166666666667 0.9875
3 0.9917333333333334 0.9861
4 0.99295 0.9876
5 0.9927166666666667 0.9887
6 0.99575 0.9894
7 0.9951333333333333 0.9893
8 0.99535 0.9886
9 0.9965166666666667 0.988
10 0.9981333333333333 0.9909
11 0.9968 0.9906
12 0.9956833333333334 0.9874
13 0.9990333333333333 0.9921
14 0.9974666666666666 0.9909
15 0.99765 0.9899
16 0.9981833333333333 0.9908
17 0.9993666666666666 0.9918
18 0.9988666666666667 0.9914
19 0.9996 0.9917
20 0.9992 0.9919
21 0.9996833333333334 0.9915
22 0.9992833333333333 0.9904
23 0.9975166666666667 0.9884
24 0.9987 0.9907
25 0.9995333333333334 0.9914
26 0.9995166666666667 0.9919
27 0.9999333333333333 0.9926
28 0.9999666666666667 0.9929
29 1.0 0.9927
30 1.0 0.9927
31 1.0 0.9928
32 1.0 0.993
33 1.0 0.9929
34 1.0 0.993
35 1.0 0.993
36 1.0 0.993
37 1.0 0.9929
38 1.0 0.993
39 1.0 0.9929


## RNN

In [49]:
X_train = X_train.reshape([-1, 28, 28])
X_test = X_test.reshape([-1, 28, 28])

In [50]:
tf.reset_default_graph()
X = tf.placeholder(tf.float32, (None, 28, 28))
y = tf.placeholder(tf.int32, (None))
#rnn_cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.BasicRNNCell(num_units=300) for _ in range(1)])
rnn_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=300)
rnn_outputs, _ = tf.nn.dynamic_rnn(inputs=X, cell=rnn_cell, dtype=tf.float32)
logits = tf.layers.dense(rnn_outputs[:,-1,:], 10)
yhat = tf.argmax(logits, axis=1)
loss = tf.losses.sparse_softmax_cross_entropy(labels=y, logits=logits)

In [51]:
batch_size = 50
n_batches = n_train // batch_size + bool(n_train % batch_size)
n_epochs = 40

In [52]:
#optimizer = tf.train.AdamOptimizer()
optimizer = tf.train.MomentumOptimizer(learning_rate=0.005, momentum=0.9)
training_op = optimizer.minimize(loss)

In [53]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(n_epochs):
        for batch in np.random.permutation(n_batches):
            X_batch = X_train[batch*batch_size : (batch+1)*batch_size]
            y_batch = y_train[batch*batch_size : (batch+1)*batch_size]
            sess.run(training_op, {X:X_batch, y:y_batch})
        logits_train = sess.run(logits, {X:X_train})
        yhat_train = sess.run(yhat, {X:X_train})
        yhat_test = sess.run(yhat, {X:X_test})
        print(epoch, accuracy(y_train, yhat_train), accuracy(y_test, yhat_test))

0 0.9311833333333334 0.9341
1 0.9595166666666667 0.9548
2 0.9684666666666667 0.9635
3 0.9670166666666666 0.9633
4 0.9769166666666667 0.971
5 0.9768166666666667 0.97
6 0.9809 0.975
7 0.98615 0.9778
8 0.9810333333333333 0.9759
9 0.9797166666666667 0.9743
10 0.9864166666666667 0.9758
11 0.9867166666666667 0.9785
12 0.98895 0.9796
13 0.9884833333333334 0.9789
14 0.9852 0.9763
15 0.9885833333333334 0.9753
16 0.9911333333333333 0.9814
17 0.9916333333333334 0.9814
18 0.9917166666666667 0.9821
19 0.9798833333333333 0.9719
20 0.99325 0.9788
21 0.9907666666666667 0.9786
22 0.9861666666666666 0.9757
23 0.99165 0.979
24 0.9934666666666667 0.9803
25 0.9918833333333333 0.9797
26 0.99705 0.9849
27 0.99485 0.9837
28 0.99635 0.9813
29 0.9919333333333333 0.9802
30 0.9924833333333334 0.9819
31 0.9950833333333333 0.9817
32 0.99455 0.9834
33 0.9965 0.9829
34 0.9865833333333334 0.9744
35 0.9913666666666666 0.9794
36 0.9927 0.9802
37 0.99735 0.981
38 0.9941333333333333 0.9832
39 0.9938333333333333 0.9823


## LSTM

In [22]:
X_train = X_train.reshape([-1, 28, 28])
X_test = X_test.reshape([-1, 28, 28])

In [27]:
tf.reset_default_graph()
X = tf.placeholder(tf.float32, (None, 28, 28))
y = tf.placeholder(tf.int32, (None))
lstm_cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.BasicLSTMCell(num_units=300) for _ in range(3)])
lstm_outputs, _ = tf.nn.dynamic_rnn(inputs=X, cell=lstm_cell, dtype=tf.float32)
logits = tf.layers.dense(lstm_outputs[:,-1,:], 10)
yhat = tf.argmax(logits, axis=1)
loss = tf.losses.sparse_softmax_cross_entropy(labels=y, logits=logits)

In [24]:
batch_size = 50
n_batches = n_train // batch_size + bool(n_train % batch_size)
n_epochs = 40

In [25]:
optimizer = tf.train.AdamOptimizer()
training_op = optimizer.minimize(loss)

In [26]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(n_epochs):
        for batch in np.random.permutation(n_batches):
            X_batch = X_train[batch*batch_size : (batch+1)*batch_size]
            y_batch = y_train[batch*batch_size : (batch+1)*batch_size]
            sess.run(training_op, {X:X_batch, y:y_batch})
        logits_train = sess.run(logits, {X:X_train})
        yhat_train = sess.run(yhat, {X:X_train})
        yhat_test = sess.run(yhat, {X:X_test})
        print(epoch, accuracy(y_train, yhat_train), accuracy(y_test, yhat_test))

0 0.9744833333333334 0.9749
1 0.9808833333333333 0.9756
2 0.9887333333333334 0.9874
3 0.9889333333333333 0.9874
4 0.9918833333333333 0.9889
5 0.9918333333333333 0.9897
6 0.9931666666666666 0.991
7 0.9949166666666667 0.9903
8 0.99405 0.9884
9 0.9948666666666667 0.9894
10 0.9946666666666667 0.9906
11 0.9958333333333333 0.9894
12 0.9968 0.992
13 0.9947 0.9887
14 0.9962666666666666 0.99
15 0.9973833333333333 0.9898
16 0.99645 0.9927
17 0.9962666666666666 0.9917
18 0.9979 0.9923
19 0.99725 0.9896
20 0.99805 0.9917
21 0.9984 0.9932
22 0.9986666666666667 0.9931
23 0.9986166666666667 0.9913
24 0.99725 0.9902
25 0.9989333333333333 0.9922
26 0.9959 0.9907
27 0.99865 0.9917
28 0.9983666666666666 0.9913
29 0.9984666666666666 0.9918
30 0.9978333333333333 0.9901
31 0.9983666666666666 0.9919
32 0.9990666666666667 0.9934
33 0.9987333333333334 0.9923
34 0.9990833333333333 0.9926
35 0.9991666666666666 0.9929
36 0.9978 0.9918
37 0.9992666666666666 0.9937
38 0.9987 0.992
39 0.9990666666666667 0.9914


## Autoencoder

In [5]:
X_train = X_train.reshape(-1, 28*28)
X_test = X_test.reshape(-1, 28*28)

In [6]:
def mlp_encoder(X, n_neurons, activation=tf.nn.relu, encoder_activation=None, output_activation=None):
    for i in range(len(n_neurons)):
        if i == len(n_neurons) // 2 - 1: X = tf.layers.dense(X, n_neurons[i], activation=encoder_activation)
        elif i == len(n_neurons) - 1: X = tf.layers.dense(X, n_neurons[i], activation=output_activation)
        else: X = tf.layers.dense(X, n_neurons[i], activation=activation)
    return X

In [7]:
tf.reset_default_graph()
X = tf.placeholder(tf.float32, (None, 28*28))
Xprime = mlp_encoder(X, [1000, 500, 250, 30, 250, 500, 1000, 28*28], output_activation=tf.nn.sigmoid)
loss = tf.losses.log_loss(labels=X, predictions=Xprime)
mse = tf.reduce_mean(tf.reduce_sum(tf.square(X-Xprime), axis=1))

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [8]:
batch_size = 250
n_batches = n_train // batch_size + bool(n_train % batch_size)
n_epochs = 80

In [9]:
optimizer = tf.train.AdamOptimizer()
training_op = optimizer.minimize(loss)

In [10]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(n_epochs):
        for batch in np.random.permutation(n_batches):
            X_batch = X_train[batch*batch_size : (batch+1)*batch_size]
            sess.run(training_op, {X:X_batch})
        mse_train = sess.run(mse, {X:X_train})
        mse_test = sess.run(mse, {X:X_test})
        print(epoch, mse_train, mse_test)

0 20.07217 19.846277
1 13.080404 12.902016
2 10.2545805 10.193475
3 8.718241 8.739309
4 7.7372847 7.855645
5 6.8676724 7.0577517
6 6.2416186 6.4690638
7 6.0546355 6.2870126
8 5.602324 5.885851
9 5.370997 5.695169
10 5.22408 5.5844564
11 5.00054 5.3623633
12 4.8410463 5.2473145
13 4.6245346 5.084189
14 4.446176 4.9216037
15 4.3591657 4.8550835
16 4.184349 4.7242165
17 4.1613975 4.7043295
18 4.2076035 4.7748194
19 3.9965494 4.5960464
20 3.8906047 4.4964
21 3.8281355 4.472819
22 3.664503 4.3293343
23 3.6297214 4.313102
24 3.6219845 4.3172398
25 3.9576273 4.63473
26 3.5268195 4.247198
27 3.4828596 4.2176876
28 3.5239084 4.2703557
29 3.3245168 4.0841084
30 3.2587354 4.041132
31 3.3717606 4.1503873
32 3.4732218 4.244524
33 3.276084 4.0742364
34 3.2300558 4.0523634
35 3.113078 3.9477625
36 3.1138964 3.9615939
37 3.0708404 3.9237337
38 3.2329943 4.0761957
39 3.0825796 3.9554892
40 2.9855735 3.864628
41 3.0011406 3.898843
42 3.003555 3.9101617
43 2.9002304 3.8263953
44 2.902273 3.830543
45 2.91

## LightGBM

In [9]:
import lightgbm as lgb

In [10]:
X_train = X_train.reshape(-1, 28*28)
X_test = X_test.reshape(-1, 28*28)

In [11]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test)

In [13]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'softmax',
    'metric': 'multi_error',
    'num_class': 10,
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'num_iterations': 500
}

gbm = lgb.train(params,
               lgb_train,
               valid_sets=[lgb_train, lgb_test],
               early_stopping_rounds=10,
               verbose_eval=10,
               )

Training until validation scores don't improve for 10 rounds.
[10]	training's multi_error: 0.0741	valid_1's multi_error: 0.0839
[20]	training's multi_error: 0.06145	valid_1's multi_error: 0.072
[30]	training's multi_error: 0.0549333	valid_1's multi_error: 0.0655
[40]	training's multi_error: 0.0487167	valid_1's multi_error: 0.0593
[50]	training's multi_error: 0.0436833	valid_1's multi_error: 0.0544
[60]	training's multi_error: 0.0388667	valid_1's multi_error: 0.051
[70]	training's multi_error: 0.0343333	valid_1's multi_error: 0.0483
[80]	training's multi_error: 0.03005	valid_1's multi_error: 0.0452
[90]	training's multi_error: 0.02635	valid_1's multi_error: 0.0436
[100]	training's multi_error: 0.0229333	valid_1's multi_error: 0.041
[110]	training's multi_error: 0.0194	valid_1's multi_error: 0.0391
[120]	training's multi_error: 0.0168167	valid_1's multi_error: 0.0367
[130]	training's multi_error: 0.0139833	valid_1's multi_error: 0.0349
[140]	training's multi_error: 0.0118	valid_1's multi