In [5]:
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
import numpy as np

# Read data set
mnist = input_data.read_data_sets("MNIST_data", one_hot=True)

# Inputs and labels
with tf.name_scope("input/labels"):
    x = tf.placeholder(tf.float32, [None, 784])
    y_ = tf.placeholder(tf.float32, [None, 10])

# Weights/bias for hidden layer
with tf.name_scope("hidden_layer_1_weights"):
    W1 = tf.Variable(tf.truncated_normal([784, 10])*(1/tf.sqrt(784.0)))
    b1 = tf.Variable(tf.truncated_normal([10]))
    w1_hist = tf.summary.histogram("hidden_layer_1_W1", W1)
    b1_hist = tf.summary.histogram("hidden_layer_1_B1", b1)

# Weights/bias for output layer
with tf.name_scope("output_layer_weights"):
    W2 = tf.Variable(tf.truncated_normal([10, 10])*(1/tf.sqrt(10.0)))
    b2 = tf.Variable(tf.truncated_normal([10]))
    w2_hist = tf.summary.histogram("output_layer_W2", W2)
    b2_hist = tf.summary.histogram("output_layer_B2", b2)

# Computational graph
with tf.name_scope("hidden_1"):
    Z2 = tf.matmul(x, W1) + b1
    z2_hist = tf.summary.histogram("hidden_layer_1_Z2", Z2)

    # Batch Normalization layer
    bn_mean_2, bn_var_2 = tf.nn.moments(Z2, [0]) # Computes mean and var of Z2. Only one dimension is passed to axis
    scale_2 = tf.Variable(tf.ones([10])) # we're gonna learn also this param. Init with 1's. Scale
    beta_2 = tf.Variable(tf.zeros([10])) # we're gonna learn also this param. Init with 0's. Offset (or shift)
    epsilon = 0.0001 # to avoid divide by zero
    bn_2 = tf.nn.batch_normalization(Z2, bn_mean_2, bn_var_2, beta_2, scale_2, epsilon)

    #A2 = tf.nn.relu(Z2) # without batch normalization
    A2 = tf.nn.relu(bn_2)
    a2_hist = tf.summary.histogram("hidden_layer_1_A2", A2)

# Computational graph
with tf.name_scope("output"):
    y = tf.matmul(A2, W2) + b2
    y_hist = tf.summary.histogram("output_y", y)

# Cost function
with tf.name_scope("cost"):
    ce_cost = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)
    cross_entropy = tf.reduce_mean(ce_cost)

# Optimizer
with tf.name_scope("train"):
    w1_grads, b1_grads = tf.gradients(cross_entropy, [W1, b1])
    w2_grads, b2_grads = tf.gradients(cross_entropy, [W2, b2])
    w1_grads_hist = tf.summary.histogram("w1_gradients", w1_grads)
    b1_grads_hist = tf.summary.histogram("b1_gradients", b1_grads)
    w2_grads_hist = tf.summary.histogram("w2_gradients", w2_grads)
    b2_grads_hist = tf.summary.histogram("b2_gradients", b2_grads)

    global_step = tf.Variable(0, trainable=False)
    starter_learning_rate = 0.01
    learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 500, 0.5, staircase=True)
    train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(cross_entropy)

# Create session
sess = tf.Session()
tf.set_random_seed(1)
sess.run(tf.global_variables_initializer())

# Test trained model
with tf.name_scope("accuracy"):
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# Tensorboard
writer = tf.summary.FileWriter("./bn", graph=tf.get_default_graph())
cost_summary = tf.summary.scalar("cross-entropy", cross_entropy)
acc_summary = tf.summary.scalar("accuracy", accuracy)
params_summary = tf.summary.merge([w1_hist, b1_hist, w2_hist, b2_hist, z2_hist, a2_hist, y_hist])
grad_summary = tf.summary.merge([w1_grads_hist, b1_grads_hist, w2_grads_hist, b2_grads_hist])



Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


# Test for calculations. Comment if you want to try the train below

In [None]:
batch_xs, batch_ys = mnist.train.next_batch(100)
z2_ = sess.run(Z2, feed_dict={x: batch_xs, y_: batch_ys})
bn2_ = sess.run(bn_2, feed_dict={x: batch_xs, y_: batch_ys})

In [None]:
np.mean(bn2_, axis = 0)

In [None]:
np.var(bn2_, axis = 0)

In [None]:
np.mean(z2_, axis = 0)

In [None]:
np.var(z2_, axis = 0)

# Train

In [2]:
for step in range(500) :
    batch_xs, batch_ys = mnist.train.next_batch(100)
    _, cost, cross_entropy_summary, params_hist_summary, gradients_hist_summary  = sess.run([train_step, cross_entropy, cost_summary, params_summary, grad_summary], feed_dict={x: batch_xs, y_: batch_ys})
    test_acc, test_acc_summary = sess.run([accuracy, acc_summary], feed_dict={x: mnist.test.images, y_: mnist.test.labels})

    writer.add_summary(cross_entropy_summary, step) 
    writer.add_summary(test_acc_summary, step) 
    writer.add_summary(params_hist_summary, step) 
    writer.add_summary(gradients_hist_summary, step) 
    print step, cost, test_acc 

0 2.88863 0.1143
1 2.96416 0.1209
2 2.86708 0.1252
3 2.69219 0.1286
4 2.77342 0.1356
5 2.60345 0.142
6 2.74622 0.1498
7 2.71651 0.1579
8 2.63498 0.1677
9 2.66297 0.1727
10 2.60155 0.1782
11 2.44917 0.1831
12 2.53236 0.1894
13 2.51441 0.1925
14 2.49089 0.1966
15 2.31554 0.1987
16 2.41269 0.2013
17 2.33383 0.2034
18 2.55344 0.2055
19 2.35927 0.2064
20 2.27483 0.2085
21 2.43961 0.2112
22 2.50538 0.2137
23 2.40121 0.216
24 2.33549 0.2194
25 2.29155 0.2234
26 2.24471 0.227
27 2.32553 0.2297
28 2.25375 0.234
29 2.31413 0.2374
30 2.20564 0.2412
31 2.00827 0.2441
32 2.26211 0.2486
33 2.06503 0.2521
34 2.36415 0.2554
35 2.04373 0.2574
36 2.2507 0.2589
37 2.44489 0.2606
38 2.18389 0.2637
39 2.14119 0.2682
40 2.13096 0.2724
41 2.12657 0.2747
42 2.14146 0.2763
43 2.43236 0.2788
44 1.99696 0.2825
45 2.09842 0.2854
46 2.09897 0.2871
47 2.08322 0.2905
48 2.15861 0.2928
49 1.90472 0.2945
50 2.13452 0.2973
51 1.98231 0.3003
52 2.02409 0.3017
53 1.96896 0.3035
54 2.14343 0.3054
55 2.0167 0.3069
56 1.961