# Batch Normalization
$ \begin{split} 1.\quad & \mathbf{\mu}_B = \dfrac{1}{m_B}\sum\limits_{i=1}^{m_B}{\mathbf{x}^{(i)}}\\ 2.\quad & {\mathbf{\sigma}_B}^2 = \dfrac{1}{m_B}\sum\limits_{i=1}^{m_B}{(\mathbf{x}^{(i)} - \mathbf{\mu}_B)^2}\\ 3.\quad & \hat{\mathbf{x}}^{(i)} = \dfrac{\mathbf{x}^{(i)} - \mathbf{\mu}_B}{\sqrt{{\mathbf{\sigma}_B}^2 + \epsilon}}\\ 4.\quad & \mathbf{z}^{(i)} = \gamma \hat{\mathbf{x}}^{(i)} + \beta \end{split} $

In [7]:
import tensorflow as tf
tf.reset_default_graph()

# Neural Network dimensions
n_inputs = 28*28 # mnist dataset
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10

X = tf.placeholder(tf.float32, shape = (None, n_inputs), name = 'X')

#training placeholder: True during training, False otherwise 
#(to use current mini-batch's mean and standar deviation or the whole training set's mean and standard deviation)
training = tf.placeholder_with_default(False, shape=(), name='training')


from functools import partial
#we will use this function to apply the batch normalization to every layer 
#before the activation function
my_batch_norm = partial(tf.layers.batch_normalization, training = training, momentum = 0.9)


# Hidden Layer 1
hidden1 = tf.layers.dense(X, n_hidden1, name = 'hidden1') # no activation function, we'll apply it after the BN
bn1 = my_batch_norm(hidden1)
bn1_act = tf.nn.elu(bn1) # we are use ELU activation function instead of the RELU

# Hidden Layer 2
hidden2 = tf.layers.dense(bn1_act, n_hidden2, name = 'hidden2')
bn2 = my_batch_norm(hidden2)
bn2_act = tf.nn.elu(bn2)

# Logits (output layer)
logits_pre_bn = tf.layers.dense(bn2, n_outputs, name = 'outputs')
logits = my_batch_norm(logits_pre_bn)

In [11]:
import tensorflow as tf
from functools import partial

tf.reset_default_graph()

# Neural Network dimensions
n_inputs = 28*28 # mnist dataset
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10

learning_rate = 0.1

# training data
X = tf.placeholder(tf.float32, shape = (None, n_inputs), name = 'X')
y = tf.placeholder(tf.int64, shape = (None), name = 'y')

#training placeholder: True during training, False otherwise 
#(to use current mini-batch's mean and standar deviation or the whole training set's mean and standard deviation)
training = tf.placeholder_with_default(False, shape=(), name='training')


with tf.name_scope('DNN'):
    #HE initialization function
    he_init = tf.contrib.layers.variance_scaling_initializer()
    
    #we will use this function to apply the batch normalization to every layer 
    #before the activation function
    my_batch_norm = partial(tf.layers.batch_normalization, training = training, momentum = 0.9)
    
    #dense layer with he init function
    my_dense_layer = partial(tf.layers.dense, kernel_initializer = he_init)
    
    
    # Hidden Layer 1
    hidden1 = my_dense_layer(X, n_hidden1, name = 'hidden1')
    bn1 = my_batch_norm(hidden1)
    bn1_act = tf.nn.elu(bn1)

    # Hidden Layer 1
    hidden2 = my_dense_layer(bn1_act, n_hidden2, name = 'hidden2')
    bn2 = my_batch_norm(hidden2)
    bn2_act = tf.nn.elu(bn2)
    
    # Logits (output layer)
    logits_pre_bn = my_dense_layer(bn2_act, n_outputs, name = 'outputs')
    logits = my_batch_norm(logits_pre_bn)


with tf.name_scope('loss'):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels = y, logits = logits)
    loss = tf.reduce_mean(xentropy, name = 'mse')
    

with tf.name_scope('train'):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)
    

with tf.name_scope('eval'):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
    
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [12]:
n_epochs = 20
batch_size = 200

In [None]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/")

In [21]:
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run([training_op, extra_update_ops],
                    feed_dict={training: True, X: X_batch, y: y_batch})
        acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        acc_valid = accuracy.eval(feed_dict={X: mnist.validation.images, y: mnist.validation.labels})
        print(epoch, "Batch accuracy:", acc_train, "Validation accuracy:", acc_valid)
    acc_test = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels})
    print("\nTest accuracy:", acc_test)
    save_path = saver.save(sess, './my_model_final.ckpt')

0 Batch accuracy: 0.96 Validation accuracy: 0.9448
1 Batch accuracy: 1.0 Validation accuracy: 0.9612
2 Batch accuracy: 1.0 Validation accuracy: 0.9676
3 Batch accuracy: 1.0 Validation accuracy: 0.9736
4 Batch accuracy: 0.995 Validation accuracy: 0.9728
5 Batch accuracy: 0.99 Validation accuracy: 0.9748
6 Batch accuracy: 1.0 Validation accuracy: 0.9752
7 Batch accuracy: 1.0 Validation accuracy: 0.9758
8 Batch accuracy: 1.0 Validation accuracy: 0.9768
9 Batch accuracy: 1.0 Validation accuracy: 0.9754
10 Batch accuracy: 1.0 Validation accuracy: 0.9768
11 Batch accuracy: 1.0 Validation accuracy: 0.9786
12 Batch accuracy: 1.0 Validation accuracy: 0.9784
13 Batch accuracy: 1.0 Validation accuracy: 0.9776
14 Batch accuracy: 0.995 Validation accuracy: 0.9798
15 Batch accuracy: 1.0 Validation accuracy: 0.9796
16 Batch accuracy: 1.0 Validation accuracy: 0.9776
17 Batch accuracy: 1.0 Validation accuracy: 0.9812
18 Batch accuracy: 1.0 Validation accuracy: 0.9786
19 Batch accuracy: 1.0 Validation a

In [22]:
[v.name for v in tf.trainable_variables()]

['hidden1/kernel:0',
 'hidden1/bias:0',
 'batch_normalization/beta:0',
 'batch_normalization/gamma:0',
 'hidden2/kernel:0',
 'hidden2/bias:0',
 'batch_normalization_1/beta:0',
 'batch_normalization_1/gamma:0',
 'outputs/kernel:0',
 'outputs/bias:0',
 'batch_normalization_2/beta:0',
 'batch_normalization_2/gamma:0']