In [1]:
#Vanilla network on MNIST

In [2]:
#Import 
import tensorflow as tf                                         # For Machine Learning
from tensorflow.examples.tutorials.mnist import input_data      # MNIST dataset object

In [3]:
#Loading MNIST dataset into a variable with one-hot encoding i.e 20x28 image is a single vector of 784 and 
#each output is a vector of 10 labels with one at the label and zero elsewhere 
MNIST = input_data.read_data_sets("MNIST_data/", one_hot = True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [4]:
#Batch_size, epochs, learning_rate

batch_size = 128
n_epochs = 25
learning_rate = 0.01

In [5]:
#Define inputs
X = tf.placeholder(dtype = tf.float32, shape = [batch_size,784], name = 'images')
Y = tf.placeholder(dtype = tf.float32, shape = [batch_size,10], name = 'labels' )

In [6]:
#No of hidden units
n_hidden_units = 2000

In [7]:
#Weights and biases, Here we've used Xavier initialization which works well with relu activation function

w1 = tf.get_variable(dtype = tf.float32, shape = [784,n_hidden_units], 
                     initializer = tf.contrib.layers.xavier_initializer(), name = 'hidden_weights')

b1 = tf.Variable(tf.zeros(shape = [1,n_hidden_units]), name = "hidden_biases")

w2 = tf.get_variable(dtype = tf.float32, shape = [n_hidden_units,10], 
                     initializer = tf.contrib.layers.xavier_initializer(), name = 'output_weights')

b2 = tf.Variable(tf.zeros(shape = [1,10]), name = "output_biases")


In [8]:
#Compute the hidden layer scores
hidden_logits = tf.matmul(X,w1) + b1
#Compute activations at hidden layer
hidden_entropy = tf.nn.relu(hidden_logits)
#Compute scores at output layer
logits = tf.matmul(hidden_entropy,w2) + b2
#Compute Softmax loss at the output layer
output_entropy = tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = Y)

In [9]:
#Loss calculation
loss = tf.reduce_mean(output_entropy)

In [10]:
#Optimizer for the network, We didn't use GradientDescentOptimizer here because it is comparitively slower to converge
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(loss)

In [11]:
#Total Correct Predictions count
total_correct_preds = 0

In [12]:
#Initializer
init = tf.global_variables_initializer()

In [13]:
#Loss per epoch
loss_sum = 0

In [14]:
with tf.Session() as sess:
    sess.run(init)
    #To save the model, create Saver object
    saver = tf.train.Saver()
    #Training Model
    n_batches = int(MNIST.train.num_examples/batch_size)       #number of batches to train
    for i in range(n_epochs):           # training the network on the dataset for n_epochs 
        for _ in range(n_batches):
            X_batch, Y_batch = MNIST.train.next_batch(batch_size)   #next_batch generates data batch of batch_size
            opt, loss_value = sess.run([optimizer, loss], feed_dict = {X:X_batch, Y:Y_batch})
            loss_sum += loss_value
        loss_mean = loss_sum/n_batches
        print("Loss at epoch {} = ".format(i),loss_mean)
        loss_sum = 0
        
    saver.save(sess, './models/model.ckpt')
    
    #Testing Model
    n_batches = int(MNIST.test.num_examples/batch_size)   # test data batches
    for i in range(n_batches):
        X_batch, Y_batch = MNIST.test.next_batch(batch_size)
        opt, loss_batch, logits_batch = sess.run([optimizer, loss, logits], feed_dict = {X:X_batch, Y:Y_batch})
        preds = tf.nn.softmax(logits_batch)     #getting predictions for each image
        correct_preds = tf.equal(tf.argmax(preds,1), tf.argmax(Y_batch,1))  #Comparing predictions with actual labels
        accuracy = tf.reduce_sum(tf.cast(correct_preds,tf.float32))#Computing accuracy by suming over an array of ones
        total_correct_preds += sess.run(accuracy)
    print("Accuracy {}".format(total_correct_preds/MNIST.test.num_examples))

Loss at epoch 0 =  0.248375402568
Loss at epoch 1 =  0.11257444993
Loss at epoch 2 =  0.0999627288123
Loss at epoch 3 =  0.0751139060898
Loss at epoch 4 =  0.0760831276749
Loss at epoch 5 =  0.06545011892
Loss at epoch 6 =  0.0613262419699
Loss at epoch 7 =  0.059895885775
Loss at epoch 8 =  0.0535014577062
Loss at epoch 9 =  0.0452394372409
Loss at epoch 10 =  0.0514961969113
Loss at epoch 11 =  0.0413180849549
Loss at epoch 12 =  0.0589651852595
Loss at epoch 13 =  0.0469580705179
Loss at epoch 14 =  0.0427544364754
Loss at epoch 15 =  0.0428658477728
Loss at epoch 16 =  0.0488559990582
Loss at epoch 17 =  0.0417805833628
Loss at epoch 18 =  0.0362437764856
Loss at epoch 19 =  0.0470676963222
Loss at epoch 20 =  0.041320434305
Loss at epoch 21 =  0.0551784383337
Loss at epoch 22 =  0.0385405582677
Loss at epoch 23 =  0.0395969516026
Loss at epoch 24 =  0.0340368059119
Accuracy 0.9639


The accuracy on training a simple model for classification with right initializations and faster Optimization strategies seems very good. With a few more epochs to train it could go close to 97%. But MNIST classification is a solved problem with state of the art technologies reaching accuracies to about 99.80%. This is as far as we could go on MNIST with a Vanilla Nueral Network. One can try increasing the capacity of the model by adding more layers to the model, combining with dropout and batch normalization , but its unlikely that a Vanilla Neural Network would serve more than 97%. An attempt to add batch_normalization to the previous model is shown below. The accuracy actually reduced!

In [6]:
#Define inputs
X = tf.placeholder(dtype = tf.float32, shape = [batch_size,784], name = 'images')
Y = tf.placeholder(dtype = tf.float32, shape = [batch_size,10], name = 'labels' )

#No of hidden units
n_hidden_units = 2000

#Weights and biases, Here we've used Xavier initialization which works well with relu activation function

w1 = tf.get_variable(dtype = tf.float32, shape = [784,n_hidden_units], 
                     initializer = tf.contrib.layers.xavier_initializer(), name = 'hidden_weights')

b1 = tf.Variable(tf.zeros(shape = [1,n_hidden_units]), name = "hidden_biases")

w2 = tf.get_variable(dtype = tf.float32, shape = [n_hidden_units,10], 
                     initializer = tf.contrib.layers.xavier_initializer(), name = 'output_weights')

b2 = tf.Variable(tf.zeros(shape = [1,10]), name = "output_biases")

#Scale Batch Norm of inputs by gamma1
gamma1 = tf.Variable(tf.ones(shape = [batch_size,784]), dtype = tf.float32, trainable = True, name = 'gamma1')
#Shift Batch Norm of inouts by beta1
beta1 = tf.Variable(tf.ones(shape = [batch_size,784]), dtype = tf.float32, trainable = True, name = 'beta1')
#Scale Batch Norm of hidden activations by gamma2
gamma2 = tf.Variable(tf.ones(shape = [batch_size, n_hidden_units]), dtype = tf.float32, trainable = True, name = 'gamma2')
#Scale Batch Norm of hidden activations by beta2
beta2 = tf.Variable(tf.ones(shape = [batch_size,n_hidden_units]), dtype = tf.float32, trainable = True, name = 'beta2')
#Scalar to add to Variance
epsilon = 1e-3

#Compute the hidden layer scores
X_input = batch_norm(X,gamma1,beta1,epsilon)
hidden_logits = tf.matmul(X_input,w1) + b1
#Compute activations at hidden layer
hidden_entropy = tf.nn.relu(hidden_logits)
#Compute dropout
# drop_out = tf.nn.dropout(hidden_entropy, keep_prob = 0.5)/0.5 #Here, we are scaling up the activations because at test
#time we'll be using the entire network and not sub-samples
#Compute scores at output layer
hidden_acts = batch_norm(hidden_entropy,gamma2,beta2,epsilon)
logits = tf.matmul(hidden_acts,w2) + b2
#Compute Softmax loss at the output layer
output_entropy = tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = Y)

#Loss calculation
loss = tf.reduce_mean(output_entropy)

#Optimizer for the network, We didn't use GradientDescentOptimizer here because it is comparitively slower to converge
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(loss)

#Total Correct Predictions count
total_correct_preds = 0

#Initializer
init = tf.global_variables_initializer()

#Loss per epoch
loss_sum = 0

with tf.Session() as sess:
    sess.run(init)
    #To save the model, create Saver object
    saver = tf.train.Saver()
    #Training Model
    n_batches = int(MNIST.train.num_examples/batch_size)       #number of batches to train
    for i in range(n_epochs):           # training the network on the dataset for n_epochs 
        for _ in range(n_batches):
            X_batch, Y_batch = MNIST.train.next_batch(batch_size)   #next_batch generates data batch of batch_size
            opt, loss_value = sess.run([optimizer, loss], feed_dict = {X:X_batch, Y:Y_batch})
            loss_sum += loss_value
        loss_mean = loss_sum/n_batches
        print("Loss at epoch {} = ".format(i),loss_mean)
        loss_sum = 0
        
    saver.save(sess, './models/model.ckpt')
    
    #Testing Model
    n_batches = int(MNIST.test.num_examples/batch_size)   # test data batches
    for i in range(n_batches):
        X_batch, Y_batch = MNIST.test.next_batch(batch_size)
        X_test_input = batch_norm(X_batch,gamma1,beta1,epsilon) 
        opt, loss_batch, logits_batch = sess.run([optimizer, loss, logits], feed_dict = {X:sess.run(X_test_input), Y:Y_batch})
        preds = tf.nn.softmax(logits_batch)     #getting predictions for each image
        correct_preds = tf.equal(tf.argmax(preds,1), tf.argmax(Y_batch,1))  #Comparing predictions with actual labels
        accuracy = tf.reduce_sum(tf.cast(correct_preds, tf.float32))#Computing accuracy by suming over an array of ones
        total_correct_preds += sess.run(accuracy)
    print("Accuracy {}".format(total_correct_preds/MNIST.test.num_examples))

Loss at epoch 0 =  1.26931818528
Loss at epoch 1 =  0.874514361084
Loss at epoch 2 =  0.713222288655
Loss at epoch 3 =  0.500095380492
Loss at epoch 4 =  0.378616767295
Loss at epoch 5 =  0.348061662853
Loss at epoch 6 =  0.297546567534
Loss at epoch 7 =  0.261044002079
Loss at epoch 8 =  0.254309315625
Loss at epoch 9 =  0.250619057045
Loss at epoch 10 =  0.25709687522
Loss at epoch 11 =  0.252009694158
Loss at epoch 12 =  0.273056957463
Loss at epoch 13 =  0.284037641669
Loss at epoch 14 =  0.276851629973
Loss at epoch 15 =  0.27925591509
Loss at epoch 16 =  0.266370114676
Loss at epoch 17 =  0.247087856482
Loss at epoch 18 =  0.266512148106
Loss at epoch 19 =  0.268455316255
Loss at epoch 20 =  0.264345035733
Loss at epoch 21 =  0.351741983528
Loss at epoch 22 =  0.396248425582
Loss at epoch 23 =  0.329770478511
Loss at epoch 24 =  0.322122983058
Accuracy 0.8732


In [5]:
def batch_norm(input_data, gamma, beta, epsilon):
    input_tensor = tf.convert_to_tensor(input_data)
    batch_mean , batch_var = tf.nn.moments(input_tensor, axes = [0])
    X_inputs = tf.nn.batch_normalization(input_tensor, mean = batch_mean, variance = batch_var, offset = beta, scale = gamma, variance_epsilon = epsilon)
    return X_inputs

### Batch Normalization

![](batch_norm.png)