This notebook contains code to train a neural network with one hidden layer on MNIST. At the end is a short exercise to add a second layer.

In [3]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

import numpy as np
import math

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
sess = None
def ResetSession():
    tf.reset_default_graph()
    global sess
    if sess is not None: sess.close()
    sess = tf.InteractiveSession()

### Code for a single hidden layer nn

In [6]:
ResetSession()

mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)

NUM_CLASSES = 10
NUM_PIXELS = 28*28
TRAIN_STEPS = 1000
BATCH_SIZE = 100

HIDDEN1_UNITS = 128
LEARNING_RATE = 0.5

x = tf.placeholder(dtype = tf.float32, shape = [None, NUM_PIXELS], name = "pixels")
y_ = tf.placeholder(dtype = tf.float32, shape = [None, NUM_CLASSES], name = "labels")

def weight_variable(inputs, outputs, name):
    initial = tf.truncated_normal(shape = [inputs, outputs], stddev=1.0/ math.sqrt(float(inputs)))
    return tf.Variable(initial, name=name)

def bias_variable(shape, name):
    initial = tf.constant(0.0, shape = [shape])
    return tf.Variable(initial, name=name)

weights1 = weight_variable(NUM_PIXELS, HIDDEN1_UNITS, "weights1")
biases1 = bias_variable(HIDDEN1_UNITS, "biases1")
hidden1 = tf.nn.relu(tf.matmul(x, weights1) + biases1, name="hidden1")

weights2 = weight_variable(HIDDEN1_UNITS, NUM_CLASSES, "weights2")
biases2 = bias_variable(NUM_CLASSES, "biases2")

y = tf.add(tf.matmul(hidden1, weights2), biases2)

summary_writer = tf.train.SummaryWriter("summaries/single_hidden_layer", graph=tf.get_default_graph())
summary_writer.close()

cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)

sess.run(tf.initialize_all_variables())

for i in range(TRAIN_STEPS):
    batch_xs, batch_ys = mnist.train.next_batch(BATCH_SIZE)
    _, loss = sess.run([train_step, cross_entropy], feed_dict={x: batch_xs, y_: batch_ys})
    if i%200 == 0:
        print("loss is %f", loss)
        
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

print("Accuracy is %f", sess.run(accuracy, feed_dict = {x:mnist.test.images, y_: mnist.test.labels}))

Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz
loss is %f 2.34853
loss is %f 0.177169
loss is %f 0.273267
loss is %f 0.241952
loss is %f 0.0798445
Accuracy is %f 0.9672


### Exercise

Add a second hidden layer to the above code, with 64 units. Experiment with the parameters (batch size, steps, learning rate, units per layer) to see if you can achieve higher accuracy than the single hidden layer model. Keep in mind there's randomness between runs.

In [34]:
def run(train_steps, batch_size, units1, units2, learning_rate):
    ResetSession()

    mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)

    NUM_CLASSES = 10
    NUM_PIXELS = 28*28
    TRAIN_STEPS = train_steps
    BATCH_SIZE = batch_size

    HIDDEN1_UNITS = units1
    HIDDEN2_UNITS = units2
    LEARNING_RATE = learning_rate

    x = tf.placeholder(dtype = tf.float32, shape = [None, NUM_PIXELS], name = "pixels")
    y_ = tf.placeholder(dtype = tf.float32, shape = [None, NUM_CLASSES], name = "labels")

    def weight_variable(inputs, outputs, name):
        initial = tf.truncated_normal(shape = [inputs, outputs], stddev=1.0/ math.sqrt(float(inputs)))
        return tf.Variable(initial, name=name)

    def bias_variable(shape, name):
        initial = tf.constant(0.0, shape = [shape])
        return tf.Variable(initial, name=name)

    weights1 = weight_variable(NUM_PIXELS, HIDDEN1_UNITS, "weights1")
    biases1 = bias_variable(HIDDEN1_UNITS, "biases1")
    hidden1 = tf.nn.relu(tf.matmul(x, weights1) + biases1, name="hidden1")

    weights2 = weight_variable(HIDDEN1_UNITS, HIDDEN2_UNITS, "weights2")
    biases2 = bias_variable(HIDDEN2_UNITS, "biases2")
    hidden2 = tf.nn.relu(tf.matmul(hidden1, weights2) + biases2, name = "hidden2")

    weights3 = weight_variable(HIDDEN2_UNITS, NUM_CLASSES, "weights3")
    biases3 = bias_variable(NUM_CLASSES, "biases3")

    y = tf.add(tf.matmul(hidden2, weights3), biases3)

    summary_writer = tf.train.SummaryWriter("summaries/single_hidden_layer", graph=tf.get_default_graph())
    summary_writer.close()

    cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_))
    train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)

    sess.run(tf.initialize_all_variables())

    for i in range(TRAIN_STEPS):
        batch_xs, batch_ys = mnist.train.next_batch(BATCH_SIZE)
        _, loss = sess.run([train_step, cross_entropy], feed_dict={x: batch_xs, y_: batch_ys})
        if i%200 == 0:
            print("At step %d, loss is %f" % (i, loss))

    correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    print("Accuracy is %f", sess.run(accuracy, feed_dict = {x:mnist.test.images, y_: mnist.test.labels}))
    return sess.run(accuracy, feed_dict = {x:mnist.test.images, y_: mnist.test.labels})

In [46]:
class GsRunner():
    
    def __init__(
        self,
        train_steps = [1000,2000,5000,],
        batch_size = [100, 250, 500, 1000,],
        units1 = [4, 8, 16, 32, 64, 128, 256,],
        units2 = [4, 8, 16, 32, 64, 128, 256,],
        learning_rate = [0.3, 0.4, 0.5, 0.6]
                ):
        self.best = {
            "val": 0, 
            "train_steps": None, 
            "batch_size": None, 
            "units1": None, 
            "units2": None, 
            "learning_rate": None
        }
        self.HP = {
            "train_steps": train_steps,
            "batch_size": batch_size,
            "units1": units1,
            "units2": units2,
            "learning_rate": learning_rate,
            
        }

    def update(self, val, **kwargs):
        if val>self.best["val"]:
            self.best["val"] = val
            for k in kwargs.keys():
                self.best[k] = kwargs[k]
            return

    def runner(self):
        for train_steps in self.HP["train_steps"]:
            for batch_size in self.HP["batch_size"]:
                for units1 in self.HP["units1"]:
                    for units2 in self.HP["units2"]:
                        for learning_rate in self.HP["learning_rate"]:
                            kwargs={
                                    "train_steps": train_steps,
                                    "batch_size": batch_size,
                                    "units1": units1,
                                    "units2": units2,
                                    "learning_rate": learning_rate,
                                    
                                }
                            self.update(run(train_steps, batch_size, units1, units2, learning_rate),
                                       **kwargs)
        return self.best

In [47]:
x = GsRunner(train_steps=[1000], batch_size=[50], units1=[4,8], units2=[4,8], learning_rate=[0.5,0.8])
x.runner()

Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz
At step 0, loss is 2.313547
At step 200, loss is 1.130637
At step 400, loss is 1.025067
At step 600, loss is 1.173610
At step 800, loss is 2.207865
Accuracy is %f 0.2126
Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz
At step 0, loss is 2.313341
At step 200, loss is 2.320842
At step 400, loss is 2.297128
At step 600, loss is 1.738284
At step 800, loss is 1.741569
Accuracy is %f 0.2839
Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz
At step 0, loss is 2.300416
At step 200, loss is 0.956067
At step 400, loss is 1.146738
At

{'batch_size': 50,
 'learning_rate': 0.5,
 'train_steps': 1000,
 'units1': 8,
 'units2': 8,
 'val': 0.83179998}

In [None]:
y = GsRunner()
thebest = y.runner()
print("Best accuracy found is ", thebest)

Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz
At step 0, loss is 2.317746
At step 200, loss is 2.298059
At step 400, loss is 2.299014
At step 600, loss is 2.310136
At step 800, loss is 2.299799
Accuracy is %f 0.1135
Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz
At step 0, loss is 2.317506
At step 200, loss is 1.921731
At step 400, loss is 2.709044
At step 600, loss is 1.386878
At step 800, loss is 1.441304
Accuracy is %f 0.4215
Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz
At step 0, loss is 2.312166
At step 200, loss is 2.086142
At step 400, loss is 1.855787
At