In [1]:
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range

In [2]:
#Reloading the data we generated in notMNIST.ipynb for simple gradient descent

pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    train_dataset = save['train_dataset']
    train_labels = save['train_labels']
    valid_dataset = save['valid_dataset']
    valid_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    del save #hint to help gc free up memory
    print("Training set", train_dataset.shape, train_labels.shape)
    print("Validation set", valid_dataset.shape, valid_labels.shape)
    print("Test set", test_dataset.shape, test_labels.shape)

Training set (20000, 28, 28) (20000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


In [3]:
#Reformatting into a shape that's more adapted to the models we are going to train,i.e.,
#1. Data as flat matrix, and
#2. Labels as float 1-hot encodings.

image_size = 28
num_labels = 10

def reformat(dataset, labels):
    dataset = dataset.reshape((-1, image_size*image_size)).astype(np.float32)
    labels = (np.arange(num_labels) == labels[:, None]).astype(np.float32)
    return dataset, labels

train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)

print("Training set", train_dataset.shape, train_labels.shape)
print("Validation set", valid_dataset.shape, valid_labels.shape)
print("Test set", test_dataset.shape, test_labels.shape)

Training set (20000, 784) (20000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [4]:
#First training multinomial logistic regression using Gradient Descent.
#TensorFlow works like this...

#Firstly, we need to describe the computation we want to see performed: what the inputs, the variables and 
#the operations looks like. These get created as nodes over a computation graph,i.e., with graph.as_default():

#Secondly, after creating nodes on the graph, we can run the operations on the graph as many times as we want by
#calling session.run() , providing it's outputs to fetch from the graph that get returned. The run time operation is:
#    with tf.Session(graph=graph) as session:..

#Now loading all the data into the tensorflow and building the computation graph corresponding to our training.

#with gradient descent training this much data is prohibitive, hence, we are taking the subset of data for faster turnaround.

train_subset = 10000

graph = tf.Graph()
with graph.as_default():
    
    #loading the traing, validation and test data into constants that are attached to the graph.
    tf_train_dataset = tf.constant(train_dataset[:train_subset, :])
    tf_train_labels = tf.constant(train_labels[:train_subset])
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    #Variables
    #These are the parameters that are going to be trained.The weight matrix will be initialized using random values
    #following a (truncated) normal distribution. The biases get initialized to zero.
    weights = tf.Variable(tf.truncated_normal([image_size * image_size, num_labels]))
    biases = tf.Variable(tf.zeros([num_labels]))
    
    #Training computation.
    #We multiply the inputs with the weight matrix,and add biases. WEe compute the softmax and cross-entropy(it's one 
    #operation in tensorflow). We take the average of this cross-entropy across all training examples(that'sour loss).
    logits = tf.matmul(tf_train_dataset, weights) + biases
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
    
    #finding the minimum of this loss using gradient descent with learning rate as low as ~0.5.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    #predictions for training, validation and test data. These are not part of training nut merely here to report
    #accuracy figure as we train.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(tf.matmul(tf_valid_dataset, weights) + biases)
    test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



In [5]:
#running the above computation with iterating 801 time.

num_steps = 801

def accuracy(predictions, labels):
    return (100.0*np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1)) / predictions.shape[0])

with tf.Session(graph=graph) as session:
    #this is a one time operation which ensures the parameters get initialized as we describe in the graph:
    #random weights for the matrix and zeros for the biases.
    tf.global_variables_initializer().run()
    print("Initialized")
    for step in range(num_steps):
        #Running the computations. We tell .run() that we want to run the optimizer, get the loss value and the training
        #predictions returned as numpy array.
        _, l, predictions = session.run([optimizer, loss, train_prediction])
        if(step % 100 == 0):
            print("Loss at step %d: %f" % (step, l))
            print('Training accuracy: %.1f%%' % accuracy(predictions, train_labels[:train_subset, :]))
            #calling .eval() on vallid_prediction is basically like callin run(), but just to get 
            #that one numpy array. It recomputes all it's graph dependencies.
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
            
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Loss at step 0: 13.042050
Training accuracy: 15.2%
Validation accuracy: 22.0%
Loss at step 100: 2.334176
Training accuracy: 72.1%
Validation accuracy: 71.0%
Loss at step 200: 1.876901
Training accuracy: 74.8%
Validation accuracy: 73.3%
Loss at step 300: 1.636075
Training accuracy: 76.3%
Validation accuracy: 74.3%
Loss at step 400: 1.473008
Training accuracy: 77.1%
Validation accuracy: 74.7%
Loss at step 500: 1.350556
Training accuracy: 77.6%
Validation accuracy: 74.7%
Loss at step 600: 1.253723
Training accuracy: 78.2%
Validation accuracy: 75.0%
Loss at step 700: 1.174543
Training accuracy: 78.7%
Validation accuracy: 75.2%
Loss at step 800: 1.108290
Training accuracy: 78.9%
Validation accuracy: 75.3%
Test accuracy: 82.8%


In [6]:
#switching to Stochastic Gradient Descent (SGD) training, which is much faster. 
#The graph will be similar, except that instead of holding all the training data in the constant node,
#we create a placeholder nodewhich will be fed actual data at every call of session.run()

batch_size = 128

graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  weights = tf.Variable(
    tf.truncated_normal([image_size * image_size, num_labels]))
  biases = tf.Variable(tf.zeros([num_labels]))
  
  # Training computation.
  logits = tf.matmul(tf_train_dataset, weights) + biases
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
  
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(
    tf.matmul(tf_valid_dataset, weights) + biases)
  test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

In [7]:
#running the above SGD computation and iterating it over 3000 times.

num_steps = 3001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    for step in range(num_steps):
        #picking an offset with training data which has been randomized (we could use better randomization across epoch).
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        #generating a minbatch
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        #preparing a dictionary, telling the session where to fit the minibatch.
        #The key of the dictionary is the placeholder node of the graph to be fed,
        #and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict = feed_dict)
        if(step%100 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" %accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" %accuracy(valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" %accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 16.489494
Minibatch accuracy: 4.7%
Validation accuracy: 11.8%
Minibatch loss at step 100: 2.819848
Minibatch accuracy: 66.4%
Validation accuracy: 70.7%
Minibatch loss at step 200: 1.133360
Minibatch accuracy: 79.7%
Validation accuracy: 72.8%
Minibatch loss at step 300: 2.144049
Minibatch accuracy: 71.1%
Validation accuracy: 73.3%
Minibatch loss at step 400: 1.580330
Minibatch accuracy: 77.3%
Validation accuracy: 74.4%
Minibatch loss at step 500: 0.934724
Minibatch accuracy: 80.5%
Validation accuracy: 73.6%
Minibatch loss at step 600: 1.603681
Minibatch accuracy: 77.3%
Validation accuracy: 74.5%
Minibatch loss at step 700: 1.337184
Minibatch accuracy: 75.8%
Validation accuracy: 74.2%
Minibatch loss at step 800: 1.727438
Minibatch accuracy: 74.2%
Validation accuracy: 75.2%
Minibatch loss at step 900: 1.515379
Minibatch accuracy: 71.9%
Validation accuracy: 74.9%
Minibatch loss at step 1000: 1.150156
Minibatch accuracy: 75.8%
Validation accuracy: 75.2%

In [13]:
#turning the logistic regression with SGD into a 1-hidden layer neural network with rectified linear units nn.relu() and 1024 hidden nodes. This model should improve our validation/test accuracy.

num_nodes = 1024
batch_size = 128

graph = tf.Graph()
with graph.as_default():
    #input data
    #For the training data, we use a placeholder that will be fed at runtime with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    #variables
    weights1 = tf.Variable(tf.truncated_normal([image_size * image_size, num_nodes]))
    biases1 = tf.Variable(tf.zeros([num_nodes]))
    weights2 = tf.Variable(tf.truncated_normal([num_nodes, num_labels]))
    biases2 = tf.Variable(tf.zeros([num_labels]))
    
    #training computation
    logits1 = tf.matmul(tf_train_dataset, weights1) + biases1
    relu_layer = tf.nn.relu(logits1)
    logits2 = tf.matmul(relu_layer, weights2) + biases2
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = tf_train_labels, logits = logits2))
    
    #optimizer
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    #predictions for training
    train_prediction = tf.nn.softmax(logits2)
    
    #predictions for validation
    logits1 = tf.matmul(tf_valid_dataset, weights1) + biases1
    relu_layer = tf.nn.relu(logits1)
    logits2 = tf.matmul(relu_layer, weights2) + biases2
    valid_prediction = tf.nn.softmax(logits2)
    
    #predictions for test
    logits1 = tf.matmul(tf_test_dataset, weights1) + biases1
    relu_layer = tf.nn.relu(logits1)
    logits2 = tf.matmul(relu_layer, weights2) + biases2
    test_prediction = tf.nn.softmax(logits2)

In [15]:
#running and iterating the relu computation

num_steps = 3001

with tf.Session(graph = graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for step in range(num_steps):
        #picking an offset with training data that has been normalized(we could use better randomization across epochs)
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        #generating minibatch
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        #preparing a dixtionary. Telling a session where to feed the minibatch. The key of the dictionary is the placeholder node of the graph to be fed, and the value is th numpy array to fed it
        feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict = feed_dict)
        if(step%100 ==0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" %accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" %accuracy(valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" %accuracy(test_prediction.eval(), test_labels))

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Initialized
Minibatch loss at step 0: 395.735962
Minibatch accuracy: 8.6%
Validation accuracy: 40.5%
Minibatch loss at step 100: 35.624657
Minibatch accuracy: 78.1%
Validation accuracy: 68.6%
Minibatch loss at step 200: 23.435997
Minibatch accuracy: 82.0%
Validation accuracy: 78.9%
Minibatch loss at step 300: 21.818375
Minibatch accuracy: 78.9%
Validation accuracy: 80.1%
Minibatch loss at step 400: 6.638374
Minibatch accuracy: 85.2%
Validation accuracy: 79.6%
Minibatch loss at step 500: 2.314669
Minibatch accuracy: 93.0%
Validation accuracy: 80.1%
Minibatch loss at step 600: 7.265114
Minibatch accuracy: 87.5%
Validation accuracy: 80.4%
Minibatch loss at step 700: 6.322563
Minibatch accuracy: 89.1%
Validation accuracy: 79.8%
Minibatch loss at step 800: 2.709992
Minibatch accuracy: 87.5%
Validation accuracy: 80.6%
Minibatch loss at step 900: 2.385769
Minibatch accuracy: 90.6%
Validation accuracy: 80.9%
Minibatch lo