# Deep Learning
## Assignment 3
Previously in 2_fullyconnected.ipynb, you trained a logistic regression and a neural network model.
The goal of this assignment is to explore regularization techniques.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

First reload the data we generated in notmist.ipynb.

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:

* data as a flat matrix,
* labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 1 to [0.0, 1.0, 0.0 ...], 2 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [4]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

## Problem 1
Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor t using nn.l2_loss(t). The right amount of regularization should improve your validation / test accuracy.

In [5]:
##### logistic model 
image_size=28
def create_log_model_and_run(graph,
                         train_dataset,
                         train_labels,
                         valid_dataset,
                         valid_labels,
                         test_dataset,
                         test_labels,
                         beta,
                         num_steps,
                         num_labels=10,batch_size = 128):
    
    with graph.as_default():
      # Input data. For the training data, we use a placeholder that will be fed
      # at run time with a training minibatch.
      tf_train_dataset = tf.placeholder(tf.float32,
                                        shape=(batch_size, image_size * image_size))
      tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
      tf_valid_dataset = tf.constant(valid_dataset)
      tf_test_dataset = tf.constant(test_dataset)

      # Variables.
      weights = tf.Variable(
        tf.truncated_normal([image_size * image_size, num_labels]))
      biases = tf.Variable(tf.zeros([num_labels]))

      # Training computation.
      logits = tf.matmul(tf_train_dataset, weights) + biases
      loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)+beta*tf.nn.l2_loss(weights))

      # Optimizer.
      optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

      # Predictions for the training, validation, and test data.
      train_prediction = tf.nn.softmax(logits)
      valid_prediction = tf.nn.softmax(
        tf.matmul(tf_valid_dataset, weights) + biases)
      test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)
        
    test_accuracy = 0
    with tf.Session(graph=graph) as session:
      init = tf.initialize_all_variables()
      session.run(init)
      print("Initialized")
      for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run(
          [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
          print("Minibatch loss at step %d: %f" % (step, l))
          print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
          print("Validation accuracy: %.1f%%" % accuracy(
            valid_prediction.eval(), valid_labels))
      test_accuracy = accuracy(test_prediction.eval(), test_labels)
      print("Test accuracy: %.1f%%" % test_accuracy)
    
    return test_accuracy
    

In [6]:
num_steps = 3001

betas = [0.001,0.01,0.1,1,10]
test_accuracy = np.zeros(len(betas))
i = 0
for beta in betas:
  print("\n>>>>>>>>>> Beta: %f%%" % beta)
  graph = tf.Graph()
  test_accuracy[i] = create_log_model_and_run(graph,
                         train_dataset,
                         train_labels,
                         valid_dataset,
                         valid_labels,
                         test_dataset,
                         test_labels,
                         beta,
                         num_steps)
   
  i = i +1


>>>>>>>>>> Beta: 0.001000%
Initialized
Minibatch loss at step 0: 22.028656
Minibatch accuracy: 9.4%
Validation accuracy: 17.7%
Minibatch loss at step 500: 2.538383
Minibatch accuracy: 80.5%
Validation accuracy: 76.0%
Minibatch loss at step 1000: 1.742854
Minibatch accuracy: 78.9%
Validation accuracy: 78.0%
Minibatch loss at step 1500: 0.959154
Minibatch accuracy: 84.4%
Validation accuracy: 79.6%
Minibatch loss at step 2000: 0.834365
Minibatch accuracy: 89.8%
Validation accuracy: 80.9%
Minibatch loss at step 2500: 0.818891
Minibatch accuracy: 80.5%
Validation accuracy: 81.4%
Minibatch loss at step 3000: 0.783801
Minibatch accuracy: 84.4%
Validation accuracy: 81.7%
Test accuracy: 88.9%

>>>>>>>>>> Beta: 0.010000%
Initialized
Minibatch loss at step 0: 50.588020
Minibatch accuracy: 9.4%
Validation accuracy: 13.0%
Minibatch loss at step 500: 0.728273
Minibatch accuracy: 86.7%
Validation accuracy: 81.5%
Minibatch loss at step 1000: 0.799771
Minibatch accuracy: 80.5%
Validation accuracy: 81.

In [7]:
print("*** Best beta:"+str(betas[np.argmax(test_accuracy)])+ " -- accuracy:" + str(test_accuracy[np.argmax(test_accuracy)]))

*** Best beta:0.001 -- accuracy:88.9


In [8]:
##### nn model 
def create_nn_model_and_run(graph,
                         train_dataset,
                         train_labels,
                         valid_dataset,
                         valid_labels,
                         test_dataset,
                         test_labels,
                         beta,
                         num_steps,
                         num_labels=10,batch_size = 128):
    with graph.as_default():
      # Input data. For the training data, we use a placeholder that will be fed
      # at run time with a training minibatch.
      tf_train_dataset = tf.placeholder(tf.float32,shape=(batch_size, image_size * image_size))
      tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
      tf_valid_dataset = tf.constant(valid_dataset)
      tf_test_dataset = tf.constant(test_dataset)

      # Hidden 1
      weights_1 = tf.Variable(tf.truncated_normal([image_size * image_size, 1024], name='weights_1'))
      biases_1 = tf.Variable(tf.zeros([1024]),name='biases_1')
      hidden_1 = tf.nn.relu(tf.matmul(tf_train_dataset, weights_1) + biases_1)

      # Softmax 
      weights_2 = tf.Variable(tf.truncated_normal([1024, 10], name='weights_2'))
      biases_2 = tf.Variable(tf.zeros([10]),name='biases_2')
      logits = tf.matmul(hidden_1, weights_2) + biases_2

      # 
      loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)+
                            beta*(tf.nn.l2_loss(weights_1)+tf.nn.l2_loss(weights_2)) )

      # Optimizer.
      optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

      # Predictions for the training, validation, and test data.
      train_prediction = tf.nn.softmax(logits)
      valid_prediction = tf.nn.softmax(
        tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights_1) + biases_1), weights_2) + biases_2)
      test_prediction = tf.nn.softmax(
        tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights_1) + biases_1), weights_2) + biases_2)

    test_accuracy = 0
    with tf.Session(graph=graph) as session:
        init = tf.initialize_all_variables()
        session.run(init)
        print("Initialized")
        for step in range(num_steps):
            # Pick an offset within the training data, which has been randomized.
            # Note: we could use better randomization across epochs.
            offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
            # Generate a minibatch.
            batch_data = train_dataset[offset:(offset + batch_size), :]
            batch_labels = train_labels[offset:(offset + batch_size), :]
            # Prepare a dictionary telling the session where to feed the minibatch.
            # The key of the dictionary is the placeholder node of the graph to be fed,
            # and the value is the numpy array to feed to it.
            feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
            _, l, predictions = session.run(
              [optimizer, loss, train_prediction], feed_dict=feed_dict)
            if (step % 500 == 0):
              print("Minibatch loss at step %d: %f" % (step, l))
              print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
              print("Validation accuracy: %.1f%%" % accuracy(
              valid_prediction.eval(), valid_labels))
              test_accuracy = accuracy(test_prediction.eval(), test_labels)
        print("Test accuracy: %.1f%%" % test_accuracy)
    return test_accuracy

In [9]:
betas = [0.001,0.01,0.1,1,10]
test_accuracy = np.zeros(len(betas))
i = 0
for beta in betas:
  print("\n>>>>>>>>>> Beta: %f%%" % beta)
  graph = tf.Graph()
  test_accuracy[i] = create_nn_model_and_run(graph,
                         train_dataset,
                         train_labels,
                         valid_dataset,
                         valid_labels,
                         test_dataset,
                         test_labels,
                         beta,
                         num_steps)
   
  i = i +1


>>>>>>>>>> Beta: 0.001000%
Initialized
Minibatch loss at step 0: 741.934448
Minibatch accuracy: 6.2%
Validation accuracy: 31.2%
Minibatch loss at step 500: 194.260941
Minibatch accuracy: 82.8%
Validation accuracy: 78.9%
Minibatch loss at step 1000: 114.849434
Minibatch accuracy: 81.2%
Validation accuracy: 80.8%
Minibatch loss at step 1500: 68.388474
Minibatch accuracy: 88.3%
Validation accuracy: 82.1%
Minibatch loss at step 2000: 41.218170
Minibatch accuracy: 89.8%
Validation accuracy: 84.8%
Minibatch loss at step 2500: 25.179317
Minibatch accuracy: 85.9%
Validation accuracy: 86.0%
Minibatch loss at step 3000: 15.523145
Minibatch accuracy: 87.5%
Validation accuracy: 86.4%
Test accuracy: 93.0%

>>>>>>>>>> Beta: 0.010000%
Initialized
Minibatch loss at step 0: 3456.065186
Minibatch accuracy: 11.7%
Validation accuracy: 26.9%
Minibatch loss at step 500: 21.216394
Minibatch accuracy: 85.9%
Validation accuracy: 84.2%
Minibatch loss at step 1000: 0.951375
Minibatch accuracy: 82.0%
Validation 

In [10]:
print("*** Best beta:"+str(betas[np.argmax(test_accuracy)])+ " -- accuracy:" + str(test_accuracy[np.argmax(test_accuracy)]))

*** Best beta:0.001 -- accuracy:93.02


## Problem 2
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

In [11]:
def create_nn_model_overfit_and_run(graph,
                         train_dataset,
                         train_labels,
                         valid_dataset,
                         valid_labels,
                         test_dataset,
                         test_labels,
                         beta,
                         num_steps,
                         num_labels=10,batch_size = 128):
    with graph.as_default():
      # Input data. For the training data, we use a placeholder that will be fed
      # at run time with a training minibatch.
      tf_train_dataset = tf.placeholder(tf.float32,shape=(batch_size, image_size * image_size))
      tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
      tf_valid_dataset = tf.constant(valid_dataset)
      tf_test_dataset = tf.constant(test_dataset)

      # Hidden 1
      weights_1 = tf.Variable(tf.truncated_normal([image_size * image_size, 1024], name='weights_1'))
      biases_1 = tf.Variable(tf.zeros([1024]),name='biases_1')
      hidden_1 = tf.nn.relu(tf.matmul(tf_train_dataset, weights_1) + biases_1)

      # Softmax 
      weights_2 = tf.Variable(tf.truncated_normal([1024, 10], name='weights_2'))
      biases_2 = tf.Variable(tf.zeros([10]),name='biases_2')
      logits = tf.matmul(hidden_1, weights_2) + biases_2

      # 
      loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)+
                            beta*(tf.nn.l2_loss(weights_1)+tf.nn.l2_loss(weights_2)) )

      # Optimizer.
      optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

      # Predictions for the training, validation, and test data.
      train_prediction = tf.nn.softmax(logits)
      valid_prediction = tf.nn.softmax(
        tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights_1) + biases_1), weights_2) + biases_2)
      test_prediction = tf.nn.softmax(
        tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights_1) + biases_1), weights_2) + biases_2)

    test_accuracy = 0
    with tf.Session(graph=graph) as session:
        init = tf.initialize_all_variables()
        session.run(init)
        print("Initialized")
        for step in range(num_steps):
            # Pick an offset within the training data, which has been randomized.
            # Note: we could use better randomization across epochs.
            offset = (1 * batch_size) % (train_labels.shape[0] - batch_size)
            # Generate a minibatch.
            batch_data = train_dataset[offset:(offset + batch_size), :]
            batch_labels = train_labels[offset:(offset + batch_size), :]
            # Prepare a dictionary telling the session where to feed the minibatch.
            # The key of the dictionary is the placeholder node of the graph to be fed,
            # and the value is the numpy array to feed to it.
            feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
            _, l, predictions = session.run(
              [optimizer, loss, train_prediction], feed_dict=feed_dict)
            if (step % 500 == 0):
              print("Minibatch loss at step %d: %f" % (step, l))
              print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
              print("Validation accuracy: %.1f%%" % accuracy(
              valid_prediction.eval(), valid_labels))
              test_accuracy = accuracy(test_prediction.eval(), test_labels)
        print("Test accuracy: %.1f%%" % test_accuracy)
    return test_accuracy

In [12]:
betas = [0.001,0.01,0.1,1,10]
test_accuracy = np.zeros(len(betas))
i = 0
for beta in betas:
  print("\n>>>>>>>>>> Beta: %f%%" % beta)
  graph = tf.Graph()
  test_accuracy[i] = create_nn_model_overfit_and_run(graph,
                         train_dataset,
                         train_labels,
                         valid_dataset,
                         valid_labels,
                         test_dataset,
                         test_labels,
                         beta,
                         num_steps)
   
  i = i +1


>>>>>>>>>> Beta: 0.001000%
Initialized
Minibatch loss at step 0: 741.652832
Minibatch accuracy: 11.7%
Validation accuracy: 30.1%
Minibatch loss at step 500: 190.727005
Minibatch accuracy: 100.0%
Validation accuracy: 63.2%
Minibatch loss at step 1000: 115.667313
Minibatch accuracy: 100.0%
Validation accuracy: 63.2%
Minibatch loss at step 1500: 70.146988
Minibatch accuracy: 100.0%
Validation accuracy: 63.0%
Minibatch loss at step 2000: 42.541008
Minibatch accuracy: 100.0%
Validation accuracy: 63.0%
Minibatch loss at step 2500: 25.799240
Minibatch accuracy: 100.0%
Validation accuracy: 63.0%
Minibatch loss at step 3000: 15.646235
Minibatch accuracy: 100.0%
Validation accuracy: 63.1%
Test accuracy: 70.6%

>>>>>>>>>> Beta: 0.010000%
Initialized
Minibatch loss at step 0: 3489.166016
Minibatch accuracy: 4.7%
Validation accuracy: 36.2%
Minibatch loss at step 500: 21.016916
Minibatch accuracy: 100.0%
Validation accuracy: 68.3%
Minibatch loss at step 1000: 0.340512
Minibatch accuracy: 100.0%
Val

In [13]:
print("*** Best beta:"+str(betas[np.argmax(test_accuracy)])+ " -- accuracy:" + str(test_accuracy[np.argmax(test_accuracy)]))

*** Best beta:0.01 -- accuracy:79.21


## Problem 3
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides nn.dropout() for that, but you have to make sure it's only inserted during training.
What happens to our extreme overfitting case?


In [14]:
##### nn model with dropout 
def create_nn_model_dropout_and_run(graph,
                         train_dataset,
                         train_labels,
                         valid_dataset,
                         valid_labels,
                         test_dataset,
                         test_labels,
                         dropout,
                         num_steps,
                         num_labels=10,batch_size = 128):
    with graph.as_default():
      # Input data. For the training data, we use a placeholder that will be fed
      # at run time with a training minibatch.
      tf_train_dataset = tf.placeholder(tf.float32,shape=(batch_size, image_size * image_size))
      tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
      tf_valid_dataset = tf.constant(valid_dataset)
      tf_test_dataset = tf.constant(test_dataset)

      # Hidden 1
      weights_1 = tf.Variable(tf.truncated_normal([image_size * image_size, 1024], name='weights_1'))
      biases_1 = tf.Variable(tf.zeros([1024]),name='biases_1')
      hidden_1 = tf.nn.relu(tf.matmul(tf_train_dataset, weights_1) + biases_1)
      
      dropped = tf.nn.dropout(hidden_1, dropout)

      # Softmax 
      weights_2 = tf.Variable(tf.truncated_normal([1024, 10], name='weights_2'))
      biases_2 = tf.Variable(tf.zeros([10]),name='biases_2')
      logits = tf.matmul(dropped, weights_2) + biases_2

      # 
      loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))

      # Optimizer.
      optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

      # Predictions for the training, validation, and test data.
      train_prediction = tf.nn.softmax(logits)
      valid_prediction = tf.nn.softmax(
        tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights_1) + biases_1), weights_2) + biases_2)
      test_prediction = tf.nn.softmax(
        tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights_1) + biases_1), weights_2) + biases_2)

    test_accuracy = 0
    with tf.Session(graph=graph) as session:
        init = tf.initialize_all_variables()
        session.run(init)
        print("Initialized")
        for step in range(num_steps):
            # Pick an offset within the training data, which has been randomized.
            # Note: we could use better randomization across epochs.
            offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
            # Generate a minibatch.
            batch_data = train_dataset[offset:(offset + batch_size), :]
            batch_labels = train_labels[offset:(offset + batch_size), :]
            # Prepare a dictionary telling the session where to feed the minibatch.
            # The key of the dictionary is the placeholder node of the graph to be fed,
            # and the value is the numpy array to feed to it.
            feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
            _, l, predictions = session.run(
              [optimizer, loss, train_prediction], feed_dict=feed_dict)
            if (step % 500 == 0):
              print("Minibatch loss at step %d: %f" % (step, l))
              print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
              print("Validation accuracy: %.1f%%" % accuracy(
              valid_prediction.eval(), valid_labels))
              test_accuracy = accuracy(test_prediction.eval(), test_labels)
        print("Test accuracy: %.1f%%" % test_accuracy)
    return test_accuracy

In [15]:
graph = tf.Graph()
test_accuracy = create_nn_model_dropout_and_run(graph,
                         train_dataset,
                         train_labels,
                         valid_dataset,
                         valid_labels,
                         test_dataset,
                         test_labels,
                         0.5,
                         num_steps)
   

Initialized
Minibatch loss at step 0: 474.254456
Minibatch accuracy: 12.5%
Validation accuracy: 27.3%
Minibatch loss at step 500: 24.837698
Minibatch accuracy: 79.7%
Validation accuracy: 80.0%
Minibatch loss at step 1000: 14.139109
Minibatch accuracy: 74.2%
Validation accuracy: 80.1%
Minibatch loss at step 1500: 18.663691
Minibatch accuracy: 75.8%
Validation accuracy: 79.5%
Minibatch loss at step 2000: 5.172076
Minibatch accuracy: 76.6%
Validation accuracy: 79.5%
Minibatch loss at step 2500: 8.879705
Minibatch accuracy: 78.1%
Validation accuracy: 80.0%
Minibatch loss at step 3000: 4.006877
Minibatch accuracy: 68.0%
Validation accuracy: 79.1%
Test accuracy: 87.1%


In [16]:
def create_nn_model_overfit_dropout_and_run(graph,
                         train_dataset,
                         train_labels,
                         valid_dataset,
                         valid_labels,
                         test_dataset,
                         test_labels,
                         dropout,
                         num_steps,
                         num_labels=10,batch_size = 128):
    with graph.as_default():
      # Input data. For the training data, we use a placeholder that will be fed
      # at run time with a training minibatch.
      tf_train_dataset = tf.placeholder(tf.float32,shape=(batch_size, image_size * image_size))
      tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
      tf_valid_dataset = tf.constant(valid_dataset)
      tf_test_dataset = tf.constant(test_dataset)

      # Hidden 1
      weights_1 = tf.Variable(tf.truncated_normal([image_size * image_size, 1024], name='weights_1'))
      biases_1 = tf.Variable(tf.zeros([1024]),name='biases_1')
      hidden_1 = tf.nn.relu(tf.matmul(tf_train_dataset, weights_1) + biases_1)
        
      dropped = tf.nn.dropout(hidden_1, dropout)

      # Softmax 
      weights_2 = tf.Variable(tf.truncated_normal([1024, 10], name='weights_2'))
      biases_2 = tf.Variable(tf.zeros([10]),name='biases_2')
      logits = tf.matmul(dropped, weights_2) + biases_2

      # 
      loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels) )

      # Optimizer.
      optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

      # Predictions for the training, validation, and test data.
      train_prediction = tf.nn.softmax(logits)
      valid_prediction = tf.nn.softmax(
        tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights_1) + biases_1), weights_2) + biases_2)
      test_prediction = tf.nn.softmax(
        tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights_1) + biases_1), weights_2) + biases_2)

    test_accuracy = 0
    with tf.Session(graph=graph) as session:
        init = tf.initialize_all_variables()
        session.run(init)
        print("Initialized")
        for step in range(num_steps):
            # Pick an offset within the training data, which has been randomized.
            # Note: we could use better randomization across epochs.
            offset = (1 * batch_size) % (train_labels.shape[0] - batch_size)
            # Generate a minibatch.
            batch_data = train_dataset[offset:(offset + batch_size), :]
            batch_labels = train_labels[offset:(offset + batch_size), :]
            # Prepare a dictionary telling the session where to feed the minibatch.
            # The key of the dictionary is the placeholder node of the graph to be fed,
            # and the value is the numpy array to feed to it.
            feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
            _, l, predictions = session.run(
              [optimizer, loss, train_prediction], feed_dict=feed_dict)
            if (step % 500 == 0):
              print("Minibatch loss at step %d: %f" % (step, l))
              print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
              print("Validation accuracy: %.1f%%" % accuracy(
              valid_prediction.eval(), valid_labels))
              test_accuracy = accuracy(test_prediction.eval(), test_labels)
        print("Test accuracy: %.1f%%" % test_accuracy)
    return test_accuracy

In [17]:
graph = tf.Graph()
test_accuracy = create_nn_model_overfit_dropout_and_run(graph,
                         train_dataset,
                         train_labels,
                         valid_dataset,
                         valid_labels,
                         test_dataset,
                         test_labels,
                         0.5,
                         num_steps)

Initialized
Minibatch loss at step 0: 463.747772
Minibatch accuracy: 9.4%
Validation accuracy: 23.8%
Minibatch loss at step 500: 0.000000
Minibatch accuracy: 100.0%
Validation accuracy: 68.4%
Minibatch loss at step 1000: 0.000000
Minibatch accuracy: 100.0%
Validation accuracy: 69.7%
Minibatch loss at step 1500: 0.000000
Minibatch accuracy: 100.0%
Validation accuracy: 70.4%
Minibatch loss at step 2000: 0.000000
Minibatch accuracy: 100.0%
Validation accuracy: 70.2%
Minibatch loss at step 2500: 0.000000
Minibatch accuracy: 100.0%
Validation accuracy: 69.6%
Minibatch loss at step 3000: 0.000000
Minibatch accuracy: 100.0%
Validation accuracy: 69.8%
Test accuracy: 76.7%


## Problem 4
Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is 97.1%.
One avenue you can explore is to add multiple layers.
Another one is to use learning rate decay:
    

global_step = tf.Variable(0)  # count the number of steps taken.

learning_rate = tf.train.exponential_decay(0.5, global_step, ...)

optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)

In [74]:
##### nn model 

import math 

def create_multi_nn_model_and_run(graph,
                         train_dataset,
                         train_labels,
                         valid_dataset,
                         valid_labels,
                         test_dataset,
                         test_labels,
                         beta,
                         num_steps,
                         num_labels=10,batch_size = 128,hidden_size=1024):
    with graph.as_default():
      # Input data. For the training data, we use a placeholder that will be fed
      # at run time with a training minibatch.
      tf_train_dataset = tf.placeholder(tf.float32,shape=(batch_size, image_size * image_size))
      tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
      tf_valid_dataset = tf.constant(valid_dataset)
      tf_test_dataset = tf.constant(test_dataset)

      # Hidden 1
      #weights_1 = tf.Variable(tf.truncated_normal([image_size * image_size, 1024], name='weights_1'))
      #biases_1 = tf.Variable(tf.zeros([1024]),name='biases_1')
      #hidden_1 = tf.nn.relu(tf.matmul(tf_train_dataset, weights_1) + biases_1)
        
      # Hidden 2
      #weights_2 = tf.Variable(tf.truncated_normal([1024,1024], name='weights_2'))
      #biases_2 = tf.Variable(tf.zeros([1024]),name='biases_2')
      #hidden_2 = tf.nn.relu(tf.matmul(hidden_1, weights_2) + biases_2)

      # Softmax 
      #weights_3 = tf.Variable(tf.truncated_normal([1024, 10], name='weights_3'))
      #biases_3 = tf.Variable(tf.zeros([10]),name='biases_3')
      #logits = tf.matmul(hidden_2, weights_3) + biases_3
        
        
      # Hidden 1
      with tf.name_scope('hidden1'):
        weights1 = tf.Variable(
            tf.truncated_normal([image_size * image_size, hidden_size],
                                stddev=1.0 / math.sqrt(float(image_size * image_size))),
            name='weights1')
        biases1 = tf.Variable(tf.zeros([hidden_size]),
                             name='biases1')
        hidden1 = tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1)
        #dropped1 = tf.nn.dropout(hidden1, 0.5)
        
      # Hidden 2
      with tf.name_scope('hidden2'):
        weights2 = tf.Variable(
            tf.truncated_normal([hidden_size, hidden_size],
                                stddev=1.0 / math.sqrt(float(hidden_size))),
            name='weights2')
        biases2 = tf.Variable(tf.zeros([hidden_size]),
                             name='biases2')
        hidden2 = tf.nn.relu(tf.matmul(hidden1, weights2) + biases2)
        #dropped2 = tf.nn.dropout(hidden2, 0.5)
        
      # Hidden 3
      #with tf.name_scope('hidden3'):
      #  weights3 = tf.Variable(
      #      tf.truncated_normal([1024, 1024],
      #                          stddev=1.0 / math.sqrt(float(1024))),
      #      name='weights2')
      #  biases3 = tf.Variable(tf.zeros([1024]),
      #                       name='biases_3')
      #  hidden3 = tf.nn.relu(tf.matmul(hidden2, weights3) + biases3)
      #
        
      # Linear
      with tf.name_scope('softmax_linear'):
        weightsS = tf.Variable(
            tf.truncated_normal([hidden_size, num_labels],
                                stddev=1.0 / math.sqrt(float(hidden_size))),
            name='weightsS')
        biasesS = tf.Variable(tf.zeros([num_labels]),name='biasesS')
        logits = tf.matmul(hidden2, weightsS) + biasesS

      # 
      loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)+
                            beta*(tf.nn.l2_loss(weights1)+tf.nn.l2_loss(weights2)+
                                  tf.nn.l2_loss(weightsS)) )

      # Optimizer.
      #optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
      global_step = tf.Variable(0) # count the number of steps taken.
      learning_rate = tf.train.exponential_decay(0.5, global_step, 100000, 0.96, staircase=True)
      optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)

      # Predictions for the training, validation, and test data.     
      train_prediction = tf.nn.softmax(logits)
      valid_prediction = tf.nn.softmax(
        tf.matmul(tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights1) + biases1),
                                       weights2) + biases2), weightsS) + biasesS)
      test_prediction = tf.nn.softmax(
        tf.matmul(tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1),
                                       weights2) + biases2), weightsS) + biasesS)

    test_accuracy = 0
    with tf.Session(graph=graph) as session:
        init = tf.initialize_all_variables()
        session.run(init)
        print("Initialized")
        for step in range(num_steps):
            # Pick an offset within the training data, which has been randomized.
            # Note: we could use better randomization across epochs.
            offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
            # Generate a minibatch.
            batch_data = train_dataset[offset:(offset + batch_size), :]
            batch_labels = train_labels[offset:(offset + batch_size), :]
            # Prepare a dictionary telling the session where to feed the minibatch.
            # The key of the dictionary is the placeholder node of the graph to be fed,
            # and the value is the numpy array to feed to it.
            feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
            _, l, predictions = session.run(
              [optimizer, loss, train_prediction], feed_dict=feed_dict)
            if (step % 500 == 0 or step ==(num_steps-1)):
              print("Minibatch loss at step %d: %f" % (step, l))
              print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
              print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
              test_accuracy = accuracy(test_prediction.eval(), test_labels)
        print("Test accuracy: %.1f%%" % test_accuracy)
    return test_accuracy

In [76]:
betas = [0,0.001,0.1,1]
test_accuracy = np.zeros(len(betas))
i = 0
for beta in betas:
  print("\n>>>>>>>>>> Beta: %f%%" % beta)
  graph = tf.Graph()
  test_accuracy[i] = create_multi_nn_model_and_run(graph,
                         train_dataset,
                         train_labels,
                         valid_dataset,
                         valid_labels,
                         test_dataset,
                         test_labels,
                         beta,
                         num_steps=3300,hidden_size=900)
   
  i = i +1


>>>>>>>>>> Beta: 0.000000%
Initialized
Minibatch loss at step 0: 2.291443
Minibatch accuracy: 10.9%
Validation accuracy: 37.5%
Minibatch loss at step 500: 0.349952
Minibatch accuracy: 89.1%
Validation accuracy: 86.1%
Minibatch loss at step 1000: 0.471818
Minibatch accuracy: 85.9%
Validation accuracy: 87.2%
Minibatch loss at step 1500: 0.239537
Minibatch accuracy: 93.0%
Validation accuracy: 88.3%
Minibatch loss at step 2000: 0.239688
Minibatch accuracy: 94.5%
Validation accuracy: 88.7%
Minibatch loss at step 2500: 0.301067
Minibatch accuracy: 91.4%
Validation accuracy: 89.2%
Minibatch loss at step 3000: 0.318308
Minibatch accuracy: 88.3%
Validation accuracy: 89.2%
Minibatch loss at step 3299: 0.184534
Minibatch accuracy: 93.0%
Validation accuracy: 89.4%
Test accuracy: 95.1%

>>>>>>>>>> Beta: 0.001000%
Initialized
Minibatch loss at step 0: 3.020672
Minibatch accuracy: 7.8%
Validation accuracy: 31.8%
Minibatch loss at step 500: 0.825769
Minibatch accuracy: 89.1%
Validation accuracy: 85.6

In [77]:
print("*** Best beta:"+str(betas[np.argmax(test_accuracy)])+ " -- accuracy:" + str(test_accuracy[np.argmax(test_accuracy)]))

*** Best beta:0 -- accuracy:95.08
