Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

First reload the data we generated in `1_notmnist.ipynb`.

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 1 to [0.0, 1.0, 0.0 ...], 2 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [4]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

In [5]:
train_dataset_original = train_dataset
train_labels_original = train_labels
valid_dataset_original = valid_dataset
valid_labels_original = valid_labels
test_dataset_original = test_dataset
test_labels_original = test_labels

In [6]:
# Logistic Model

batch_size = 128

def logistic_model_train_test(batch_size, beta=1e-02, num_steps=3001):
    graph = tf.Graph()
    with graph.as_default():

      # Input data. For the training data, we use a placeholder that will be fed
      # at run time with a training minibatch.
      tf_train_dataset = tf.placeholder(tf.float32,
                                        shape=(batch_size, image_size * image_size))
      tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
      tf_valid_dataset = tf.constant(valid_dataset)
      tf_test_dataset = tf.constant(test_dataset)

      # Variables.
      weights = tf.Variable(
        tf.truncated_normal([image_size * image_size, num_labels]))
      biases = tf.Variable(tf.zeros([num_labels]))

      # Training computation.
      logits = tf.matmul(tf_train_dataset, weights) + biases
      loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
      # regularization
      loss = loss + beta*tf.nn.l2_loss(weights)

      # Optimizer.
      optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

      # Predictions for the training, validation, and test data.
      train_prediction = tf.nn.softmax(logits)
      valid_prediction = tf.nn.softmax(
        tf.matmul(tf_valid_dataset, weights) + biases)
      test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

    ###############################################################################

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(graph=graph, config=config) as session:
      tf.global_variables_initializer().run()
      print("Initialized")
      for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run(
          [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
          print("Minibatch loss at step %d: %f" % (step, l))
          print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
          print("Validation accuracy: %.1f%%" % accuracy(
            valid_prediction.eval(), valid_labels))
      print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

In [7]:
logistic_model_train_test(batch_size, 1e-02)

Initialized
Minibatch loss at step 0: 51.961105
Minibatch accuracy: 8.6%
Validation accuracy: 14.3%
Minibatch loss at step 500: 1.181631
Minibatch accuracy: 69.5%
Validation accuracy: 80.9%
Minibatch loss at step 1000: 0.808629
Minibatch accuracy: 83.6%
Validation accuracy: 81.7%
Minibatch loss at step 1500: 0.725195
Minibatch accuracy: 82.0%
Validation accuracy: 81.5%
Minibatch loss at step 2000: 0.760283
Minibatch accuracy: 79.7%
Validation accuracy: 81.3%
Minibatch loss at step 2500: 0.804104
Minibatch accuracy: 79.7%
Validation accuracy: 81.3%
Minibatch loss at step 3000: 0.849642
Minibatch accuracy: 79.7%
Validation accuracy: 81.1%
Test accuracy: 87.4%


Test accuracy with regularization ~ 87.4%

Test accuracy without regularization ~ 84.6% (from assignment 2)

In [8]:
# Neural network model

batch_size = 128
hidden_nodes = 1024

def graph_computation(inputs, weights_1, biases_1, weights_2, biases_2, labels=None, dropout_keep_pb=1):
    first_layer_out = tf.matmul(inputs, weights_1) + biases_1
    relu_out = tf.nn.relu(first_layer_out)
    # use dropout: https://www.tensorflow.org/get_started/mnist/pros
    # if dropout_keep_pb == 1, all elements in Tensor are kept, otherwise dropout is applied
    dropout_out = tf.nn.dropout(relu_out, dropout_keep_pb)
    logits = tf.matmul(dropout_out, weights_2) + biases_2
    if labels is not None:
        loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits))
        return logits, loss
    return logits, _

def neural_network_model_train_test(batch_size, hidden_nodes, beta, keep_training_prob=1, num_steps=3001):
    graph = tf.Graph()
    with graph.as_default():
      # Input data. For the training data, we use a placeholder that will be fed
      # at run time with a training minibatch.
      tf_train_dataset = tf.placeholder(tf.float32,
                                        shape=(batch_size, image_size * image_size))
      tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
      tf_valid_dataset = tf.constant(valid_dataset)
      tf_test_dataset = tf.constant(test_dataset)
        
      #dropout prob
      keep_prob = tf.placeholder(tf.float32)

      # Variables.
      weights_1 = tf.Variable(
        tf.truncated_normal([image_size * image_size, hidden_nodes]))
      biases_1 = tf.Variable(tf.zeros([hidden_nodes]))

      weights_2 = tf.Variable(
        tf.truncated_normal([hidden_nodes, num_labels]))
      biases_2 = tf.Variable(tf.zeros([num_labels]))

      # Training computation.
      logits, loss = graph_computation(tf_train_dataset, 
                                       weights_1, biases_1, weights_2, biases_2, 
                                       labels=tf_train_labels, dropout_keep_pb=keep_prob)
      # regularization (if beta==0 we have no regularization term) 
      loss = loss + beta*tf.nn.l2_loss(weights_1) + beta*tf.nn.l2_loss(weights_2)

      # Optimizer.
      optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

      # Predictions for the training, validation, and test data.
      train_prediction = tf.nn.softmax(logits)

      valid_logits, _ = graph_computation(tf_valid_dataset, 
                                          weights_1, biases_1, weights_2, biases_2)
      valid_prediction = tf.nn.softmax(valid_logits)

      test_logits, _ = graph_computation(tf_test_dataset, 
                                          weights_1, biases_1, weights_2, biases_2)

      test_prediction = tf.nn.softmax(test_logits)


    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(graph=graph, config=config) as session:
      tf.global_variables_initializer().run()
      print("Initialized")
      for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : keep_training_prob}
        _, l, predictions = session.run(
          [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
          print("Minibatch loss at step %d: %f" % (step, l))
          print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
          print("Validation accuracy: %.1f%%" % accuracy(
            valid_prediction.eval(), valid_labels))
      print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

In [9]:
neural_network_model_train_test(batch_size, hidden_nodes, 1e-03)

Initialized
Minibatch loss at step 0: 715.210327
Minibatch accuracy: 4.7%
Validation accuracy: 32.5%
Minibatch loss at step 500: 201.939026
Minibatch accuracy: 75.0%
Validation accuracy: 78.2%
Minibatch loss at step 1000: 114.155663
Minibatch accuracy: 86.7%
Validation accuracy: 82.7%
Minibatch loss at step 1500: 68.577736
Minibatch accuracy: 85.9%
Validation accuracy: 82.1%
Minibatch loss at step 2000: 41.379742
Minibatch accuracy: 83.6%
Validation accuracy: 85.1%
Minibatch loss at step 2500: 25.388046
Minibatch accuracy: 86.7%
Validation accuracy: 86.2%
Minibatch loss at step 3000: 15.552467
Minibatch accuracy: 82.8%
Validation accuracy: 86.9%
Test accuracy: 92.7%


Test accuracy with regularization ~ 92.7%

Test accuracy without regularization ~ 87% (from assignment 2)

---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

Note the difference between Minibatch accuracy and Validation and Testing accuracies

Minibatch accuracies reach 100%, while validation and Testing accuracies drops to ~60%, ~70%

And Test accuracies drop respect to using the whole original dataset (see Problem 1 accuracies)

---

In [10]:
# Restrict your training data to just a few batches
train_dataset = train_dataset_original[:batch_size*2]
train_labels = train_labels_original[:batch_size*2]
valid_dataset = valid_dataset_original[:batch_size*2]
valid_labels = valid_labels_original[:batch_size*2]
test_dataset = test_dataset_original[:batch_size*2]
test_labels = test_labels_original[:batch_size*2]

In [11]:
logistic_model_train_test(batch_size, 1e-02, 3001)

Initialized
Minibatch loss at step 0: 46.801533
Minibatch accuracy: 11.7%
Validation accuracy: 14.1%
Minibatch loss at step 500: 0.316294
Minibatch accuracy: 100.0%
Validation accuracy: 64.8%
Minibatch loss at step 1000: 0.120715
Minibatch accuracy: 100.0%
Validation accuracy: 65.2%
Minibatch loss at step 1500: 0.119184
Minibatch accuracy: 100.0%
Validation accuracy: 64.8%
Minibatch loss at step 2000: 0.118989
Minibatch accuracy: 100.0%
Validation accuracy: 64.8%
Minibatch loss at step 2500: 0.118851
Minibatch accuracy: 100.0%
Validation accuracy: 64.5%
Minibatch loss at step 3000: 0.118750
Minibatch accuracy: 100.0%
Validation accuracy: 64.5%
Test accuracy: 82.8%


In [12]:
neural_network_model_train_test(batch_size, hidden_nodes, 1e-03)

Initialized
Minibatch loss at step 0: 583.121704
Minibatch accuracy: 11.7%
Validation accuracy: 33.6%
Minibatch loss at step 500: 190.537430
Minibatch accuracy: 100.0%
Validation accuracy: 64.5%
Minibatch loss at step 1000: 115.552414
Minibatch accuracy: 100.0%
Validation accuracy: 64.8%
Minibatch loss at step 1500: 70.077324
Minibatch accuracy: 100.0%
Validation accuracy: 65.2%
Minibatch loss at step 2000: 42.498714
Minibatch accuracy: 100.0%
Validation accuracy: 65.6%
Minibatch loss at step 2500: 25.773546
Minibatch accuracy: 100.0%
Validation accuracy: 66.0%
Minibatch loss at step 3000: 15.630507
Minibatch accuracy: 100.0%
Validation accuracy: 66.4%
Test accuracy: 79.3%


---
Problem 3
---------
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

If we use Dropout while removing L2 regularization, the network keeps overfitting, although not as much as the case with no dropout but L2 regularization (Training acc vs. Validation acc is 100% vs. ~70.3% ). The test accuracy is better than the case with no dropout but L2 regularization ( ~75.8% vs. ~84.0%). Minibatch loss is 0 from step 500 to the end of training.

If we use Dropout while keeping L2 regularization, the network keeps overfitting (Training acc vs. Validation acc is 100% vs. ~68.0% ). However, Minibatch loss does not reach the zero value in any iteration. Test accuracy is slightly different from the previous case (which applies Dropout and no L2 regularization); sometimes is better than the previous case and sometimes is worse.

---

In [13]:
# Use Dropout without regularization 
neural_network_model_train_test(batch_size, hidden_nodes, 0, keep_training_prob=0.5)

Initialized
Minibatch loss at step 0: 526.441162
Minibatch accuracy: 8.6%
Validation accuracy: 29.3%
Minibatch loss at step 500: 0.000000
Minibatch accuracy: 100.0%
Validation accuracy: 66.8%
Minibatch loss at step 1000: 0.000000
Minibatch accuracy: 100.0%
Validation accuracy: 67.6%
Minibatch loss at step 1500: 0.000000
Minibatch accuracy: 100.0%
Validation accuracy: 68.0%
Minibatch loss at step 2000: 0.000000
Minibatch accuracy: 100.0%
Validation accuracy: 66.8%
Minibatch loss at step 2500: 0.000000
Minibatch accuracy: 100.0%
Validation accuracy: 67.6%
Minibatch loss at step 3000: 0.000000
Minibatch accuracy: 100.0%
Validation accuracy: 68.0%
Test accuracy: 82.0%


In [14]:
# Use Dropout with regularization 
neural_network_model_train_test(batch_size, hidden_nodes, 1e-03, keep_training_prob=0.5)

Initialized
Minibatch loss at step 0: 762.987549
Minibatch accuracy: 7.0%
Validation accuracy: 39.8%
Minibatch loss at step 500: 191.642883
Minibatch accuracy: 100.0%
Validation accuracy: 66.8%
Minibatch loss at step 1000: 116.228081
Minibatch accuracy: 100.0%
Validation accuracy: 68.0%
Minibatch loss at step 1500: 70.490662
Minibatch accuracy: 100.0%
Validation accuracy: 68.0%
Minibatch loss at step 2000: 42.751842
Minibatch accuracy: 100.0%
Validation accuracy: 68.8%
Minibatch loss at step 2500: 26.011810
Minibatch accuracy: 99.2%
Validation accuracy: 68.4%
Minibatch loss at step 3000: 15.724291
Minibatch accuracy: 100.0%
Validation accuracy: 68.0%
Test accuracy: 84.4%


---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 
 ---


In [15]:
# Restore original data
train_dataset = train_dataset_original
train_labels = train_labels_original
valid_dataset = valid_dataset_original
valid_labels = valid_labels_original
test_dataset = test_dataset_original
test_labels = test_labels_original

In [23]:
# Neural network model with learning rate decay and a second hidden layer

def graph_computation_layer1(inputs, weights_1, biases_1):
    first_layer_out = tf.matmul(inputs, weights_1) + biases_1
    relu_out = tf.nn.relu(first_layer_out)
    return relu_out

def graph_computation_layer2(inputs, weights_1, biases_1, weights_2, biases_2, labels=None, dropout_keep_pb=1):
    first_layer_out = tf.matmul(inputs, weights_1) + biases_1
    relu_out = tf.nn.relu(first_layer_out)
    # use dropout: https://www.tensorflow.org/get_started/mnist/pros
    # if dropout_keep_pb == 1, all elements in Tensor are kept, otherwise dropout is applied
    dropout_out = tf.nn.dropout(relu_out, dropout_keep_pb)
    logits = tf.matmul(dropout_out, weights_2) + biases_2
    if labels is not None:
        loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits))
        return logits, loss
    return logits, _

def neural_network_model2_train_test(batch_size, hidden_nodes, betas=None, keep_training_prob=1, num_steps=3001, lr0=None):
    """
    hidden_nodes: must be a list with two values
    betas: must be a list containing three values, one for each of the three weights matrix 
    """
    graph = tf.Graph()
    with graph.as_default():
      # Input data. For the training data, we use a placeholder that will be fed
      # at run time with a training minibatch.
      tf_train_dataset = tf.placeholder(tf.float32,
                                        shape=(batch_size, image_size * image_size))
      tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
      tf_valid_dataset = tf.constant(valid_dataset)
      tf_test_dataset = tf.constant(test_dataset)
        
      #dropout prob
      keep_prob = tf.placeholder(tf.float32)

      # Variables.
      weights_1 = tf.Variable(
        tf.truncated_normal([image_size * image_size, hidden_nodes[0]]))
      biases_1 = tf.Variable(tf.zeros([hidden_nodes[0]]))
    
      
        
      weights_2 = tf.Variable(
        tf.truncated_normal([hidden_nodes[0], hidden_nodes[1]]))
      biases_2 = tf.Variable(tf.zeros([hidden_nodes[1]]))

      weights_3 = tf.Variable(
        tf.truncated_normal([hidden_nodes[1], num_labels]))
      biases_3 = tf.Variable(tf.zeros([num_labels]))

      # Training computation.
      first_layer_out = graph_computation_layer1(tf_train_dataset, weights_1, biases_1)
      logits, loss = graph_computation_layer2(first_layer_out, 
                                       weights_2, biases_2, weights_3, biases_3, 
                                       labels=tf_train_labels, dropout_keep_pb=keep_prob)
      # regularization (if betas==None we have no regularization term)
      if (betas is None):
        betas = [0, 0, 0]
      loss = loss + betas[0]*tf.nn.l2_loss(weights_1) + betas[1]*tf.nn.l2_loss(weights_2) + betas[2]*tf.nn.l2_loss(weights_3)

      # Optimizer.
      if lr0 == None:
        optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
      else:
        global_step = tf.Variable(0)  # count the number of steps taken
        learning_rate = tf.train.exponential_decay(lr0, global_step, 1000, 0.95)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)

      # Predictions for the training, validation, and test data.
      train_prediction = tf.nn.softmax(logits)
      
      partial_computation_v = graph_computation_layer1(tf_valid_dataset, weights_1, biases_1)
      valid_logits, _ = graph_computation_layer2(partial_computation_v, 
                                          weights_2, biases_2, weights_3, biases_3)
      valid_prediction = tf.nn.softmax(valid_logits)

      partial_computation = graph_computation_layer1(tf_test_dataset, weights_1, biases_1)
      test_logits, _ = graph_computation_layer2(partial_computation, 
                                          weights_2, biases_2, weights_3, biases_3)

      test_prediction = tf.nn.softmax(test_logits)


    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(graph=graph, config=config) as session:
      tf.global_variables_initializer().run()
      print("Initialized")
      for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : keep_training_prob}
        _, l, predictions = session.run(
          [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 1000 == 0):
          print("Minibatch loss at step %d: %f" % (step, l))
          print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
          print("Validation accuracy: %.1f%%" % accuracy(
            valid_prediction.eval(), valid_labels))
      print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

In [32]:
# Additional hidden layer
neural_network_model2_train_test(128, hidden_nodes=[1024, 32], betas=[1e-02, 1e-03, 1e-04], keep_training_prob=1, num_steps=20001, lr0=0.03)

Initialized
Minibatch loss at step 0: 4168.687500
Minibatch accuracy: 10.2%
Validation accuracy: 17.6%
Minibatch loss at step 1000: 1744.224976
Minibatch accuracy: 18.0%
Validation accuracy: 17.3%
Minibatch loss at step 2000: 1005.779358
Minibatch accuracy: 23.4%
Validation accuracy: 28.7%
Minibatch loss at step 3000: 597.887512
Minibatch accuracy: 39.1%
Validation accuracy: 37.1%
Minibatch loss at step 4000: 366.133545
Minibatch accuracy: 45.3%
Validation accuracy: 42.4%
Minibatch loss at step 5000: 231.222717
Minibatch accuracy: 50.0%
Validation accuracy: 52.9%
Minibatch loss at step 6000: 150.562973
Minibatch accuracy: 67.2%
Validation accuracy: 64.2%
Minibatch loss at step 7000: 100.988342
Minibatch accuracy: 67.2%
Validation accuracy: 72.4%
Minibatch loss at step 8000: 69.890175
Minibatch accuracy: 71.9%
Validation accuracy: 70.8%
Minibatch loss at step 9000: 50.261005
Minibatch accuracy: 73.4%
Validation accuracy: 77.5%
Minibatch loss at step 10000: 37.232098
Minibatch accuracy: 

### Conclusion Problem 4

Need to test different values for neural_network_model2_train_test input arguments, and also for tf.train.exponential_decay

Deeper net --> more training steps, less learning rate, less beta (regularization parameter)


tf.train.exponential_decay(lr0, global_step, 1000, 0.90, staircase=True)

hidden nodes layer 2: 512

beta=1e-04, keep_training_prob=0.5, num_steps=10000, lr0=0.075 --> Test acc: 29.3%

beta=1e-04, keep_training_prob=0.5, num_steps=10000, lr0=0.055 --> Test acc: 45.3%

beta=1e-04, keep_training_prob=1, num_steps=10000, lr0=0.055 --> Test acc: 60.1%

beta=1e-05, keep_training_prob=1, num_steps=10000, lr0=0.055 --> Test acc: 72.6%

beta=1e-05, keep_training_prob=1, num_steps=10000, lr0=0.025 --> Test acc: 75.0%

beta=1e-06, keep_training_prob=1, num_steps=10000, lr0=0.025 --> Test acc: 76.5%

beta=1e-06, keep_training_prob=1, num_steps=10000, lr0=0.01 --> Test acc: 75.5%

hidden nodes layer 2: 256

beta=1e-06, keep_training_prob=1, num_steps=10000, lr0=0.01 --> Test acc: 73.8%

beta=1e-07, keep_training_prob=1, num_steps=10000, lr0=0.01 --> Test acc: 74.0%

beta=1e-07, keep_training_prob=1, num_steps=10000, lr0=0.005 --> Test acc: 76.2%

beta=1e-07, keep_training_prob=1, num_steps=10000, lr0=0.001 --> Test acc: 84.0%

beta=1e-07, keep_training_prob=1, num_steps=20000, lr0=0.001 --> Test acc: 83.5%

beta=1e-07, keep_training_prob=1, num_steps=20000, lr0=0.0005 --> Test acc: 84.5%

beta=1e-06, keep_training_prob=1, num_steps=10000, lr0=0.001 --> Test acc: 83%

change exponential decay: 

tf.train.exponential_decay(lr0, global_step, 1000, 0.65)

beta=1e-06, keep_training_prob=1, num_steps=10000, lr0=0.001 --> Test acc: 84.7%

beta=1e-06, keep_training_prob=1, num_steps=10000, lr0=0.01 --> Test acc: 67%

change betas: multiply beta3 times 1e-02

beta=1e-02, keep_training_prob=1, num_steps=10000, lr0=0.001 --> Test acc: 85.1%

tf.train.exponential_decay(lr0, global_step, 1000, 0.95)

beta=1e-02, keep_training_prob=1, num_steps=10001, lr0=0.001 --> Test acc: 83.4%

beta=1e-02, keep_training_prob=1, num_steps=10001, lr0=0.01 --> Test acc: 82.8%

beta=1e-02, keep_training_prob=1, num_steps=15001, lr0=0.01 --> Test acc: 85.8%

beta=1e-02, keep_training_prob=1, num_steps=20001, lr0=0.01 --> Test acc: 87.6%

beta=1e-02, keep_training_prob=1, num_steps=20001, lr0=0.025 --> Test acc: 92.1%

https://discussions.udacity.com/t/assignment-3-problem-4/46128/4 ?

change: hidden_nodes provided as list, and betas also provided as a list

hidden_nodes=[1024, 512], betas=[1e-02, 1e-02, 1e-04], keep_training_prob=1, num_steps=20001, lr0=0.025: Test: 92.5%

hidden_nodes=[1024, 256], betas=[1e-02, 1e-02, 1e-04], keep_training_prob=1, num_steps=15001, lr0=0.03: Test: 92.1%

hidden_nodes=[1024, 128], betas=[1e-02, 1e-02, 1e-04], keep_training_prob=1, num_steps=15001, lr0=0.03: Test 92.3%

hidden_nodes=[1024, 64], betas=[1e-02, 1e-02, 1e-04], keep_training_prob=1, num_steps=18001, lr0=0.03; Test: 92.4%

hidden_nodes=[1024, 64], betas=[1e-01, 1e-02, 1e-03], keep_training_prob=1, num_steps=18001, lr0=0.03; Test: 91.3%

hidden_nodes=[1024, 64], betas=[1e-02, 1e-03, 1e-04], keep_training_prob=1, num_steps=18001, lr0=0.03; Test: 91.8%

hidden_nodes=[1024, 64], betas=[1e-03, 1e-04, 1e-05], keep_training_prob=1, num_steps=20001, lr0=0.03; Test: 67.8%

hidden_nodes=[1024, 64], betas=[1e-02, 1e-03, 1e-04], keep_training_prob=1, num_steps=20001, lr0=0.03; Test: 92.6%

hidden_nodes=[1024, 32], betas=[1e-02, 1e-03, 1e-04], keep_training_prob=1, num_steps=20001, lr0=0.03; Test: 92.7%