Deep Learning
=============

Assignment 4
------------

Previously in `2_fullyconnected.ipynb` and `3_regularization.ipynb`, we trained fully connected networks to classify [notMNIST](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html) characters.

The goal of this assignment is make the neural network convolutional.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a TensorFlow-friendly shape:
- convolutions need the image data formatted as a cube (width by height by #channels)
- labels as float 1-hot encodings.

In [3]:
imgsize = 28
n_labels = 10
n_channels = 1 # grayscale

import numpy as np

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, imgsize, imgsize, n_channels)).astype(np.float32)
  labels = (np.arange(n_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels

train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)

print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28, 1) (200000, 10)
Validation set (10000, 28, 28, 1) (10000, 10)
Test set (10000, 28, 28, 1) (10000, 10)


In [4]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

Let's build a small network with two convolutional layers, followed by one fully connected layer. Convolutional networks are more expensive computationally, so we'll limit its depth and number of fully connected nodes.

In [5]:
bsize = 16
psize = 5
depth = 16
n_hidden = 64

graph = tf.Graph()

with graph.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(tf.float32, shape=(bsize, imgsize, imgsize, n_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(bsize, n_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  weight1 = tf.Variable(tf.truncated_normal(
                  [psize, psize, n_channels, depth], stddev=0.1))
  bias1 = tf.Variable(tf.zeros([depth]))
  
  weight2 = tf.Variable(tf.truncated_normal(
                  [psize, psize, depth, depth], stddev=0.1))
  bias2 = tf.Variable(tf.constant(1.0, shape=[depth]))
  
  weight3 = tf.Variable(tf.truncated_normal(
                  [imgsize // 4 * imgsize // 4 * depth, n_hidden], stddev=0.1))
  bias3 = tf.Variable(tf.constant(1.0, shape=[n_hidden]))
  
  weight4 = tf.Variable(tf.truncated_normal(
                  [n_hidden, n_labels], stddev=0.1))
  bias4 = tf.Variable(tf.constant(1.0, shape=[n_labels]))
  
  # Model.
  def model(data):
    conv = tf.nn.conv2d(data, weight1, [1, 2, 2, 1], padding='SAME')
    hidden = tf.nn.relu(conv + bias1)
    
    conv = tf.nn.conv2d(hidden, weight2, [1, 2, 2, 1], padding='SAME')
    hidden = tf.nn.relu(conv + bias2)
    
    shape = hidden.get_shape().as_list()
    reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])
    
    hidden = tf.nn.relu(tf.matmul(reshape, weight3) + bias3)
    
    return tf.matmul(hidden, weight4) + bias4
  
  # Training computation.
  logits = model(tf_train_dataset)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
    
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
  test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [6]:
n_steps = 1001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Graph initialized')
  
  for step in range(n_steps):
    offset = (step * bsize) % (train_labels.shape[0] - bsize)
    
    batch_data = train_dataset[offset:(offset + bsize), :, :, :]
    batch_labels = train_labels[offset:(offset + bsize), :]
    
    dic = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
    _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=dic)
    
    if (step % 100 == 0):
      print('Step %d' % (step))
      print('Minibatch loss at: %.3f' % (l))
      print('Minibatch accuracy: %.2f%%' % accuracy(predictions, batch_labels))
      print('Validation accuracy: %.2f%%' % accuracy(valid_prediction.eval(), valid_labels))
      
  print('Test accuracy: %.2f%%' % accuracy(test_prediction.eval(), test_labels))

Graph initialized
Step 0
Minibatch loss at: 3.470
Minibatch accuracy: 6.25%
Validation accuracy: 10.00%
Step 100
Minibatch loss at: 1.059
Minibatch accuracy: 68.75%
Validation accuracy: 70.27%
Step 200
Minibatch loss at: 1.763
Minibatch accuracy: 68.75%
Validation accuracy: 68.93%
Step 300
Minibatch loss at: 0.893
Minibatch accuracy: 81.25%
Validation accuracy: 79.77%
Step 400
Minibatch loss at: 0.758
Minibatch accuracy: 75.00%
Validation accuracy: 80.46%
Step 500
Minibatch loss at: 1.014
Minibatch accuracy: 62.50%
Validation accuracy: 80.12%
Step 600
Minibatch loss at: 0.619
Minibatch accuracy: 81.25%
Validation accuracy: 81.41%
Step 700
Minibatch loss at: 0.897
Minibatch accuracy: 75.00%
Validation accuracy: 81.42%
Step 800
Minibatch loss at: 0.468
Minibatch accuracy: 81.25%
Validation accuracy: 83.20%
Step 900
Minibatch loss at: 0.376
Minibatch accuracy: 81.25%
Validation accuracy: 82.03%
Step 1000
Minibatch loss at: 0.657
Minibatch accuracy: 87.50%
Validation accuracy: 82.41%
Test 

---
Problem 1
---------

The convolutional model above uses convolutions with stride 2 to reduce the dimensionality. Replace the strides by a max pooling operation (`nn.max_pool()`) of stride 2 and kernel size 2.

---

In [65]:
bsize = 16 # batch size
psize = 5  # patch size
depth = 16
n_hidden = 64

# Variables.
def weight_variable(shape):
  init_val = tf.truncated_normal(shape, stddev=0.1)
  return tf.Variable(init_val)

def bias_variable(shape,val):
  init_val = tf.constant(val, shape=shape)
  return tf.Variable(init_val)

# Model.
def layer_model(x, w, b):
  conv = tf.nn.conv2d(x, w, [1,1,1,1], padding='SAME') 
  conv = tf.nn.relu(conv + b)
  pool = tf.nn.max_pool(conv, [1,2,2,1], [1,2,2,1], padding='SAME')
  return pool

def mlp_network(data):
  pool1 = layer_model(data,  weight1, bias1) #(16,7,7,16) 
  pool2 = layer_model(pool1, weight2, bias2) #(16,2,2,16)
  
  d1,d2,d3,d4 = pool2.get_shape().as_list()
  pool2rs = tf.reshape(pool2, [d1, d2*d3*d4]) #(16,64)
  layer3 = tf.nn.relu(tf.matmul(pool2rs, weight3) + bias3)
    
  #pool3 = layer_model(pool2, weight3, bias3)
  #pool4 = layer_model(pool3, weight4, bias4)

  return tf.matmul(layer3, weight4) + bias4

  

In [70]:
graph = tf.Graph()

with graph.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(tf.float32, shape=(bsize, imgsize, imgsize, n_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(bsize, n_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  weight1 = weight_variable([psize, psize, n_channels, depth]) #(5,5,1,16)
  weight2 = weight_variable([psize, psize, depth, depth]) #(5,5,16,16)
  weight3 = weight_variable([(imgsize//4)* (imgsize//4)* depth, n_hidden]) #(784,64)
  weight4 = weight_variable([n_hidden, n_labels]) #(16,10)
  
  #bias1 = tf.Variable(tf.zeros([depth]))
  bias1 = bias_variable([depth], 0)      #(16,)
  bias2 = bias_variable([depth], 0.1)    #(16,)
  bias3 = bias_variable([n_hidden], 0.1) #(64,)
  bias4 = bias_variable([n_labels], 0.1) #(10,)
  
  # Training computation.
  logits = mlp_network(tf_train_dataset)
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits,tf_train_labels))
    
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
  test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [71]:
n_steps = 2001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Graph initialized')
  
  for step in range(n_steps):
    offset = (step * bsize) % (train_labels.shape[0] - bsize)
    
    batch_data = train_dataset[offset:(offset + bsize), :, :, :]
    batch_labels = train_labels[offset:(offset + bsize), :]
    
    dic = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
    _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=dic)
    
    if (step % 400 == 0):
      print('Step %d' % (step))
      print('Minibatch loss: %.3f' % (l))
      print('Minibatch accuracy: %.2f%%' % accuracy(predictions, batch_labels))
      print('Validation accuracy: %.2f%%' % accuracy(valid_prediction.eval(), valid_labels))
      
  print('Test accuracy: %.2f%%' % accuracy(test_prediction.eval(), test_labels))

Graph initialized
Step 0
Minibatch loss: 2.271
Minibatch accuracy: 6.25%
Validation accuracy: 12.07%
Step 400
Minibatch loss: 0.541
Minibatch accuracy: 81.25%
Validation accuracy: 70.74%
Step 800
Minibatch loss: 0.486
Minibatch accuracy: 87.50%
Validation accuracy: 74.22%
Step 1200
Minibatch loss: 0.486
Minibatch accuracy: 81.25%
Validation accuracy: 76.59%
Step 1600
Minibatch loss: 0.248
Minibatch accuracy: 87.50%
Validation accuracy: 76.49%
Step 2000
Minibatch loss: 0.416
Minibatch accuracy: 87.50%
Validation accuracy: 75.27%
Test accuracy: 81.00%


---
Problem 2
---------

Try to get the best performance you can using a convolutional net. Look for example at the classic [LeNet5](http://yann.lecun.com/exdb/lenet/) architecture, adding Dropout, and/or adding learning rate decay.

---

In [21]:
bsize = 16 # batch size
psize = 5  # patch size
depth = 16
n_hidden = 64
size3 = ((imgsize-psize + 1) // 2 - psize + 1) // 2  #4

# Variables.
def weight_variable(shape):
  init_val = tf.truncated_normal(shape, stddev=0.1)
  return tf.Variable(init_val)

def bias_variable(shape, val):
  if val==0:
    init_val = tf.zeros([depth])
  else:
    init_val = tf.constant(val, shape=shape)
  return tf.Variable(init_val)

# Model.
def layer_model(x, w, b):
  conv = tf.nn.conv2d(x, w, [1,1,1,1], padding='VALID') 
  conv = tf.nn.relu(conv + b)
  pool = tf.nn.avg_pool(conv, [1,2,2,1], [1,2,2,1], padding='VALID')
  return pool

def conv_net(data):
  pool1 = layer_model(data,  weight1, bias1) #(16,7,7,16) 
  pool2 = layer_model(pool1, weight2, bias2) #(16,2,2,16)
  
  d1,d2,d3,d4 = pool2.get_shape().as_list()
  pool2rs = tf.reshape(pool2, [d1, d2*d3*d4]) #(16,64)
  layer3 = tf.nn.relu(tf.matmul(pool2rs, weight3) + bias3)

  return tf.matmul(layer3, weight4) + bias4

  

In [22]:
graph = tf.Graph()

with graph.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(tf.float32, shape=(bsize, imgsize, imgsize, n_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(bsize, n_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  weight1 = weight_variable([psize, psize, n_channels, depth]) #(5,5,1,16)
  weight2 = weight_variable([psize, psize, depth, depth]) #(5,5,16,16)
  weight3 = weight_variable([size3* size3* depth, n_hidden]) #(64,64)
  weight4 = weight_variable([n_hidden, n_labels]) #(16,10)
  
  bias1 = bias_variable([depth], 0)     #(16,)
  bias2 = bias_variable([depth], 1.0)    #(16,)
  bias3 = bias_variable([n_hidden], 1.0) #(64,)
  bias4 = bias_variable([n_labels], 1.0) #(10,)
  
  # Training computation.
  logits = conv_net(tf_train_dataset)
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits,tf_train_labels))
    
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
  test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [27]:
n_steps = 100001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Graph initialized')
  
  for step in range(n_steps):
    offset = (step * bsize) % (train_labels.shape[0] - bsize)
    
    batch_data = train_dataset[offset:(offset + bsize), :, :, :]
    batch_labels = train_labels[offset:(offset + bsize), :]
    
    dic = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
    _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=dic)
    
    if (step % 400 == 0):
      print('Step %d' % (step))
      print('Minibatch loss: %.3f' % (l))
      print('Minibatch accuracy: %.2f%%' % accuracy(predictions, batch_labels))
      print('Validation accuracy: %.2f%%' % accuracy(valid_prediction.eval(), valid_labels))
      
  print('Test accuracy: %.2f%%' % accuracy(test_prediction.eval(), test_labels))
  print('Done.')
  

Graph initialized
Step 0
Minibatch loss: 2.385
Minibatch accuracy: 0.00%
Validation accuracy: 10.00%
Step 400
Minibatch loss: 0.731
Minibatch accuracy: 75.00%
Validation accuracy: 76.71%
Step 800
Minibatch loss: 0.659
Minibatch accuracy: 81.25%
Validation accuracy: 81.49%
Step 1200
Minibatch loss: 0.588
Minibatch accuracy: 81.25%
Validation accuracy: 81.77%
Step 1600
Minibatch loss: 0.681
Minibatch accuracy: 87.50%
Validation accuracy: 83.20%
Step 2000
Minibatch loss: 0.610
Minibatch accuracy: 87.50%
Validation accuracy: 84.67%
Step 2400
Minibatch loss: 0.532
Minibatch accuracy: 81.25%
Validation accuracy: 84.51%
Step 2800
Minibatch loss: 0.462
Minibatch accuracy: 81.25%
Validation accuracy: 85.30%
Step 3200
Minibatch loss: 0.317
Minibatch accuracy: 87.50%
Validation accuracy: 85.84%
Step 3600
Minibatch loss: 0.789
Minibatch accuracy: 68.75%
Validation accuracy: 87.00%
Step 4000
Minibatch loss: 0.933
Minibatch accuracy: 81.25%
Validation accuracy: 86.96%
Step 4400
Minibatch loss: 0.473