Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
import tensorflow as tf
from six.moves import cPickle as pickle
import matplotlib
import matplotlib.pyplot as plt
import math
import nn

In [2]:
SIZE = 28
LABELS = 10
CHANNELS = 1
BREAKS = 50

First reload the data we generated in `1_notmnist.ipynb`.

In [3]:
pickle_file = '/Users/desiredewaele/Google Drive/Datasets/notMNIST.pickle'
with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    data = save['data']
    labels = save['labels']
    del save

---
### Data Preproccessing
Reformat into a TensorFlow-friendly shape:
- all input should be numpy
- labels as float 1-hot encodings.

In [4]:
print('Dataset', data.shape, labels.shape)

Dataset (220000, 28, 28) (220000,)


In [5]:
data = data.reshape(-1, SIZE * SIZE)
labels = pd.get_dummies(labels).values

In [6]:
print('Dataset', data.shape, labels.shape)

Dataset (220000, 784) (220000, 10)


In [7]:
from sklearn.model_selection import train_test_split
data, testX, labels, testY = train_test_split(data, labels, test_size=10000, random_state=100)
trainX, validX, trainY, validY = train_test_split(data, labels, test_size=10000, random_state=100)

In [8]:
print('Training set:', trainX.shape, trainY.shape)
print('Validation set:', validX.shape, validY.shape)
print('Testing set:', testX.shape, testY.shape)

Training set: (200000, 784) (200000, 10)
Validation set: (10000, 784) (10000, 10)
Testing set: (10000, 784) (10000, 10)


---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

In [None]:
batch_size = 128

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_valid_labels = tf.constant(valid_labels)
    tf_test_dataset = tf.constant(test_dataset)
    beta_regul = tf.placeholder(tf.float32)

    # Variables.
    w1 = tf.Variable(tf.truncated_normal([image_size * image_size, num_labels]))
    b1 = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    tLogits = tf.matmul(tf_train_dataset, w1) + b1
    vLogits = tf.matmul(tf_valid_dataset, w1) + b1
    #loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf_train_labels)) + beta_regul * tf.nn.l2_loss(w1)
    lossTrain = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
        logits=tLogits, labels=tf_train_labels)) + beta_regul * tf.nn.l2_loss(w1)
    lossValid = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
        logits=vLogits, labels=tf_valid_labels)) + beta_regul * tf.nn.l2_loss(w1)
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(lossTrain)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(tLogits)
    valid_prediction = tf.nn.softmax(vLogits)
    test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, w1) + b1)

In [None]:
num_steps = 3000

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    ltList, lvList, atList, avList = [], [], [], []
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, beta_regul : 1e-3}
        session.run(optimizer, feed)
        if (step % math.floor(num_steps / breaks) == 0):
            lt, lv, predictions = session.run([lossTrain, lossValid, train_prediction], feed)
            ltList.append(lt)
            lvList.append(lv)
            atList.append(accuracy(predictions, batch_labels))
            avList.append(accuracy(valid_prediction.eval(), valid_labels))
            print('.', end=""),
    print('\nTest accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

In [None]:
viewer(atList, avList, ltList, lvList)

---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

---

In [None]:
batch_size = 128
num_hidden_nodes = 1024

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_valid_labels = tf.constant(valid_labels)
    tf_test_dataset = tf.constant(test_dataset)
    beta_regul = tf.placeholder(tf.float32)

    # Variables.
    w1 = tf.Variable(tf.truncated_normal([image_size * image_size, num_hidden_nodes]))
    w2 = tf.Variable(tf.truncated_normal([num_hidden_nodes, num_labels]))
    b1 = tf.Variable(tf.zeros([num_hidden_nodes]))
    b2 = tf.Variable(tf.zeros([num_labels]))

    # Model.
    def model(data):
        x = tf.nn.relu(tf.matmul(data, w1) + b1)
        return tf.matmul(x, w2) + b2
    
    # Training computation.
    tLogits = model(tf_train_dataset)
    vLogits = model(tf_valid_dataset)
    lossTrain = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=tLogits, labels=tf_train_labels))
    lossValid = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=vLogits, labels=tf_valid_labels))

    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(lossTrain)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(tLogits)
    valid_prediction = tf.nn.softmax(vLogits)
    test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [None]:
num_steps = 101
num_batches = 3

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    ltList, lvList, atList, avList = [], [], [], []
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, beta_regul : 1e-3}
        session.run(optimizer, feed)
        if (step % math.floor(num_steps / breaks) == 0):
            lt, lv, predictions = session.run([lossTrain, lossValid, train_prediction], feed_dict=feed)
            ltList.append(lt)
            lvList.append(lv)
            atList.append(accuracy(predictions, batch_labels))
            avList.append(accuracy(valid_prediction.eval(), valid_labels))
            print('.', end=""),
    print('\nTest accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

In [None]:
viewer(atList, avList, ltList, lvList)

---
Problem 3
---------
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

---

In [None]:
batch_size = 128
num_hidden_nodes = 1024

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_valid_labels = tf.constant(valid_labels)
    tf_test_dataset = tf.constant(test_dataset)
    beta_regul = tf.placeholder(tf.float32)

    # Variables.
    w1 = tf.Variable(tf.truncated_normal([image_size * image_size, num_hidden_nodes]))
    w2 = tf.Variable(tf.truncated_normal([num_hidden_nodes, num_labels]))
    b1 = tf.Variable(tf.zeros([num_hidden_nodes]))
    b2 = tf.Variable(tf.zeros([num_labels]))

    # Model.
    def model(data, isTraining):
        x = tf.nn.relu(tf.matmul(data, w1) + b1)
        if isTraining:
            x = tf.nn.dropout(x, 0.5)
        return tf.matmul(x, w2) + b2
    
    # Training computation.
    tLogits = model(tf_train_dataset, isTraining=True)
    vLogits = model(tf_valid_dataset, isTraining=False)
    lossTrain = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=tLogits, labels=tf_train_labels))
    lossValid = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=vLogits, labels=tf_valid_labels))

    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(lossTrain)

    # Predictions for the training, validation, and test data.    
    train_prediction = tf.nn.softmax(tLogits)
    valid_prediction = tf.nn.softmax(vLogits)
    test_prediction = tf.nn.softmax(model(tf_test_dataset, isTraining=False))

In [None]:
num_steps = 101
num_batches = 3

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    ltList, lvList, atList, avList = [], [], [], []
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, beta_regul : 1e-3}
        session.run(optimizer, feed)
        if (step % math.floor(num_steps / breaks) == 0):
            lt, lv, predictions = session.run([lossTrain, lossValid, train_prediction], feed)
            ltList.append(lt)
            lvList.append(lv)
            atList.append(accuracy(predictions, batch_labels))
            avList.append(accuracy(valid_prediction.eval(), valid_labels))
            print('.', end=""),
    print('\nTest accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

In [None]:
viewer(atList, avList, ltList, lvList)

---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

In [None]:
print(train_dataset.shape)
print(train_labels.shape)

In [10]:
STEPS = 18000
BATCH = 128
BREAKS = 5
HIDDEN1 = 1024
HIDDEN2 = 256
HIDDEN3 = 128
SIZE = 784
LABELS = 10




In [13]:
keep_prob = 0.5

graph = tf.Graph()
with graph.as_default():

    # Input data.
    tfDataX = tf.placeholder(tf.float32, shape=(None, SIZE))
    tfDataY = tf.placeholder(tf.float32, shape=(None, LABELS))

    # Variables.
    w1 = tf.Variable(tf.truncated_normal([SIZE, HIDDEN1], stddev=np.sqrt(2.0/SIZE)))
    w2 = tf.Variable(tf.truncated_normal([HIDDEN1, HIDDEN2], stddev=np.sqrt(2.0/SIZE)))
    w3 = tf.Variable(tf.truncated_normal([HIDDEN2, HIDDEN3], stddev=np.sqrt(2.0/SIZE)))
    w4 = tf.Variable(tf.truncated_normal([HIDDEN3, LABELS], stddev=np.sqrt(2.0/SIZE)))
    b1 = tf.Variable(tf.zeros([HIDDEN1]))
    b2 = tf.Variable(tf.zeros([HIDDEN2]))
    b3 = tf.Variable(tf.zeros([HIDDEN3]))
    b4 = tf.Variable(tf.zeros([LABELS]))
    
    # Model.
    def model(x):
        x = tf.nn.relu(tf.matmul(x, w1) + b1)
        x = tf.nn.relu(tf.matmul(x, w2) + b2)
        x = tf.nn.relu(tf.matmul(x, w3) + b3)
        return tf.matmul(x, w4) + b4
    
    # Training computation.
    logits = model(tfDataX)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tfDataY))
    rate = tf.train.exponential_decay(0.5, tf.Variable(0), 4000, 0.65, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(rate).minimize(loss)

    # Predictions and Accuracy.
    predictions = {"classes": tf.argmax(model(tfDataX), axis=1),"probabilities": tf.nn.softmax(model(tfDataX))}
    accuracy = tf.reduce_mean(tf.to_float(tf.equal(predictions["classes"], tf.argmax(tfDataY, axis=1)))) * 100

In [16]:
with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    history = []
    for step in range(STEPS):
        offset = (step * BATCH) % (trainY.shape[0] - BATCH)
        batchX = trainX[offset:(offset + BATCH), :]
        batchY = trainY[offset:(offset + BATCH), :]
        session.run(optimizer, {tfDataX: trainX, tfDataY: trainY})
        if(step % (STEPS // BREAKS) == 0):
            lt, at = session.run([loss, accuracy], {tfDataX: trainX, tfDataY: trainY})
            lv, av = session.run([loss, accuracy], {tfDataX: validX, tfDataY: validY})
            history.append((at, av, lt, lv))
            print(".")
    predictions = session.run(predictions, {tfDataX: validX})
    #accuracy = session.run(accuracy, {tfDataX: testX, tfDataY: testY})
    #print('\nTest accuracy: %.2f%%' % accuracy)

.


KeyboardInterrupt: 

In [None]:
viewer(atList, avList, ltList, lvList)