Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

First reload the data we generated in _notmist.ipynb_.

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [4]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

In [5]:
tot_samples = 100000
g1 = tf.Graph()
with g1.as_default():
    tf_train_dataset = tf.constant(train_dataset[:tot_samples,:])
    tf_train_labels = tf.constant(train_labels[:tot_samples,:])
    tf_val_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    beta_regul = tf.placeholder(tf.float32)
    
    weights = tf.Variable(tf.truncated_normal([784,10]))
    biases = tf.Variable(tf.zeros([10]))
    
    logits = tf.matmul(tf_train_dataset,weights) + biases
    cost = tf.nn.softmax_cross_entropy_with_logits(logits,tf_train_labels)
    loss = tf.reduce_mean(cost)# + tf.nn.l2_loss(weights)   
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
       
    train_prediction = tf.nn.softmax(logits)    
    valid_prediction = tf.matmul(tf_val_dataset,weights) + biases
    test_prediction = tf.matmul(tf_test_dataset,weights) + biases    

In [36]:
iterations = 1000
with tf.Session(graph=g1) as sess:
    tf.initialize_all_variables().run()
    
    for i in range(iterations):
        o = sess.run(optimizer)
        
        if i%100 == 0:
            print( "train loss at step",i," = ",loss.eval())
            print( "training accuracy at step",i," = ",accuracy(train_prediction.eval(),train_labels[:tot_samples,:]))
            print( "val accuracy at step",i," = ",accuracy(valid_prediction.eval(),valid_labels))
    print( "test accuracy at step",i," = ",accuracy(test_prediction.eval(),test_labels))

train loss at step 0  =  15.3441
training accuracy at step 0  =  8.476
val accuracy at step 0  =  7.86
train loss at step 100  =  2.4227
training accuracy at step 100  =  71.715
val accuracy at step 100  =  71.24
train loss at step 200  =  2.02616
training accuracy at step 200  =  74.288
val accuracy at step 200  =  73.25
train loss at step 300  =  1.8128
training accuracy at step 300  =  75.117
val accuracy at step 300  =  74.33
train loss at step 400  =  1.66666
training accuracy at step 400  =  75.693
val accuracy at step 400  =  74.91
train loss at step 500  =  1.55548
training accuracy at step 500  =  76.131
val accuracy at step 500  =  75.32
train loss at step 600  =  1.46621
training accuracy at step 600  =  76.462
val accuracy at step 600  =  75.7
train loss at step 700  =  1.39225
training accuracy at step 700  =  76.771
val accuracy at step 700  =  75.86
train loss at step 800  =  1.32971
training accuracy at step 800  =  77.004
val accuracy at step 800  =  76.3
train loss at

In [29]:
g2 = tf.Graph()
batch_size = 1000
with g2.as_default():
    tf_train_dataset = tf.placeholder(tf.float32,shape =(batch_size,784) )
    tf_train_labels = tf.placeholder(tf.float32,shape = (batch_size,10))
    tf_val_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    beta_regul = tf.placeholder(tf.float32)
    
    weights = tf.Variable(tf.truncated_normal([784,10]))
    biases = tf.Variable(tf.zeros([10]))    
    
    logits = tf.matmul(tf_train_dataset,weights) + biases
    
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits,tf_train_labels)) + beta_regul*tf.nn.l2_loss(weights)
    
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(tf.matmul(valid_dataset,weights)+biases)
    test_prediction = tf.nn.softmax(tf.matmul(test_dataset,weights)+biases)

In [54]:
iterations = 500
batch_size = 1000
with tf.Session(graph=g4) as sess:
    sess.run(tf.initialize_all_variables())
    print("initilized")
    for i in range(iterations):
        offset = (i*batch_size)% (train_labels.shape[0] - batch_size)
        
        batch_data = train_dataset[offset:(offset+batch_size),:]
        batch_labels = train_labels[offset:(offset+batch_size),:]        
        feed_d = {tf_train_dataset:batch_data, tf_train_labels:batch_labels,beta_regul:(1e-1)/(i+1)}
    #_, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        
        _, l = sess.run([optimizer, loss],feed_dict=feed_d)
        
        if i%100 == 0:
            print ("loss in iteration",i," = ",l)
            print ("train accuracy", accuracy(sess.run(train_prediction,feed_dict=feed_d),batch_labels))
            print ("valid accuracy", accuracy(valid_prediction.eval(),valid_labels))
    print ("test accuracy", accuracy(test_prediction.eval(),test_labels))

initilized
loss in iteration 0  =  31837.1
train accuracy 37.1
valid accuracy 33.92
loss in iteration 100  =  194.468
train accuracy 81.7
valid accuracy 80.48
loss in iteration 200  =  92.4018
train accuracy 84.7
valid accuracy 81.78
loss in iteration 300  =  59.0532
train accuracy 81.8
valid accuracy 79.14
loss in iteration 400  =  43.4074
train accuracy 85.9
valid accuracy 83.25
test accuracy 90.31


In [76]:
def feedforward(data,h1_l_w,h1_l_b,ol_w,ol_b):
    temp = tf.matmul(data,h1_l_w) + h1_l_b
    temp = tf.nn.relu(temp)
    temp = tf.matmul(temp,ol_w) + ol_b
    return temp


g3 = tf.Graph()
with g3.as_default():
    tf_train_dataset = tf.placeholder(tf.float32,shape = (batch_size,784))
    tf_train_labels = tf.placeholder(tf.float32,shape = (batch_size,10))
    tf_val_dataset = valid_dataset
    tf_test_dataset = test_dataset
    
    h1_layer_w = tf.Variable(tf.truncated_normal([784,1024]))
    h1_layer_b = tf.Variable(tf.zeros([1024]))
    output_layer_w = tf.Variable(tf.truncated_normal([1024,10]))
    output_layer_b = tf.Variable(tf.zeros([10]))
    
    ff = feedforward(tf_train_dataset,h1_layer_w,h1_layer_b,output_layer_w,output_layer_b)
    train_prediction = tf.nn.softmax(ff)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(ff,tf_train_labels))
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    valid_prediction = tf.nn.softmax(feedforward(tf_val_dataset,h1_layer_w,h1_layer_b,output_layer_w,output_layer_b))
    test_prediction = tf.nn.softmax(feedforward(tf_test_dataset,h1_layer_w,h1_layer_b,output_layer_w,output_layer_b))

In [53]:
g4 = tf.Graph()
with g4.as_default():
    tf_train_dataset = tf.placeholder(tf.float32,shape = (batch_size,784))
    tf_train_labels = tf.placeholder(tf.float32,shape = (batch_size,10))
    tf_val_dataset = valid_dataset
    tf_test_dataset = test_dataset
    beta_regul = tf.placeholder(tf.float32)
    
    h1_layer_w = tf.Variable(tf.truncated_normal([784,1024]))
    h1_layer_b = tf.Variable(tf.zeros([1024]))
    output_layer_w = tf.Variable(tf.truncated_normal([1024,10]))
    output_layer_b = tf.Variable(tf.zeros([10]))
    
    ff = feedforward(tf_train_dataset,h1_layer_w,h1_layer_b,output_layer_w,output_layer_b)
    train_prediction = tf.nn.softmax(ff)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(ff,tf_train_labels)) + beta_regul*(tf.nn.l2_loss(h1_layer_w) +
                                                                                                     tf.nn.l2_loss(output_layer_w))
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    valid_prediction = tf.nn.softmax(feedforward(tf_val_dataset,h1_layer_w,h1_layer_b,output_layer_w,output_layer_b))
    test_prediction = tf.nn.softmax(feedforward(tf_test_dataset,h1_layer_w,h1_layer_b,output_layer_w,output_layer_b))

In [42]:
1e-3

0.001

In [47]:
(1e-1)/400

0.00025

---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

---

In [63]:
#run graph g4 before running this block

iterations = 500
batch_size = 10
with tf.Session(graph=g5) as sess:
    sess.run(tf.initialize_all_variables())
    print("initilized")
    for i in range(iterations):
        offset = (i*batch_size)% (train_labels.shape[0] - batch_size)
        
        batch_data = train_dataset[offset:(offset+batch_size),:]
        batch_labels = train_labels[offset:(offset+batch_size),:]        
        feed_d = {tf_train_dataset:batch_data, tf_train_labels:batch_labels,beta_regul:(1e-1)/(i+1)}
    #_, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        
        _, l = sess.run([optimizer, loss],feed_dict=feed_d)
        
        if i%100 == 0:
            print ("loss in iteration",i," = ",l)
            print ("train accuracy", accuracy(sess.run(train_prediction,feed_dict=feed_d),batch_labels))
            print ("valid accuracy", accuracy(valid_prediction.eval(),valid_labels))
    print ("test accuracy", accuracy(test_prediction.eval(),test_labels))

initilized
loss in iteration 0  =  31925.2
train accuracy 70.0
valid accuracy 18.56
loss in iteration 100  =  7.42408e+13
train accuracy 10.0
valid accuracy 20.92
loss in iteration 200  =  1.18485e+22
train accuracy 50.0
valid accuracy 17.93
loss in iteration 300  =  1.92519e+31
train accuracy 40.0
valid accuracy 16.9
loss in iteration 400  =  nan
train accuracy 10.0
valid accuracy 10.0
test accuracy 10.0


---
Problem 3
---------
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

---

In [77]:
def feedforward_train(data,h1_l_w,h1_l_b,ol_w,ol_b):
    temp = tf.matmul(data,h1_l_w) + h1_l_b
    temp = tf.nn.relu(temp)
    temp = tf.nn.dropout(temp,0.5)
    temp = tf.matmul(temp,ol_w) + ol_b
    return temp


batch_size = 1000
g5 = tf.Graph()
with g5.as_default():
    tf_train_dataset = tf.placeholder(tf.float32,shape = (batch_size,784))
    tf_train_labels = tf.placeholder(tf.float32,shape = (batch_size,10))
    tf_val_dataset = valid_dataset
    tf_test_dataset = test_dataset
    beta_regul = tf.placeholder(tf.float32)
    
    h1_layer_w = tf.Variable(tf.truncated_normal([784,1024]))
    h1_layer_b = tf.Variable(tf.zeros([1024]))
    output_layer_w = tf.Variable(tf.truncated_normal([1024,10]))
    output_layer_b = tf.Variable(tf.zeros([10]))
    
    ff = feedforward_train(tf_train_dataset,h1_layer_w,h1_layer_b,output_layer_w,output_layer_b)
    train_prediction = tf.nn.softmax(ff)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(ff,tf_train_labels)) + beta_regul*(tf.nn.l2_loss(h1_layer_w) +
                                                                                                     tf.nn.l2_loss(output_layer_w))
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    valid_prediction = tf.nn.softmax(feedforward(tf_val_dataset,h1_layer_w,h1_layer_b,output_layer_w,output_layer_b))
    test_prediction = tf.nn.softmax(feedforward(tf_test_dataset,h1_layer_w,h1_layer_b,output_layer_w,output_layer_b))

In [78]:
iterations = 300

with tf.Session(graph=g5) as sess:
    sess.run(tf.initialize_all_variables())
    for i in range(iterations):
        start = (i*batch_size)% (train_labels.shape[0] - batch_size)
        end = start+batch_size
        train_subset = train_dataset[start:end,:]
        train_l_subset = train_labels[start:end,:]
        #beta_regul = 1e-3
        feed_d = {tf_train_dataset:train_subset,tf_train_labels:train_l_subset,beta_regul:0}
        _,l = sess.run([optimizer,loss],feed_dict=feed_d)
        
        if i%100 == 0:
            print ("loss in iteration",i," = ",l)
            print ("train accuracy", accuracy(sess.run(train_prediction,feed_dict=feed_d),train_l_subset))
            print ("valid accuracy", accuracy(valid_prediction.eval(),valid_labels))
    print ("test accuracy", accuracy(test_prediction.eval(),test_labels))

loss in iteration 0  =  553.477
train accuracy 45.3
valid accuracy 45.39
loss in iteration 100  =  40.099
train accuracy 76.0
valid accuracy 80.95
loss in iteration 200  =  18.5795
train accuracy 77.8
valid accuracy 83.27
test accuracy 90.28


---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 
 ---


In [None]:
   
    h1_layer_w = tf.Variable(tf.random_normal([784,600]))
    h1_layer_b = tf.Variable(tf.zeros([600]))
    h2_layer_w = tf.Variable(tf.random_normal([600,400]))
    h2_layer_b = tf.Variable(tf.zeros([400]))
    h3_layer_w = tf.Variable(tf.random_normal([400,100]))
    h3_layer_b = tf.Variable(tf.zeros([100]))
    output_layer_w = tf.Variable(tf.random_normal([100,10]))
    output_layer_b = tf.Variable(tf.zeros([10]))

In [23]:
def feedforward_train(data,h1_l_w,h1_l_b,h2_l_w,h2_l_b,h3_l_w,h3_l_b,ol_w,ol_b):
    temp = tf.nn.relu(tf.matmul(data,h1_l_w) + h1_l_b)
    temp = tf.nn.dropout(temp,0.5)
    temp = tf.nn.relu(tf.matmul(temp,h2_l_w) + h2_l_b)
    temp = tf.nn.dropout(temp,0.5)
    temp = tf.nn.relu(tf.matmul(temp,h3_l_w) + h3_l_b)
    temp = tf.nn.dropout(temp,0.5)
    temp = tf.matmul(temp,ol_w) + ol_b
    return temp

def feedforward(data,h1_l_w,h1_l_b,h2_l_w,h2_l_b,h3_l_w,h3_l_b,ol_w,ol_b):
    temp = tf.nn.relu(tf.matmul(data,h1_l_w) + h1_l_b)
    temp = tf.nn.relu(tf.matmul(temp,h2_l_w) + h2_l_b)
    temp = tf.nn.relu(tf.matmul(temp,h3_l_w) + h3_l_b)
    temp = tf.matmul(temp,ol_w) + ol_b
    return temp

batch_size = 200
g6 = tf.Graph()

with g6.as_default():
    tf_train_dataset = tf.placeholder(tf.float32,shape = (batch_size,784))
    tf_train_labels = tf.placeholder(tf.float32,shape = (batch_size,10))
    tf_val_dataset = valid_dataset
    tf_test_dataset = test_dataset
    beta_regul = tf.placeholder(tf.float32)
    param1 = tf.placeholder(tf.float32)
    
    h1_layer_w = tf.Variable(tf.truncated_normal([784,600],stddev=np.sqrt(2.0 / 784)))
    h1_layer_b = tf.Variable(tf.zeros([600]))
    h2_layer_w = tf.Variable(tf.truncated_normal([600,400],stddev=np.sqrt(2.0 / 600)))
    h2_layer_b = tf.Variable(tf.zeros([400]))
    h3_layer_w = tf.Variable(tf.truncated_normal([400,100],stddev=np.sqrt(2.0 / 400)))
    h3_layer_b = tf.Variable(tf.zeros([100]))
    output_layer_w = tf.Variable(tf.truncated_normal([100,10],stddev=np.sqrt(2.0 / 100)))
    output_layer_b = tf.Variable(tf.zeros([10]))
     
    
    ff = feedforward_train(tf_train_dataset,h1_layer_w,h1_layer_b,h2_layer_w,h2_layer_b,h3_layer_w,h3_layer_b,output_layer_w,output_layer_b)
    train_prediction = tf.nn.softmax(ff)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(ff,tf_train_labels)) + beta_regul*(tf.nn.l2_loss(h1_layer_w) +
                                                                                                     tf.nn.l2_loss(output_layer_w))
    optimizer = tf.train.GradientDescentOptimizer(0.5/param1).minimize(loss)
    #optimizer = tf.train.AdamOptimizer(0.5/param1).minimize(loss)

    valid_prediction = tf.nn.softmax(feedforward(tf_val_dataset,h1_layer_w,h1_layer_b,h2_layer_w,h2_layer_b,h3_layer_w,h3_layer_b,output_layer_w,output_layer_b))
    test_prediction = tf.nn.softmax(feedforward(tf_test_dataset,h1_layer_w,h1_layer_b,h2_layer_w,h2_layer_b,h3_layer_w,h3_layer_b,output_layer_w,output_layer_b))

In [24]:
iterations = 3000

with tf.Session(graph=g6) as sess:
    sess.run(tf.initialize_all_variables())
    for i in range(iterations):
        start = (i*batch_size)% (train_labels.shape[0] - batch_size)
        end = start+batch_size
        train_subset = train_dataset[start:end,:]
        train_l_subset = train_labels[start:end,:]
        #beta_regul = 1e-3
        feed_d = {tf_train_dataset:train_subset,tf_train_labels:train_l_subset,beta_regul:0,param1:1.0/(i+10)}
        _,l = sess.run([optimizer,loss],feed_dict=feed_d)
        
        if i%50 == 0:
            print ("loss in iteration",i," = ",l)
            print ("train accuracy", accuracy(sess.run(train_prediction,feed_dict=feed_d),train_l_subset))
            print ("valid accuracy", accuracy(valid_prediction.eval(),valid_labels))
    print ("test accuracy", accuracy(test_prediction.eval(),test_labels))

loss in iteration 0  =  2.7802
train accuracy 12.0
valid accuracy 10.0
loss in iteration 50  =  6.0187
train accuracy 10.0
valid accuracy 10.0
loss in iteration 100  =  34.0447
train accuracy 9.5
valid accuracy 10.0
loss in iteration 150  =  32.7403
train accuracy 14.5
valid accuracy 10.0
loss in iteration 200  =  45.0668
train accuracy 11.5
valid accuracy 10.0
loss in iteration 250  =  59.4793
train accuracy 16.0
valid accuracy 10.0
loss in iteration 300  =  62.3253
train accuracy 10.5
valid accuracy 10.0
loss in iteration 350  =  101.406
train accuracy 11.0
valid accuracy 10.0
loss in iteration 400  =  93.3206
train accuracy 13.5
valid accuracy 10.0
loss in iteration 450  =  120.289
train accuracy 12.5
valid accuracy 10.0
loss in iteration 500  =  179.108
train accuracy 12.0
valid accuracy 10.0
loss in iteration 550  =  117.143
train accuracy 8.0
valid accuracy 10.0
loss in iteration 600  =  168.517
train accuracy 11.0
valid accuracy 10.0
loss in iteration 650  =  177.821
train accur

In [180]:
def feedforward_train(data,   h1_l_w,h1_l_b,   h2_l_w,h2_l_b,    h3_l_w,h3_l_b,    ol_w,ol_b):
    layer_1_result = tf.nn.relu(tf.matmul(data,h1_l_w) + h1_l_b)
    layer_1_result = tf.nn.dropout(layer_1_result,0.5)
    layer_2_result = tf.nn.relu(tf.matmul(layer_1_result,h2_l_w) + h2_l_b)
    layer_2_result = tf.nn.dropout(layer_2_result,0.5)
    temp = tf.matmul(layer_2_result,ol_w) + ol_b
    return temp

def feedforward(data,   h1_l_w,h1_l_b,   h2_l_w,h2_l_b,    h3_l_w,h3_l_b,    ol_w,ol_b):
    layer_1_result = tf.nn.relu(tf.matmul(data,h1_l_w) + h1_l_b)
    layer_2_result = tf.nn.relu(tf.matmul(layer_1_result,h2_l_w) + h2_l_b)
    temp = tf.matmul(layer_2_result,ol_w) + ol_b
    return temp


batch_size = 2000
g7 = tf.Graph()
with g7.as_default():
    tf_train_dataset = tf.placeholder(tf.float32,shape = (batch_size,784))
    tf_train_labels = tf.placeholder(tf.float32,shape = (batch_size,10))
    tf_val_dataset = valid_dataset
    tf_test_dataset = test_dataset
    #beta_regul = tf.placeholder(tf.float32)
    
    h1_layer_w = tf.Variable(tf.truncated_normal([784,1024], stddev=np.sqrt(2.0 / 784)))
    h1_layer_b = tf.Variable(tf.zeros([1024]))
    h2_layer_w = tf.Variable(tf.truncated_normal([1024,100], stddev=np.sqrt(2.0 / 1024)))
    h2_layer_b = tf.Variable(tf.zeros([100]))
    h3_layer_w = 1#tf.Variable(tf.truncated_normal([400,200]))
    h3_layer_b = 1#tf.Variable(tf.zeros([200]))
    output_layer_w = tf.Variable(tf.truncated_normal([100,10], stddev=np.sqrt(2.0 / 100)))
    output_layer_b = tf.Variable(tf.zeros([10]))
    
    
    weights1 = tf.Variable(
    tf.truncated_normal(
        [image_size * image_size, 1024],
        stddev=np.sqrt(2.0 / (image_size * image_size)))
    )
    biases1 = tf.Variable(tf.zeros([1024]))
    weights2 = tf.Variable(
    tf.truncated_normal([1024, 100], stddev=np.sqrt(2.0 / num_hidden_nodes1)))
    biases2 = tf.Variable(tf.zeros([100]))
    weights3 = tf.Variable(
    tf.truncated_normal([num_hidden_nodes2, num_labels], stddev=np.sqrt(2.0 / num_hidden_nodes2)))
    biases3 = tf.Variable(tf.zeros([num_labels]))

    
    logits = feedforward_train(tf_train_dataset,h1_layer_w,h1_layer_b,h2_layer_w,h2_layer_b,h3_layer_w,h3_layer_b,output_layer_w,output_layer_b)
    #logits = feedforward_train(tf_train_dataset,weights1,biases1,weights2,biases2,1,1,weights3,biases3)
    train_prediction = tf.nn.softmax(logits)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits,tf_train_labels))
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
    
    valid_prediction = tf.nn.softmax(feedforward(tf_val_dataset,h1_layer_w,h1_layer_b,h2_layer_w,h2_layer_b,h3_layer_w,h3_layer_b,output_layer_w,output_layer_b))
    test_prediction = tf.nn.softmax(feedforward(tf_test_dataset,h1_layer_w,h1_layer_b,h2_layer_w,h2_layer_b,h3_layer_w,h3_layer_b,output_layer_w,output_layer_b))
    
    #valid_prediction = tf.nn.softmax(feedforward(tf_val_dataset,weights1,biases1,weights2,biases2,1,1,weights3,biases3))

    #test_prediction = tf.nn.softmax(feedforward(tf_test_dataset,weights1,biases1,weights2,biases2,1,1,weights3,biases3))

In [179]:
iterations = 300

with tf.Session(graph=g7) as sess:
    sess.run(tf.initialize_all_variables())
    for i in range(iterations):
        start = (i*batch_size)% (train_labels.shape[0] - batch_size)
        end = start+batch_size
        train_subset = train_dataset[start:end,:]
        train_l_subset = train_labels[start:end,:]
        #beta_regul = 1e-3
        feed_d = {tf_train_dataset:train_subset,tf_train_labels:train_l_subset}
        _,l = sess.run([optimizer,loss],feed_dict=feed_d)
        
        if i%100 == 0:
            print ("loss in iteration",i," = ",l)
            print ("train accuracy", accuracy(sess.run(train_prediction,feed_dict=feed_d),train_l_subset))
            print ("valid accuracy", accuracy(valid_prediction.eval(),valid_labels))
    print ("test accuracy", accuracy(test_prediction.eval(),test_labels))

loss in iteration 0  =  2.73074
train accuracy 28.65
valid accuracy 42.7
loss in iteration 100  =  0.63239
train accuracy 80.75
valid accuracy 84.35
loss in iteration 200  =  0.560704
train accuracy 83.25
valid accuracy 85.17
test accuracy 91.9


In [162]:
def feedforward_train(data,   h1_l_w,h1_l_b,   h2_l_w,h2_l_b,    h3_l_w,h3_l_b,    ol_w,ol_b):
    layer_1_result = tf.nn.relu(tf.matmul(data,h1_l_w) + h1_l_b)
    layer_1_result = tf.nn.dropout(layer_1_result,0.5)
    layer_2_result = tf.nn.relu(tf.matmul(layer_1_result,h2_l_w) + h2_l_b)
    layer_2_result = tf.nn.dropout(layer_2_result,0.5)
    temp = tf.matmul(layer_2_result,ol_w) + ol_b
    return temp

def feedforward(data,   h1_l_w,h1_l_b,   h2_l_w,h2_l_b,    h3_l_w,h3_l_b,    ol_w,ol_b):
    layer_1_result = tf.nn.relu(tf.matmul(data,h1_l_w) + h1_l_b)
    layer_2_result = tf.nn.relu(tf.matmul(layer_1_result,h2_l_w) + h2_l_b)
    temp = tf.matmul(layer_2_result,ol_w) + ol_b
    return temp

In [175]:
batch_size = 128
num_hidden_nodes1 = 1024
num_hidden_nodes2 = 100
beta_regul = 1e-3

g8 = tf.Graph()
with g8.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  global_step = tf.Variable(0)

  # Variables.
  weights1 = tf.Variable(
    tf.truncated_normal(
        [image_size * image_size, num_hidden_nodes1] ,stddev=np.sqrt(2.0 / (image_size * image_size))))
  biases1 = tf.Variable(tf.zeros([num_hidden_nodes1]))
  weights2 = tf.Variable(
    tf.truncated_normal([num_hidden_nodes1, num_hidden_nodes2], stddev=np.sqrt(2.0 / num_hidden_nodes1)))
  biases2 = tf.Variable(tf.zeros([num_hidden_nodes2]))
  weights3 = tf.Variable(
    tf.truncated_normal([num_hidden_nodes2, num_labels], stddev=np.sqrt(2.0 / num_hidden_nodes2)))
  biases3 = tf.Variable(tf.zeros([num_labels]))
  
  # Training computation.
  #lay1_train = tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1)
  #lay2_train = tf.nn.relu(tf.matmul(lay1_train, weights2) + biases2)
  #logits = tf.matmul(lay2_train, weights3) + biases3
  logits = feedforward_train(tf_train_dataset,weights1,biases1,weights2,biases2,1,1,weights3,biases3)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + \
      beta_regul * (tf.nn.l2_loss(weights1) + tf.nn.l2_loss(weights2) + tf.nn.l2_loss(weights3))
  
  # Optimizer.
  learning_rate = tf.train.exponential_decay(0.5, global_step, 1000, 0.65, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  #lay1_valid = tf.nn.relu(tf.matmul(tf_valid_dataset, weights1) + biases1)
  #lay2_valid = tf.nn.relu(tf.matmul(lay1_valid, weights2) + biases2)
  valid_prediction = tf.nn.softmax(feedforward(tf_valid_dataset,weights1,biases1,weights2,biases2,1,1,weights3,biases3))
  #lay1_test = tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1)
  #lay2_test = tf.nn.relu(tf.matmul(lay1_test, weights2) + biases2)
  test_prediction = tf.nn.softmax(feedforward_train(tf_test_dataset,weights1,biases1,weights2,biases2,1,1,weights3,biases3))

In [177]:
iterations = 300

with tf.Session(graph=g8) as sess:
    sess.run(tf.initialize_all_variables())
    print ("initilized")
    for i in range(iterations):
        start = (i*batch_size)% (train_labels.shape[0] - batch_size)
        end = start+batch_size
        train_subset = train_dataset[start:end,:]
        train_l_subset = train_labels[start:end,:]
        #beta_regul = 1e-3
        feed_d = {tf_train_dataset:train_subset,tf_train_labels:train_l_subset}
        _,l = sess.run([optimizer,loss],feed_dict=feed_d)
        
        if i%100 == 0:
            print ("loss in iteration",i," = ",l)
            print ("train accuracy", accuracy(sess.run(train_prediction,feed_dict=feed_d),train_l_subset))
            print ("valid accuracy", accuracy(valid_prediction.eval(),valid_labels))
    print ("test accuracy", accuracy(test_prediction.eval(),test_labels))

initilized
loss in iteration 0  =  3.47102
train accuracy 35.9375
valid accuracy 27.51
loss in iteration 100  =  1.75462
train accuracy 75.78125
valid accuracy 81.74
loss in iteration 200  =  1.48833
train accuracy 84.375
valid accuracy 83.44
test accuracy 87.4


In [176]:
num_steps = 500

with tf.Session(graph=g8) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 100 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 3.513977
Minibatch accuracy: 10.2%
Validation accuracy: 37.7%
Minibatch loss at step 100: 1.711253
Minibatch accuracy: 75.0%
Validation accuracy: 81.8%
Minibatch loss at step 200: 1.508261
Minibatch accuracy: 81.2%
Validation accuracy: 83.2%
Minibatch loss at step 300: 1.731425
Minibatch accuracy: 75.8%
Validation accuracy: 83.1%
Minibatch loss at step 400: 1.395843
Minibatch accuracy: 78.9%
Validation accuracy: 83.7%
Test accuracy: 89.1%


In [186]:
batch_size = 128
num_hidden_nodes1 = 1024
num_hidden_nodes2 = 100
beta_regul = 1e-3

graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, image_size * image_size))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  global_step = tf.Variable(0)

  # Variables.
  weights1 = tf.Variable(
    tf.truncated_normal(
        [image_size * image_size, num_hidden_nodes1],
        stddev=np.sqrt(2.0 / (image_size * image_size)))
    )
  biases1 = tf.Variable(tf.zeros([num_hidden_nodes1]))
  weights2 = tf.Variable(
    tf.truncated_normal([num_hidden_nodes1, num_hidden_nodes2], stddev=np.sqrt(2.0 / num_hidden_nodes1)))
  biases2 = tf.Variable(tf.zeros([num_hidden_nodes2]))
  weights3 = tf.Variable(
    tf.truncated_normal([num_hidden_nodes2, num_labels], stddev=np.sqrt(2.0 / num_hidden_nodes2)))
  biases3 = tf.Variable(tf.zeros([num_labels]))
  
  # Training computation.
  lay1_train = tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1)
  lay2_train = tf.nn.relu(tf.matmul(lay1_train, weights2) + biases2)
  logits = tf.matmul(lay2_train, weights3) + biases3
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + \
      beta_regul * (tf.nn.l2_loss(weights1) + tf.nn.l2_loss(weights2) + tf.nn.l2_loss(weights3))
  
  # Optimizer.
  learning_rate = tf.train.exponential_decay(0.5, global_step, 1000, 0.65, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  lay1_valid = tf.nn.relu(tf.matmul(tf_valid_dataset, weights1) + biases1)
  lay2_valid = tf.nn.relu(tf.matmul(lay1_valid, weights2) + biases2)
  valid_prediction = tf.nn.softmax(tf.matmul(lay2_valid, weights3) + biases3)
  lay1_test = tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1)
  lay2_test = tf.nn.relu(tf.matmul(lay1_test, weights2) + biases2)
  test_prediction = tf.nn.softmax(tf.matmul(lay2_test, weights3) + biases3)

In [187]:
num_steps = 9001

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 1000 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 3.312616
Minibatch accuracy: 7.8%
Validation accuracy: 27.1%
Minibatch loss at step 1000: 0.916887
Minibatch accuracy: 81.2%
Validation accuracy: 87.2%
Minibatch loss at step 2000: 0.632886
Minibatch accuracy: 87.5%
Validation accuracy: 88.5%
Minibatch loss at step 3000: 0.561764
Minibatch accuracy: 89.1%
Validation accuracy: 89.4%
Minibatch loss at step 4000: 0.396751
Minibatch accuracy: 93.0%
Validation accuracy: 89.7%
Minibatch loss at step 5000: 0.468482
Minibatch accuracy: 89.8%
Validation accuracy: 90.0%
Minibatch loss at step 6000: 0.465835
Minibatch accuracy: 87.5%
Validation accuracy: 90.1%
Minibatch loss at step 7000: 0.441074
Minibatch accuracy: 88.3%
Validation accuracy: 90.6%
Minibatch loss at step 8000: 0.608504
Minibatch accuracy: 86.7%
Validation accuracy: 90.5%
Minibatch loss at step 9000: 0.354365
Minibatch accuracy: 93.8%
Validation accuracy: 90.5%
Test accuracy: 95.6%
