Deep Learning
=============

Assignment 4
------------

Previously in `2_fullyconnected.ipynb` and `3_regularization.ipynb`, we trained fully connected networks to classify [notMNIST](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html) characters.

The goal of this assignment is make the neural network convolutional.

In [2]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range
from datetime import datetime as dt

# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline

In [3]:
# ==============================================================================
# First reload the data we generated in 1_notmnist.ipynb.
# ==============================================================================

data_root = '../../input/notMNIST/' # Change me to store data elsewhere
pickle_file = data_root + 'notMNIST.pickle'

label_dic = {0:'A', 1:'B', 2:'C', 3:'D', 4:'E', 5:'F', 6:'G', 7:'H', 8:'I', 9:'J'}

print('pickle file location: %s' % pickle_file)

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    train_dataset_rw = save['train_dataset']
    train_labels_rw = save['train_labels']
    valid_dataset_rw = save['valid_dataset']
    valid_labels_rw = save['valid_labels']
    test_dataset_rw = save['test_dataset']
    test_labels_rw = save['test_labels']
    del save  # hint to help gc free up memory
    print('Training set', train_dataset_rw.shape, train_labels_rw.shape)
    print('Validation set', valid_dataset_rw.shape, valid_labels_rw.shape)
    print('Test set', test_dataset_rw.shape, test_labels_rw.shape)

pickle file location: ../../input/notMNIST/notMNIST.pickle
Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a TensorFlow-friendly shape:
- convolutions need the image data formatted as a cube (width by height by #channels)
- labels as float 1-hot encodings.

In [4]:

# ==============================================================================
# Reformat into a shape that's more adapted to the models we're going to train:
#
#   @ data as a flat matrix,
#   @ labels as float 1-hot encodings.
# ==============================================================================

train_dataset = {}
valid_dataset = {}
test_dataset = {}

image_size = 28
num_labels = 10
num_channels = 1  # grayscale
layers_info_text = ''


def reformat(dataset, labels, name):
    dataset = dataset.reshape((-1, image_size, image_size, num_channels)).astype(np.float32)

    # Map 0 to [1.0, 0.0, 0.0 ...], 1 to [0.0, 1.0, 0.0 ...]
    labels = (np.arange(num_labels) == labels[:, None]).astype(np.float32)
    print(name, 'dataset shape :', dataset.shape, labels.shape)
    return dataset, labels


train_dataset["Input_X"], train_dataset["Labels"] = reformat(train_dataset_rw, train_labels_rw, 'train')
valid_dataset["Input_X"], valid_dataset["Labels"] = reformat(valid_dataset_rw, valid_labels_rw, 'valid')
test_dataset["Input_X"], test_dataset["Labels"] = reformat(test_dataset_rw, test_labels_rw, 'test')

# print('Validation set', valid_dataset["Input_X"].shape, test_dataset["Labels"].shape)
# print('Test set', test_dataset["Input_X"].shape, test_dataset["Labels"].shape)


train dataset shape : (200000, 28, 28, 1) (200000, 10)
valid dataset shape : (10000, 28, 28, 1) (10000, 10)
test dataset shape : (10000, 28, 28, 1) (10000, 10)


In [5]:
def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
            / predictions.shape[0])

In [6]:
def show_result(predictions, labels):
    print([label_dic[key] for key in np.argmax(predictions, 1)])
    print([label_dic[key] for key in np.argmax(labels, 1)])

In [7]:
# Define a fully connected layer
def fc_layer(input_data, channels_in, channels_out, act_fun=None, dropout_kp=None,
             layer_name='Full_Connection_Layer', logs=None):
    with tf.name_scope(layer_name):
        # It is not a good idea to set initial value as zero
        # It will cause problem during the learning activity
        # w = tf.Variable(tf.zeros([channels_in, channels_out]))

        # These are the parameters that we are going to be training. The weight
        # matrix will be initialized using random values following a (truncated)
        # normal distribution.

        with tf.variable_scope(layer_name):
            # weights = tf.Variable(tf.truncated_normal([channels_in, channels_out], seed=1),
            #                       name='W')
            weights = tf.get_variable(name='Weights', shape=[channels_in, channels_out],
                                      initializer=tf.truncated_normal_initializer(stddev=0.1, seed=1))
            # The biases get initialized to zero.
            # biases = tf.Variable(tf.zeros([channels_out]), name='B')
            biases = tf.get_variable(name='Biases', shape=[channels_out],
                                     initializer=tf.zeros_initializer())
            if logs == 'Y':
                tf.summary.histogram("Weights", weights)
                tf.summary.histogram("Biases", biases)

        fc_conn = tf.matmul(input_data, weights) + biases
        print("full connection Layer")

        if act_fun == 'relu':
            fc_conn = tf.nn.relu(fc_conn, name='Relu')
            print("act fun: ", act_fun)

        result = fc_conn

        if (dropout_kp > 0) and (dropout_kp < 1):
            result = tf.nn.dropout(result, keep_prob=dropout_kp, name='Dropout_Act')
            print("dropout_kp: ", dropout_kp)

        print("output: ", channels_out)
        # tf.summary.scalar('output_sum', tf.reduce_sum(result))

    return result, channels_out


In [8]:
# Define a Convolutional layer
def conv2d_layer(input_x, filter, filter_num, strides, padding, act_fun=None, dropout_kp=None,
                 layer_name='Conv_Layer', logs=None, reshape_to=None):
    with tf.name_scope(layer_name):
        with tf.variable_scope(layer_name):
            weights = tf.get_variable(name='Weights', shape=filter + [filter_num],
                                      initializer=tf.truncated_normal_initializer(stddev=0.1, seed=1))
            biases = tf.get_variable(name='Biases', shape=[filter_num],
                                     initializer=tf.zeros_initializer())

            if logs == 'Y':
                tf.summary.histogram("Weights", weights)
                tf.summary.histogram("Biases", biases)

        conv_conn = tf.nn.conv2d(input_x, weights, strides=strides, padding=padding)

        if act_fun == 'relu':
            act = tf.nn.relu(conv_conn + biases, name='Relu')
            print('act fun: ', act_fun)
        else:
            act = conv_conn + biases

        result = act
        shape = result.get_shape().as_list()
        channels_out = shape[1:]

        # TODO: Log the image for hidden layer
        if logs == 'Y':
            with tf.name_scope('To_Image'):
                # tensorboard logging for first 3 images
                # image shape: [-1, image_size, image_size, num_channels]
                split0, split1, split2, _ = tf.split(result, [1 , 1, 1, shape[3]-3], axis=3)
                tf.summary.image('conv_out_1', split0, 3)
                tf.summary.image('conv_out_2', split1, 3)
                tf.summary.image('conv_out_3', split2, 3)

        if (dropout_kp > 0) and (dropout_kp < 1):
            result = tf.nn.dropout(result, keep_prob=dropout_kp, name='Dropout_Act')
            print("dropout_kp: ", dropout_kp)

        if reshape_to == 'FC':
            print("output: ", channels_out)
            print("reshape to")
            shape = result.get_shape().as_list()
            # need to reshape to match the datashape from CNN to FC.
            result = tf.reshape(result, [-1, shape[1] * shape[2] * shape[3]])
            channels_out = shape[1] * shape[2] * shape[3]

        print("output: ", channels_out)
        # tf.summary.scalar('output_sum', tf.reduce_sum(result))

    return result, channels_out


In [9]:
# Define a Convolutional layer
def pool_layer(input_x, pool_type, pool_patch, strides, padding, layer_name='pool_layer', act_fun=None,
               dropout_kp=None, logs=None, reshape_to=None):
    with tf.name_scope(layer_name):
        if pool_type == 'MAX':
            conv_conn = tf.nn.max_pool(input_x, ksize=pool_patch, strides=strides, padding=padding)
        elif pool_type == 'AVG':
            conv_conn = tf.nn.avg_pool(input_x, ksize=pool_patch, strides=strides, padding=padding)
        else:
            conv_conn = input_x

        act = conv_conn

        if act_fun == 'relu':
            act = tf.nn.relu(act, name='Relu')
            print('act fun: ', act_fun)

        result = act
        shape = result.get_shape().as_list()
        channels_out = shape[1:]

        # TODO: Log the image for hidden layer
        if logs == 'Y':
            with tf.name_scope('To_Image'):
                # tensorboard logging for first 3 images
                # image shape: [-1, image_size, image_size, num_channels]
                split0, split1, split2, _ = tf.split(result, [1 , 1, 1, shape[3]-3], axis=3)
                tf.summary.image('conv_out_1', split0, 3)
                tf.summary.image('conv_out_2', split1, 3)
                tf.summary.image('conv_out_3', split2, 3)

        if (dropout_kp > 0) and (dropout_kp < 1):
            result = tf.nn.dropout(result, keep_prob=dropout_kp, name='Dropout_Act')
            print("dropout_kp: ", dropout_kp)

        if reshape_to == 'FC':
            print("output: ", channels_out)
            print("reshape to")
            shape = result.get_shape().as_list()
            # need to reshape to match the datashape from CNN to FC.
            result = tf.reshape(result, [-1, shape[1] * shape[2] * shape[3]])
            channels_out = shape[1] * shape[2] * shape[3]

        print("output: ", channels_out)
        # tf.summary.scalar('output_sum', tf.reduce_sum(result))

    return result, channels_out


Let's build a small network with two convolutional layers, followed by one fully connected layer. Convolutional networks are more expensive computationally, so we'll limit its depth and number of fully connected nodes.

In [10]:
# build the network graph
def build_neural_network_graph(input_data, label, hidden_layers, loss, train):
    global layers_info_text

    """
        Implements a multilayer neural network with different hidden sizes
        It also adds the dropout and learning rate regularization
        techniques in the computational graph.
    """
    graph = tf.Graph()

    with graph.as_default():

        # Input data. For the training data, we use a placeholder that will be fed
        # at run time with a training minibatch.
        with tf.name_scope('Input_X'):
            tf_train_dataset = tf.placeholder(tf.float32,
                                              shape=(None,) + input_data['node_size'],
                                              name=input_data['name'])

        # label data. For the training data, we use a placeholder that will be fed
        # at run time with a training minibatch.
        with tf.name_scope('Labels_y'):
            tf_train_labels = tf.placeholder(tf.float32,
                                             shape=(None, label['node_size']),
                                             name=label['name'])

        tf_beta_l2_regu = tf.constant(loss['beta_l2_regu'], name='beta_l2_regu')

        with tf.name_scope('Input_X/To_Image'):
            # tensorboard logging for first 3 images
            # image shape: [-1, image_size, image_size, num_channels]
            tf.summary.image('input', tf_train_dataset, 3)

        # Model
        prev_size = input_data['node_size']
        print("Input: ", prev_size)
        tf_data_layer = tf_train_dataset
        layers_info_text = "%d_Layers" % (len(hidden_layers) + 1)

        for index, layer in enumerate(hidden_layers):
            layer['prev_size'] = prev_size
            print("Build layer:", layer['name'])
            if layer['layer_type'] == 'FC':
                tf_data_layer, prev_size = fc_layer(tf_data_layer, prev_size, layer['node_size'],
                                                    layer_name=layer['name'],
                                                    act_fun=layer['act_fun'],
                                                    dropout_kp=layer['dropout_kp'],
                                                    logs=layer['logs'])
            elif layer['layer_type'] == 'CNN':
                tf_data_layer, prev_size = conv2d_layer(tf_data_layer, filter=layer['filter'],
                                                        filter_num=layer['filter_num'],
                                                        strides=layer['strides'], padding=layer['padding'],
                                                        layer_name=layer['name'], act_fun=layer['act_fun'],
                                                        dropout_kp=layer['dropout_kp'], logs=layer['logs'],
                                                        reshape_to=layer['reshape_to'])
            elif layer['layer_type'] == 'POOL':
                tf_data_layer, prev_size = pool_layer(tf_data_layer, pool_type=layer['pool_type'],
                                                      pool_patch=layer['pool_patch'],
                                                      strides=layer['strides'], padding=layer['padding'],
                                                      layer_name=layer['name'], act_fun=layer['act_fun'],
                                                      dropout_kp=layer['dropout_kp'], logs=layer['logs'],
                                                      reshape_to=layer['reshape_to'])
            else:
                print("Error: Unknown Type [%s] of Layer No %d" % (layer['type'], index))

        logits = tf_data_layer

        # Loss
        with tf.name_scope('loss_function'):
            l2_loss = 0
            for index, layer in enumerate(hidden_layers):
                if layer['layer_type'] == 'FC' and layer['L2_regularization'] == 'Y':
                    with tf.variable_scope(layer['name'], reuse=True):
                        weights_for_l2_regu = tf.get_variable("Weights", [layer['prev_size'], layer['node_size']])
                    l2_loss += tf.nn.l2_loss(weights_for_l2_regu)

                if layer['layer_type'] == 'CNN' and layer['L2_regularization'] == 'Y':
                    # Cnn: No need to add weight for regularization
                    print("CNN: no need to add weight for regularization")

            softmax_cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=tf_train_labels, logits=logits,
                                                                               name='cross_entropy')

            loss = tf.add(tf.reduce_mean(softmax_cross_entropy, name='loss'), tf_beta_l2_regu * l2_loss,
                          name='total_loss')

            tf.summary.scalar('total_loss', loss)
            tf.summary.scalar('l2_loss', l2_loss)

        # Optimizer.
        with tf.name_scope('Optimizer'):
            global_step = tf.Variable(0, trainable=False)  # count the number of steps taken.
            learning_rate = tf.train.exponential_decay(train['learning_rate'],
                                                       global_step,
                                                       train['learning_rate_decay_step'],
                                                       train['learning_rate_decay_rate'], staircase=True)
            optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
            # tf.summary.scalar('global_step', global_step)
            tf.summary.scalar('learning_rate', learning_rate)

        # Accuracy
        with tf.name_scope('Accuracy'):
            correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(tf_train_labels, 1))
            accuracy_res = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
            tf.summary.scalar('Accuracy_Result', accuracy_res)

        info = {
            "GRAPH": graph,
            "TF_TRAIN_DATASET": tf_train_dataset,
            "TF_TRAIN_LABELS": tf_train_labels,
            "LOSS": loss,
            # Optimizer.
            "OPTIMIZER": optimizer,
            # Predictions for the training, validation, and test data.
            "PREDICTION": tf.nn.softmax(logits),
            # "VALID": tf.nn.softmax(valid_logits),
            # "TEST": tf.nn.softmax(test_logits)
        }
    return info


In [11]:
def train_model(model_info, train_dataset, valid_dataset, test_dataset, batch_size, train_steps,
                log_steps, tb_log_steps):
    """
        Initializes and runs the tensor's graph
    """
    with tf.Session(graph=model_info['GRAPH']) as session:

        # Initialize all the variables
        # session.run(tf.global_variables_initializer())
        tf.global_variables_initializer().run()
        print("Initialized")

        # Make the tensorboard log writer
        session_log_dir = "logs/4_1/" + layers_info_text + "/" + dt.today().strftime('%m%d_%H%M')
        writer = tf.summary.FileWriter(session_log_dir)
        print("Logging Directory : %s" % session_log_dir)

        writer.add_graph(session.graph)

        # Merge all the tf summary
        merged_summary = tf.summary.merge_all()

        # Data Set
        # Minibatch will be built in loop

        tf_train_dataset = model_info['TF_TRAIN_DATASET']
        tf_train_labels = model_info['TF_TRAIN_LABELS']

        valid_feed_dict = {tf_train_dataset: valid_dataset["Input_X"], tf_train_labels: valid_dataset["Labels"]}

        test_feed_dict = {tf_train_dataset: test_dataset["Input_X"], tf_train_labels: test_dataset["Labels"]}

        for step in range(train_steps):

            # Pick an offset within the training data, which has been randomized.
            # Note: we could use better randomization across epochs.
            offset = (step * batch_size) % (train_dataset["Input_X"].shape[0] - batch_size)

            # Generate a minibatch.
            batch_data = train_dataset["Input_X"][offset:(offset + batch_size), :]
            batch_labels = train_dataset["Labels"][offset:(offset + batch_size), :]

            # print("batch_data shape :", batch_data.shape)
            # print("batch_labels shape : ", batch_labels.shape)

            # Prepare a dictionary telling the session where to feed the minibatch.
            # The key of the dictionary is the placeholder node of the graph to be fed,
            # and the value is the numpy array to feed to it.

            train_feed_dict = {tf_train_dataset: batch_data,
                               tf_train_labels: batch_labels}

            if step % tb_log_steps == 0:
                s = session.run(merged_summary, feed_dict=train_feed_dict)
                writer.add_summary(s, step)

            targets = [model_info["OPTIMIZER"], model_info["LOSS"], model_info["PREDICTION"]]
            _, l, train_prediction = session.run(targets, feed_dict=train_feed_dict)

            if step % log_steps == 0:
                # Predictions for the validation, and test data.
                valid_prediction = session.run(model_info["PREDICTION"], feed_dict=valid_feed_dict)
                print("Minibatch loss at step %d: %f" % (step, l))
                # print(show_result(train_prediction, batch_labels))
                print("Minibatch accuracy: %.1f%%" % accuracy(train_prediction, batch_labels))
                print("Validation accuracy: %.1f%%" % accuracy(valid_prediction, valid_dataset["Labels"]))

        test_prediction = session.run(model_info["PREDICTION"], feed_dict=test_feed_dict)
        print("Test accuracy: %.1f%%" % accuracy(test_prediction, test_dataset["Labels"]))

        writer.close()
        session.close()

In [12]:
input_x = {
    'name': 'input',
    'layer_type': 'input',
    'node_size': (image_size, image_size, num_channels)
}

label_y = {
    'name': 'label',
    'layer_type': 'label',
    'node_size': num_labels
}

# build the deep learning network
hidden_1 = {
    'name': '1_Conv',
    'layer_type': 'CNN',
    'filter_num': 16,
    'filter': [5, 5, 1],
    'strides': [1, 2, 2, 1],
    'padding': 'SAME',
    'act_fun': None,
    'dropout_kp': 1,
    'L2_regularization': 'N',
    'logs': 'Y',
    'reshape_to': None
}

hidden_2 = {
    'name': '2_Max_Pool',
    'layer_type': 'POOL',
    'pool_type': 'MAX',
    'pool_patch': [1, 5, 5, 1],
    'strides': [1, 2, 2, 1],
    'padding': 'SAME',
    'act_fun': 'relu',
    'dropout_kp': 1,
    'L2_regularization': 'N',
    'logs': 'Y',
    'reshape_to': None
}

hidden_3 = {
    'name': '3_Conv',
    'layer_type': 'CNN',
    'filter_num': 16,
    'filter': [5, 5, 16],
    'strides': [1, 2, 2, 1],
    'padding': 'SAME',
    'act_fun': None,
    'dropout_kp': 1,
    'L2_regularization': 'N',
    'logs': 'Y',
    'reshape_to': 'FC'
}

hidden_4 = {
    'name': '4_Max_Pool',
    'layer_type': 'POOL',
    'pool_type': 'MAX',
    'pool_patch': [1, 5, 5, 1],
    'strides': [1, 2, 2, 1],
    'padding': 'SAME',
    'act_fun': 'relu',
    'dropout_kp': 1,
    'L2_regularization': 'N',
    'logs': 'Y',
    'reshape_to': 'FC'
}

hidden_5 = {
    'name': '5_FC_hidden',
    'layer_type': 'FC',
    'node_size': 64,
    'act_fun': 'relu',
    'dropout_kp': 1,
    'L2_regularization': 'N',
    'logs': 'N'
}

output_y = {
    'name': '6_FC_output',
    'layer_type': 'FC',
    'node_size': num_labels,
    'act_fun': None,
    'dropout_kp': 1,
    'L2_regularization': 'N',
    'logs': 'N'
}

layer_design = [
    hidden_1,
    hidden_3,
    hidden_5,
    output_y
]

loss_params = {
    'beta_l2_regu': 0.001
}

train_params = {
    'learning_rate': 0.05,
    'learning_rate_decay_step': 10000,
    'learning_rate_decay_rate': 1
}

model_information = build_neural_network_graph(input_data=input_x, label=output_y, hidden_layers=layer_design,
                                               loss=loss_params, train=train_params)

# run the deep learning network
batch_size = 16
num_steps = 1001
log_steps = 50
tb_log_steps = 5

train_model(model_info=model_information, train_dataset=train_dataset, valid_dataset=valid_dataset,
            test_dataset=test_dataset, batch_size=batch_size, train_steps=num_steps, log_steps=log_steps,
            tb_log_steps=tb_log_steps)

Input:  (28, 28, 1)
Build layer: 1_Conv
output:  [14, 14, 16]
Build layer: 3_Conv
output:  [7, 7, 16]
reshape to
output:  784
Build layer: 5_FC_hidden
full connection Layer
act fun:  relu
output:  64
Build layer: 6_FC_output
full connection Layer
output:  10
Initialized
Logging Directory : logs/4_1/5_Layers/1103_1556
Minibatch loss at step 0: 2.378084
Minibatch accuracy: 0.0%
Validation accuracy: 15.1%
Minibatch loss at step 50: 1.008829
Minibatch accuracy: 68.8%
Validation accuracy: 73.7%
Minibatch loss at step 100: 0.649673
Minibatch accuracy: 81.2%
Validation accuracy: 77.2%
Minibatch loss at step 150: 0.291959
Minibatch accuracy: 87.5%
Validation accuracy: 77.3%
Minibatch loss at step 200: 0.826853
Minibatch accuracy: 75.0%
Validation accuracy: 79.3%
Minibatch loss at step 250: 1.055810
Minibatch accuracy: 68.8%
Validation accuracy: 79.0%
Minibatch loss at step 300: 0.350152
Minibatch accuracy: 93.8%
Validation accuracy: 79.9%
Minibatch loss at step 350: 0.485494
Minibatch accuracy

---
Problem 1
---------

The convolutional model above uses convolutions with stride 2 to reduce the dimensionality. Replace the strides by a max pooling operation (`nn.max_pool()`) of stride 2 and kernel size 2.

---

In [14]:
input_x = {
    'name': 'input',
    'layer_type': 'input',
    'node_size': (image_size, image_size, num_channels)
}

label_y = {
    'name': 'label',
    'layer_type': 'label',
    'node_size': num_labels
}

# build the deep learning network
hidden_1 = {
    'name': '1_Conv',
    'layer_type': 'CNN',
    'filter_num': 16,
    'filter': [5, 5, 1],
    'strides': [1, 1, 1, 1],
    'padding': 'SAME',
    'act_fun': None,
    'dropout_kp': 1,
    'L2_regularization': 'N',
    'logs': 'Y',
    'reshape_to': None
}

hidden_2 = {
    'name': '2_Max_Pool',
    'layer_type': 'POOL',
    'pool_type': 'MAX',
    'pool_patch': [1, 5, 5, 1],
    'strides': [1, 2, 2, 1],
    'padding': 'SAME',
    'act_fun': 'relu',
    'dropout_kp': 1,
    'L2_regularization': 'N',
    'logs': 'Y',
    'reshape_to': None
}

hidden_3 = {
    'name': '3_Conv',
    'layer_type': 'CNN',
    'filter_num': 16,
    'filter': [5, 5, 16],
    'strides': [1, 1, 1, 1],
    'padding': 'SAME',
    'act_fun': None,
    'dropout_kp': 1,
    'L2_regularization': 'N',
    'logs': 'Y',
    'reshape_to': None
}

hidden_4 = {
    'name': '4_Max_Pool',
    'layer_type': 'POOL',
    'pool_type': 'MAX',
    'pool_patch': [1, 5, 5, 1],
    'strides': [1, 2, 2, 1],
    'padding': 'SAME',
    'act_fun': 'relu',
    'dropout_kp': 1,
    'L2_regularization': 'N',
    'logs': 'Y',
    'reshape_to': 'FC'
}

hidden_5 = {
    'name': '5_FC_hidden',
    'layer_type': 'FC',
    'node_size': 64,
    'act_fun': 'relu',
    'dropout_kp': 1,
    'L2_regularization': 'N',
    'logs': 'N'
}

output_y = {
    'name': '6_FC_output',
    'layer_type': 'FC',
    'node_size': num_labels,
    'act_fun': None,
    'dropout_kp': 1,
    'L2_regularization': 'N',
    'logs': 'N'
}

layer_design = [
    hidden_1,
    hidden_2,
    hidden_3,
    hidden_4,
    hidden_5,
    output_y
]

loss_params = {
    'beta_l2_regu': 0.001
}

train_params = {
    'learning_rate': 0.05,
    'learning_rate_decay_step': 10000,
    'learning_rate_decay_rate': 1
}

model_information = build_neural_network_graph(input_data=input_x, label=output_y, hidden_layers=layer_design,
                                               loss=loss_params, train=train_params)

# run the deep learning network
batch_size = 16
num_steps = 1001
log_steps = 50
tb_log_steps = 5

train_model(model_info=model_information, train_dataset=train_dataset, valid_dataset=valid_dataset,
            test_dataset=test_dataset, batch_size=batch_size, train_steps=num_steps, log_steps=log_steps,
            tb_log_steps=tb_log_steps)

Input:  (28, 28, 1)
Build layer: 1_Conv
output:  [28, 28, 16]
Build layer: 2_Max_Pool
act fun:  relu
output:  [14, 14, 16]
Build layer: 3_Conv
output:  [14, 14, 16]
Build layer: 4_Max_Pool
act fun:  relu
output:  [7, 7, 16]
reshape to
output:  784
Build layer: 5_FC_hidden
full connection Layer
act fun:  relu
output:  64
Build layer: 6_FC_output
full connection Layer
output:  10
Initialized
Logging Directory : logs/4_1/7_Layers/1103_1607
Minibatch loss at step 0: 2.576654
Minibatch accuracy: 12.5%
Validation accuracy: 10.4%
Minibatch loss at step 50: 1.326884
Minibatch accuracy: 50.0%
Validation accuracy: 48.6%
Minibatch loss at step 100: 0.782306
Minibatch accuracy: 81.2%
Validation accuracy: 69.8%
Minibatch loss at step 150: 0.485366
Minibatch accuracy: 81.2%
Validation accuracy: 79.3%
Minibatch loss at step 200: 0.877959
Minibatch accuracy: 75.0%
Validation accuracy: 80.9%
Minibatch loss at step 250: 1.109884
Minibatch accuracy: 56.2%
Validation accuracy: 80.9%
Minibatch loss at step

---
Problem 2
---------

Try to get the best performance you can using a convolutional net. Look for example at the classic [LeNet5](http://yann.lecun.com/exdb/lenet/) architecture, adding Dropout, and/or adding learning rate decay.

---

![title](leNET_5.jpg)

In [20]:
input_x = {
    'name': 'input',
    'layer_type': 'input',
    'node_size': (image_size, image_size, num_channels)
}

label_y = {
    'name': 'label',
    'layer_type': 'label',
    'node_size': num_labels
}

# build the deep learning network
hidden_1 = {
    'name': '1_Conv',
    'layer_type': 'CNN',
    'filter_num': 6,
    'filter': [8, 8, 1],
    'strides': [1, 1, 1, 1],
    'padding': 'SAME',
    'act_fun': None,
    'dropout_kp': 1,
    'L2_regularization': 'N',
    'logs': 'Y',
    'reshape_to': None
}

hidden_2 = {
    'name': '2_Max_Pool',
    'layer_type': 'POOL',
    'pool_type': 'MAX',
    'pool_patch': [1, 3, 3, 1],
    'strides': [1, 2, 2, 1],
    'padding': 'SAME',
    'act_fun': 'relu',
    'dropout_kp': 1,
    'L2_regularization': 'N',
    'logs': 'Y',
    'reshape_to': None
}

hidden_3 = {
    'name': '3_Conv',
    'layer_type': 'CNN',
    'filter_num': 16,
    'filter': [9, 9, 6],
    'strides': [1, 1, 1, 1],
    'padding': 'SAME',
    'act_fun': None,
    'dropout_kp': 1,
    'L2_regularization': 'N',
    'logs': 'Y',
    'reshape_to': None
}

hidden_4 = {
    'name': '4_Max_Pool',
    'layer_type': 'POOL',
    'pool_type': 'MAX',
    'pool_patch': [1, 3, 3, 1],
    'strides': [1, 2, 2, 1],
    'padding': 'SAME',
    'act_fun': 'relu',
    'dropout_kp': 1,
    'L2_regularization': 'N',
    'logs': 'Y',
    'reshape_to': 'FC'
}

hidden_5 = {
    'name': '5_FC_hidden',
    'layer_type': 'FC',
    'node_size': 64,
    'act_fun': 'relu',
    'dropout_kp': 1,
    'L2_regularization': 'N',
    'logs': 'N'
}

output_y = {
    'name': '6_FC_output',
    'layer_type': 'FC',
    'node_size': num_labels,
    'act_fun': None,
    'dropout_kp': 1,
    'L2_regularization': 'N',
    'logs': 'N'
}

layer_design = [
    hidden_1,
    hidden_2,
    hidden_3,
    hidden_4,
    hidden_5,
    output_y
]

loss_params = {
    'beta_l2_regu': 0.001
}

train_params = {
    'learning_rate': 0.05,
    'learning_rate_decay_step': 300,
    'learning_rate_decay_rate': 0.5
}

model_information = build_neural_network_graph(input_data=input_x, label=output_y, hidden_layers=layer_design,
                                               loss=loss_params, train=train_params)

# run the deep learning network
batch_size = 16
num_steps = 1001
log_steps = 50
tb_log_steps = 5

train_model(model_info=model_information, train_dataset=train_dataset, valid_dataset=valid_dataset,
            test_dataset=test_dataset, batch_size=batch_size, train_steps=num_steps, log_steps=log_steps,
            tb_log_steps=tb_log_steps)

Input:  (28, 28, 1)
Build layer: 1_Conv
output:  [28, 28, 6]
Build layer: 2_Max_Pool
act fun:  relu
output:  [14, 14, 6]
Build layer: 3_Conv
output:  [14, 14, 16]
Build layer: 4_Max_Pool
act fun:  relu
output:  [7, 7, 16]
reshape to
output:  784
Build layer: 5_FC_hidden
full connection Layer
act fun:  relu
output:  64
Build layer: 6_FC_output
full connection Layer
output:  10
Initialized
Logging Directory : logs/4_1/7_Layers/1103_2042
Minibatch loss at step 0: 2.673327
Minibatch accuracy: 6.2%
Validation accuracy: 13.5%
Minibatch loss at step 50: 1.004596
Minibatch accuracy: 62.5%
Validation accuracy: 68.1%
Minibatch loss at step 100: 0.702034
Minibatch accuracy: 81.2%
Validation accuracy: 76.1%
Minibatch loss at step 150: 0.726387
Minibatch accuracy: 87.5%
Validation accuracy: 76.7%
Minibatch loss at step 200: 0.831116
Minibatch accuracy: 75.0%
Validation accuracy: 78.5%
Minibatch loss at step 250: 1.123239
Minibatch accuracy: 62.5%
Validation accuracy: 79.1%
Minibatch loss at step 30

# CASE 1
- batch_size = 16
- num_steps = 1001

|1_Conv  |2_Pool   |3_Conv     | 4_Pool  | 5_FC     | 6_FC    |Decay Step |Decay Rate | Train Steps | T Accu    | Test Acc   |
|--------|---------|-----------|---------|----------|---------|-----------|-----------|-------------|-----------|------------|
|5,5,1,6 | 1,3,3,1 | 6,6,6,16  | 1,3,3,1 | 784 - 64 | 64 - 10 | 10000     | 0.9       | 1001        | 87%       | 90.6%      |
|1,1,1,1 | 1,2,2,1 | 1,1,1,1   | 1,2,2,1 | -        | -       | -         | -         | -           | -         | -          |
|--------|---------|-----------|---------|----------|---------|-----------|-----------|-------------|-----------|------------|
|2,2,1,6 | 1,3,3,1 | 3,3,6,16  | 1,3,3,1 | 784 - 64 | 64 - 10 | 10000     | 0.9       | 1001        | 85%       | 89.0%      |
|1,1,1,1 | 1,2,2,1 | 1,1,1,1   | 1,2,2,1 | -        | -       | -         | -         | -           | -         | -          |
|--------|---------|-----------|---------|----------|---------|-----------|-----------|-------------|-----------|------------|
|8,8,1,6 | 1,3,3,1 | 9,9,6,16  | 1,3,3,1 | 784 - 64 | 64 - 10 | 10000     | 0.9       | 1001        | 87%       | 90.7%      |
|1,1,1,1 | 1,2,2,1 | 1,1,1,1   | 1,2,2,1 | -        | -       | -         | -         | -           | -         | -          |
|--------|---------|-----------|---------|----------|---------|-----------|-----------|-------------|-----------|------------|
|8,8,1,6 | 1,3,3,1 | 9,9,6,16  | 1,3,3,1 | 784 - 64 | 64 - 10 | 200       | 0.9       | 1001        | 88%       | 91.0%      |
|1,1,1,1 | 1,2,2,1 | 1,1,1,1   | 1,2,2,1 | -        | -       | -         | -         | -           | -         | -          |