## PART A: 2,3 - RMSProp and Adam

This notebook was primarily used for creating and testing of variations of Gradient Descent Learning Rule - RMSProp[1] and Adam[2].

As noted in previous experiments, large number of deep layers results in large disturbances in convergence.


[1] T. Tieleman and G. E. Hinton. Lecture 6.5-rmsprop: Divide the gradient by a running average of its recent
magnitude. COURSERA: Neural Networks for Machine Learning, 4(2), 2012. URL https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf.

[2] Diederik P. Kingma and Jimmy Ba. Adam: A method for stochastic optimization. In ICML, 2015. URL
https://arxiv.org/abs/1412.6980.


### Setup

In [2]:
%load_ext autoreload
%autoreload 2
from __future__ import print_function

In [4]:
from mlp.learning_rules import *
%aimport mlp.learning_rules # enable autoreloading

In [5]:
import numpy as np
import logging
from mlp.data_providers import MNISTDataProvider, EMNISTDataProvider

# Seed a random number generator
seed = 10102016
rng = np.random.RandomState(seed)
batch_size = 100
# Set up a logger object to print info about the training run to stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = [logging.StreamHandler()]

# Create data provider objects for the MNIST data set
train_data = EMNISTDataProvider('train', batch_size=batch_size, rng=rng)
valid_data = EMNISTDataProvider('valid', batch_size=batch_size, rng=rng)
test_data = EMNISTDataProvider('test', batch_size=batch_size, rng=rng)

['inputs', 'targets']
['inputs', 'targets']
['inputs', 'targets']


In [12]:
from mlp.layers import AffineLayer, SoftmaxLayer, ELULayer
from mlp.errors import CrossEntropySoftmaxError
from mlp.models import MultipleLayerModel
from mlp.initialisers import ConstantInit, GlorotUniformInit
from mlp.learning_rules import GradientDescentLearningRule
from mlp.optimisers import Optimiser

#setup hyperparameters
learning_rate = 0.1
num_epochs = 50 # TODO: CHANGED HERE FOR TESTING ONLY!
stats_interval = 1
input_dim, output_dim, hidden_dim = 784, 47, 100

In [7]:
# Create a single function to test a simple MLP with RELULayer:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

def train_model_and_plot_stats(model,
                               error,
                               learning_rule,
                               train_data,
                               valid_data,
                               test_data,
                               num_epochs,
                               stats_interval,
                               notebook=True):

    # As well as monitoring the error over training also monitor classification
    # accuracy i.e. proportion of most-probable predicted classes being equal to targets
    data_monitors = {'acc': lambda y, t: (y.argmax(-1) == t.argmax(-1)).mean()}

    # Use the created objects to initialise a new Optimiser instance.
    optimiser = Optimiser(
        model,
        error,
        learning_rule,
        train_data,
        valid_data,
        test_data,
        data_monitors,
        notebook=notebook)

    # Run the optimiser for 5 epochs (full passes through the training set)
    # printing statistics every epoch.
    stats, keys, run_time = optimiser.train(
        num_epochs=num_epochs, stats_interval=stats_interval)

    # Plot the change in the validation and training set error over training.
    fig_1 = plt.figure(figsize=(8, 4))
    ax_1 = fig_1.add_subplot(111)
    for k in ['error(train)', 'error(valid)']:
        ax_1.plot(
            np.arange(1, stats.shape[0]) * stats_interval,
            stats[1:, keys[k]],
            label=k)
    ax_1.legend(loc=0)
    ax_1.set_xlabel('Epoch number')

    # Plot the change in the validation and training set accuracy over training.
    fig_2 = plt.figure(figsize=(8, 4))
    ax_2 = fig_2.add_subplot(111)
    for k in ['acc(train)', 'acc(valid)']:
        ax_2.plot(
            np.arange(1, stats.shape[0]) * stats_interval,
            stats[1:, keys[k]],
            label=k)
    ax_2.legend(loc=0)
    ax_2.set_xlabel('Epoch number')

    return stats, keys, run_time, fig_1, ax_1, fig_2, ax_2

### Baseline

Run an experiment using SGD learning rule for deep hidden layer (say 10)

In [13]:
expt = {} # store all the experiments
func = ELULayer()
i = 9 # 10 layers

In [None]:
train_data.reset()
test_data.reset()
valid_data.reset()

# Initialise the weights and biases:
weights_init = GlorotUniformInit(rng=rng)
biases_init = ConstantInit(0.)

input_layer = [
    AffineLayer(input_dim, hidden_dim, weights_init, biases_init)
]
output_layer = [
    func,
    AffineLayer(hidden_dim, output_dim, weights_init, biases_init)
]
each_hidden_layer = [
    func,
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init)
]

# create the MLP:
model = MultipleLayerModel(input_layer + each_hidden_layer * i +
                           output_layer)
print(model, '{} layers'.format(i + 1))

error = CrossEntropySoftmaxError()
learning_rule = GradientDescentLearningRule(learning_rate=learning_rate)

expt['SGD'] = train_model_and_plot_stats(
    model,
    error,
    learning_rule,
    train_data,
    valid_data,
    test_data,
    num_epochs,
    stats_interval,
    notebook=False)

MultiLayerModel(
    AffineLayer(input_dim=784, output_dim=100)
    ELULayer
    AffineLayer(input_dim=100, output_dim=100)
    ELULayer
    AffineLayer(input_dim=100, output_dim=100)
    ELULayer
    AffineLayer(input_dim=100, output_dim=100)
    ELULayer
    AffineLayer(input_dim=100, output_dim=100)
    ELULayer
    AffineLayer(input_dim=100, output_dim=100)
    ELULayer
    AffineLayer(input_dim=100, output_dim=100)
    ELULayer
    AffineLayer(input_dim=100, output_dim=100)
    ELULayer
    AffineLayer(input_dim=100, output_dim=100)
    ELULayer
    AffineLayer(input_dim=100, output_dim=100)
    ELULayer
    AffineLayer(input_dim=100, output_dim=47)
) 10 layers


Epoch 1: 13.8s to complete
    error(train)=8.58e-01, acc(train)=7.30e-01, error(valid)=8.85e-01, acc(valid)=7.24e-01, error(test)=9.09e-01, acc(test)=7.13e-01
Epoch 2: 14.5s to complete
    error(train)=7.65e-01, acc(train)=7.46e-01, error(valid)=8.04e-01, acc(valid)=7.32e-01, error(test)=8.29e-01, acc(test)=7.30e-01
Epoch 3: 13.7s to complete
    error(train)=6.49e-01, acc(train)=7.80e-01, error(valid)=7.06e-01, acc(valid)=7.69e-01, error(test)=7.32e-01, acc(test)=7.56e-01
Epoch 4: 13.7s to complete
    error(train)=6.20e-01, acc(train)=7.90e-01, error(valid)=6.94e-01, acc(valid)=7.71e-01, error(test)=7.15e-01, acc(test)=7.66e-01
Epoch 5: 13.6s to complete
    error(train)=5.47e-01, acc(train)=8.18e-01, error(valid)=6.24e-01, acc(valid)=7.97e-01, error(test)=6.50e-01, acc(test)=7.90e-01
Epoch 6: 13.5s to complete
    error(train)=5.63e-01, acc(train)=8.08e-01, error(valid)=6.43e-01, acc(valid)=7.88e-01, error(test)=6.72e-01, acc(test)=7.81e-01
Epoch 7: 13.5s to complete
    error(tra

### Momentum SGD

Repeat the experiment with momentum SGD:

In [None]:
train_data.reset()
test_data.reset()
valid_data.reset()

learning_rule = MomentumLearningRule(learning_rate=0.02, mom_coeff=.9)

expt['MomentumSGD'] = train_model_and_plot_stats(
    model,
    error,
    learning_rule,
    train_data,
    valid_data,
    test_data,
    num_epochs,
    stats_interval,
    notebook=False)

### RMSProp

In [None]:
train_data.reset()
test_data.reset()
valid_data.reset()

learning_rule = RMSPropLearningRule(learning_rate=1e-3, beta=0.9, epsilon=1e-8)

expt['RMSProp'] = train_model_and_plot_stats(
    model,
    error,
    learning_rule,
    train_data,
    valid_data,
    test_data,
    num_epochs,
    stats_interval,
    notebook=False)

### Adam

In [None]:
train_data.reset()
test_data.reset()
valid_data.reset()

learning_rule = AdamLearningRule(learning_rate=1e-3, beta_1=0.9, beta_2=0.999,
                 epsilon=1e-8)

expt['adam'] = train_model_and_plot_stats(
    model,
    error,
    learning_rule,
    train_data,
    valid_data,
    test_data,
    num_epochs,
    stats_interval,
    notebook=False)