## TRAIN AND VALIDATION

In [11]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

def train_model_and_plot_stats(
        model, error, learning_rule, train_data, valid_data, num_epochs, stats_interval, fig_name, notebook=True):
    
    # As well as monitoring the error over training also monitor classification
    # accuracy i.e. proportion of most-probable predicted classes being equal to targets
    data_monitors={'acc': lambda y, t: (y.argmax(-1) == t.argmax(-1)).mean()}

    # Use the created objects to initialise a new Optimiser instance.
    optimiser = Optimiser(
        model, error, learning_rule, train_data, valid_data, None, data_monitors, notebook=notebook)

    # Run the optimiser for 5 epochs (full passes through the training set)
    # printing statistics every epoch.
    stats, keys, run_time = optimiser.train(num_epochs=num_epochs, stats_interval=stats_interval)

    # Plot the change in the validation and training set error over training.
    fig_1 = plt.figure(figsize=(8, 4))
    ax_1 = fig_1.add_subplot(111)
    for k in ['error(train)', 'error(valid)']:
        ax_1.plot(np.arange(1, stats.shape[0]) * stats_interval, 
                  stats[1:, keys[k]], label=k)
#     ax_1.legend(loc=0)
    ax_1.grid('on') # Turn axes grid on
    ax_1.legend(loc='best', fontsize=11) # Add a legend
    ax_1.set_xlabel('Epoch number')

    fig_1.tight_layout() # This minimises whitespace around the axes.
    fig_1.savefig('err_' + fig_name) # Save figure to current directory in PDF format
    
    # Plot the change in the validation and training set accuracy over training.
    fig_2 = plt.figure(figsize=(8, 4))
    ax_2 = fig_2.add_subplot(111)
    for k in ['acc(train)', 'acc(valid)']:
        ax_2.plot(np.arange(1, stats.shape[0]) * stats_interval, 
                  stats[1:, keys[k]], label=k)
#     ax_2.legend(loc=0)
    ax_2.grid('on') # Turn axes grid on
    ax_2.legend(loc='best', fontsize=11) # Add a legend
    ax_2.set_xlabel('Epoch number')
    
    fig_2.tight_layout() # This minimises whitespace around the axes.
    fig_2.savefig('acc_' + fig_name) # Save figure to current directory in PDF format
    
    return stats, keys, run_time, fig_1, ax_1, fig_2, ax_2

## TRAIN AND TEST

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

def test_model(model, error, learning_rule, train_data, test_data, num_epochs, stats_interval, notebook=True):
    
    # As well as monitoring the error over training also monitor classification
    # accuracy i.e. proportion of most-probable predicted classes being equal to targets
    data_monitors={'acc': lambda y, t: (y.argmax(-1) == t.argmax(-1)).mean()}

    # Use the created objects to initialise a new Optimiser instance.
    optimiser = Optimiser(
        model, error, learning_rule, train_data, test_data, None,data_monitors, notebook=notebook)

    # Run the optimiser for 5 epochs (full passes through the training set)
    # printing statistics every epoch.
    stats, keys, run_time = optimiser.train(num_epochs=num_epochs, stats_interval=stats_interval)
    
    return stats, keys, run_time

In [3]:
# The below code will set up the data providers, random number
# generator and logger objects needed for training runs. As
# loading the data from file take a little while you generally
# will probably not want to reload the data providers on
# every training run. If you wish to reset their state you
# should instead use the .reset() method of the data providers.
import numpy as np
from itertools import product
import logging
from mlp.data_providers import MNISTDataProvider, EMNISTDataProvider

# Seed a random number generator
seed = 11102018 
rng = np.random.RandomState(seed)
# Set up a logger object to print info about the training run to stdout
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers = [logging.StreamHandler()]

In [4]:
# The model set up code below is provided as a starting point.
# You will probably want to add further code cells for the
# different experiments you run.

from mlp.layers import AffineLayer, SoftmaxLayer, SigmoidLayer, ReluLayer, LeakyReluLayer
from mlp.errors import CrossEntropySoftmaxError
from mlp.models import MultipleLayerModel
from mlp.initialisers import ConstantInit, GlorotUniformInit
from mlp.learning_rules import AdamLearningRule, RMSPropLearningRule, GradientDescentLearningRule, AdaGradLearningRule, AdamLearningRuleWithWeightDecay
from mlp.optimisers import Optimiser

### batch_size = 100

In [None]:
# Set batch size
batch_size = 100

# Create data provider objects for the MNIST data set
train_data = EMNISTDataProvider('train', batch_size=batch_size, rng=rng)
valid_data = EMNISTDataProvider('valid', batch_size=batch_size, rng=rng)

# RMSprop

In [None]:
# setup hyperparameters
num_epochs = 100
stats_interval = 1
input_dim, output_dim, hidden_dim = 784, 47, 100

learning_rates = [0.000001, 0.0000015, 0.00001, 0.000015, 0.0001, 0.00015]  # scale for random parameter initialisation
decay_rates = np.linspace(0.1, 0.9, 5)
final_errors_train = []
final_errors_valid = []
final_accs_train = []
final_accs_valid = []

for i, element in enumerate(product(learning_rates, decay_rates)):
    
    print('-' * 80)
    print('learning_rate={0:.2e}, decay_rate={1:.2e}'
          .format(element[0], element[1]))
    print('-' * 80)
    
    # Reset random number generator and data provider states on each run
    # to ensure reproducibility of results
    rng.seed(seed)
    train_data.reset()
    valid_data.reset()

    weights_init = GlorotUniformInit(rng=rng)
    biases_init = ConstantInit(0.)

    # Create a model with three hidden layers
    model = MultipleLayerModel([
        AffineLayer(input_dim, hidden_dim, weights_init, biases_init), 
        ReluLayer(),
        AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
        ReluLayer(),
        AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
        ReluLayer(),
        AffineLayer(hidden_dim, output_dim, weights_init, biases_init)
    ])

    # Initialise a cross entropy error object
    error = CrossEntropySoftmaxError()

    # Use a basic gradient descent learning rule
    learning_rule = RMSPropLearningRule(learning_rate=element[0], beta=element[1])

    # Remember to use notebook=False when you write a script to be run in a terminal
    stats, keys, run_time, fig_1, ax_1, fig_2, ax_2 = train_model_and_plot_stats(
        model, error, learning_rule, train_data, valid_data, num_epochs, stats_interval, fig_name='RMSprop_' + str(i) + '.pdf', notebook=True)

    plt.show()

    print('    final error(train) = {0:.2e}'.format(stats[-1, keys['error(train)']]))
    print('    final error(valid) = {0:.2e}'.format(stats[-1, keys['error(valid)']]))
    print('    final acc(train)   = {0:.2e}'.format(stats[-1, keys['acc(train)']]))
    print('    final acc(valid)   = {0:.2e}'.format(stats[-1, keys['acc(valid)']]))
    print('    run time per epoch = {0:.2f}'.format(run_time * 1. / num_epochs))

    final_errors_train.append(stats[-1, keys['error(train)']])
    final_errors_valid.append(stats[-1, keys['error(valid)']])
    final_accs_train.append(stats[-1, keys['acc(train)']])
    final_accs_valid.append(stats[-1, keys['acc(valid)']])

In [None]:
j = 0
print('| batch_size | learning_rate | decay_rate | final error(train) | final error(valid) | final acc(train) | final acc(valid) |')
print('|------------|---------------|------------|--------------------|--------------------|------------------|------------------|')
for element in product(learning_rates, decay_rates):
    print('| {0}   | {1:2f}   | {2:.2e}   | {3:.2e}   | {4:.2e}   | {5:.2e}   |  {6:.2f}      | {7:.2f}       |'
          .format(j, batch_size, element[0], element[1], 
                  final_errors_train[j], final_errors_valid[j],
                  final_accs_train[j], final_accs_valid[j]))
    j += 1

# Adam

In [None]:
# setup hyperparameters
num_epochs = 100
stats_interval = 1
input_dim, output_dim, hidden_dim = 784, 47, 100

learning_rates = [0.000001, 0.0000015, 0.00001, 0.000015]  # scale for random parameter initialisation
decay_rates_1 = [0.85, 0.9, 0.95]
decay_rates_2 = [0.95, 0.975, 0.999]
final_errors_train = []
final_errors_valid = []
final_accs_train = []
final_accs_valid = []

for i, element in enumerate(product(learning_rates, decay_rates_1, decay_rates_2)):
    
    print('-' * 80)
    print('learning_rate={0:.2e}, decay_rate={1:.2e}'
          .format(element[0], element[1]))
    print('-' * 80)
    
    # Reset random number generator and data provider states on each run
    # to ensure reproducibility of results
    rng.seed(seed)
    train_data.reset()
    valid_data.reset()

    weights_init = GlorotUniformInit(rng=rng)
    biases_init = ConstantInit(0.)

    # Create a model with three hidden layers
    model = MultipleLayerModel([
        AffineLayer(input_dim, hidden_dim, weights_init, biases_init), 
        ReluLayer(),
        AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
        ReluLayer(),
        AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
        ReluLayer(),
        AffineLayer(hidden_dim, output_dim, weights_init, biases_init)
    ])

    # Initialise a cross entropy error object
    error = CrossEntropySoftmaxError()

    # Use a basic gradient descent learning rule
    learning_rule = AdamLearningRule(learning_rate=element[0], beta_1=element[1], beta_2=element[2])

    # Remember to use notebook=False when you write a script to be run in a terminal
    stats, keys, run_time, fig_1, ax_1, fig_2, ax_2 = train_model_and_plot_stats(
        model, error, learning_rule, train_data, valid_data, num_epochs, stats_interval, fig_name='Adam_' + str(i) + '.pdf', notebook=True)

    plt.show()

    print('    final error(train) = {0:.2e}'.format(stats[-1, keys['error(train)']]))
    print('    final error(valid) = {0:.2e}'.format(stats[-1, keys['error(valid)']]))
    print('    final acc(train)   = {0:.2e}'.format(stats[-1, keys['acc(train)']]))
    print('    final acc(valid)   = {0:.2e}'.format(stats[-1, keys['acc(valid)']]))
    print('    run time per epoch = {0:.2f}'.format(run_time * 1. / num_epochs))

    final_errors_train.append(stats[-1, keys['error(train)']])
    final_errors_valid.append(stats[-1, keys['error(valid)']])
    final_accs_train.append(stats[-1, keys['acc(train)']])
    final_accs_valid.append(stats[-1, keys['acc(valid)']])

In [None]:
j = 0
print('| batch_size | learning_rate | dr_1 | dr_2 | final error(train) | final error(valid) | final acc(train) | final acc(valid) |')
print('|------------|---------------|------|------|--------------------|--------------------|------------------|------------------|')
for element in product(learning_rates, decay_rates_1, decay_rates_2):
    print('| {0}   | {1:2f}   | {2:.2e}   | {3:.2e}   | {4:.2e}   | {5:.2e}   | {6:.2e}    | {7:.2f}      | {8:.2f}       |'
          .format(j, batch_size, element[0], element[1], element[2], 
                  final_errors_train[j], final_errors_valid[j],
                  final_accs_train[j], final_accs_valid[j]))
    j += 1

### Aditional training on Adam

In [None]:
# setup hyperparameters
num_epochs = 100
stats_interval = 1
input_dim, output_dim, hidden_dim = 784, 47, 100

learning_rates = [0.000001, 0.0000015, 0.00001, 0.000015]  # scale for random parameter initialisation
decay_rates_1 = [0.99]#[0.85, 0.9, 0.95]
decay_rates_2 = [0.999]#[0.95, 0.975, 0.999]
final_errors_train = []
final_errors_valid = []
final_accs_train = []
final_accs_valid = []

for i, element in enumerate(product(learning_rates, decay_rates_1, decay_rates_2)):
    
    print('-' * 80)
    print('learning_rate={0:.2e}, decay_rate={1:.2e}'
          .format(element[0], element[1]))
    print('-' * 80)
    
    # Reset random number generator and data provider states on each run
    # to ensure reproducibility of results
    rng.seed(seed)
    train_data.reset()
    valid_data.reset()

    weights_init = GlorotUniformInit(rng=rng)
    biases_init = ConstantInit(0.)

    # Create a model with three hidden layers
    model = MultipleLayerModel([
        AffineLayer(input_dim, hidden_dim, weights_init, biases_init), 
        ReluLayer(),
        AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
        ReluLayer(),
        AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
        ReluLayer(),
        AffineLayer(hidden_dim, output_dim, weights_init, biases_init)
    ])

    # Initialise a cross entropy error object
    error = CrossEntropySoftmaxError()

    # Use a basic gradient descent learning rule
    learning_rule = AdamLearningRule(learning_rate=element[0], beta_1=element[1], beta_2=element[2])

    # Remember to use notebook=False when you write a script to be run in a terminal
    stats, keys, run_time, fig_1, ax_1, fig_2, ax_2 = train_model_and_plot_stats(
        model, error, learning_rule, train_data, valid_data, num_epochs, stats_interval, fig_name='Adam_' + str(i) + '.pdf', notebook=True)

    plt.show()

    print('    final error(train) = {0:.2e}'.format(stats[-1, keys['error(train)']]))
    print('    final error(valid) = {0:.2e}'.format(stats[-1, keys['error(valid)']]))
    print('    final acc(train)   = {0:.2e}'.format(stats[-1, keys['acc(train)']]))
    print('    final acc(valid)   = {0:.2e}'.format(stats[-1, keys['acc(valid)']]))
    print('    run time per epoch = {0:.2f}'.format(run_time * 1. / num_epochs))

    final_errors_train.append(stats[-1, keys['error(train)']])
    final_errors_valid.append(stats[-1, keys['error(valid)']])
    final_accs_train.append(stats[-1, keys['acc(train)']])
    final_accs_valid.append(stats[-1, keys['acc(valid)']])

In [None]:
j = 0
print('| batch_size | learning_rate | dr_1 | dr_2 | final error(train) | final error(valid) | final acc(train) | final acc(valid) |')
print('|------------|---------------|------|------|--------------------|--------------------|------------------|------------------|')
for element in product(learning_rates, decay_rates_1, decay_rates_2):
    print('| {0}   | {1:2f}   | {2:.2e}   | {3:.2e}   | {4:.2e}   | {5:.2e}   | {6:.2e}    | {7:.2f}      | {8:.2f}       |'
          .format(j, batch_size, element[0], element[1], element[2], 
                  final_errors_train[j], final_errors_valid[j],
                  final_accs_train[j], final_accs_valid[j]))
    j += 1

## Adam With Weight Decay

In [None]:
#setup hyperparameters
learning_rate = 0.000015
num_epochs = 100
stats_interval = 1
input_dim, output_dim, hidden_dim = 784, 47, 100


print('-' * 80)
print('learning_rate={0:.2e}'
      .format(learning_rate))
print('-' * 80)

# Reset random number generator and data provider states on each run
# to ensure reproducibility of results
rng.seed(seed)
train_data.reset()
valid_data.reset()

weights_init = GlorotUniformInit(rng=rng)
biases_init = ConstantInit(0.)

# Create a model with three hidden layers
model = MultipleLayerModel([
    AffineLayer(input_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, output_dim, weights_init, biases_init)
])

error = CrossEntropySoftmaxError()
# Use a basic gradient descent learning rule
learning_rule = AdamLearningRuleWithWeightDecay(learning_rate=learning_rate)

#Remember to use notebook=False when you write a script to be run in a terminal
stats, keys, run_time, fig_1, ax_1, fig_2, ax_2 = train_model_and_plot_stats(
    model, error, learning_rule, train_data, valid_data, num_epochs, stats_interval, fig_name='AdamWD_three_hidden_layers.pdf', notebook=True)

plt.show()

print('    final error(train) = {0:.2e}'.format(stats[-1, keys['error(train)']]))
print('    final error(valid) = {0:.2e}'.format(stats[-1, keys['error(valid)']]))
print('    final acc(train)   = {0:.2e}'.format(stats[-1, keys['acc(train)']]))
print('    final acc(valid)   = {0:.2e}'.format(stats[-1, keys['acc(valid)']]))
print('    run time per epoch = {0:.2f}'.format(run_time * 1. / num_epochs))

# final_errors_train.append(stats[-1, keys['error(train)']])
# final_errors_valid.append(stats[-1, keys['error(valid)']])
# final_accs_train.append(stats[-1, keys['acc(train)']])
# final_accs_valid.append(stats[-1, keys['acc(valid)']])

## TEST ALL THREE MODELS

### batch_size = 100

In [5]:
# Set batch size
batch_size = 100

# Create data provider objects for the MNIST data set
train_data = EMNISTDataProvider('train', batch_size=batch_size, rng=rng)
valid_data = EMNISTDataProvider('valid', batch_size=batch_size, rng=rng)
test_data = EMNISTDataProvider('test', batch_size=batch_size, rng=rng)

KeysView(<numpy.lib.npyio.NpzFile object at 0x1039d88d0>)
KeysView(<numpy.lib.npyio.NpzFile object at 0x114452eb8>)
KeysView(<numpy.lib.npyio.NpzFile object at 0x114452eb8>)


### SGD

In [6]:
#setup hyperparameters
learning_rate = 0.01
num_epochs = 100
stats_interval = 1
input_dim, output_dim, hidden_dim = 784, 47, 100


print('-' * 80)
print('learning_rate={0:.2e}'
      .format(learning_rate))
print('-' * 80)

# Reset random number generator and data provider states on each run
# to ensure reproducibility of results
rng.seed(seed)
train_data.reset()
test_data.reset()

weights_init = GlorotUniformInit(rng=rng)
biases_init = ConstantInit(0.)

# Create a model with three hidden layers
model = MultipleLayerModel([
    AffineLayer(input_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, output_dim, weights_init, biases_init)
])

error = CrossEntropySoftmaxError()
# Use a basic gradient descent learning rule
learning_rule = GradientDescentLearningRule(learning_rate=learning_rate)

#Remember to use notebook=False when you write a script to be run in a terminal
stats_1, keys_1, run_time_1 = test_model(
    model, error, learning_rule, train_data, test_data, num_epochs, stats_interval, notebook=True)

print('    final error(train) = {0:.2e}'.format(stats_1[-1, keys_1['error(train)']]))
print('    final error(test)  = {0:.2e}'.format(stats_1[-1, keys_1['error(valid)']]))
print('    final acc(train)   = {0:.2e}'.format(stats_1[-1, keys_1['acc(train)']]))
print('    final acc(test)    = {0:.2e}'.format(stats_1[-1, keys_1['acc(valid)']]))
print('    run time per epoch = {0:.2f}'.format(run_time_1 * 1. / num_epochs))

--------------------------------------------------------------------------------
learning_rate=1.00e-02
--------------------------------------------------------------------------------


HBox(children=(IntProgress(value=0), HTML(value='')))




AttributeError: 'dict' object has no attribute 'update_learning_rule'

### RMSprop

In [None]:
#setup hyperparameters
learning_rate = 0.00001
decay_rate = 0.9
num_epochs = 100
stats_interval = 1
input_dim, output_dim, hidden_dim = 784, 47, 100


print('-' * 80)
print('learning_rate={0:.2e}, decay_rate={1:.2e}'
      .format(learning_rate, decay_rate))
print('-' * 80)

# Reset random number generator and data provider states on each run
# to ensure reproducibility of results
rng.seed(seed)
train_data.reset()
valid_data.reset()

weights_init = GlorotUniformInit(rng=rng)
biases_init = ConstantInit(0.)

# Create a model with three hidden layers
model = MultipleLayerModel([
    AffineLayer(input_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, output_dim, weights_init, biases_init)
])

error = CrossEntropySoftmaxError()
# Use a basic gradient descent learning rule
# learning_rule = RMSPropLearningRule()
learning_rule = RMSPropLearningRule(learning_rate=learning_rate, beta=decay_rate)

#Remember to use notebook=False when you write a script to be run in a terminal
stats_2, keys_2, run_time_2 = test_model(
    model, error, learning_rule, train_data, test_data, num_epochs, stats_interval, notebook=True)

print('    final error(train) = {0:.2e}'.format(stats_2[-1, keys_2['error(train)']]))
print('    final error(test)  = {0:.2e}'.format(stats_2[-1, keys_2['error(valid)']]))
print('    final acc(train)   = {0:.2e}'.format(stats_2[-1, keys_2['acc(train)']]))
print('    final acc(test)    = {0:.2e}'.format(stats_2[-1, keys_2['acc(valid)']]))
print('    run time per epoch = {0:.2f}'.format(run_time_2 * 1. / num_epochs))

### Adam

In [8]:
#setup hyperparameters
learning_rate = 0.000015
decay_rate_1 = 0.9
decay_rate_2 = 0.95
num_epochs = 100
stats_interval = 1
input_dim, output_dim, hidden_dim = 784, 47, 100


print('-' * 80)
print('learning_rate={0:.2e}, decay_rate_1={1:.2e}, decay_rate_2={2:.2e}'
      .format(learning_rate, decay_rate_1, decay_rate_2))
print('-' * 80)

# Reset random number generator and data provider states on each run
# to ensure reproducibility of results
rng.seed(seed)
train_data.reset()
valid_data.reset()

weights_init = GlorotUniformInit(rng=rng)
biases_init = ConstantInit(0.)

# Create a model with three hidden layers
model = MultipleLayerModel([
    AffineLayer(input_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, output_dim, weights_init, biases_init)
])

error = CrossEntropySoftmaxError()
# Use a basic gradient descent learning rule
# learning_rule = AdamLearningRule()
learning_rule = AdamLearningRule(learning_rate=learning_rate, beta_1=decay_rate_1, beta_2=decay_rate_2)

#Remember to use notebook=False when you write a script to be run in a terminal
stats_3, keys_3, run_time_3 = test_model(
    model, error, learning_rule, train_data, test_data, num_epochs, stats_interval, notebook=True)

print('    final error(train) = {0:.2e}'.format(stats_3[-1, keys_3['error(train)']]))
print('    final error(test)  = {0:.2e}'.format(stats_3[-1, keys_3['error(valid)']]))
print('    final acc(train)   = {0:.2e}'.format(stats_3[-1, keys_3['acc(train)']]))
print('    final acc(test)    = {0:.2e}'.format(stats_3[-1, keys_3['acc(valid)']]))
print('    run time per epoch = {0:.2f}'.format(run_time_3 * 1. / num_epochs))

--------------------------------------------------------------------------------
learning_rate=1.50e-05, decay_rate_1=9.00e-01, decay_rate_2=9.50e-01
--------------------------------------------------------------------------------


HBox(children=(IntProgress(value=0), HTML(value='')))




AttributeError: 'dict' object has no attribute 'update_learning_rule'

### Plot Graph

In [None]:
# Plot the change in the validation and training set error over training.
fig_1 = plt.figure(figsize=(8, 4))
ax_1 = fig_1.add_subplot(111)
for k in ['error(valid)']:
    ax_1.plot(np.arange(1, stats_1.shape[0]) * stats_interval, 
              stats_1[1:, keys_1[k]], label='SGD: error(test)')
    ax_1.plot(np.arange(1, stats_2.shape[0]) * stats_interval, 
              stats_2[1:, keys_2[k]], label='RMSprop: error(test)')
    ax_1.plot(np.arange(1, stats_3.shape[0]) * stats_interval, 
              stats_3[1:, keys_3[k]], label='Adam: error(test)')
#     ax_1.legend(loc=0)
ax_1.grid('on') # Turn axes grid on
ax_1.legend(loc='best', fontsize=11) # Add a legend
ax_1.set_xlabel('Epoch number')

fig_1.tight_layout() # This minimises whitespace around the axes.
fig_1.savefig('err_test_models') # Save figure to current directory in PDF format

# Plot the change in the validation and training set accuracy over training.
fig_2 = plt.figure(figsize=(8, 4))
ax_2 = fig_2.add_subplot(111)
for k in ['acc(valid)']:
    ax_2.plot(np.arange(1, stats_1.shape[0]) * stats_interval, 
              stats_1[1:, keys_1[k]], label='SGD: acc(test)')
    ax_2.plot(np.arange(1, stats_2.shape[0]) * stats_interval, 
              stats_2[1:, keys_2[k]], label='RMSprop: acc(test)')
    ax_2.plot(np.arange(1, stats_3.shape[0]) * stats_interval, 
              stats_3[1:, keys_3[k]], label='Adam: acc(test)')
#     ax_2.legend(loc=0)
ax_2.grid('on') # Turn axes grid on
ax_2.legend(loc='best', fontsize=11) # Add a legend
ax_2.set_xlabel('Epoch number')

fig_2.tight_layout() # This minimises whitespace around the axes.
fig_2.savefig('acc_test_models') # Save figure to current directory in PDF format


plt.show()

### Adagrad

In [None]:
#setup hyperparameters
learning_rate = 0.000015
decay_rate_1 = 0.9
decay_rate_2 = 0.95
num_epochs = 100
stats_interval = 1
input_dim, output_dim, hidden_dim = 784, 47, 100


print('-' * 80)
print('learning_rate={0:.2e}, decay_rate_1={1:.2e}, decay_rate_2={2:.2e}'
      .format(learning_rate, decay_rate_1, decay_rate_2))
print('-' * 80)

# Reset random number generator and data provider states on each run
# to ensure reproducibility of results
rng.seed(seed)
train_data.reset()
valid_data.reset()

weights_init = GlorotUniformInit(rng=rng)
biases_init = ConstantInit(0.)

# Create a model with three hidden layers
model = MultipleLayerModel([
    AffineLayer(input_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, hidden_dim, weights_init, biases_init), 
    ReluLayer(),
    AffineLayer(hidden_dim, output_dim, weights_init, biases_init)
])

error = CrossEntropySoftmaxError()
# Use a basic gradient descent learning rule
# learning_rule = AdamLearningRule()
learning_rule = AdaGradLearningRule()

#Remember to use notebook=False when you write a script to be run in a terminal
stats_4, keys_4, run_time_4 = test_model(
    model, error, learning_rule, train_data, test_data, num_epochs, stats_interval, notebook=True)

print('    final error(train) = {0:.2e}'.format(stats_4[-1, keys_4['error(train)']]))
print('    final error(test)  = {0:.2e}'.format(stats_4[-1, keys_4['error(valid)']]))
print('    final acc(train)   = {0:.2e}'.format(stats_4[-1, keys_4['acc(train)']]))
print('    final acc(test)    = {0:.2e}'.format(stats_4[-1, keys_4['acc(valid)']]))
print('    run time per epoch = {0:.2f}'.format(run_time_4 * 1. / num_epochs))

In [1]:
# Plot the change in the validation and training set error over training.
fig_1 = plt.figure(figsize=(8, 4))
ax_1 = fig_1.add_subplot(111)
for k in ['error(valid)']:
    ax_1.plot(np.arange(1, stats_1.shape[0]) * stats_interval, 
              stats_1[1:, keys_1[k]], label='SGD: error(test)')
    ax_1.plot(np.arange(1, stats_2.shape[0]) * stats_interval, 
              stats_2[1:, keys_2[k]], label='RMSprop: error(test)')
    ax_1.plot(np.arange(1, stats_3.shape[0]) * stats_interval, 
              stats_3[1:, keys_3[k]], label='Adam: error(test)')
    ax_1.plot(np.arange(1, stats_4.shape[0]) * stats_interval, 
              stats_4[1:, keys_4[k]], label='Adagrad: error(test)')
#     ax_1.legend(loc=0)
ax_1.grid('on') # Turn axes grid on
ax_1.legend(loc='best', fontsize=11) # Add a legend
ax_1.set_xlabel('Epoch number')

fig_1.tight_layout() # This minimises whitespace around the axes.
fig_1.savefig('full_err_test_models.pdf') # Save figure to current directory in PDF format

# Plot the change in the validation and training set accuracy over training.
fig_2 = plt.figure(figsize=(8, 4))
ax_2 = fig_2.add_subplot(111)
for k in ['acc(valid)']:
    ax_2.plot(np.arange(1, stats_1.shape[0]) * stats_interval, 
              stats_1[1:, keys_1[k]], label='SGD: acc(test)')
    ax_2.plot(np.arange(1, stats_2.shape[0]) * stats_interval, 
              stats_2[1:, keys_2[k]], label='RMSprop: acc(test)')
    ax_2.plot(np.arange(1, stats_3.shape[0]) * stats_interval, 
              stats_3[1:, keys_3[k]], label='Adam: acc(test)')
    ax_2.plot(np.arange(1, stats_4.shape[0]) * stats_interval, 
              stats_4[1:, keys_4[k]], label='Adagrad: acc(test)')
#     ax_2.legend(loc=0)
ax_2.grid('on') # Turn axes grid on
ax_2.legend(loc='best', fontsize=11) # Add a legend
ax_2.set_xlabel('Epoch number')

fig_2.tight_layout() # This minimises whitespace around the axes.
fig_2.savefig('full_acc_test_models.pdf') # Save figure to current directory in PDF format


plt.show()

NameError: name 'plt' is not defined

In [None]:
learning_rates = [0.000001, 0.0000015, 0.00001, 0.000015]  # scale for random parameter initialisation
decay_rates_1 = [0.99, 0,999]#[0.85, 0.9, 0.95]
decay_rates_2 = [0.999]#[0.95, 0.975, 0.999]

for i, element in enumerate(product(learning_rates, decay_rates_1, decay_rates_2)):
    print(i, element[0], element[1], element[2])

In [None]:
import numpy as np
from itertools import product
a = np.linspace(0.1, 0.9, 5) #np.array([1, 2, 3, 4, 5 , 6])
b = [0.000001, 0.0000015, 0.00001, 0.000015, 0.0001, 0.00015] #np.array(['a', 'b', 'c'])

for i, element in enumerate(product(a, b)):
    print(i, element)