# Tune a CNN on MNIST

This tutorial walks through using Ax to tune two hyperparameters (learning rate and momentum) for a PyTorch CNN on the MNIST dataset trained using SGD with momentum.


In [1]:
import torch
import numpy as np
from ax import (
    ParameterType,
    RangeParameter,
    SearchSpace,
    SimpleExperiment,
    modelbridge,
    models,
)
from ax.plot.contour import plot_contour
from ax.plot.trace import optimization_trace_single_method
from ax.utils.notebook.plotting import render, init_notebook_plotting
from ax.utils.tutorials.cnn_utils import load_mnist, train, evaluate


In [3]:
init_notebook_plotting()

[INFO 04-01 16:11:38] ipy_plotting: Injecting Plotly library into cell. Do not overwrite or delete cell.


In [4]:
dtype = torch.float
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Load MNIST data
First, we need to load the MNIST data and partition it into training, validation, and test sets.

Note: this will download the dataset if necessary.

In [5]:
train_loader, valid_loader, test_loader = load_mnist()

## Define function to optimize
We need to define the function we want to optimize. In this tutorial, we want to optimize classification accuracy on the validation set as a function of the learning rate and momentum. The function takes in a parameterization (set of parameter values), computes the classification accuracy, and returns a dictionary of metric name ('accuracy') to a tuple with the mean and standard error.

In [10]:
def train_evaluate(
    parameterization,  # dict of parameter names to values of those parameters
    weight=None,  # required by the evaluation function signature
    train_loader=train_loader,  # DataLoader with examples and labels for training
    eval_loader=valid_loader,  # DataLoader with examples and labels for evaluation
):
    net = train(
        train_loader=train_loader, 
        params=parameterization,
        dtype=dtype,
        device=device,
    )
    accuracy = evaluate(
        net=net,
        data_loader=eval_loader,
        dtype=dtype,
        device=device,
    )
    return {'accuracy': (accuracy, 0.0)}

## Define the search space for the parameters
Here, we set the bounds on the learning rate and momentum and set the parameter space for the learning rate to be on a log scale.

In [11]:
search_space = SearchSpace(parameters=[
    RangeParameter(
        name='lr', parameter_type=ParameterType.FLOAT, lower=1e-6, upper=0.4, log_scale=True
    ),
    RangeParameter(
        name='momentum', parameter_type=ParameterType.FLOAT, lower=0.0, upper=1.0,
    )
])

## Setup experiment
Finally, we set up a SimpleExperiment with our search space and evaluation function. Note that SimpleExperiment can be used here instead of Experiment because points tried in optimization are computed synchrously via the evaluation function.

In [12]:
exp = SimpleExperiment(
    name='tune_cnn',
    search_space=search_space,
    evaluation_function=train_evaluate,
    objective_name='accuracy',
)

## Setup optimization loop

In [13]:
# We only instantiate the Sobol generator once, as the underlying model does not to be re-fit every 
# time new data is added to the experiment.
sobol = modelbridge.get_sobol(search_space=exp.search_space)
print(f"Running Sobol initialization trials...")
for _ in range(5):
    exp.new_trial(generator_run=sobol.gen(1))
for i in range(15):
    print(f"Running GP+EI optimization trial {i+1}/15...")
    # Since we need to re-fit the underlying GP model, we reinstantiate the GP+EI model every 
    # time new data is added to the experiment.
    # Note: a device can be specified here so that Bayesian optimization is performed on the same device
    # (e.g. on a gpu)
    gpei = modelbridge.get_GPEI(experiment=exp, data=exp.eval(), device=device)
    exp.new_trial(generator_run=gpei.gen(1))

Running Sobol initialization trials...
Running GP+EI optimization trial 1/15...
Running GP+EI optimization trial 2/15...
Running GP+EI optimization trial 3/15...
Running GP+EI optimization trial 4/15...
Running GP+EI optimization trial 5/15...
Running GP+EI optimization trial 6/15...
Running GP+EI optimization trial 7/15...
Running GP+EI optimization trial 8/15...
Running GP+EI optimization trial 9/15...
Running GP+EI optimization trial 10/15...
Running GP+EI optimization trial 11/15...
Running GP+EI optimization trial 12/15...
Running GP+EI optimization trial 13/15...
Running GP+EI optimization trial 14/15...
Running GP+EI optimization trial 15/15...


## Plot response surface

In [14]:
render(plot_contour(model=gpei, param_x='lr', param_y='momentum', metric_name='accuracy'))

## Plot best objective as function of the iteration

In [15]:
# `plot_single_method` expects a 2-d array of means, because it expects to average means from multiple 
# optimization runs, so we wrap out best objectives array in another array.
best_objectives = np.array([[trial.objective_mean for trial in exp.trials.values()]])
best_objective_plot = optimization_trace_single_method(
        y=np.maximum.accumulate(best_objectives, axis=1),
)
render(best_objective_plot)

## Train CNN with best hyperparameters and evaluate on test set

In [16]:
data = exp.fetch_data()
df = data.df
best_arm_name = df.arm_name[df['mean'] == df['mean'].max()].values[0]
best_arm = exp.arms_by_name[best_arm_name]
best_arm

Arm(name=17_0, params={'lr': 0.0017335090006982416, 'momentum': 0.5598888204187598})

In [17]:
test_accuracy = train_evaluate(best_arm.params, eval_loader=test_loader)

In [18]:
print(f"Classification Accuracy (test set): {round(test_accuracy['accuracy'][0]*100, 2)}%")

Classification Accuracy (test set): 97.1%
