# A short example on using hyperopt/hyperband with pytorch 

The hyperband algorithm randomly generates a set of configurations of hyperparameters and then tests them with some model on a given dataset. Initially, the tests are computed using very few iterations. The best performing 50% of the configurations are kept and the other 50% are resampled from the space of hyperparameters (successive halving). 

In further steps the number of generated configurations is reduced and the number of iterations to evaluate their performance is increased. This is repeated until only one configuration remains. Please see this [post](https://people.eecs.berkeley.edu/~kjamieson/hyperband.html) with a link to the paper for more details.

In [35]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.init as init
from torch.autograd import Variable
from torchvision.datasets import MNIST
from torchvision import transforms
from hyperopt import hp
from hyperopt.pyll.stochastic import sample
import pickle

The following code for the hyperband algorithm is adapted from [here](https://github.com/zygmuntz/hyperband). I slightly modified it to support python 3.

In [2]:
from __future__ import print_function

import numpy as np

from random import random
from math import log, ceil
from time import time, ctime



class Hyperband:
    
    def __init__( self, get_params_function, try_params_function ):
        self.get_params = get_params_function
        self.try_params = try_params_function
        
        self.max_iter = 81      # maximum iterations per configuration
        self.eta = 3            # defines configuration downsampling rate (default = 3)

        self.logeta = lambda x: log( x ) / log( self.eta )
        self.s_max = int( self.logeta( self.max_iter ))
        self.B = ( self.s_max + 1 ) * self.max_iter

        self.results = []    # list of dicts
        self.counter = 0
        self.best_loss = np.inf
        self.best_counter = -1
        

    # can be called multiple times
    def run( self, skip_last = 0, dry_run = False ):
        
        for s in reversed( range( self.s_max + 1 )):
            
            # initial number of configurations
            n = int( ceil( self.B / self.max_iter / ( s + 1 ) * self.eta ** s ))    
            
            # initial number of iterations per config
            r = self.max_iter * self.eta ** ( -s )        

            # n random configurations
            T = [ self.get_params() for i in range( n )] 
            
            for i in range(( s + 1 ) - int( skip_last )):    # changed from s + 1
                
                # Run each of the n configs for <iterations> 
                # and keep best (n_configs / eta) configurations
                
                n_configs = n * self.eta ** ( -i )
                n_iterations = r * self.eta ** ( i )
                
                print("\n*** {} configurations x {:.1f} iterations each".format( n_configs, n_iterations ))
                
                val_losses = []
                early_stops = []
                
                for t in T:
                    
                    self.counter += 1
                    print("\n{} | {} | lowest loss so far: {:.4f} (run {})\n".format( self.counter, ctime(), self.best_loss, self.best_counter ))
                    
                    start_time = time()
                    
                    if dry_run:
                        result = { 'loss': random(), 'log_loss': random(), 'auc': random()}
                    else:
                        result = self.try_params( n_iterations, t )        # <---
                        
                    assert( type( result ) == dict )
                    assert( 'loss' in result )
                    
                    seconds = int( round( time() - start_time ))
                    print("\n{} seconds.".format( seconds))
                    
                    loss = result['loss']    
                    val_losses.append( loss )
                    
                    early_stop = result.get( 'early_stop', False )
                    early_stops.append( early_stop )
                    
                    # keeping track of the best result so far (for display only)
                    # could do it be checking results each time, but hey
                    if loss < self.best_loss:
                        self.best_loss = loss
                        self.best_counter = self.counter
                    
                    result['counter'] = self.counter
                    result['seconds'] = seconds
                    result['params'] = t
                    result['iterations'] = n_iterations
                    
                    self.results.append( result )
                
                # select a number of best configurations for the next loop
                # filter out early stops, if any
                indices = np.argsort( val_losses )
                T = [ T[i] for i in indices if not early_stops[i]]
                T = T[ 0:int( n_configs / self.eta )]
        
        return self.results

## The model
Here I define a very simple model (based on the pytorch example for MNIST) that is constructed based on some hyperparameters (e.g. number of convolution filters, dropout or initialization scheme).

In [36]:
class Net(nn.Module):
    def __init__(self, dropout, conv1_n, conv2_n, fc_n, act_fn, init_fn):
        assert act_fn in ['relu', 'lrelu', 'elu']
        assert init_fn in ['xavier_uniform', 'xavier_normal', 'he_normal', 'he_uniform']
        
        super(Net, self).__init__()
        
        if act_fn == 'relu':
            self.act_fn = F.relu
            gain = init.calculate_gain('relu')
        elif act_fn == 'lrelu':
            self.act_fn = F.leaky_relu
            gain = init.calculate_gain('leaky_relu')
        else:
            self.act_fn = F.elu
            gain = init.calculate_gain('leaky_relu')
            
        if init_fn == 'xavier_uniform':
            init_layers = lambda tensor: init.xavier_uniform(tensor, gain=gain)
            init_last = lambda tensor: init.xavier_uniform(tensor, gain=1)
        elif init_fn == 'xavier_normal':
            init_layers = lambda tensor: init.xavier_normal(tensor, gain=gain)
            init_last = lambda tensor: init.xavier_normal(tensor, gain=1)
        elif init_fn == 'he_uniform':
            init_layers = init.kaiming_uniform
            init_last = init.kaiming_uniform
        else:
            init_layers = init.kaiming_normal
            init_last = init.kaiming_normal
        
        self.conv1 = nn.Conv2d(1, conv1_n, kernel_size=5, bias=False)
        init_layers(self.conv1.weight)
        self.conv2 = nn.Conv2d(conv1_n, conv2_n, kernel_size=5, bias=False)
        init_layers(self.conv2.weight)
        self.conv2_drop = nn.Dropout2d(p=dropout)
        self.dropout = dropout
        self.n_flat = 4*4*conv2_n
        self.fc1 = nn.Linear(self.n_flat, fc_n)
        init_layers(self.fc1.weight)
        self.fc2 = nn.Linear(fc_n, 10)
        init_last(self.fc2.weight)
        
    def forward(self, x):
        x = self.act_fn(F.max_pool2d(self.conv1(x), 2))
        x = self.act_fn(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, self.n_flat)
        x = self.act_fn(self.fc1(x))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x)

## The hyperparameter space
First, we have to define the hyperparameter space, i.e. which hyperparameters are we interested in and how should they be sampled. I use [hyperopt](http://hyperopt.github.io/hyperopt/) to define the search space.

In [37]:
space = {
    'dropout': hp.quniform('dropout', 0, 0.5, 0.1),
    'batchsize': hp.choice('batchsize', (128, 256)),
    'fc_n': hp.choice('fc_n', (32, 64, 128)),
    'conv1_n': hp.choice('fc_n', (16, 32, 64)),
    'conv2_n': hp.choice('fc_n', (16, 32, 64)),
    'init_fn': hp.choice('init_fn', ('xavier_uniform', 'xavier_normal', 'he_normal', 'he_uniform')),
    'act_fn': hp.choice('act_fn', ('relu', 'lrelu', 'elu')),
    'lr': hp.loguniform('lr', -10, -2),
    'l2': hp.loguniform('l2', -10, -2),
}

The hyperband algorithm requires to functions (`get_params` and `try_params`).
As the names imply, `get_params` simply returns  a configuration sampled from the hyperparameter (search) space.
`try_params` takes a set of hyperparameters and then uses them to construct and train a model before returning the loss on the validation set. I used the test set for demonstration purposes.

In [38]:
def get_params():
    return sample(space)

In [39]:
def try_params(n_iterations, params):
    n_iterations = int(round(n_iterations))
    print("iterations: ", n_iterations)
    print("params: ", params)
    
    train_loader = torch.utils.data.DataLoader(MNIST('data', train=True, download=True, transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])), batch_size=params['batchsize'])
    test_loader = torch.utils.data.DataLoader(MNIST('data', train=False, download=True, transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])), batch_size=params['batchsize'])
    
    model = Net(params['dropout'], params['conv1_n'], params['conv2_n'], params['fc_n'], params['act_fn'], params['init_fn'])
    if torch.cuda.is_available():
        model.cuda()
    optimizer = optim.Adam(model.parameters(), lr=params['lr'], weight_decay=params['l2'])
    
    model.train()
    for epoch in range(n_iterations):
        for data, target in train_loader:
            if torch.cuda.is_available():
                data, target = data.cuda(), target.cuda()
            data, target = Variable(data), Variable(target)
            optimizer.zero_grad()
            loss = F.nll_loss(model(data), target)
            loss.backward()
            optimizer.step()
    
    model.eval()
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        if torch.cuda.is_available():
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data, volatile=True), Variable(target)
        output = model(data)
        test_loss += F.nll_loss(output, target, size_average=False).data[0]
        pred = output.data.max(1, keepdim=True)[1]
        correct += pred.eq(target.data.view_as(pred)).cpu().sum()

    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)
    
    return {'loss': test_loss, 'accuracy': accuracy}

## Running hyperband
We can now run our experiments in order to hopefully find a good configuration of hyperparameters.

In [40]:
hyperband = Hyperband(get_params, try_params)
results = hyperband.run()


*** 81 configurations x 1.0 iterations each

1 | Wed Oct  4 17:25:13 2017 | lowest loss so far: inf (run -1)

iterations:  1
params:  {'lr': 0.08355500484510042, 'conv1_n': 32, 'init_fn': 'xavier_uniform', 'fc_n': 128, 'dropout': 0.4, 'conv2_n': 32, 'batchsize': 128, 'l2': 0.00018721158029257873, 'act_fn': 'lrelu'}

4 seconds.

2 | Wed Oct  4 17:25:18 2017 | lowest loss so far: 1.4036 (run 1)

iterations:  1
params:  {'lr': 0.0004112842617591849, 'conv1_n': 16, 'init_fn': 'xavier_normal', 'fc_n': 64, 'dropout': 0.2, 'conv2_n': 64, 'batchsize': 128, 'l2': 0.0002400717559614904, 'act_fn': 'lrelu'}

4 seconds.

3 | Wed Oct  4 17:25:22 2017 | lowest loss so far: 0.0864 (run 2)

iterations:  1
params:  {'lr': 0.0008600033044224089, 'conv1_n': 16, 'init_fn': 'he_normal', 'fc_n': 64, 'dropout': 0.2, 'conv2_n': 16, 'batchsize': 256, 'l2': 0.00234224380412097, 'act_fn': 'relu'}

4 seconds.

4 | Wed Oct  4 17:25:26 2017 | lowest loss so far: 0.0864 (run 2)

iterations:  1
params:  {'lr': 0.0001


4 seconds.

30 | Wed Oct  4 17:27:14 2017 | lowest loss so far: 0.0757 (run 12)

iterations:  1
params:  {'lr': 0.0007971812450578527, 'conv1_n': 64, 'init_fn': 'he_uniform', 'fc_n': 32, 'dropout': 0.4, 'conv2_n': 32, 'batchsize': 128, 'l2': 0.0002679302922015169, 'act_fn': 'lrelu'}

4 seconds.

31 | Wed Oct  4 17:27:18 2017 | lowest loss so far: 0.0757 (run 12)

iterations:  1
params:  {'lr': 0.1331963340992927, 'conv1_n': 64, 'init_fn': 'xavier_uniform', 'fc_n': 128, 'dropout': 0.1, 'conv2_n': 16, 'batchsize': 256, 'l2': 0.041651811394610716, 'act_fn': 'elu'}

4 seconds.

32 | Wed Oct  4 17:27:22 2017 | lowest loss so far: 0.0757 (run 12)

iterations:  1
params:  {'lr': 0.00017346293690085896, 'conv1_n': 32, 'init_fn': 'xavier_uniform', 'fc_n': 64, 'dropout': 0.2, 'conv2_n': 32, 'batchsize': 128, 'l2': 0.002823612683110437, 'act_fn': 'relu'}

4 seconds.

33 | Wed Oct  4 17:27:26 2017 | lowest loss so far: 0.0757 (run 12)

iterations:  1
params:  {'lr': 0.10541501955523469, 'conv1_n'


4 seconds.

59 | Wed Oct  4 17:29:15 2017 | lowest loss so far: 0.0685 (run 44)

iterations:  1
params:  {'lr': 0.07176505431643829, 'conv1_n': 32, 'init_fn': 'he_uniform', 'fc_n': 128, 'dropout': 0.0, 'conv2_n': 16, 'batchsize': 256, 'l2': 0.00028672526970305095, 'act_fn': 'relu'}

4 seconds.

60 | Wed Oct  4 17:29:19 2017 | lowest loss so far: 0.0685 (run 44)

iterations:  1
params:  {'lr': 0.06413748603338029, 'conv1_n': 16, 'init_fn': 'xavier_uniform', 'fc_n': 64, 'dropout': 0.4, 'conv2_n': 16, 'batchsize': 128, 'l2': 9.719298663485391e-05, 'act_fn': 'relu'}

4 seconds.

61 | Wed Oct  4 17:29:23 2017 | lowest loss so far: 0.0685 (run 44)

iterations:  1
params:  {'lr': 0.01319580553118669, 'conv1_n': 16, 'init_fn': 'xavier_normal', 'fc_n': 32, 'dropout': 0.30000000000000004, 'conv2_n': 32, 'batchsize': 256, 'l2': 0.0008840899621622176, 'act_fn': 'elu'}

4 seconds.

62 | Wed Oct  4 17:29:27 2017 | lowest loss so far: 0.0685 (run 44)

iterations:  1
params:  {'lr': 0.096746977218802


11 seconds.

88 | Wed Oct  4 17:31:59 2017 | lowest loss so far: 0.0481 (run 82)

iterations:  3
params:  {'lr': 0.0013278193607813074, 'conv1_n': 32, 'init_fn': 'he_normal', 'fc_n': 128, 'dropout': 0.4, 'conv2_n': 32, 'batchsize': 256, 'l2': 0.00024153331338579306, 'act_fn': 'elu'}

11 seconds.

89 | Wed Oct  4 17:32:09 2017 | lowest loss so far: 0.0411 (run 88)

iterations:  3
params:  {'lr': 0.01034764149541634, 'conv1_n': 64, 'init_fn': 'xavier_uniform', 'fc_n': 128, 'dropout': 0.2, 'conv2_n': 32, 'batchsize': 256, 'l2': 0.00016832489221715, 'act_fn': 'relu'}

11 seconds.

90 | Wed Oct  4 17:32:20 2017 | lowest loss so far: 0.0411 (run 88)

iterations:  3
params:  {'lr': 0.0004112842617591849, 'conv1_n': 16, 'init_fn': 'xavier_normal', 'fc_n': 64, 'dropout': 0.2, 'conv2_n': 64, 'batchsize': 128, 'l2': 0.0002400717559614904, 'act_fn': 'lrelu'}

12 seconds.

91 | Wed Oct  4 17:32:32 2017 | lowest loss so far: 0.0406 (run 90)

iterations:  3
params:  {'lr': 0.0002300591925480189, 'co


34 seconds.

117 | Wed Oct  4 17:40:16 2017 | lowest loss so far: 0.0276 (run 110)

iterations:  9
params:  {'lr': 0.002546565521290165, 'conv1_n': 64, 'init_fn': 'he_uniform', 'fc_n': 32, 'dropout': 0.4, 'conv2_n': 64, 'batchsize': 256, 'l2': 0.00019420322312303122, 'act_fn': 'elu'}

31 seconds.

*** 3.0 configurations x 27.0 iterations each

118 | Wed Oct  4 17:40:47 2017 | lowest loss so far: 0.0276 (run 110)

iterations:  27
params:  {'lr': 0.0004112842617591849, 'conv1_n': 16, 'init_fn': 'xavier_normal', 'fc_n': 64, 'dropout': 0.2, 'conv2_n': 64, 'batchsize': 128, 'l2': 0.0002400717559614904, 'act_fn': 'lrelu'}

101 seconds.

119 | Wed Oct  4 17:42:27 2017 | lowest loss so far: 0.0256 (run 118)

iterations:  27
params:  {'lr': 0.0013278193607813074, 'conv1_n': 32, 'init_fn': 'he_normal', 'fc_n': 128, 'dropout': 0.4, 'conv2_n': 32, 'batchsize': 256, 'l2': 0.00024153331338579306, 'act_fn': 'elu'}

91 seconds.

120 | Wed Oct  4 17:43:58 2017 | lowest loss so far: 0.0256 (run 118)

i


11 seconds.

145 | Wed Oct  4 17:54:13 2017 | lowest loss so far: 0.0249 (run 121)

iterations:  3
params:  {'lr': 0.0004130988965427741, 'conv1_n': 64, 'init_fn': 'xavier_normal', 'fc_n': 128, 'dropout': 0.0, 'conv2_n': 16, 'batchsize': 128, 'l2': 7.115528328092609e-05, 'act_fn': 'relu'}

12 seconds.

146 | Wed Oct  4 17:54:25 2017 | lowest loss so far: 0.0249 (run 121)

iterations:  3
params:  {'lr': 0.11232905181221525, 'conv1_n': 16, 'init_fn': 'xavier_normal', 'fc_n': 128, 'dropout': 0.30000000000000004, 'conv2_n': 32, 'batchsize': 128, 'l2': 0.016141870628517848, 'act_fn': 'elu'}

12 seconds.

147 | Wed Oct  4 17:54:36 2017 | lowest loss so far: 0.0249 (run 121)

iterations:  3
params:  {'lr': 4.577394798930015e-05, 'conv1_n': 32, 'init_fn': 'xavier_normal', 'fc_n': 128, 'dropout': 0.4, 'conv2_n': 16, 'batchsize': 256, 'l2': 0.0023646627638231996, 'act_fn': 'relu'}

11 seconds.

148 | Wed Oct  4 17:54:47 2017 | lowest loss so far: 0.0249 (run 121)

iterations:  3
params:  {'lr':


34 seconds.

173 | Wed Oct  4 18:13:26 2017 | lowest loss so far: 0.0249 (run 121)

iterations:  9
params:  {'lr': 0.00016587687507210025, 'conv1_n': 32, 'init_fn': 'he_uniform', 'fc_n': 64, 'dropout': 0.4, 'conv2_n': 64, 'batchsize': 256, 'l2': 9.821919477548803e-05, 'act_fn': 'lrelu'}

31 seconds.

174 | Wed Oct  4 18:13:56 2017 | lowest loss so far: 0.0249 (run 121)

iterations:  9
params:  {'lr': 5.684008369849797e-05, 'conv1_n': 32, 'init_fn': 'he_normal', 'fc_n': 64, 'dropout': 0.1, 'conv2_n': 32, 'batchsize': 256, 'l2': 0.01937548373008604, 'act_fn': 'relu'}

31 seconds.

175 | Wed Oct  4 18:14:27 2017 | lowest loss so far: 0.0249 (run 121)

iterations:  9
params:  {'lr': 0.007425477299592599, 'conv1_n': 64, 'init_fn': 'he_uniform', 'fc_n': 64, 'dropout': 0.4, 'conv2_n': 16, 'batchsize': 256, 'l2': 0.00030136605782824806, 'act_fn': 'lrelu'}

31 seconds.

176 | Wed Oct  4 18:14:58 2017 | lowest loss so far: 0.0249 (run 121)

iterations:  9
params:  {'lr': 0.00012758721407422747,


303 seconds.

201 | Wed Oct  4 18:50:56 2017 | lowest loss so far: 0.0203 (run 191)

iterations:  81
params:  {'lr': 0.005215494834695938, 'conv1_n': 64, 'init_fn': 'xavier_normal', 'fc_n': 32, 'dropout': 0.1, 'conv2_n': 64, 'batchsize': 128, 'l2': 0.00012785054663949893, 'act_fn': 'elu'}

304 seconds.

*** 5 configurations x 81.0 iterations each

202 | Wed Oct  4 18:56:00 2017 | lowest loss so far: 0.0203 (run 191)

iterations:  81
params:  {'lr': 0.0006828671855591531, 'conv1_n': 16, 'init_fn': 'he_normal', 'fc_n': 64, 'dropout': 0.1, 'conv2_n': 16, 'batchsize': 128, 'l2': 0.10146351901746466, 'act_fn': 'elu'}

305 seconds.

203 | Wed Oct  4 19:01:04 2017 | lowest loss so far: 0.0203 (run 191)

iterations:  81
params:  {'lr': 0.00043521122298378505, 'conv1_n': 16, 'init_fn': 'xavier_normal', 'fc_n': 32, 'dropout': 0.0, 'conv2_n': 16, 'batchsize': 128, 'l2': 9.825493085017422e-05, 'act_fn': 'elu'}

299 seconds.

204 | Wed Oct  4 19:06:04 2017 | lowest loss so far: 0.0203 (run 191)

i

In [41]:
with open('results.pkl', 'wb') as f:
    pickle.dump(results, f)

As you can see we found a reasonable setting of hyperparameters for our model. If we wanted to further explore the hyperparameter space we could call `hyperband.run()` again.

In [43]:
sorted(results, key=lambda r: r['loss'])[:5]

[{'accuracy': 99.44,
  'counter': 191,
  'iterations': 81.0,
  'loss': 0.02028474977016449,
  'params': {'act_fn': 'lrelu',
   'batchsize': 256,
   'conv1_n': 32,
   'conv2_n': 64,
   'dropout': 0.4,
   'fc_n': 64,
   'init_fn': 'he_uniform',
   'l2': 9.821919477548803e-05,
   'lr': 0.00016587687507210025},
  'seconds': 270},
 {'accuracy': 99.28,
  'counter': 192,
  'iterations': 27.0,
  'loss': 0.02320907483100891,
  'params': {'act_fn': 'lrelu',
   'batchsize': 128,
   'conv1_n': 16,
   'conv2_n': 16,
   'dropout': 0.30000000000000004,
   'fc_n': 128,
   'init_fn': 'he_uniform',
   'l2': 0.0005953039730324044,
   'lr': 0.0008042245680506868},
  'seconds': 101},
 {'accuracy': 99.39,
  'counter': 121,
  'iterations': 81.0,
  'loss': 0.024867862105369568,
  'params': {'act_fn': 'lrelu',
   'batchsize': 256,
   'conv1_n': 16,
   'conv2_n': 64,
   'dropout': 0.2,
   'fc_n': 128,
   'init_fn': 'xavier_normal',
   'l2': 6.147400023396258e-05,
   'lr': 0.00030599368840705983},
  'seconds': 2