In [1]:
!pip install -q dask_cuda torch torchtext skorch
!pip -q install dask[dataframe] --upgrade

# Hyperparameter optimization with Skorch


## Setup Dask

In [2]:
import torch
from dask_cuda import LocalCUDACluster
from distributed import Client

# if you have GPU(s), use dask_cuda to automatically make use of them in your dask cluster
if torch.cuda.is_available():
    cluster = LocalCUDACluster()
    client = Client(cluster)
else:
    client = Client(processes=False, threads_per_worker=4,
                    n_workers=1, memory_limit='2GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:42413  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 1  Memory: 33.68 GB


In [3]:
# for reproducibility
import random
import numpy as np

SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Create Data

In [4]:
import torchtext
from torchtext import data
from torchtext import datasets

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
# takes approx. 10 minutes to download data and embeddings (will be cached for re-use)

# set up fields
TEXT = data.Field(lower=True, include_lengths=True, batch_first=True, )
LABEL = data.Field(sequential=False, unk_token=None)

# make splits for data
train, test = datasets.IMDB.splits(TEXT, LABEL)

# will be used to initialize model embeddings layer
vocab = torchtext.vocab.GloVe(name='6B', dim=100)

# build the vocabulary
max_size = 25_000 # shorten for demonstrative purposes
TEXT.build_vocab(train, vectors=vocab, max_size=max_size)
LABEL.build_vocab(train)

# make iterator for splits
train_iter, test_iter = data.BucketIterator.splits(
    (train, test), batch_sizes=(32, 64), device='cpu')

In [7]:
# itos := index-to-string
# note the 2 extra tokens added for us: '<unk>', '<pad>'
TEXT.vocab.itos[:5]

['<unk>', '<pad>', 'the', 'a', 'and']

In [8]:
assert (len(TEXT.vocab.itos) == max_size + 2)

In [9]:
# peek at the data
print(train.examples[0].text)
print()
print(train.examples[0].label)

['a', 'masterful', 'treatment', 'of', 'james', "caine's", '"the', 'postman', 'always', 'rings', 'twice"', 'as', 'luchino', "visconti's", 'first', 'film', 'shot', 'primarily', 'around', 'ferrara', 'in', 'a', 'soulless', 'war-torn', 'italy.', 'the', 'original', 'negative', 'was', 'thought', 'destroyed', 'but', 'visconti', 'saved', 'a', 'print', 'and', 'fortunately', 'we', 'can', 'see', 'this', 'early', 'neo-realist', 'work', 'today.', 'a', 'ruggedly', 'handsome', 'massimo', 'girotti', 'and', 'clara', 'calamai', '(who', 'had', 'recently', 'revealed', 'her', 'breasts', 'in', 'la', 'cena', 'delle', 'beffe"', '(1941),', 'star', 'as', 'the', 'sensually-charged', 'and', 'ill-fated', 'lovers', 'who', 'plot', 'to', 'kill', 'her', 'husband.', 'unusual', 'ending', 'in', 'which,', 'although', 'crime', 'does', 'not', 'pay,', 'one', 'pays', 'in', 'a', 'way', 'not', 'directly', 'linked', 'to', 'the', 'crime.', 'excellent', 'direction,', 'script,', 'acting,', 'and', 'cinematography.', 'reportedly', 'no

In [10]:
# sadly train_iter isn't actually an iter..
# peek at a batch of data
batch = next(iter(train_iter))

In [11]:
# numericalized tokens
print(batch.text[0])
# sequence lengths
print(batch.text[1])

tensor([[  542,  5101,    41,  ...,     1,     1,     1],
        [   10,     7,    31,  ...,     1,     1,     1],
        [    0,    49, 15842,  ...,     1,     1,     1],
        ...,
        [   25,    86,   311,  ...,     1,     1,     1],
        [    9,   157,   168,  ...,     1,     1,     1],
        [   10,    49,   235,  ...,     1,     1,     1]])
tensor([132, 121,  72, 343, 112,  90, 226, 982, 118, 341, 145,  64, 149,  87,
        244, 270, 122,  94, 141,  76, 154, 151, 429,  97, 104,  76, 120, 125,
        120, 118, 134, 154])


In [12]:
batch.label

tensor([1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0,
        0, 1, 0, 1, 1, 0, 1, 1])

In [13]:
# stoi := string-to-index
# check on the meaning of these zeroes and ones
LABEL.vocab.stoi

defaultdict(None, {'neg': 0, 'pos': 1})

## Define your network

In [14]:
# was having trouble with when model was defined in the notebook
# Can't get attribute ‘CNN' on <module ‘__main__'
from model import CNN

In [15]:
# smoketest
model = CNN(pretrained_embeddings=TEXT.vocab.vectors).to(device)
gpu_batch = batch.text[0].to(device)
model(gpu_batch)

tensor([[0.4698, 0.5302],
        [0.3791, 0.6209],
        [0.2715, 0.7285],
        [0.3220, 0.6780],
        [0.3979, 0.6021],
        [0.4197, 0.5803],
        [0.3409, 0.6591],
        [0.2695, 0.7305],
        [0.4400, 0.5600],
        [0.4209, 0.5791],
        [0.4640, 0.5360],
        [0.4475, 0.5525],
        [0.4261, 0.5739],
        [0.3032, 0.6968],
        [0.3452, 0.6548],
        [0.3803, 0.6197],
        [0.3741, 0.6259],
        [0.3796, 0.6204],
        [0.5005, 0.4995],
        [0.3706, 0.6294],
        [0.3142, 0.6858],
        [0.3772, 0.6228],
        [0.3281, 0.6719],
        [0.3190, 0.6810],
        [0.3340, 0.6660],
        [0.3948, 0.6052],
        [0.2351, 0.7649],
        [0.4538, 0.5462],
        [0.2321, 0.7679],
        [0.4226, 0.5774],
        [0.2449, 0.7551],
        [0.3471, 0.6529]], device='cuda:0', grad_fn=<SoftmaxBackward>)

In [16]:
del model

In [17]:
del gpu_batch

In [18]:
torch.cuda.empty_cache()

## Quick attempt at model training

In [19]:
import skorch
from skorch import NeuralNetClassifier
import torch.optim as optim

In [20]:
# this is a really unfortunate hack to make torchtext batching semantics work with skorch and dask
# the downside here is that we're no longer padding to the longest sequence in the batch, rather
# we're padding to the longest sequence in the *dataset*, which results in signifcantly more
# computation and thus significantly more time to train a model
# of course, you could set a max sequence length but that's not an ideal solution
# another solution would be to create a different dataset object, but then you can't use torchtext,
# which really is quite handy

# train=True shuffles the data
train_iter_skorch = torchtext.data.Iterator(train, batch_size=len(train), train=True, sort=False, device='cpu')
test_iter_skorch = torchtext.data.Iterator(test, batch_size=len(test), train=False, sort=False, device='cpu')

In [21]:
# takes some time to numericalize the whole dataset

# also notice that skorch and dask expect numpy arrays, which isn't ideal since it ties you to the cpu.
# meanwhile, projects like https://rapids.ai/ are moving toward all GPU computation, avoiding the cpu altogether.
for batch in train_iter_skorch:
    X_train = batch.text[0].numpy()
    y_train = batch.label.numpy()

for batch in test_iter_skorch:
    X_test = batch.text[0].numpy()
    y_test = batch.label.numpy()

In [22]:
# notice how awfully large the second dimension is
X_train.shape

(25000, 2470)

In [23]:
import torch.nn as nn

In [24]:
# NB: not ideal to be using softmax + log + NLLLoss
# see discussion: https://github.com/skorch-dev/skorch/issues/637
skorch_model = NeuralNetClassifier(
                CNN,
                device=device,
                max_epochs=2,
                lr=0.001,
                optimizer=optim.Adam,
                criterion=nn.NLLLoss,
                train_split=skorch.dataset.CVSplit(.2), # NB: this witholds 20% of the training data for validation
                module__n_filters=100,
                module__filter_sizes=(2,3,4),
                module__dropout=0.2,
                module__pretrained_embeddings=TEXT.vocab.vectors,
                batch_size=32,
                verbose=2)

In [25]:
skorch_model.fit(X_train, y_train)

  epoch    train_loss    valid_acc    valid_loss      dur
-------  ------------  -----------  ------------  -------
      1        [36m0.4689[0m       [32m0.8370[0m        [35m0.3656[0m  60.5744
      2        [36m0.3466[0m       [32m0.8524[0m        [35m0.3413[0m  58.6526


<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=CNN(
    (embedding): Embedding(25002, 100)
    (conv_0): Conv1d(1, 100, kernel_size=(2, 100), stride=(1,))
    (conv_1): Conv1d(1, 100, kernel_size=(3, 100), stride=(1,))
    (conv_2): Conv1d(1, 100, kernel_size=(4, 100), stride=(1,))
    (fc): Linear(in_features=300, out_features=2, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  ),
)

In [26]:
# quick check on the test set accuracy
skorch_model.score(X_test, y_test)

0.85512

In [27]:
# random guessing would 50% accuracy so the model is indeed training
np.unique(y_test, return_counts=True)

(array([0, 1]), array([12500, 12500]))

In [28]:
# NB: this has no effect on GPU memory usage. If I keyboard interrupt, the workers get
# restarted and memory usage goes down. Deleting these "handler" objects doesn't delete
# GPU memory references on the workers. 
del skorch_model

## Grid search with Hyperband

In [29]:
# https://ml.dask.org/hyper-parameter-search.html#hyperband-parameters-rule-of-thumb
EPOCHS = 5
NUM_TRAINING_EXAMPLES = len(train)*.8
n_examples = EPOCHS * NUM_TRAINING_EXAMPLES
n_params = 8

# it's not immediately obvious to beginners how all these parameters interact with each other
max_iter = n_params
chunk_size = n_examples // n_params

In [30]:
# suppose we want to set max_iter to be the commensurate with the number of examples required
# for the model converge (as cited in the documentation)

# it's a bit unclear how n_params relates to BOTH the number of data points required
# for the model to converge AND how many hyperparameters to try out (i.e. n_iter in RandomizedSearchCV)

In [31]:
import math

In [32]:
# choose chunk size so that the remainder is not a tiny number
print(f'Chunk size: {chunk_size}')
print(f'Total chunks: {math.ceil(len(X_train) / chunk_size)}')
last_chunk_size = len(X_train) % chunk_size
if last_chunk_size == 0: # i.e. chunk_size evenly divides X_train
    last_chunk_size = chunk_size
print(f'Last chunk size: {last_chunk_size}')

assert (len(X_train) % chunk_size > 10 or len(X_train) % chunk_size == 0), 'Choose another chunk size'

Chunk size: 12500.0
Total chunks: 2
Last chunk size: 12500.0


In [33]:
import dask.array as da

X = da.from_array(X_train, chunks=(chunk_size, X_train.shape[-1]))
y = da.from_array(y_train, chunks=(chunk_size))

In [34]:
X

Unnamed: 0,Array,Chunk
Bytes,494.00 MB,247.00 MB
Shape,"(25000, 2470)","(12500, 2470)"
Count,3 Tasks,2 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 494.00 MB 247.00 MB Shape (25000, 2470) (12500, 2470) Count 3 Tasks 2 Chunks Type int64 numpy.ndarray",2470  25000,

Unnamed: 0,Array,Chunk
Bytes,494.00 MB,247.00 MB
Shape,"(25000, 2470)","(12500, 2470)"
Count,3 Tasks,2 Chunks
Type,int64,numpy.ndarray


In [35]:
# reinitialize and set train_split=None to let hyperband handle validation set splitting
skorch_model = NeuralNetClassifier(
                CNN,
                device=device,
                lr=0.001,
                optimizer=optim.Adam,
                criterion=nn.NLLLoss,
                train_split=None, # let hyperband handle it
                module__n_filters=100,
                module__filter_sizes=(2, 3, 4),
                module__dropout=0.2,
                module__pretrained_embeddings=TEXT.vocab.vectors,
                batch_size=32,
                verbose=2)

In [36]:
from scipy.stats import loguniform

In [37]:
# define parameter grid
params = {'module__filter_sizes': [(1, 2, 3), (2, 3, 4), (3, 4, 5)], 
          'module__n_filters': [25, 50, 100],
          'module__dropout': loguniform(1e-1, 3e-1),
          'batch_size': [32, 64],
         }

In [38]:
from dask_ml.model_selection import HyperbandSearchCV

In [39]:
search = HyperbandSearchCV(
    skorch_model,
    params,
    max_iter=max_iter,
    verbose=True,
    test_size=0.2 # validation size
)

In [40]:
search.metadata["partial_fit_calls"]

26

In [41]:
search.metadata['n_models']

5

In [42]:
import time

In [43]:
# to clear up any confusion, every time partial_fit is called, we're passing in chunk_size number of
# data points. Then skorch handles the batch size either by being set explicitly or as part of the param grid.

# to compare this grid search to number of epochs, we have 26 partial_fit calls * 10k data points = 260k examples
# with a training set size of 25k * .8 = 20k data points, this is 13 epochs!
# considering that it takes approximately 5 epochs to train a model, you would get through less than 3 sets of 
# hyperparameters if manually searching. Instead we'll search through ~5.

In [44]:
# it's been erroring out with a less than helpful error message. This started happening right around 
# when I started passing module__pretrained_embeddings=vocab.vectors. Unclear if this is the culprit.

# NB: this took ~3800 seconds on a single Nvidia GTX 980 Ti
# notice how the number of training datapoints relates to the chunk size and our test_size
# e.g. Validation set size: 2500 = 12500*.2, Train set size: 10000 = 12500 - 2500
start = time.time()
search.fit(X, y)
end = time.time()
duration = round(end - start, 2)
print(f'Time to complete grid search: {duration} seconds')

[CV, bracket=1] creating 3 models


  [<class 'skorch.classifier.NeuralNetClassifier'>[u ... .0721]]),
), 0]
Consider scattering large objects ahead of time
with client.scatter to reduce scheduler burden and 
keep data on workers

    future = client.submit(func, big_data)    # bad

    big_future = client.scatter(big_data)     # good
    future = client.submit(func, big_future)  # good
  % (format_bytes(len(b)), s)


[CV, bracket=0] creating 2 models
[CV, bracket=0] For training there are between 10000 and 10000 examples in each chunk
[CV, bracket=1] For training there are between 10000 and 10000 examples in each chunk
[CV, bracket=0] validation score of 0.8386 received after 1 partial_fit calls
[CV, bracket=1] validation score of 0.8306 received after 1 partial_fit calls
[CV, bracket=0] validation score of 0.8446 received after 8 partial_fit calls
[CV, bracket=1] validation score of 0.8430 received after 2 partial_fit calls
[CV, bracket=1] validation score of 0.8456 received after 6 partial_fit calls
Time to complete grid search: 3813.07 seconds


In [50]:
3800 / 60

63.333333333333336

## Integration

`HyperbandSearchCV` follows the Scikit-learn API and mirrors Scikit-learn's `RandomizedSearchCV`. This means that it "just works". All the Scikit-learn attributes and methods are available:

In [45]:
search.best_score_

0.8456

In [46]:
search.best_estimator_

<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=CNN(
    (embedding): Embedding(25002, 100)
    (conv_0): Conv1d(1, 50, kernel_size=(3, 100), stride=(1,))
    (conv_1): Conv1d(1, 50, kernel_size=(4, 100), stride=(1,))
    (conv_2): Conv1d(1, 50, kernel_size=(5, 100), stride=(1,))
    (fc): Linear(in_features=150, out_features=2, bias=True)
    (dropout): Dropout(p=0.22181138802671369, inplace=False)
  ),
)

In [47]:
import pandas as pd

In [48]:
search.cv_results_

{'param_module__n_filters': array([50, 50, 50, 25, 50]),
 'test_score': array([0.8456, 0.7982, 0.733 , 0.843 , 0.8446]),
 'std_score_time': array([0.06216696, 0.19731486, 0.56393194, 1.33936262, 0.06445384]),
 'rank_test_score': array([1, 2, 3, 2, 1]),
 'mean_score_time': array([3.82462454, 4.0282222 , 5.47523713, 4.41082978, 5.29884744]),
 'params': array([{'batch_size': 64, 'module__dropout': 0.22181138802671369, 'module__filter_sizes': (3, 4, 5), 'module__n_filters': 50},
        {'batch_size': 64, 'module__dropout': 0.19188105885296006, 'module__filter_sizes': (3, 4, 5), 'module__n_filters': 50},
        {'batch_size': 32, 'module__dropout': 0.11893575283135104, 'module__filter_sizes': (3, 4, 5), 'module__n_filters': 50},
        {'batch_size': 32, 'module__dropout': 0.10900910968190201, 'module__filter_sizes': (2, 3, 4), 'module__n_filters': 25},
        {'batch_size': 32, 'module__dropout': 0.10009422737462631, 'module__filter_sizes': (3, 4, 5), 'module__n_filters': 50}],
       

In [49]:
cv_results = pd.DataFrame(search.cv_results_)
cv_results.head()

Exception: Data must be 1-dimensional

In [60]:
# issue with numpy converting array of tuples into a 2d array
search.cv_results_['param_module__filter_sizes'] = search.cv_results_['param_module__filter_sizes'].tolist()

In [61]:
cv_results = pd.DataFrame(search.cv_results_)
cv_results.head()

Unnamed: 0,param_module__n_filters,test_score,std_score_time,rank_test_score,mean_score_time,params,partial_fit_calls,param_module__dropout,std_partial_fit_time,mean_partial_fit_time,model_id,bracket,param_batch_size,param_module__filter_sizes
0,50,0.8456,0.062167,1,3.824625,"{'batch_size': 64, 'module__dropout': 0.221811...",6,0.221811,3.628802,93.614171,bracket=1-0,1,64,"[3, 4, 5]"
1,50,0.7982,0.197315,2,4.028222,"{'batch_size': 64, 'module__dropout': 0.191881...",2,0.191881,1.520352,96.362607,bracket=1-1,1,64,"[3, 4, 5]"
2,50,0.733,0.563932,3,5.475237,"{'batch_size': 32, 'module__dropout': 0.118935...",2,0.118936,11.099511,136.895765,bracket=1-2,1,32,"[3, 4, 5]"
3,25,0.843,1.339363,2,4.41083,"{'batch_size': 32, 'module__dropout': 0.109009...",8,0.109009,15.075696,143.696262,bracket=0-0,0,32,"[2, 3, 4]"
4,50,0.8446,0.064454,1,5.298847,"{'batch_size': 32, 'module__dropout': 0.100094...",8,0.100094,5.982249,130.647277,bracket=0-1,0,32,"[3, 4, 5]"


In [62]:
search.score(X_test, y_test)

0.83816

In [63]:
search.predict(X_test)

Unnamed: 0,Array,Chunk
Bytes,200.00 kB,200.00 kB
Shape,"(25000,)","(25000,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 200.00 kB 200.00 kB Shape (25000,) (25000,) Count 2 Tasks 1 Chunks Type int64 numpy.ndarray",25000  1,

Unnamed: 0,Array,Chunk
Bytes,200.00 kB,200.00 kB
Shape,"(25000,)","(25000,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray


In [64]:
search.predict(X_test).compute()

array([0, 1, 1, ..., 0, 0, 0])

It also has some other attributes.

In [65]:
hist = pd.DataFrame(search.history_)
hist.head()

Unnamed: 0,model_id,params,partial_fit_calls,partial_fit_time,score,score_time,elapsed_wall_time,bracket
0,bracket=0-0,"{'batch_size': 32, 'module__dropout': 0.109009...",1,128.620566,0.8272,3.071467,358.527012,0
1,bracket=0-1,"{'batch_size': 32, 'module__dropout': 0.100094...",1,124.665028,0.8386,5.234394,358.527018,0
2,bracket=1-0,"{'batch_size': 64, 'module__dropout': 0.221811...",1,95.602639,0.7558,3.739039,460.180206,1
3,bracket=1-1,"{'batch_size': 64, 'module__dropout': 0.191881...",1,94.842256,0.7496,4.225537,460.180212,1
4,bracket=1-2,"{'batch_size': 32, 'module__dropout': 0.118935...",1,125.796254,0.8306,6.039169,460.180215,1


This illustrates the history after every `partial_fit` call. There's also an attributed `model_history_` that records the history for each model (it's a reorganization of `history_`).

## Learn more

This notebook covered basic usage `HyperbandSearchCV`. The following documentation and resources might be useful to learn more about `HyperbandSearchCV`, including some of the finer use cases:

* [A talk](https://www.youtube.com/watch?v=x67K9FiPFBQ) introducing `HyperbandSearchCV` to the SciPy 2019 audience and the [corresponding paper](https://conference.scipy.org/proceedings/scipy2019/pdfs/scott_sievert.pdf)
* [HyperbandSearchCV's documentation](https://ml.dask.org/modules/generated/dask_ml.model_selection.HyperbandSearchCV.html)

Performance comparisons can be found in the SciPy 2019 talk/paper.