In [1]:
!pip install -q dask_cuda torch torchtext skorch
!pip -q install dask[dataframe] --upgrade

# Hyperparameter optimization with Skorch


## Setup Dask

In [2]:
import torch
from dask_cuda import LocalCUDACluster
from distributed import Client

# if you have GPU(s), use dask_cuda to automatically make use of them in your dask cluster
if torch.cuda.is_available():
    cluster = LocalCUDACluster()
    client = Client(cluster)
else:
    client = Client(processes=False, threads_per_worker=4,
                    n_workers=1, memory_limit='2GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:42667  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 1  Memory: 33.68 GB


In [3]:
# for reproducibility
import random
import numpy as np

SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Create Data

In [4]:
import torchtext
from torchtext import data
from torchtext import datasets

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
# takes approx. 10 minutes to download data and embeddings (will be cached for re-use)

# set up fields
TEXT = data.Field(lower=True, include_lengths=True, batch_first=True)
LABEL = data.Field(sequential=False, unk_token=None)

# make splits for data
train, test = datasets.IMDB.splits(TEXT, LABEL)

# will be used to initialize model embeddings layer
vocab = torchtext.vocab.GloVe(name='6B', dim=100)

# build the vocabulary
max_size = 25_000 # shorten for demonstrative purposes
TEXT.build_vocab(train, vectors=vocab, max_size=max_size)
LABEL.build_vocab(train)

# make iterator for splits
train_iter, test_iter = data.BucketIterator.splits(
    (train, test), batch_sizes=(32, 64), device=device)

In [7]:
# itos := index-to-string
# note the 2 extra tokens added for us: '<unk>', '<pad>'
TEXT.vocab.itos[:5]

['<unk>', '<pad>', 'the', 'a', 'and']

In [8]:
assert (len(TEXT.vocab.itos) == max_size + 2)

In [9]:
# peek at the data
print(train.examples[0].text)
print()
print(train.examples[0].label)

['a', 'masterful', 'treatment', 'of', 'james', "caine's", '"the', 'postman', 'always', 'rings', 'twice"', 'as', 'luchino', "visconti's", 'first', 'film', 'shot', 'primarily', 'around', 'ferrara', 'in', 'a', 'soulless', 'war-torn', 'italy.', 'the', 'original', 'negative', 'was', 'thought', 'destroyed', 'but', 'visconti', 'saved', 'a', 'print', 'and', 'fortunately', 'we', 'can', 'see', 'this', 'early', 'neo-realist', 'work', 'today.', 'a', 'ruggedly', 'handsome', 'massimo', 'girotti', 'and', 'clara', 'calamai', '(who', 'had', 'recently', 'revealed', 'her', 'breasts', 'in', 'la', 'cena', 'delle', 'beffe"', '(1941),', 'star', 'as', 'the', 'sensually-charged', 'and', 'ill-fated', 'lovers', 'who', 'plot', 'to', 'kill', 'her', 'husband.', 'unusual', 'ending', 'in', 'which,', 'although', 'crime', 'does', 'not', 'pay,', 'one', 'pays', 'in', 'a', 'way', 'not', 'directly', 'linked', 'to', 'the', 'crime.', 'excellent', 'direction,', 'script,', 'acting,', 'and', 'cinematography.', 'reportedly', 'no

In [10]:
# sadly train_iter isn't actually an iter..
# peek at a batch of data
batch = next(iter(train_iter))

In [11]:
# numericalized tokens
print(batch.text[0])
# sequence lengths
print(batch.text[1])

tensor([[  45,  123,   25,  ...,    1,    1,    1],
        [5311, 3868,    2,  ...,    1,    1,    1],
        [ 133,    8,   42,  ...,    1,    1,    1],
        ...,
        [   9,  176,   10,  ...,    1,    1,    1],
        [1467,   10,   30,  ...,    1,    1,    1],
        [  82, 1297,    9,  ...,    1,    1,    1]], device='cuda:0')
tensor([147, 347, 361,  66, 147, 262, 336, 105, 255, 132, 128, 162, 122, 184,
        195, 100, 226, 127, 152, 159, 120, 204, 267, 209, 129, 486,  95, 151,
        140, 247, 127, 167], device='cuda:0')


In [12]:
batch.label

tensor([1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0], device='cuda:0')

In [13]:
# stoi := string-to-index
# check on the meaning of these zeroes and ones
LABEL.vocab.stoi

defaultdict(None, {'neg': 0, 'pos': 1})

## Define your network

In [14]:
# was having trouble with when model was defined in the notebook
# Can't get attribute ‘CNN' on <module ‘__main__'
from model import CNN

In [15]:
# smoketest
model = CNN(pretrained_embeddings=vocab.vectors).to(device)
model(batch.text[0].to(device))

tensor([[0.4709, 0.5291],
        [0.2993, 0.7007],
        [0.2609, 0.7391],
        [0.3476, 0.6524],
        [0.4323, 0.5677],
        [0.3394, 0.6606],
        [0.2745, 0.7255],
        [0.3152, 0.6848],
        [0.4568, 0.5432],
        [0.4189, 0.5811],
        [0.4351, 0.5649],
        [0.4204, 0.5796],
        [0.3963, 0.6037],
        [0.3482, 0.6518],
        [0.3528, 0.6472],
        [0.4137, 0.5863],
        [0.3327, 0.6673],
        [0.3380, 0.6620],
        [0.4874, 0.5126],
        [0.2904, 0.7096],
        [0.2868, 0.7132],
        [0.3196, 0.6804],
        [0.3689, 0.6311],
        [0.2525, 0.7475],
        [0.2493, 0.7507],
        [0.3308, 0.6692],
        [0.2088, 0.7912],
        [0.4834, 0.5166],
        [0.2363, 0.7637],
        [0.4511, 0.5489],
        [0.2282, 0.7718],
        [0.3502, 0.6498]], device='cuda:0', grad_fn=<SoftmaxBackward>)

In [16]:
del model

## Quick attempt at model training

In [17]:
import skorch
from skorch import NeuralNetClassifier
import torch.optim as optim

In [18]:
# this is a really unfortunate hack to make torchtext batching semantics work with skorch and dask
# the downside here is that we're no longer padding to the longest sequence in the batch, rather
# we're padding to the longest sequence in the *dataset*, which results in signifcantly more
# computation and thus significantly more time to train a model
# of course, you could set a max sequence length but that's not an ideal solution
# another solution would be to create a different dataset object, but then you can't use torchtext,
# which really is quite handy

# train=True shuffles the data
train_iter_skorch = torchtext.data.Iterator(train, batch_size=len(train), train=True, sort=False, device='cpu')
test_iter_skorch = torchtext.data.Iterator(test, batch_size=len(test), train=False, sort=False, device='cpu')

In [19]:
# takes some time to numericalize the whole dataset

# also notice that skorch and dask expect numpy arrays, which isn't ideal since it ties you to the cpu.
# meanwhile, projects like https://rapids.ai/ are moving toward all GPU computation, avoiding the cpu altogether.
for batch in train_iter_skorch:
    X_train = batch.text[0].numpy()
    y_train = batch.label.numpy()

for batch in test_iter_skorch:
    X_test = batch.text[0].numpy()
    y_test = batch.label.numpy()

In [20]:
# notice how awfully large the second dimension is
X_train.shape

(25000, 2470)

In [21]:
import torch.nn as nn

In [22]:
# NB: not ideal to be using softmax + log + NLLLoss
# see discussion: https://github.com/skorch-dev/skorch/issues/637
skorch_model = NeuralNetClassifier(
                CNN,
                device=device,
                max_epochs=2,
                lr=0.001,
                optimizer=optim.Adam,
                criterion=nn.NLLLoss,
                train_split=skorch.dataset.CVSplit(.2), # NB: this witholds 20% of the training data for validation
                module__n_filters=100,
                module__filter_sizes=(2,3,4),
                module__dropout=0.2,
                module__pretrained_embeddings=vocab.vectors,
                batch_size=32,
                verbose=2)

In [23]:
skorch_model.fit(X_train, y_train)

  epoch    train_loss    valid_acc    valid_loss      dur
-------  ------------  -----------  ------------  -------
      1        [36m0.6302[0m       [32m0.7516[0m        [35m0.5243[0m  66.5468
      2        [36m0.4797[0m       [32m0.7994[0m        [35m0.4413[0m  59.9177


<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=CNN(
    (embedding): Embedding(400000, 100)
    (conv_0): Conv1d(1, 100, kernel_size=(2, 100), stride=(1,))
    (conv_1): Conv1d(1, 100, kernel_size=(3, 100), stride=(1,))
    (conv_2): Conv1d(1, 100, kernel_size=(4, 100), stride=(1,))
    (fc): Linear(in_features=300, out_features=2, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  ),
)

In [24]:
# quick check on the test set accuracy
skorch_model.score(X_test, y_test)

0.79112

In [25]:
# random guessing would 50% accuracy so the model is indeed training
np.unique(y_test, return_counts=True)

(array([0, 1]), array([12500, 12500]))

In [26]:
del skorch_model

## Grid search with Hyperband

In [27]:
# https://ml.dask.org/hyper-parameter-search.html#hyperband-parameters-rule-of-thumb
EPOCHS = 5
NUM_TRAINING_EXAMPLES = len(train)*.8
n_examples = EPOCHS * NUM_TRAINING_EXAMPLES
n_params = 8

# it's not immediately obvious to beginners how all these parameters interact with each other
max_iter = n_params
chunk_size = n_examples // n_params

In [28]:
# suppose we want to set max_iter to be the commensurate with the number of examples required
# for the model converge (as cited in the documentation)

# it's a bit unclear how n_params relates to BOTH the number of data points required
# for the model to converge AND how many hyperparameters to try out (i.e. n_iter in RandomizedSearchCV)

In [29]:
import math

In [30]:
# choose chunk size so that the remainder is not a tiny number
print(f'Chunk size: {chunk_size}')
print(f'Total chunks: {math.ceil(len(X_train) / chunk_size)}')
last_chunk_size = len(X_train) % chunk_size
if last_chunk_size == 0: # i.e. chunk_size evenly divides X_train
    last_chunk_size = chunk_size
print(f'Last chunk size: {last_chunk_size}')

assert (len(X_train) % chunk_size > 10 or len(X_train) % chunk_size == 0), 'Choose another chunk size'

Chunk size: 12500.0
Total chunks: 2
Last chunk size: 12500.0


In [31]:
import dask.array as da

X = da.from_array(X_train, chunks=(chunk_size, X_train.shape[-1]))
y = da.from_array(y_train, chunks=(chunk_size))

In [32]:
X

Unnamed: 0,Array,Chunk
Bytes,494.00 MB,247.00 MB
Shape,"(25000, 2470)","(12500, 2470)"
Count,3 Tasks,2 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 494.00 MB 247.00 MB Shape (25000, 2470) (12500, 2470) Count 3 Tasks 2 Chunks Type int64 numpy.ndarray",2470  25000,

Unnamed: 0,Array,Chunk
Bytes,494.00 MB,247.00 MB
Shape,"(25000, 2470)","(12500, 2470)"
Count,3 Tasks,2 Chunks
Type,int64,numpy.ndarray


In [33]:
import torch.nn as nn

In [35]:
# reinitialize and set train_split=None to let hyperband handle validation set splitting
skorch_model = NeuralNetClassifier(
                CNN,
                device=device,
                lr=0.001,
                optimizer=optim.Adam,
                criterion=nn.NLLLoss,
                train_split=None, # let hyperband handle it
                module__n_filters=100,
                module__filter_sizes=(2, 3, 4),
                module__dropout=0.2,
                module__pretrained_embeddings=vocab.vectors,
                batch_size=32,
                verbose=2)

In [36]:
from scipy.stats import loguniform

In [37]:
# define parameter grid
params = {'module__filter_size': [(1,2,3), (2, 3, 4), (3, 4, 5)], 
          'module__n_filters': [25,50,100],
          'module__dropout': loguniform(1e-1, 3e-1),
          'batch_size': [32, 64],
         }

In [38]:
from dask_ml.model_selection import HyperbandSearchCV

In [39]:
search = HyperbandSearchCV(
    skorch_model,
    params,
    max_iter=max_iter,
    verbose=True,
    test_size=0.2 # validation size
)

In [40]:
search.metadata["partial_fit_calls"]

26

In [41]:
import time

In [42]:
# to clear up any confusion, every time partial_fit is called, we're passing in chunk_size number of
# data points. Then skorch handles the batch size either by being set explicitly or as part of the param grid.

# to compare this grid search to number of epochs, we have 26 partial_fit calls * 10k data points = 260k examples
# with a training set size of 25k * .8 = 20k data points, this is 13 epochs!
# considering that it takes approximately 5 epochs to train a model, you would get through less than 3 sets of 
# hyperparameters if manually searching. Instead we'll search through ~5.

In [43]:
# it's been erroring out with a less than helpful error message. This started happening right around 
# when I started passing module__pretrained_embeddings=vocab.vectors. Unclear if this is the culprit.

# NB: this took ~900 seconds on a single Nvidia GTX 980 Ti
# notice how the number of training datapoints relates to the chunk size and our test_size
# e.g. Validation set size: 2500 = 12500*.2, Train set size: 10000 = 12500 - 2500
start = time.time()
search.fit(X, y)
end = time.time()
duration = round(end - start, 2)
print(f'Time to complete grid search: {duration} seconds')

[CV, bracket=1] creating 3 models


  [<class 'skorch.classifier.NeuralNetClassifier'>[u ... .1570]]),
), 0]
Consider scattering large objects ahead of time
with client.scatter to reduce scheduler burden and 
keep data on workers

    future = client.submit(func, big_data)    # bad

    big_future = client.scatter(big_data)     # good
    future = client.submit(func, big_future)  # good
  % (format_bytes(len(b)), s)


[CV, bracket=0] creating 2 models
[CV, bracket=0] For training there are between 10000 and 10000 examples in each chunk
[CV, bracket=1] For training there are between 10000 and 10000 examples in each chunk


tornado.application - ERROR - Multiple exceptions in yield list
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 849, in callback
    result_list.append(f.result())
  File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 1107, in run
    yielded = self.gen.throw(*exc_info)
  File "/opt/conda/lib/python3.7/site-packages/dask_ml/model_selection/_incremental.py", line 625, in _fit
    prefix=self.prefix,
  File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 1099, in run
    value = future.result()
  File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 1107, in run
    yielded = self.gen.throw(*exc_info)
  File "/opt/conda/lib/python3.7/site-packages/dask_ml/model_selection/_incremental.py", line 233, in _fit
    metas = yield client.gather(new_scores)
  File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 1099, in run
    value = future.result()
  File "/opt/conda/lib/python3.7/s

BdbQuit: 

## Integration

`HyperbandSearchCV` follows the Scikit-learn API and mirrors Scikit-learn's `RandomizedSearchCV`. This means that it "just works". All the Scikit-learn attributes and methods are available:

In [None]:
search.best_score_

In [None]:
search.best_estimator_

In [None]:
import pandas as pd

In [None]:
search.cv_results_

In [None]:
cv_results = pd.DataFrame(search.cv_results_)
cv_results.head()

In [None]:
search.score(X_test, y_test)

In [None]:
search.predict(X_test)

In [None]:
search.predict(X_test).compute()

It also has some other attributes.

In [None]:
hist = pd.DataFrame(search.history_)
hist.head()

This illustrates the history after every `partial_fit` call. There's also an attributed `model_history_` that records the history for each model (it's a reorganization of `history_`).

## Learn more

This notebook covered basic usage `HyperbandSearchCV`. The following documentation and resources might be useful to learn more about `HyperbandSearchCV`, including some of the finer use cases:

* [A talk](https://www.youtube.com/watch?v=x67K9FiPFBQ) introducing `HyperbandSearchCV` to the SciPy 2019 audience and the [corresponding paper](https://conference.scipy.org/proceedings/scipy2019/pdfs/scott_sievert.pdf)
* [HyperbandSearchCV's documentation](https://ml.dask.org/modules/generated/dask_ml.model_selection.HyperbandSearchCV.html)

Performance comparisons can be found in the SciPy 2019 talk/paper.