In [1]:
# !pip install -q dask_cuda torch torchtext skorch
# !pip -q install dask[dataframe] --upgrade

# Hyperparameter optimization with Skorch


## Setup Dask Cluster

In [2]:
import math
import random
import time

import dask.array as da
from dask_cuda import LocalCUDACluster
from dask_ml.model_selection import HyperbandSearchCV
from distributed import Client
import numpy as np
import pandas as pd
from scipy.stats import loguniform
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
import skorch
from skorch import NeuralNetClassifier
from skorch.helper import SliceDataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchtext
from torchtext import data
from torchtext import datasets

In [3]:
# if you have GPU(s), use dask_cuda to automatically make use of them in your dask cluster
if torch.cuda.is_available():
    cluster = LocalCUDACluster()
    client = Client(cluster)
else:
    client = Client(processes=False, threads_per_worker=4,
                    n_workers=1, memory_limit='2GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:43999  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 1  Memory: 31.63 GB


In [4]:
# for reproducibility
# NB: enabling reproducibility can significantly slow down runtimes
reproducible = False
if reproducible:
    SEED = 42
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

## Create Data

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
# this solves many of our later problems but isn't an ideal solution
# accuracy will take a hit
FIX_LENGTH = 512

In [7]:
# a few seconds to download IMDB dataset (84Mb, will be cached)
# approx. 10 minutes to download glove embeddings (862Mb, will be cached)

# set up fields
TEXT = data.Field(lower=True, batch_first=True, fix_length=FIX_LENGTH)
LABEL = data.Field(sequential=False, unk_token=None)

# make splits for data
train, test = datasets.IMDB.splits(TEXT, LABEL)

# work with 5k datapoints for faster iteration times
split_ratio = 5_000 / len(train)
train, discard = train.split(split_ratio=split_ratio)

split_ratio = 5_000 / len(test)
test, discard = test.split(split_ratio=split_ratio)

# will be used to initialize model embeddings layer
vocab = torchtext.vocab.GloVe(name='6B', dim=100)

# build the vocabulary
max_size = 25_000 # shorten for demonstrative purposes
TEXT.build_vocab(train, vectors=vocab, max_size=max_size)
LABEL.build_vocab(train)

In [8]:
# itos := index-to-string
# note the 2 extra tokens added for us: '<unk>', '<pad>'
TEXT.vocab.itos[:5]

['<unk>', '<pad>', 'the', 'and', 'a']

In [9]:
# stoi := string-to-index
# check on the meaning of these zeroes and ones
LABEL.vocab.stoi

defaultdict(None, {'pos': 0, 'neg': 1})

In [10]:
assert (len(TEXT.vocab.itos) == max_size + 2)

In [11]:
# peek at the data
print(train.examples[0].text[:10], '...')
print()
print(train.examples[0].label)

['i', "couldn't", 'hold', 'back', 'the', 'tears', 'when', 'i', 'watched', 'this'] ...

pos


In [12]:
# custom dataset class required to work with Skorch
class TorchDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __getitem__(self, idx):
        example = self.dataset.examples[idx]
        return example.text, example.label
    
    def __len__(self):
        return len(self.dataset)

In [13]:
train_dataset = TorchDataset(train)
test_dataset = TorchDataset(test)

In [14]:
tokens, label = train_dataset[0]

In [15]:
print(tokens[:10], '...')
print()
print(label)

['i', "couldn't", 'hold', 'back', 'the', 'tears', 'when', 'i', 'watched', 'this'] ...

pos


In [16]:
# custom collate function for DataLoader
def pad_batch(batch, TEXT, LABEL):
    text, label = list(zip(*batch))
    # numericalized and padded text representation
    text_processed = TEXT.process(text)
    label_processed = LABEL.process(label)
    return text_processed, label_processed

from functools import partial

pad_batch_partial = partial(pad_batch, TEXT=TEXT, LABEL=LABEL)

In [17]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=pad_batch_partial)

In [18]:
batch = next(iter(train_dataloader))

In [19]:
processed_examples, labels = batch

In [20]:
print(processed_examples[0][:10], '...')
print()
print(labels[0])

tensor([   9,   20,    7, 3765,   23,    8,   54,  692,    2, 2384]) ...

tensor(1)


## Define your network

In [21]:
# would have defined the class in this notebook but was getting the following error from Hyperband
# PicklingError: Can't pickle <class '__main__.CNN'>: attribute lookup CNN on __main__ failed
from model import CNN

In [22]:
# smoketest
model = CNN(pretrained_embeddings=TEXT.vocab.vectors).to(device)
gpu_batch = batch[0].to(device)
model_out = model(gpu_batch)
print(model_out)

tensor([[0.5000, 0.5000],
        [0.5174, 0.4826],
        [0.4599, 0.5401],
        [0.3472, 0.6528],
        [0.3888, 0.6112],
        [0.4259, 0.5741],
        [0.3734, 0.6266],
        [0.3527, 0.6473],
        [0.4275, 0.5725],
        [0.4277, 0.5723],
        [0.5281, 0.4719],
        [0.4183, 0.5817],
        [0.4409, 0.5591],
        [0.4205, 0.5795],
        [0.4820, 0.5180],
        [0.3552, 0.6448],
        [0.3843, 0.6157],
        [0.3047, 0.6953],
        [0.5312, 0.4688],
        [0.4069, 0.5931],
        [0.3691, 0.6309],
        [0.3541, 0.6459],
        [0.2763, 0.7237],
        [0.4770, 0.5230],
        [0.3749, 0.6251],
        [0.4165, 0.5835],
        [0.4208, 0.5792],
        [0.5268, 0.4732],
        [0.4046, 0.5954],
        [0.5047, 0.4953],
        [0.3795, 0.6205],
        [0.4030, 0.5970]], device='cuda:0', grad_fn=<SoftmaxBackward>)


In [23]:
del model

In [24]:
del gpu_batch

In [25]:
torch.cuda.empty_cache()

## Quick attempt at model training to debug any issues

In [26]:
# if you want to use a custom DataLoader, you must use NeuralNet
# also, not immediately obvious that for NeuralNet you are responsible for applying the log function
# whereas for NeuralNetClassifier, you are not

# NB: not ideal to be using softmax + log + NLLLoss
# see discussion: https://github.com/skorch-dev/skorch/issues/637
skorch_model = NeuralNetClassifier(
                CNN,
                device=device,
                max_epochs=2,
                lr=0.001,
                optimizer=optim.Adam,
                criterion=nn.NLLLoss,
                iterator_train=DataLoader,
                iterator_train__shuffle=True,
                iterator_train__batch_size=32,
                iterator_train__collate_fn=pad_batch_partial,
                iterator_train__num_workers=8,
                iterator_valid=DataLoader,
                iterator_valid__shuffle=False,
                iterator_valid__batch_size=64,
                iterator_valid__collate_fn=pad_batch_partial,
                iterator_valid__num_workers=8,
                train_split=skorch.dataset.CVSplit(.2), # NB: this witholds 20% of the training data for validation
                module__n_filters=100,
                module__filter_sizes=(2,3,4),
                module__dropout=0.2,
                module__pretrained_embeddings=TEXT.vocab.vectors,
                verbose=2)
# getting the following error when trying to compute accuracy
# ValueError: Classification metrics can't handle a mix of binary and continuous-multioutput targets
#                 callbacks=callbacks)

In [27]:
skorch_model.fit(train_dataset, y=None)

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m0.6258[0m       [32m0.7930[0m        [35m0.5002[0m  2.0929
      2        [36m0.4405[0m       [32m0.8250[0m        [35m0.3986[0m  1.9010


<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=CNN(
    (embedding): Embedding(25002, 100)
    (conv_0): Conv1d(1, 100, kernel_size=(2, 100), stride=(1,))
    (conv_1): Conv1d(1, 100, kernel_size=(3, 100), stride=(1,))
    (conv_2): Conv1d(1, 100, kernel_size=(4, 100), stride=(1,))
    (fc): Linear(in_features=300, out_features=2, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  ),
)

In [28]:
# https://github.com/skorch-dev/skorch/issues/641

# skorch_model.score(test_dataset)
# TypeError: score() missing 1 required positional argument: 'y'
# skorch_model.score(test_dataset, y=None)
# ValueError: Expected array-like (array or non-string sequence), got None

# can monkey patch skorch_model to achieve native scoring
# def score(self, X, y=None): 
#     ds = self.get_dataset(X) 
#     target_iterator = self.get_iterator(ds, training=False) 
    
#     y_true = np.concatenate([skorch.utils.to_numpy(y) for _, y in target_iterator]) 
#     y_pred = self.predict(X)
    
#     return accuracy_score(y_true, y_pred) 

In [29]:
# score manually
test_dataloader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False, collate_fn=pad_batch_partial, num_workers=8)

In [30]:
# test set accuracy
test_preds = skorch_model.predict(test_dataset)
processed_test_data = next(iter(test_dataloader))
test_labels = processed_test_data[1].numpy()
print(accuracy_score(test_labels, test_preds))

0.8006


In [31]:
# random guessing would 50% accuracy so the model is indeed training well
np.unique(test_labels, return_counts=True)

(array([0, 1]), array([2554, 2446]))

In [32]:
# NB: this has no effect on GPU memory usage. If I keyboard interrupt, the workers get
# restarted and memory usage goes down. Deleting these "handler" objects doesn't delete
# GPU memory references on the workers. 
# del skorch_model

### Grid search with Skorch

In [33]:
# define parameter grid
params = {'module__filter_sizes': [(1, 2, 3), (2, 3, 4), (3, 4, 5)], 
          'module__n_filters': [25, 50, 100],
          'module__dropout': loguniform(1e-1, 3e-1),
          'batch_size': [32, 64],
         }

skorch_search = RandomizedSearchCV(skorch_model, params, n_iter=2, cv=5)

In [34]:
# This errors out with: TypeError: fit() missing 1 required positional argument: 'y'
# skorch_search.fit(train_dataset, y=None)

In [35]:
# https://github.com/skorch-dev/skorch/issues/605#issuecomment-650580286

In [36]:
# ValueError: Dataset does not have consistent lengths.
# dummy_y = np.zeros((len(train_dataset)))
# skorch_search.fit(train_dataset, y=dummy_y)

In [37]:
# ValueError: Dataset does not have consistent lengths.
# y = torch.cat([LABEL.process([pair[1]]) for pair in train_dataset]).numpy()
# skorch_search.fit(train_dataset, y=y)

In [38]:
# ValueError: Dataset does not have consistent lengths.
# skorch_search.fit(train_dataset, y=SliceDataset(train_dataset, idx=1))

## Grid search with Hyperband

This is a really unfortunate hack to make deep learning batching semantics work with `Skorch` and `Dask`. The downside here is that we're no longer padding to the longest sequence in the batch, rather we're padding to the longest sequence in the *dataset*, which results in signifcantly more computation and thus significantly more time to train a model.

Our solution was to set a max sequence length but that's not an ideal solution since you're still performing extra computation and accuracy does suffer.

In [39]:
# train=True shuffles the data
train_iter_skorch = torchtext.data.Iterator(train, batch_size=len(train), train=True, sort=False, device='cpu')
test_iter_skorch = torchtext.data.Iterator(test, batch_size=len(test), train=False, sort=False, device='cpu')

In [40]:
# takes some time to numericalize the whole dataset

# also notice that skorch and dask expect numpy arrays, which isn't ideal since it ties you to the cpu.
# meanwhile, projects like https://rapids.ai/ are moving toward all GPU computation, avoiding the cpu altogether.
for batch in train_iter_skorch:
    X_train = batch.text.numpy()
    y_train = batch.label.numpy()

In [41]:
for batch in test_iter_skorch:
    X_test = batch.text.numpy()
    y_test = batch.label.numpy()

In [42]:
# notice how awfully large the second dimension is
X_train.shape

(5000, 512)

In [49]:
# https://ml.dask.org/hyper-parameter-search.html#hyperband-parameters-rule-of-thumb
EPOCHS = 10
NUM_TRAINING_EXAMPLES = len(train)*.8
n_examples = EPOCHS * NUM_TRAINING_EXAMPLES
n_params = 12

# it's not immediately obvious to beginners how all these parameters interact with each other
max_iter = n_params
chunk_size = n_examples // n_params

In [50]:
# suppose we want to set max_iter to be the commensurate with the number of examples required
# for the model converge (as cited in the documentation)

# it's a bit unclear how n_params relates to BOTH the number of data points required
# for the model to converge AND how many hyperparameters to try out (i.e. n_iter in RandomizedSearchCV)

In [51]:
# choose chunk size so that the remainder is not a tiny number
print(f'Chunk size: {chunk_size}')
print(f'Total chunks: {math.ceil(len(train) / chunk_size)}')
last_chunk_size = len(train) % chunk_size
if last_chunk_size == 0: # i.e. chunk_size evenly divides X_train
    last_chunk_size = chunk_size
print(f'Last chunk size: {last_chunk_size}')

assert (len(train) % chunk_size > 10 or len(train) % chunk_size == 0), 'Choose another chunk size'

Chunk size: 3333.0
Total chunks: 2
Last chunk size: 1667.0


In [52]:
X = da.from_array(X_train, chunks=(chunk_size, X_train.shape[-1]))
y = da.from_array(y_train, chunks=(chunk_size))

In [53]:
X

Unnamed: 0,Array,Chunk
Bytes,20.48 MB,13.65 MB
Shape,"(5000, 512)","(3333, 512)"
Count,3 Tasks,2 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 20.48 MB 13.65 MB Shape (5000, 512) (3333, 512) Count 3 Tasks 2 Chunks Type int64 numpy.ndarray",512  5000,

Unnamed: 0,Array,Chunk
Bytes,20.48 MB,13.65 MB
Shape,"(5000, 512)","(3333, 512)"
Count,3 Tasks,2 Chunks
Type,int64,numpy.ndarray


TLDR; you can't use dask arrays with `torch.utils.data.Dataloader`, which means you have to do all your data preparation ahead of time

In [54]:
# raw_train_dataset = [x for x in train_dataset]
# raw_train_dataset_array = np.array(raw_train_dataset, dtype=object)
# dask_dataset = da.from_array(raw_train_dataset_array, chunks=(chunk_size))
# dask_dataset[0].compute()

# TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'dask.array.core.Array'>
# data_iter = DataLoader(dask_dataset)
# next(iter(data_iter))

# # TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found object
# np_data_iter = DataLoader(raw_train_dataset_array)
# next(iter(np_data_iter))

In [55]:
# reinitialize and set train_split=None to let hyperband handle validation set splitting
skorch_model = NeuralNetClassifier(
                CNN,
                device=device,
                lr=0.001,
                optimizer=optim.Adam,
                criterion=nn.NLLLoss,
                iterator_train__batch_size=32,
                iterator_valid__batch_size=64,
                train_split=None, # let hyperband handle it
                module__n_filters=100,
                module__filter_sizes=(2, 3, 4),
                module__dropout=0.2,
                module__pretrained_embeddings=TEXT.vocab.vectors,
                # module__TEXT=TEXT,
                batch_size=32,
                verbose=2)

In [56]:
# define parameter grid
params = {'module__filter_sizes': [(1, 2, 3), (2, 3, 4), (3, 4, 5)], 
          'module__n_filters': [25, 50, 100],
          'module__dropout': loguniform(1e-1, 3e-1),
          'batch_size': [32, 64],
         }

In [57]:
search = HyperbandSearchCV(
    skorch_model,
    params,
    max_iter=max_iter,
    verbose=True,
    test_size=0.2 # validation size
)

In [58]:
search.metadata["partial_fit_calls"]

85

In [59]:
search.metadata['n_models']

17

In [60]:
# to clear up any confusion, every time partial_fit is called, we're passing in chunk_size number of
# data points. Then skorch handles the batch size either by being set explicitly or as part of the param grid.

# to compare this grid search to number of epochs, we have 26 partial_fit calls * 10k data points = 260k examples
# with a training set size of 25k * .8 = 20k data points, this is 13 epochs!
# considering that it takes approximately 5 epochs to train a model, you would get through less than 3 sets of 
# hyperparameters if manually searching. Instead we'll search through ~5.

Running training on a Nvidia Tesla T4\ 

In [61]:
# notice how the number of training datapoints relates to the chunk size and our test_size
# Train set chunk size: 800 = 1000*(1-.2)
# Validation set chunk size: 200 = 1000*.2
start = time.time()
search.fit(X, y)
end = time.time()
duration = round(end - start, 2)
print(f'Time to complete grid search: {duration} seconds')

[CV, bracket=2] creating 9 models
[CV, bracket=1] creating 5 models


  [<class 'skorch.classifier.NeuralNetClassifier'>[u ... .0000]]),
), 0]
Consider scattering large objects ahead of time
with client.scatter to reduce scheduler burden and 
keep data on workers

    future = client.submit(func, big_data)    # bad

    big_future = client.scatter(big_data)     # good
    future = client.submit(func, big_future)  # good
  % (format_bytes(len(b)), s)


[CV, bracket=0] creating 3 models
[CV, bracket=0] For training there are between 1333 and 2666 examples in each chunk
[CV, bracket=1] For training there are between 1333 and 2666 examples in each chunk
[CV, bracket=2] For training there are between 1333 and 2666 examples in each chunk
[CV, bracket=0] validation score of 0.7982 received after 1 partial_fit calls
[CV, bracket=1] validation score of 0.8032 received after 1 partial_fit calls
[CV, bracket=2] validation score of 0.7842 received after 1 partial_fit calls
[CV, bracket=0] validation score of 0.8551 received after 12 partial_fit calls
[CV, bracket=1] validation score of 0.8322 received after 4 partial_fit calls
[CV, bracket=2] validation score of 0.8212 received after 3 partial_fit calls
[CV, bracket=1] validation score of 0.8162 received after 12 partial_fit calls
[CV, bracket=2] validation score of 0.8062 received after 9 partial_fit calls
Time to complete grid search: 372.02 seconds


## Integration

`HyperbandSearchCV` follows the Scikit-learn API and mirrors Scikit-learn's `RandomizedSearchCV`. This means that it "just works". All the Scikit-learn attributes and methods are available:

In [62]:
search.best_score_

0.8551448551448552

In [63]:
search.best_estimator_

<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=CNN(
    (embedding): Embedding(25002, 100)
    (conv_0): Conv1d(1, 50, kernel_size=(1, 100), stride=(1,))
    (conv_1): Conv1d(1, 50, kernel_size=(2, 100), stride=(1,))
    (conv_2): Conv1d(1, 50, kernel_size=(3, 100), stride=(1,))
    (fc): Linear(in_features=150, out_features=2, bias=True)
    (dropout): Dropout(p=0.12476236679704862, inplace=False)
  ),
)

In [64]:
search.cv_results_

{'param_module__n_filters': array([100, 100,  25,  50,  25,  25,  25,  50,  50,  25, 100,  25, 100,
        100,  50, 100, 100]),
 'param_batch_size': array([32, 64, 64, 32, 32, 32, 32, 32, 32, 32, 64, 64, 64, 32, 32, 64, 32]),
 'param_module__dropout': array([0.14233276, 0.15454961, 0.273504  , 0.26444226, 0.13400896,
        0.19302409, 0.15142034, 0.23147478, 0.28516434, 0.15385363,
        0.18147304, 0.29957988, 0.10990513, 0.29003914, 0.12476237,
        0.14516028, 0.10459089]),
 'std_partial_fit_time': array([0.        , 1.42792781, 0.        , 0.        , 0.8889792 ,
        0.94623792, 0.        , 0.        , 0.        , 0.84567356,
        1.6097213 , 0.79452193, 1.30084145, 1.54377656, 0.04432201,
        0.26346576, 0.31667006]),
 'param_module__filter_sizes': array([[3, 4, 5],
        [2, 3, 4],
        [1, 2, 3],
        [2, 3, 4],
        [2, 3, 4],
        [3, 4, 5],
        [1, 2, 3],
        [3, 4, 5],
        [1, 2, 3],
        [2, 3, 4],
        [2, 3, 4],
        

In [65]:
# issue with numpy converting array of tuples into a 2d array
try:    
    cv_results = pd.DataFrame(search.cv_results_)
    cv_results.head()
except Exception as e:
    print(e)

Data must be 1-dimensional


In [66]:
# issue with numpy converting array of tuples into a 2d array
search.cv_results_['param_module__filter_sizes'] = search.cv_results_['param_module__filter_sizes'].tolist()

In [67]:
cv_results = pd.DataFrame(search.cv_results_)
cv_results.head()

Unnamed: 0,param_module__n_filters,param_batch_size,param_module__dropout,std_partial_fit_time,param_module__filter_sizes,test_score,model_id,bracket,rank_test_score,std_score_time,params,mean_partial_fit_time,mean_score_time,partial_fit_calls
0,100,32,0.142333,0.0,"[3, 4, 5]",0.736264,bracket=2-0,2,9,0.0,"{'batch_size': 32, 'module__dropout': 0.142332...",3.082313,0.165004,1
1,100,64,0.15455,1.427928,"[2, 3, 4]",0.806194,bracket=2-1,2,1,0.00368,"{'batch_size': 64, 'module__dropout': 0.154549...",4.203769,0.176866,9
2,25,64,0.273504,0.0,"[1, 2, 3]",0.778222,bracket=2-2,2,5,0.0,"{'batch_size': 64, 'module__dropout': 0.273504...",1.613707,0.05371,1
3,50,32,0.264442,0.0,"[2, 3, 4]",0.779221,bracket=2-3,2,4,0.0,"{'batch_size': 32, 'module__dropout': 0.264442...",1.775565,0.075516,1
4,25,32,0.134009,0.888979,"[2, 3, 4]",0.806194,bracket=2-4,2,1,0.000247,"{'batch_size': 32, 'module__dropout': 0.134008...",2.550697,0.062964,3


In [68]:
search.score(X_test, y_test)

0.8106

In [69]:
search.predict(X_test)

Unnamed: 0,Array,Chunk
Bytes,40.00 kB,40.00 kB
Shape,"(5000,)","(5000,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 40.00 kB 40.00 kB Shape (5000,) (5000,) Count 2 Tasks 1 Chunks Type int64 numpy.ndarray",5000  1,

Unnamed: 0,Array,Chunk
Bytes,40.00 kB,40.00 kB
Shape,"(5000,)","(5000,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray


In [70]:
search.predict(X_test).compute()

array([0, 1, 0, ..., 1, 0, 1])

It also has some other attributes.

In [71]:
hist = pd.DataFrame(search.history_)
hist.head()

Unnamed: 0,model_id,params,partial_fit_calls,partial_fit_time,score,score_time,elapsed_wall_time,bracket
0,bracket=0-0,"{'batch_size': 32, 'module__dropout': 0.124762...",1,3.679612,0.798202,0.069165,30.430613,0
1,bracket=0-1,"{'batch_size': 64, 'module__dropout': 0.145160...",1,6.269561,0.791209,0.167421,30.430616,0
2,bracket=0-2,"{'batch_size': 32, 'module__dropout': 0.104590...",1,6.193064,0.789211,0.166522,30.430617,0
3,bracket=1-0,"{'batch_size': 32, 'module__dropout': 0.153853...",1,1.683563,0.787213,0.062644,36.104408,1
4,bracket=1-1,"{'batch_size': 64, 'module__dropout': 0.181473...",1,3.002587,0.803197,0.171636,36.10441,1


This illustrates the history after every `partial_fit` call. There's also an attributed `model_history_` that records the history for each model (it's a reorganization of `history_`).

## Learn more

This notebook covered basic usage `HyperbandSearchCV`. The following documentation and resources might be useful to learn more about `HyperbandSearchCV`, including some of the finer use cases:

* [A talk](https://www.youtube.com/watch?v=x67K9FiPFBQ) introducing `HyperbandSearchCV` to the SciPy 2019 audience and the [corresponding paper](https://conference.scipy.org/proceedings/scipy2019/pdfs/scott_sievert.pdf)
* [HyperbandSearchCV's documentation](https://ml.dask.org/modules/generated/dask_ml.model_selection.HyperbandSearchCV.html)

Performance comparisons can be found in the SciPy 2019 talk/paper.