In [1]:
!pip install -q dask_cuda torch torchtext skorch
!pip -q install dask[dataframe] --upgrade

# Hyperparameter optimization with Skorch


## Setup Dask

In [2]:
import torch
from dask_cuda import LocalCUDACluster
from distributed import Client

# if you have GPU(s), use dask_cuda to automatically make use of them in your dask cluster
if torch.cuda.is_available():
    cluster = LocalCUDACluster()
    client = Client(cluster)
else:
    client = Client(processes=False, threads_per_worker=4,
                    n_workers=1, memory_limit='2GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:42917  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 1  Memory: 33.68 GB


In [3]:
# for reproducibility
import random
import numpy as np

SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Create Data

In [4]:
import torchtext
from torchtext import data
from torchtext import datasets
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
# takes approx. 10 minutes to download data and embeddings (will be cached for re-use)

# set up fields
TEXT = data.Field(lower=True, batch_first=True, )
LABEL = data.Field(sequential=False, unk_token=None)

# make splits for data
train, test = datasets.IMDB.splits(TEXT, LABEL)

# will be used to initialize model embeddings layer
vocab = torchtext.vocab.GloVe(name='6B', dim=100)

# build the vocabulary
max_size = 25_000 # shorten for demonstrative purposes
TEXT.build_vocab(train, vectors=vocab, max_size=max_size)
LABEL.build_vocab(train)

# sadly I don't think we can use this 
# make iterator for splits
# train_iter, test_iter = data.BucketIterator.splits(
#     (train, test), batch_sizes=(32, 64), device='cpu')

In [7]:
# itos := index-to-string
# note the 2 extra tokens added for us: '<unk>', '<pad>'
TEXT.vocab.itos[:5]

['<unk>', '<pad>', 'the', 'a', 'and']

In [8]:
# stoi := string-to-index
# check on the meaning of these zeroes and ones
LABEL.vocab.stoi

defaultdict(None, {'neg': 0, 'pos': 1})

In [9]:
assert (len(TEXT.vocab.itos) == max_size + 2)

In [10]:
# peek at the data
print(train.examples[0].text)
print()
print(train.examples[0].label)

['a', 'masterful', 'treatment', 'of', 'james', "caine's", '"the', 'postman', 'always', 'rings', 'twice"', 'as', 'luchino', "visconti's", 'first', 'film', 'shot', 'primarily', 'around', 'ferrara', 'in', 'a', 'soulless', 'war-torn', 'italy.', 'the', 'original', 'negative', 'was', 'thought', 'destroyed', 'but', 'visconti', 'saved', 'a', 'print', 'and', 'fortunately', 'we', 'can', 'see', 'this', 'early', 'neo-realist', 'work', 'today.', 'a', 'ruggedly', 'handsome', 'massimo', 'girotti', 'and', 'clara', 'calamai', '(who', 'had', 'recently', 'revealed', 'her', 'breasts', 'in', 'la', 'cena', 'delle', 'beffe"', '(1941),', 'star', 'as', 'the', 'sensually-charged', 'and', 'ill-fated', 'lovers', 'who', 'plot', 'to', 'kill', 'her', 'husband.', 'unusual', 'ending', 'in', 'which,', 'although', 'crime', 'does', 'not', 'pay,', 'one', 'pays', 'in', 'a', 'way', 'not', 'directly', 'linked', 'to', 'the', 'crime.', 'excellent', 'direction,', 'script,', 'acting,', 'and', 'cinematography.', 'reportedly', 'no

In [11]:
# if you could use data.BucketIterator.splits

# sadly train_iter isn't actually an iter..
# peek at a batch of data
# batch = next(iter(train_iter))

# numericalized tokens
# print(batch.text)

# print(batch.label)

In [12]:
TEXT.process([train[0].text, train[1].text]).shape

torch.Size([2, 143])

In [13]:
# custom dataset class required to work with Skorch
class TorchDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __getitem__(self, idx):
        example = self.dataset.examples[idx]
        return example.text, example.label
    
    def __len__(self):
        return len(self.dataset)

In [14]:
# # custom dataset class required to work with Skorch
# class TorchDataset(torch.utils.data.Dataset):
#     def __init__(self, dataset, TEXT, LABEL):
#         self.dataset = dataset
#         self.TEXT = TEXT
#         self.LABEL = LABEL
    
#     def __getitem__(self, idx):
#         example = self.dataset.examples[idx]
#         X = TEXT.numericalize([example.text]).squeeze() # get rid of "batch" dimension
#         y = LABEL.numericalize([example.label])
#         return X, y
    
#     def __len__(self):
#         return len(self.dataset)

In [15]:
ex = train.examples[0]

In [16]:
' '.join(ex.text)

'a masterful treatment of james caine\'s "the postman always rings twice" as luchino visconti\'s first film shot primarily around ferrara in a soulless war-torn italy. the original negative was thought destroyed but visconti saved a print and fortunately we can see this early neo-realist work today. a ruggedly handsome massimo girotti and clara calamai (who had recently revealed her breasts in la cena delle beffe" (1941), star as the sensually-charged and ill-fated lovers who plot to kill her husband. unusual ending in which, although crime does not pay, one pays in a way not directly linked to the crime. excellent direction, script, acting, and cinematography. reportedly not as good as the french "le dernier tournant\' (1939) but probably better than the us version (1946) featuring lana turner and john garfield in the lead roles. highly recommended.'

In [17]:
train_dataset = TorchDataset(train)
test_dataset = TorchDataset(test)

In [18]:
print(train_dataset[0])

(['a', 'masterful', 'treatment', 'of', 'james', "caine's", '"the', 'postman', 'always', 'rings', 'twice"', 'as', 'luchino', "visconti's", 'first', 'film', 'shot', 'primarily', 'around', 'ferrara', 'in', 'a', 'soulless', 'war-torn', 'italy.', 'the', 'original', 'negative', 'was', 'thought', 'destroyed', 'but', 'visconti', 'saved', 'a', 'print', 'and', 'fortunately', 'we', 'can', 'see', 'this', 'early', 'neo-realist', 'work', 'today.', 'a', 'ruggedly', 'handsome', 'massimo', 'girotti', 'and', 'clara', 'calamai', '(who', 'had', 'recently', 'revealed', 'her', 'breasts', 'in', 'la', 'cena', 'delle', 'beffe"', '(1941),', 'star', 'as', 'the', 'sensually-charged', 'and', 'ill-fated', 'lovers', 'who', 'plot', 'to', 'kill', 'her', 'husband.', 'unusual', 'ending', 'in', 'which,', 'although', 'crime', 'does', 'not', 'pay,', 'one', 'pays', 'in', 'a', 'way', 'not', 'directly', 'linked', 'to', 'the', 'crime.', 'excellent', 'direction,', 'script,', 'acting,', 'and', 'cinematography.', 'reportedly', 'n

In [19]:
def pad_batch(batch, TEXT, LABEL):
    text, label = list(zip(*batch))
    # numericalized and padded text representation
    text_processed = TEXT.process(text)
    label_processed = LABEL.process(label)
    return text_processed, label_processed

from functools import partial

pad_batch_partial = partial(pad_batch, TEXT=TEXT, LABEL=LABEL)

In [20]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=pad_batch_partial)

In [21]:
batch = next(iter(train_dataloader))

## Define your network

In [22]:
# was having trouble with when model was defined in the notebook
# Can't get attribute ‘CNN' on <module ‘__main__'
from model import CNN

In [23]:
# smoketest
model = CNN(pretrained_embeddings=TEXT.vocab.vectors).to(device)
gpu_batch = batch[0].to(device)
model_out = model(gpu_batch)
print(model_out)

tensor([[0.5732, 0.4268],
        [0.5372, 0.4628],
        [0.5615, 0.4385],
        [0.6806, 0.3194],
        [0.6352, 0.3648],
        [0.5678, 0.4322],
        [0.6505, 0.3495],
        [0.4325, 0.5675],
        [0.6019, 0.3981],
        [0.5857, 0.4143],
        [0.6010, 0.3990],
        [0.5662, 0.4338],
        [0.6494, 0.3506],
        [0.4390, 0.5610],
        [0.6041, 0.3959],
        [0.7121, 0.2879],
        [0.5781, 0.4219],
        [0.5727, 0.4273],
        [0.5169, 0.4831],
        [0.6531, 0.3469],
        [0.5080, 0.4920],
        [0.5611, 0.4389],
        [0.5560, 0.4440],
        [0.6448, 0.3552],
        [0.5697, 0.4303],
        [0.5798, 0.4202],
        [0.6656, 0.3344],
        [0.4933, 0.5067],
        [0.5143, 0.4857],
        [0.5518, 0.4482],
        [0.5806, 0.4194],
        [0.6070, 0.3930]], device='cuda:0', grad_fn=<SoftmaxBackward>)


In [24]:
del model

In [25]:
del gpu_batch

In [26]:
torch.cuda.empty_cache()

## Quick attempt at model training

In [27]:
import skorch
from skorch import NeuralNetClassifier 
import torch.optim as optim

In [28]:
import torch.nn as nn

In [29]:
# if you want to use a custom DataLoader, you must use NeuralNet
# also, not immediately obvious that for NeuralNet you are responsible for applying the log function
# whereas for NeuralNetClassifier, you are not

# NB: not ideal to be using softmax + log + NLLLoss
# see discussion: https://github.com/skorch-dev/skorch/issues/637
skorch_model = NeuralNetClassifier(
                CNN,
                device=device,
                max_epochs=2,
                lr=0.001,
                optimizer=optim.Adam,
                criterion=nn.NLLLoss,
                iterator_train=DataLoader,
                iterator_train__shuffle=True,
                iterator_train__batch_size=32,
                iterator_train__collate_fn=pad_batch_partial,
                iterator_train__num_workers=8,
                iterator_valid=DataLoader,
                iterator_valid__shuffle=False,
                iterator_valid__batch_size=64,
                iterator_valid__collate_fn=pad_batch_partial,
                iterator_valid__num_workers=8,
                train_split=skorch.dataset.CVSplit(.2), # NB: this witholds 20% of the training data for validation
                module__n_filters=100,
                module__filter_sizes=(2,3,4),
                module__dropout=0.2,
                module__pretrained_embeddings=TEXT.vocab.vectors,
                verbose=2)
# getting the following error when trying to compute accuracy
# ValueError: Classification metrics can't handle a mix of binary and continuous-multioutput targets
#                 callbacks=callbacks)

In [30]:
# NB: now that we implemented our custom batching we reduced compute time from 60 seconds
# per epoch to 20 seconds per epoch. That's huge.
# ((60 - 20) / 60)*100 = ~66%
skorch_model.fit(train_dataset, y=None)

  epoch    train_loss    valid_acc    valid_loss      dur
-------  ------------  -----------  ------------  -------
      1        [36m0.4630[0m       [32m0.8312[0m        [35m0.3808[0m  33.8281
      2        [36m0.3454[0m       [32m0.8540[0m        [35m0.3361[0m  34.1240


<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=CNN(
    (embedding): Embedding(25002, 100)
    (conv_0): Conv1d(1, 100, kernel_size=(2, 100), stride=(1,))
    (conv_1): Conv1d(1, 100, kernel_size=(3, 100), stride=(1,))
    (conv_2): Conv1d(1, 100, kernel_size=(4, 100), stride=(1,))
    (fc): Linear(in_features=300, out_features=2, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  ),
)

In [31]:
# this isn't working as expected
# skorch_model.score(test_dataset)
# TypeError: score() missing 1 required positional argument: 'y'
# skorch_model.score(test_dataset, y=None)
# ValueError: Expected array-like (array or non-string sequence), got None

In [32]:
# score manually
test_dataloader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False, collate_fn=pad_batch_partial, num_workers=8)

In [33]:
# sadly processing the entire test set twice just to score the model
processed_test_data = next(iter(test_dataloader))

In [34]:
# quick check on the test set accuracy
test_preds = skorch_model.predict(test_dataset)

In [35]:
test_preds

array([0, 1, 1, ..., 0, 0, 0])

In [36]:
test_labels = processed_test_data[1].numpy()

In [37]:
from sklearn.metrics import accuracy_score

In [38]:
accuracy_score(test_labels, test_preds)

0.85048

In [39]:
# random guessing would 50% accuracy so the model is indeed training
np.unique(test_labels, return_counts=True)

(array([0, 1]), array([12500, 12500]))

In [40]:
# NB: this has no effect on GPU memory usage. If I keyboard interrupt, the workers get
# restarted and memory usage goes down. Deleting these "handler" objects doesn't delete
# GPU memory references on the workers. 
del skorch_model

## Grid search with Hyperband

In [41]:
# try to store the raw text in dask arrays and handle preprocessing inside the network
# this isn't ideal because feature engineering really should be separate from modeling

In [42]:
# raw text
train_text = []
train_label = []
for e in train.examples:
    tokenized_text = e.text
    text_string = ' '.join(tokenized_text)
    train_text.append(text_string)
    label = e.label
    # label = LABEL.process([e.label]).numpy()
    train_label.append(label)

In [43]:
train_text[:1]

['a masterful treatment of james caine\'s "the postman always rings twice" as luchino visconti\'s first film shot primarily around ferrara in a soulless war-torn italy. the original negative was thought destroyed but visconti saved a print and fortunately we can see this early neo-realist work today. a ruggedly handsome massimo girotti and clara calamai (who had recently revealed her breasts in la cena delle beffe" (1941), star as the sensually-charged and ill-fated lovers who plot to kill her husband. unusual ending in which, although crime does not pay, one pays in a way not directly linked to the crime. excellent direction, script, acting, and cinematography. reportedly not as good as the french "le dernier tournant\' (1939) but probably better than the us version (1946) featuring lana turner and john garfield in the lead roles. highly recommended.']

In [44]:
train_text = np.array(train_text)

In [45]:
train_text = np.expand_dims(train_text, axis=-1)

In [46]:
sample_batch = [train_text[0], train_text[10]]

In [47]:
sample_text_batch = [i[0] for i in sample_batch]

In [48]:
TEXT.process(sample_text_batch)

tensor([[    3,     0, 12500,  ...,     1,     1,     1],
        [ 1669,  3651,     9,  ...,  9300,  6517,   411]])

In [49]:
TEXT.process([train_text[0][0], train_text[1][0]]).shape

torch.Size([2, 858])

In [50]:
isinstance(train_text[:10], np.ndarray)

True

In [51]:
# TEXT.process(train_text[0])

In [52]:
train_label = np.array(train_label)

In [53]:
# # this is a really unfortunate hack to make torchtext batching semantics work with skorch and dask
# # the downside here is that we're no longer padding to the longest sequence in the batch, rather
# # we're padding to the longest sequence in the *dataset*, which results in signifcantly more
# # computation and thus significantly more time to train a model
# # of course, you could set a max sequence length but that's not an ideal solution
# # another solution would be to create a different dataset object, but then you can't use torchtext,
# # which really is quite handy

# # train=True shuffles the data
# train_iter_skorch = torchtext.data.Iterator(train, batch_size=len(train), train=True, sort=False, device='cpu')
# test_iter_skorch = torchtext.data.Iterator(test, batch_size=len(test), train=False, sort=False, device='cpu')

# # takes some time to numericalize the whole dataset

# # also notice that skorch and dask expect numpy arrays, which isn't ideal since it ties you to the cpu.
# # meanwhile, projects like https://rapids.ai/ are moving toward all GPU computation, avoiding the cpu altogether.
# for batch in train_iter_skorch:
#     X_train = batch.text[0].numpy()
#     y_train = batch.label.numpy()

# for batch in test_iter_skorch:
#     X_test = batch.text[0].numpy()
#     y_test = batch.label.numpy()

In [54]:
# notice how awfully large the second dimension is
# X_train.shape

In [55]:
# https://ml.dask.org/hyper-parameter-search.html#hyperband-parameters-rule-of-thumb
EPOCHS = 5
NUM_TRAINING_EXAMPLES = len(train)*.8
n_examples = EPOCHS * NUM_TRAINING_EXAMPLES
n_params = 8

# it's not immediately obvious to beginners how all these parameters interact with each other
max_iter = n_params
chunk_size = n_examples // n_params

In [56]:
# suppose we want to set max_iter to be the commensurate with the number of examples required
# for the model converge (as cited in the documentation)

# it's a bit unclear how n_params relates to BOTH the number of data points required
# for the model to converge AND how many hyperparameters to try out (i.e. n_iter in RandomizedSearchCV)

In [57]:
import math

In [58]:
# choose chunk size so that the remainder is not a tiny number
print(f'Chunk size: {chunk_size}')
print(f'Total chunks: {math.ceil(len(train) / chunk_size)}')
last_chunk_size = len(train) % chunk_size
if last_chunk_size == 0: # i.e. chunk_size evenly divides X_train
    last_chunk_size = chunk_size
print(f'Last chunk size: {last_chunk_size}')

assert (len(train) % chunk_size > 10 or len(train) % chunk_size == 0), 'Choose another chunk size'

Chunk size: 12500.0
Total chunks: 2
Last chunk size: 12500.0


In [59]:
import dask.array as da

X = da.from_array(train_text, chunks=(chunk_size))
y = da.from_array(train_label, chunks=(chunk_size))

In [60]:
X

Unnamed: 0,Array,Chunk
Bytes,1.37 GB,685.20 MB
Shape,"(25000, 1)","(12500, 1)"
Count,3 Tasks,2 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 1.37 GB 685.20 MB Shape (25000, 1) (12500, 1) Count 3 Tasks 2 Chunks Type numpy.ndarray",1  25000,

Unnamed: 0,Array,Chunk
Bytes,1.37 GB,685.20 MB
Shape,"(25000, 1)","(12500, 1)"
Count,3 Tasks,2 Chunks
Type,numpy.ndarray,


In [61]:
# need new collate_fn due to the following error
# ValueError: Expected 2D array, got 1D array instead:
# array=['1' '1' '1' ... '1' '1' '1'].
# Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [62]:
def pad_batch_hyperband(batch, TEXT, LABEL):
    text, label = list(zip(*batch))
    # unnecessary extra dimension
    text = [i[0] for i in text]
    # numericalized and padded text representation
    text_processed = TEXT.process(text)
    label_processed = LABEL.process(label)
    return text_processed, label_processed

pad_batch_hyperband_partial = partial(pad_batch_hyperband, TEXT=TEXT, LABEL=LABEL)

In [63]:
# reinitialize and set train_split=None to let hyperband handle validation set splitting
skorch_model = NeuralNetClassifier(
                CNN,
                device=device,
                lr=0.001,
                optimizer=optim.Adam,
                criterion=nn.NLLLoss,
                iterator_train=DataLoader,
                iterator_train__shuffle=True,
                iterator_train__batch_size=32,
                iterator_train__collate_fn=pad_batch_hyperband_partial,
                iterator_valid=DataLoader,
                iterator_valid__collate_fn=pad_batch_hyperband_partial,
                iterator_valid__shuffle=False,
                iterator_valid__batch_size=64,
                train_split=None, # let hyperband handle it
                module__n_filters=100,
                module__filter_sizes=(2, 3, 4),
                module__dropout=0.2,
                module__pretrained_embeddings=TEXT.vocab.vectors,
                # module__TEXT=TEXT,
                batch_size=32,
                verbose=2)

In [64]:
from scipy.stats import loguniform

In [65]:
# define parameter grid
params = {'module__filter_sizes': [(1, 2, 3), (2, 3, 4), (3, 4, 5)], 
          'module__n_filters': [25, 50, 100],
          'module__dropout': loguniform(1e-1, 3e-1),
          'batch_size': [32, 64],
         }

In [66]:
from dask_ml.model_selection import HyperbandSearchCV

In [67]:
search = HyperbandSearchCV(
    skorch_model,
    params,
    max_iter=max_iter,
    verbose=True,
    test_size=0.2 # validation size
)

In [68]:
search.metadata["partial_fit_calls"]

26

In [69]:
search.metadata['n_models']

5

In [70]:
import time

In [71]:
# to clear up any confusion, every time partial_fit is called, we're passing in chunk_size number of
# data points. Then skorch handles the batch size either by being set explicitly or as part of the param grid.

# to compare this grid search to number of epochs, we have 26 partial_fit calls * 10k data points = 260k examples
# with a training set size of 25k * .8 = 20k data points, this is 13 epochs!
# considering that it takes approximately 5 epochs to train a model, you would get through less than 3 sets of 
# hyperparameters if manually searching. Instead we'll search through ~5.

In [72]:
# I feel like the GPU is spending a lot of time waiting for the CPU to preprocess data
# I've been watching GPU power consumption during training and it's not as high as during
# skorch training, so I tried to increase the number of workers the dataloader uses and
# got the following error: AssertionError: daemonic processes are not allowed to have children

In [83]:
y[0].compute()

'pos'

In [73]:
# I think I'm getting yelled at for passing in text
# TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <U13704

# NB: this took ~3800 seconds on a single Nvidia GTX 980 Ti
# notice how the number of training datapoints relates to the chunk size and our test_size
# e.g. Validation set size: 2500 = 12500*.2, Train set size: 10000 = 12500 - 2500
start = time.time()
search.fit(X, y)
end = time.time()
duration = round(end - start, 2)
print(f'Time to complete grid search: {duration} seconds')

[CV, bracket=1] creating 3 models


  sk_validation.check_array(sample, *args, **kwargs)
  sk_validation.check_array(sample, *args, **kwargs)
  sk_validation.check_array(sample, *args, **kwargs)
  sk_validation.check_array(sample, *args, **kwargs)
  sk_validation.check_array(sample, *args, **kwargs)
  [<class 'skorch.classifier.NeuralNetClassifier'>[u ... .0721]]),
), 0]
Consider scattering large objects ahead of time
with client.scatter to reduce scheduler burden and 
keep data on workers

    future = client.submit(func, big_data)    # bad

    big_future = client.scatter(big_data)     # good
    future = client.submit(func, big_future)  # good
  % (format_bytes(len(b)), s)


[CV, bracket=0] creating 2 models


  sk_validation.check_array(sample, *args, **kwargs)
  sk_validation.check_array(sample, *args, **kwargs)
  sk_validation.check_array(sample, *args, **kwargs)


[CV, bracket=0] For training there are between 10000 and 10000 examples in each chunk
[CV, bracket=1] For training there are between 10000 and 10000 examples in each chunk


tornado.application - ERROR - Multiple exceptions in yield list
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 849, in callback
    result_list.append(f.result())
  File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 1107, in run
    yielded = self.gen.throw(*exc_info)
  File "/opt/conda/lib/python3.7/site-packages/dask_ml/model_selection/_incremental.py", line 625, in _fit
    prefix=self.prefix,
  File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 1099, in run
    value = future.result()
  File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 1107, in run
    yielded = self.gen.throw(*exc_info)
  File "/opt/conda/lib/python3.7/site-packages/dask_ml/model_selection/_incremental.py", line 233, in _fit
    metas = yield client.gather(new_scores)
  File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 1099, in run
    value = future.result()
  File "/opt/conda/lib/python3.7/s

KeyError: tensor([0.])

## Integration

`HyperbandSearchCV` follows the Scikit-learn API and mirrors Scikit-learn's `RandomizedSearchCV`. This means that it "just works". All the Scikit-learn attributes and methods are available:

In [None]:
search.best_score_

In [None]:
search.best_estimator_

In [None]:
import pandas as pd

In [None]:
search.cv_results_

In [None]:
cv_results = pd.DataFrame(search.cv_results_)
cv_results.head()

In [None]:
# issue with numpy converting array of tuples into a 2d array
search.cv_results_['param_module__filter_sizes'] = search.cv_results_['param_module__filter_sizes'].tolist()

In [None]:
cv_results = pd.DataFrame(search.cv_results_)
cv_results.head()

In [None]:
search.score(X_test, y_test)

In [None]:
search.predict(X_test)

In [None]:
search.predict(X_test).compute()

It also has some other attributes.

In [None]:
hist = pd.DataFrame(search.history_)
hist.head()

This illustrates the history after every `partial_fit` call. There's also an attributed `model_history_` that records the history for each model (it's a reorganization of `history_`).

## Learn more

This notebook covered basic usage `HyperbandSearchCV`. The following documentation and resources might be useful to learn more about `HyperbandSearchCV`, including some of the finer use cases:

* [A talk](https://www.youtube.com/watch?v=x67K9FiPFBQ) introducing `HyperbandSearchCV` to the SciPy 2019 audience and the [corresponding paper](https://conference.scipy.org/proceedings/scipy2019/pdfs/scott_sievert.pdf)
* [HyperbandSearchCV's documentation](https://ml.dask.org/modules/generated/dask_ml.model_selection.HyperbandSearchCV.html)

Performance comparisons can be found in the SciPy 2019 talk/paper.