# Hyperparameter selection with Ray Tune

In [1]:
import os
from datetime import datetime
import time
import random
import warnings
import joblib
import warnings

# My modules
from config import Config
from logger import init_logger
from common_utils import set_seeds, read_csvs, stratify_split, setup, get_loaders
from model import Model
from train_loop_functions import train_epoch, valid_epoch

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score

import torch
from torch import nn
from torch.cuda.amp import GradScaler

# hyperparameter tuning
from functools import partial
import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

In [2]:
%load_ext autoreload
%autoreload 2

# Setup

In [3]:
set_seeds(Config.seed)
LOGGER = init_logger() # uses Python's logging framework

# Tuning

In [4]:
# all params are passed in by Tune
def train_main(config, checkpoint_dir=None, data_dir=None):
    assert config is not None
    # -------- DATASETS AND LOADERS --------
    train_df, test_df = read_csvs(data_dir, Config.debug)
    train_folds = stratify_split(train_df, Config.fold_num, Config.seed, Config.target_col)
    
    # select only one of the folds (fold 0)
    train_dataloader, valid_dataloader = get_loaders(train_folds, 0, 
                                                     config["batch_size"], data_dir+'/train_images')
    
    # -------- MODEL --------
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model, optimizer, scheduler, criterion = setup(Config.model_arch, config["lr"], 
                                                   config["is_amsgrad"], train_df.label.nunique(), device)
    
    # The `checkpoint_dir` parameter gets passed by Ray Tune when a checkpoint should be restored.
    if checkpoint_dir:
        checkpoint = os.path.join(checkpoint_dir, "checkpoint")
        model_state, optimizer_state = torch.load(checkpoint)
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)
        
    # EPOCHS TRAIN
    for e in range(10):
        # TRAIN
        training_losses = train_epoch(train_dataloader, model, 
                                      criterion, optimizer, 
                                      scheduler, GradScaler(), 
                                      config["accum_iter"], LOGGER,
                                      device)
        avg_training_loss = sum(training_losses) / len(train_dataloader)
        
        # VALIDATE
        validation_losses, preds = valid_epoch(valid_dataloader, model, 
                                               criterion, config["accum_iter"],
                                               LOGGER, device)
        avg_validation_loss = sum(validation_losses) / len(valid_dataloader)
        
        validation_labels = valid_df[Config.target_col].values
        accuracy = accuracy_score(y_true=validation_labels, y_pred=preds)
        
        # SAVE CHECKPOINT.
        # It is automatically registered with Ray Tune and will potentially
        # be passed as the `checkpoint_dir` parameter in future iterations.
        with tune.checkpoint_dir(step=e) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((model.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=avg_validation_loss, accuracy=accuracy)
    print("Finished Training")

In [5]:
def main(num_samples=20, max_num_epochs=10, gpus_per_trial=1):
    data_dir = os.path.abspath('./data')
    
    hyperconfig = {
        "is_amsgrad": tune.choice([False, True]),
        "accum_iter": tune.choice([1,2,4,8,16]),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([4, 8, 16, 32])
    }
    
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    reporter = CLIReporter(metric_columns=["loss", "accuracy", "training_iteration"])
    result = tune.run(
        partial(train_main, data_dir=data_dir),
        resources_per_trial={"cpu": 1, "gpu": gpus_per_trial},
        config=hyperconfig,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

    best_trained_model = Model(Config.model_arch, Config.num_labels, pretrained=True).to(device)
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if gpus_per_trial > 1:
            best_trained_model = nn.DataParallel(best_trained_model)
    best_trained_model.to(device)

    checkpoint_path = os.path.join(best_trial.checkpoint.value, "checkpoint")

    model_state, optimizer_state = torch.load(checkpoint_path)
    best_trained_model.load_state_dict(model_state)

    test_acc = test_accuracy(best_trained_model, device)
    print("Best trial test set accuracy: {}".format(test_acc))

In [6]:
main()

2020-12-24 23:05:01,429	INFO services.py:1092 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
2020-12-24 23:05:02,071	INFO registry.py:65 -- Detected unknown callable for trainable. Converting to class.


== Status ==
Memory usage on this node: 4.9/31.3 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 1/8 CPUs, 1/1 GPUs, 0.0/17.63 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/ray_results/DEFAULT_2020-12-24_23-05-02
Number of trials: 1/20 (1 RUNNING)
+---------------------+----------+-------+--------------+--------------+--------------+-----------+
| Trial name          | status   | loc   |   accum_iter |   batch_size | is_amsgrad   |        lr |
|---------------------+----------+-------+--------------+--------------+--------------+-----------|
| DEFAULT_73dea_00000 | RUNNING  |       |            1 |           32 | False        | 0.0122859 |
+---------------------+----------+-------+--------------+--------------+--------------+-----------+


[2m[36m(pid=6411)[0m Epoch     0: adjusting learning rate of group 0 to 1.2286e-02.


[2m[36m(pid=6411)[0m 2020-12-24 23:05:06,153	ERROR function_runner.py:254 -- Runner Thread raised error.
[2m[36m(pid=6411)[0m Traceback (most recent call last):
[2m[36m(pid=6411)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 248, in run
[2m[36m(pid=6411)[0m     self._entrypoint()
[2m[36m(pid=6411)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 316, in entrypoint
[2m[36m(pid=6411)[0m     self._status_reporter.get_checkpoint())
[2m[36m(pid=6411)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 575, in _trainable_func
[2m[36m(pid=6411)[0m     output = fn()
[2m[36m(pid=6411)[0m   File "<ipython-input-4-85c201a07a96>", line 31, in train_main
[2m[36m(pid=6411)[0m   File "/opt/favordata/AI/Felix/kaggle-cassava/train_loop_functions.py", line 21, in train_epoch
[2m[36m(pid=64

[2m[36m(pid=6410)[0m Epoch     0: adjusting learning rate of group 0 to 4.7924e-04.


[2m[36m(pid=6410)[0m 2020-12-24 23:05:14,125	ERROR function_runner.py:254 -- Runner Thread raised error.
[2m[36m(pid=6410)[0m Traceback (most recent call last):
[2m[36m(pid=6410)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 248, in run
[2m[36m(pid=6410)[0m     self._entrypoint()
[2m[36m(pid=6410)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 316, in entrypoint
[2m[36m(pid=6410)[0m     self._status_reporter.get_checkpoint())
[2m[36m(pid=6410)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 575, in _trainable_func
[2m[36m(pid=6410)[0m     output = fn()
[2m[36m(pid=6410)[0m   File "<ipython-input-4-85c201a07a96>", line 31, in train_main
[2m[36m(pid=6410)[0m   File "/opt/favordata/AI/Felix/kaggle-cassava/train_loop_functions.py", line 60, in train_epoch
[2m[36m(pid=64

== Status ==
Memory usage on this node: 7.1/31.3 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/8 CPUs, 0/1 GPUs, 0.0/17.63 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/ray_results/DEFAULT_2020-12-24_23-05-02
Number of trials: 3/20 (2 ERROR, 1 PENDING)
+---------------------+----------+-------+--------------+--------------+--------------+-------------+
| Trial name          | status   | loc   |   accum_iter |   batch_size | is_amsgrad   |          lr |
|---------------------+----------+-------+--------------+--------------+--------------+-------------|
| DEFAULT_73dea_00002 | PENDING  |       |            1 |            8 | True         | 0.0144015   |
| DEFAULT_73dea_00000 | ERROR    |       |            1 |           32 | False        | 0.0122859   |
| DEFAULT_73dea_00001 | ERROR    |       |            4 |            8 | False        | 0.000

[2m[36m(pid=6406)[0m 2020-12-24 23:05:22,318	ERROR function_runner.py:254 -- Runner Thread raised error.
[2m[36m(pid=6406)[0m Traceback (most recent call last):
[2m[36m(pid=6406)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 248, in run
[2m[36m(pid=6406)[0m     self._entrypoint()
[2m[36m(pid=6406)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 316, in entrypoint
[2m[36m(pid=6406)[0m     self._status_reporter.get_checkpoint())
[2m[36m(pid=6406)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 575, in _trainable_func
[2m[36m(pid=6406)[0m     output = fn()
[2m[36m(pid=6406)[0m   File "<ipython-input-4-85c201a07a96>", line 31, in train_main
[2m[36m(pid=6406)[0m   File "/opt/favordata/AI/Felix/kaggle-cassava/train_loop_functions.py", line 60, in train_epoch
[2m[36m(pid=64

== Status ==
Memory usage on this node: 7.0/31.3 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/8 CPUs, 0/1 GPUs, 0.0/17.63 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/ray_results/DEFAULT_2020-12-24_23-05-02
Number of trials: 4/20 (3 ERROR, 1 PENDING)
+---------------------+----------+-------+--------------+--------------+--------------+-------------+
| Trial name          | status   | loc   |   accum_iter |   batch_size | is_amsgrad   |          lr |
|---------------------+----------+-------+--------------+--------------+--------------+-------------|
| DEFAULT_73dea_00003 | PENDING  |       |            2 |            4 | False        | 0.0875573   |
| DEFAULT_73dea_00000 | ERROR    |       |            1 |           32 | False        | 0.0122859   |
| DEFAULT_73dea_00001 | ERROR    |       |            4 |            8 | False        | 0.000

[2m[36m(pid=6416)[0m 2020-12-24 23:05:31,207	ERROR function_runner.py:254 -- Runner Thread raised error.
[2m[36m(pid=6416)[0m Traceback (most recent call last):
[2m[36m(pid=6416)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 248, in run
[2m[36m(pid=6416)[0m     self._entrypoint()
[2m[36m(pid=6416)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 316, in entrypoint
[2m[36m(pid=6416)[0m     self._status_reporter.get_checkpoint())
[2m[36m(pid=6416)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 575, in _trainable_func
[2m[36m(pid=6416)[0m     output = fn()
[2m[36m(pid=6416)[0m   File "<ipython-input-4-85c201a07a96>", line 31, in train_main
[2m[36m(pid=6416)[0m   File "/opt/favordata/AI/Felix/kaggle-cassava/train_loop_functions.py", line 60, in train_epoch
[2m[36m(pid=64

== Status ==
Memory usage on this node: 6.9/31.3 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/8 CPUs, 0/1 GPUs, 0.0/17.63 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/ray_results/DEFAULT_2020-12-24_23-05-02
Number of trials: 5/20 (4 ERROR, 1 PENDING)
+---------------------+----------+-------+--------------+--------------+--------------+-------------+
| Trial name          | status   | loc   |   accum_iter |   batch_size | is_amsgrad   |          lr |
|---------------------+----------+-------+--------------+--------------+--------------+-------------|
| DEFAULT_73dea_00004 | PENDING  |       |            1 |           16 | False        | 0.00277202  |
| DEFAULT_73dea_00000 | ERROR    |       |            1 |           32 | False        | 0.0122859   |
| DEFAULT_73dea_00001 | ERROR    |       |            4 |            8 | False        | 0.000

[2m[36m(pid=6408)[0m 2020-12-24 23:05:39,759	ERROR function_runner.py:254 -- Runner Thread raised error.
[2m[36m(pid=6408)[0m Traceback (most recent call last):
[2m[36m(pid=6408)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 248, in run
[2m[36m(pid=6408)[0m     self._entrypoint()
[2m[36m(pid=6408)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 316, in entrypoint
[2m[36m(pid=6408)[0m     self._status_reporter.get_checkpoint())
[2m[36m(pid=6408)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 575, in _trainable_func
[2m[36m(pid=6408)[0m     output = fn()
[2m[36m(pid=6408)[0m   File "<ipython-input-4-85c201a07a96>", line 31, in train_main
[2m[36m(pid=6408)[0m   File "/opt/favordata/AI/Felix/kaggle-cassava/train_loop_functions.py", line 60, in train_epoch
[2m[36m(pid=64

== Status ==
Memory usage on this node: 6.9/31.3 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/8 CPUs, 0/1 GPUs, 0.0/17.63 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/ray_results/DEFAULT_2020-12-24_23-05-02
Number of trials: 6/20 (5 ERROR, 1 PENDING)
+---------------------+----------+-------+--------------+--------------+--------------+-------------+
| Trial name          | status   | loc   |   accum_iter |   batch_size | is_amsgrad   |          lr |
|---------------------+----------+-------+--------------+--------------+--------------+-------------|
| DEFAULT_73dea_00005 | PENDING  |       |            2 |            8 | False        | 0.00107037  |
| DEFAULT_73dea_00000 | ERROR    |       |            1 |           32 | False        | 0.0122859   |
| DEFAULT_73dea_00001 | ERROR    |       |            4 |            8 | False        | 0.000

[2m[36m(pid=6409)[0m 2020-12-24 23:05:47,775	ERROR function_runner.py:254 -- Runner Thread raised error.
[2m[36m(pid=6409)[0m Traceback (most recent call last):
[2m[36m(pid=6409)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 248, in run
[2m[36m(pid=6409)[0m     self._entrypoint()
[2m[36m(pid=6409)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 316, in entrypoint
[2m[36m(pid=6409)[0m     self._status_reporter.get_checkpoint())
[2m[36m(pid=6409)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 575, in _trainable_func
[2m[36m(pid=6409)[0m     output = fn()
[2m[36m(pid=6409)[0m   File "<ipython-input-4-85c201a07a96>", line 31, in train_main
[2m[36m(pid=6409)[0m   File "/opt/favordata/AI/Felix/kaggle-cassava/train_loop_functions.py", line 60, in train_epoch
[2m[36m(pid=64

== Status ==
Memory usage on this node: 6.9/31.3 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/8 CPUs, 0/1 GPUs, 0.0/17.63 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/ray_results/DEFAULT_2020-12-24_23-05-02
Number of trials: 7/20 (6 ERROR, 1 PENDING)
+---------------------+----------+-------+--------------+--------------+--------------+-------------+
| Trial name          | status   | loc   |   accum_iter |   batch_size | is_amsgrad   |          lr |
|---------------------+----------+-------+--------------+--------------+--------------+-------------|
| DEFAULT_73dea_00006 | PENDING  |       |            1 |            4 | True         | 0.00206879  |
| DEFAULT_73dea_00000 | ERROR    |       |            1 |           32 | False        | 0.0122859   |
| DEFAULT_73dea_00001 | ERROR    |       |            4 |            8 | False        | 0.000

[2m[36m(pid=6407)[0m 2020-12-24 23:05:56,717	ERROR function_runner.py:254 -- Runner Thread raised error.
[2m[36m(pid=6407)[0m Traceback (most recent call last):
[2m[36m(pid=6407)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 248, in run
[2m[36m(pid=6407)[0m     self._entrypoint()
[2m[36m(pid=6407)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 316, in entrypoint
[2m[36m(pid=6407)[0m     self._status_reporter.get_checkpoint())
[2m[36m(pid=6407)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 575, in _trainable_func
[2m[36m(pid=6407)[0m     output = fn()
[2m[36m(pid=6407)[0m   File "<ipython-input-4-85c201a07a96>", line 31, in train_main
[2m[36m(pid=6407)[0m   File "/opt/favordata/AI/Felix/kaggle-cassava/train_loop_functions.py", line 60, in train_epoch
[2m[36m(pid=64

== Status ==
Memory usage on this node: 6.8/31.3 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/8 CPUs, 0/1 GPUs, 0.0/17.63 GiB heap, 0.0/6.05 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /opt/favordata/ray_results/DEFAULT_2020-12-24_23-05-02
Number of trials: 8/20 (7 ERROR, 1 PENDING)
+---------------------+----------+-------+--------------+--------------+--------------+-------------+
| Trial name          | status   | loc   |   accum_iter |   batch_size | is_amsgrad   |          lr |
|---------------------+----------+-------+--------------+--------------+--------------+-------------|
| DEFAULT_73dea_00007 | PENDING  |       |            1 |           32 | True         | 0.00156363  |
| DEFAULT_73dea_00000 | ERROR    |       |            1 |           32 | False        | 0.0122859   |
| DEFAULT_73dea_00001 | ERROR    |       |            4 |            8 | False        | 0.000

[2m[36m(pid=6405)[0m 2020-12-24 23:06:00,929	ERROR function_runner.py:254 -- Runner Thread raised error.
[2m[36m(pid=6405)[0m Traceback (most recent call last):
[2m[36m(pid=6405)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 248, in run
[2m[36m(pid=6405)[0m     self._entrypoint()
[2m[36m(pid=6405)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 316, in entrypoint
[2m[36m(pid=6405)[0m     self._status_reporter.get_checkpoint())
[2m[36m(pid=6405)[0m   File "/opt/favordata/anaconda3/envs/kaggle/lib/python3.7/site-packages/ray/tune/function_runner.py", line 575, in _trainable_func
[2m[36m(pid=6405)[0m     output = fn()
[2m[36m(pid=6405)[0m   File "<ipython-input-4-85c201a07a96>", line 31, in train_main
[2m[36m(pid=6405)[0m   File "/opt/favordata/AI/Felix/kaggle-cassava/train_loop_functions.py", line 21, in train_epoch
[2m[36m(pid=64

KeyboardInterrupt: 