# Getting Started - Hyperparameter Tuning #
Ray's quickstart on Hyperparameter tuning:
https://docs.ray.io/en/latest/train/dl_guide.html#hyperparameter-tuning-ray-tune

and also
https://docs.ray.io/en/latest/tune/getting-started.html#tune-tutorial

combined with Torch's Quickstart guide:
https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html

and their version of ray tuning guide:
https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html

In [1]:
import torch
import torch.nn as nn
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from ray import air
import torchmetrics
import numpy as np
import os

In [2]:
# Download training data from open datasets.
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Download test data from open datasets.
test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

In [3]:
class NaiveDense(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, inputs):
        inputs = self.flatten(inputs)
        logits = self.linear_relu_stack(inputs)
        return logits

In [4]:
def train_fashion(config, checkpoint_dir=None, data_dir=None):
    num_epochs=3
    batch_size = config['batch_size']
    # checkpoint_dir = config['checkpoint_dir']
    model_state_name = "serial_model.pth"
    train_dataloader = DataLoader(training_data, batch_size=batch_size)
    test_dataloader = DataLoader(test_data, batch_size=batch_size)
    device = (
        "cuda"
        if torch.cuda.is_available()
        else "mps"
        if torch.backends.mps.is_available()
        else "cpu"
    )
    device = "cpu"
    model = NaiveDense().to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(
        model.parameters(), lr=config["lr"], momentum=config["momentum"]
    )

    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    for epoch in range(num_epochs):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(train_dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
                                                running_loss / epoch_steps))
                running_loss = 0.0

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(test_dataloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = model(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1

        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((model.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=(val_loss / val_steps), accuracy=correct / total)
    print("Finished Training")

In [5]:
def test_accuracy(net, device="cpu"):
    batch_size=64
    test_dataloader = DataLoader(test_data, batch_size=batch_size)

    correct = 0
    total = 0
    with torch.no_grad():
        for data in test_dataloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return correct / total

In [6]:
# first make sure it runs
config = {"batch_size":64,
          "lr": 0.001,
          "momentum": 0.8}
train_fashion(config)

Session not detected. You should not be calling `checkpoint_dir` outside `tuner.fit()` or while using the class API. 
  File "/home/jim/mambaforge/envs/ray/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/jim/mambaforge/envs/ray/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/jim/mambaforge/envs/ray/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/home/jim/mambaforge/envs/ray/lib/python3.10/site-packages/traitlets/config/application.py", line 1043, in launch_instance
    app.start()
  File "/home/jim/mambaforge/envs/ray/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 725, in start
    self.io_loop.start()
  File "/home/jim/mambaforge/envs/ray/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 195, in start
    self.asyncio_loop.run_forever()
  File "/home/jim/mambaforge/envs/ray/lib/pytho

Finished Training


In [7]:
num_samples=10  # number of experiments
max_num_epochs=3  # reps? the "training_iteration" column
gpus_per_trial=0  # I only have 1 Gpu, turning this off allows parallel?
search_space = {
    "lr": tune.loguniform(1e-4, 1e-2),
    "momentum": tune.uniform(0.1, 0.9),
    "batch_size": tune.choice([16, 32, 64, 128]),
}
scheduler = ASHAScheduler(
    metric="loss",
    mode="min",
    max_t=max_num_epochs,
    grace_period=1,
    reduction_factor=2)
reporter = CLIReporter(
    # ``parameter_columns=["l1", "l2", "lr", "batch_size"]``,
    metric_columns=["loss", "accuracy", "training_iteration"])

tuner = tune.Tuner(
    # train_cifar, # this default to use 1 ??
    tune.with_resources(
        tune.with_parameters(train_fashion),
        resources={"cpu": 4, "gpu": gpus_per_trial}
    ),
    tune_config=tune.TuneConfig(
        scheduler=scheduler,
        num_samples=num_samples,
    ),
    run_config=air.RunConfig(progress_reporter=reporter),
    param_space=search_space
)
result = tuner.fit()

2023-05-14 14:57:41,927	INFO worker.py:1616 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
2023-05-14 14:57:45,311	INFO tune.py:218 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `Tuner(...)`.

from ray.air import session

def train(config):
    # ...
    session.report({"metric": metric}, checkpoint=checkpoint)

For more information please see https://docs.ray.io/en/latest/tune/api/trainable.html



== Status ==
Current time: 2023-05-14 14:57:54 (running for 00:00:08.20)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 2.000: None | Iter 1.000: None
Logical resource usage: 4.0/16 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /home/jim/ray_results/train_fashion_2023-05-14_14-57-40
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+---------------------------+----------+----------------------+--------------+-------------+------------+
| Trial name                | status   | loc                  |   batch_size |          lr |   momentum |
|---------------------------+----------+----------------------+--------------+-------------+------------|
| train_fashion_369b0_00000 | RUNNING  | 192.168.0.135:481602 |          128 | 0.000330542 |   0.353592 |
| train_fashion_369b0_00001 | PENDING  |                      |           64 | 0.00186854  |   0.843826 |
| train_fashion_369b0_00002 | PENDING  |                      |          128 | 0.00029119  |   0.552573 |
| train_fashion_3

Trial name,accuracy,date,done,hostname,iterations_since_restore,loss,node_ip,pid,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,training_iteration,trial_id
train_fashion_369b0_00000,0.1565,2023-05-14_14-58-14,False,jim-desktop,1,2.27271,192.168.0.135,481602,True,17.4771,17.4771,17.4771,1684090694,1,369b0_00000


== Status ==
Current time: 2023-05-14 14:58:19 (running for 00:00:32.69)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 2.000: None | Iter 1.000: -2.272705799416651
Logical resource usage: 16.0/16 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /home/jim/ray_results/train_fashion_2023-05-14_14-57-40
Number of trials: 10/10 (6 PENDING, 4 RUNNING)
+---------------------------+----------+----------------------+--------------+-------------+------------+---------+------------+----------------------+
| Trial name                | status   | loc                  |   batch_size |          lr |   momentum |    loss |   accuracy |   training_iteration |
|---------------------------+----------+----------------------+--------------+-------------+------------+---------+------------+----------------------|
| train_fashion_369b0_00000 | RUNNING  | 192.168.0.135:481602 |          128 | 0.000330542 |   0.353592 | 2.27271 |     0.1565 |                    1 |
| train_fashion_369b0_00001 | 

2023-05-14 15:00:28,142	INFO tune.py:945 -- Total run time: 162.83 seconds (161.74 seconds for the tuning loop).


== Status ==
Current time: 2023-05-14 15:00:28 (running for 00:02:41.75)
Using AsyncHyperBand: num_stopped=10
Bracket: Iter 2.000: -0.5909972912401911 | Iter 1.000: -0.9023655880788329
Logical resource usage: 0/16 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /home/jim/ray_results/train_fashion_2023-05-14_14-57-40
Number of trials: 10/10 (10 TERMINATED)
+---------------------------+------------+----------------------+--------------+-------------+------------+----------+------------+----------------------+
| Trial name                | status     | loc                  |   batch_size |          lr |   momentum |     loss |   accuracy |   training_iteration |
|---------------------------+------------+----------------------+--------------+-------------+------------+----------+------------+----------------------|
| train_fashion_369b0_00000 | TERMINATED | 192.168.0.135:481602 |          128 | 0.000330542 |   0.353592 | 2.20102  |     0.2912 |                    3 |
| train_fas

In [8]:
best_trial = result.get_best_result("loss", "min", "last")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(
    best_trial.metrics['loss']))
print("Best trial final validation accuracy: {}".format(
    best_trial.metrics["accuracy"]))

Best trial config: {'lr': 0.007081667762167206, 'momentum': 0.7697592864340546, 'batch_size': 16}
Best trial final validation loss: 0.39953762774467466
Best trial final validation accuracy: 0.8551


In [9]:
best_trained_model = NaiveDense()
device = "cpu"
if torch.cuda.is_available():
    device = "cuda:0"
    if gpus_per_trial > 1:
        best_trained_model = nn.DataParallel(best_trained_model)
best_trained_model.to(device)

best_checkpoint_dir = best_trial.checkpoint.path
model_state, optimizer_state = torch.load(os.path.join(
    best_checkpoint_dir, "checkpoint"))
best_trained_model.load_state_dict(model_state)

test_acc = test_accuracy(best_trained_model, device)
print("Best trial test set accuracy: {}".format(test_acc))

Best trial test set accuracy: 0.8551
