In [11]:
import os 
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision.datasets import CIFAR10
from torch.utils.data import random_split
from torchvision import transforms
from torch.utils.data import DataLoader
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from functools import partial

In [12]:
!pip install -U "ray[tune]"
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler



Reference: Ray Tune Documentation
https://docs.ray.io/en/latest/tune/tutorials/tune-pytorch-cifar.html

Resnet Setup

In [3]:
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out

class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def project1_model():
    return ResNet(BasicBlock, [2, 2, 2, 2])

def test(net):
    totparam = 0
    for x in filter(lambda p: p.requires_grad, net.parameters()):
        totparam += np.prod(x.data.numpy().shape)
    print("# of params", totparam)
    print("# of layers", len(list(filter(lambda p: p.requires_grad and len(p.data.size())>1, net.parameters()))))

test(project1_model())

# of params 11173962
# of layers 21


Load Data

In [4]:
ROOT = '.data'

def load_data(data_dir=ROOT):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.RandomHorizontalFlip(),
        transforms.RandomCrop(30, 2),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    train_set = torchvision.datasets.CIFAR10(
        root=data_dir, train=True, download=True, transform=transform)

    test_set = torchvision.datasets.CIFAR10(
        root=data_dir, train=False, download=True, transform=transform)

    return train_set, test_set

In [5]:
load_data(ROOT)

Files already downloaded and verified
Files already downloaded and verified


(Dataset CIFAR10
     Number of datapoints: 50000
     Root location: .data
     Split: Train
     StandardTransform
 Transform: Compose(
                ToTensor()
                RandomHorizontalFlip(p=0.5)
                RandomCrop(size=(32, 32), padding=2)
                Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
            ), Dataset CIFAR10
     Number of datapoints: 10000
     Root location: .data
     Split: Test
     StandardTransform
 Transform: Compose(
                ToTensor()
                RandomHorizontalFlip(p=0.5)
                RandomCrop(size=(32, 32), padding=2)
                Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
            ))

In [13]:
def train_cifar(config, checkpoint_dir=None, data_dir=None):
    net = project1_model()

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    net.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)

    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    trainset, testset = load_data(ROOT)

    test_abs = int(len(trainset) * 0.8)
    train_subset, val_subset = random_split(
        trainset, [test_abs, len(trainset) - test_abs])

    trainloader = torch.utils.data.DataLoader(
        train_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)
    valloader = torch.utils.data.DataLoader(
        val_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)

    for epoch in range(10): 
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            if i % 5000 == 4999:  # print every 5000 mini-batches
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
                                                running_loss / epoch_steps))
                # reset running_loss
                running_loss = 0.0

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1

        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((net.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=(val_loss / val_steps), accuracy=correct / total)
    print("Finished Training")

In [14]:
def test_accuracy(net, device="cpu"):
    trainset, testset = load_data(ROOT)

    testloader = torch.utils.data.DataLoader(
        testset, batch_size=4, shuffle=False, num_workers=2)

    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return correct / total

In [None]:
def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
    load_data(ROOT)
    config = {
        "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16])
    }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    reporter = CLIReporter(
        parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "training_iteration"])
    result = tune.run(
        partial(train_cifar, data_dir=ROOT),
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

    best_trained_model = project1_model()
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if gpus_per_trial > 1:
            best_trained_model = nn.DataParallel(best_trained_model)
    best_trained_model.to(device)

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)

    test_acc = test_accuracy(best_trained_model, device)
    print("Best trial test set accuracy: {}".format(test_acc))


if __name__ == "__main__":
    main(num_samples=10, max_num_epochs=10, gpus_per_trial=0)

Files already downloaded and verified
Files already downloaded and verified


2022-03-25 02:52:10,205	INFO registry.py:70 -- Detected unknown callable for trainable. Converting to class.


== Status ==
Current time: 2022-03-25 02:52:10 (running for 00:00:00.22)
Memory usage on this node: 1.7/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 0/0 GPUs, 0.0/6.68 GiB heap, 0.0/3.34 GiB objects
Result logdir: /root/ray_results/DEFAULT_2022-03-25_02-52-10
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+---------------------+----------+-----------------+------+------+-------------+--------------+
| Trial name          | status   | loc             |   l1 |   l2 |          lr |   batch_size |
|---------------------+----------+-----------------+------+------+-------------+--------------|
| DEFAULT_90d48_00000 | RUNNING  | 172.28.0.2:4190 |  256 |    8 | 0.00237941  |           16 |
| DEFAULT_90d48_00001 | PENDING  |                 |  256 |  128 | 0.0985469   |            4 |
| DEFAULT_90d48_00002 | PENDING  |                 |   16 |  128 | 0.000290688 |            8 |
|

[2m[36m(func pid=4190)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 1024/170498071 [00:00<7:49:08, 6057.06it/s]
  0%|          | 33792/170498071 [00:00<24:40, 115143.70it/s]
  0%|          | 82944/170498071 [00:00<14:45, 192502.21it/s]
  0%|          | 214016/170498071 [00:00<06:48, 417249.43it/s]
  0%|          | 443392/170498071 [00:00<03:47, 746936.24it/s]
  1%|          | 902144/170498071 [00:01<02:01, 1398017.63it/s]
  1%|          | 1819648/170498071 [00:01<01:02, 2680910.47it/s]
  2%|▏         | 3671040/170498071 [00:01<00:31, 5246472.57it/s]
  4%|▎         | 6374400/170498071 [00:01<00:19, 8505414.87it/s]
  5%|▌         | 8586240/170498071 [00:01<00:16, 9843347.24it/s]


== Status ==
Current time: 2022-03-25 02:52:15 (running for 00:00:05.27)
Memory usage on this node: 2.1/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 0/0 GPUs, 0.0/6.68 GiB heap, 0.0/3.34 GiB objects
Result logdir: /root/ray_results/DEFAULT_2022-03-25_02-52-10
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+---------------------+----------+-----------------+------+------+-------------+--------------+
| Trial name          | status   | loc             |   l1 |   l2 |          lr |   batch_size |
|---------------------+----------+-----------------+------+------+-------------+--------------|
| DEFAULT_90d48_00000 | RUNNING  | 172.28.0.2:4190 |  256 |    8 | 0.00237941  |           16 |
| DEFAULT_90d48_00001 | PENDING  |                 |  256 |  128 | 0.0985469   |            4 |
| DEFAULT_90d48_00002 | PENDING  |                 |   16 |  128 | 0.000290688 |            8 |
|

  6%|▋         | 10880000/170498071 [00:01<00:14, 10894105.63it/s]
  8%|▊         | 13386752/170498071 [00:02<00:13, 11990540.47it/s]
  9%|▉         | 16188416/170498071 [00:02<00:11, 13259111.82it/s]
 11%|█▏        | 19235840/170498071 [00:02<00:10, 14591158.48it/s]
 13%|█▎        | 21578752/170498071 [00:02<00:10, 14287796.12it/s]
 14%|█▍        | 24544256/170498071 [00:02<00:09, 15163933.16it/s]
 16%|█▌        | 27264000/170498071 [00:02<00:09, 15341808.30it/s]
 18%|█▊        | 30229504/170498071 [00:03<00:08, 15898006.79it/s]
 19%|█▉        | 33113088/170498071 [00:03<00:08, 16143770.61it/s]
 21%|██        | 36111360/170498071 [00:03<00:08, 16512049.45it/s]
 23%|██▎       | 39109632/170498071 [00:03<00:07, 16685381.25it/s]
 25%|██▍       | 42058752/170498071 [00:03<00:07, 16714203.71it/s]
 26%|██▋       | 45057024/170498071 [00:03<00:07, 17003770.71it/s]
 28%|██▊       | 47858688/170498071 [00:04<00:07, 16760295.50it/s]
 30%|██▉       | 50594816/170498071 [00:04<00:07, 16484377.53i

== Status ==
Current time: 2022-03-25 02:52:21 (running for 00:00:11.26)
Memory usage on this node: 2.1/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 0/0 GPUs, 0.0/6.68 GiB heap, 0.0/3.34 GiB objects
Result logdir: /root/ray_results/DEFAULT_2022-03-25_02-52-10
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+---------------------+----------+-----------------+------+------+-------------+--------------+
| Trial name          | status   | loc             |   l1 |   l2 |          lr |   batch_size |
|---------------------+----------+-----------------+------+------+-------------+--------------|
| DEFAULT_90d48_00000 | RUNNING  | 172.28.0.2:4190 |  256 |    8 | 0.00237941  |           16 |
| DEFAULT_90d48_00001 | PENDING  |                 |  256 |  128 | 0.0985469   |            4 |
| DEFAULT_90d48_00002 | PENDING  |                 |   16 |  128 | 0.000290688 |            8 |
|

 66%|██████▌   | 112149504/170498071 [00:07<00:03, 17119187.39it/s]
 67%|██████▋   | 114721792/170498071 [00:08<00:03, 16449022.85it/s]
 69%|██████▉   | 117474304/170498071 [00:08<00:03, 16305311.92it/s]
 71%|███████   | 120407040/170498071 [00:08<00:03, 16509322.09it/s]
 72%|███████▏  | 123356160/170498071 [00:08<00:02, 16687326.18it/s]
 74%|███████▍  | 126387200/170498071 [00:08<00:02, 16947127.14it/s]
 76%|███████▌  | 129254400/170498071 [00:08<00:02, 16815240.71it/s]
 78%|███████▊  | 132318208/170498071 [00:09<00:02, 17029459.79it/s]
 79%|███████▉  | 135382016/170498071 [00:09<00:02, 17239437.88it/s]
 81%|████████  | 138511360/170498071 [00:09<00:01, 17511498.46it/s]
 83%|████████▎ | 141591552/170498071 [00:09<00:01, 17611233.76it/s]
 85%|████████▍ | 144655360/170498071 [00:09<00:01, 17661236.61it/s]
 87%|████████▋ | 147801088/170498071 [00:10<00:01, 17836198.24it/s]
 88%|████████▊ | 150766592/170498071 [00:10<00:01, 17628747.43it/s]
 90%|█████████ | 153781248/170498071 [00:10<00:0

[2m[36m(func pid=4190)[0m Extracting .data/cifar-10-python.tar.gz to .data
== Status ==
Current time: 2022-03-25 02:52:26 (running for 00:00:16.32)
Memory usage on this node: 2.1/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 0/0 GPUs, 0.0/6.68 GiB heap, 0.0/3.34 GiB objects
Result logdir: /root/ray_results/DEFAULT_2022-03-25_02-52-10
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+---------------------+----------+-----------------+------+------+-------------+--------------+
| Trial name          | status   | loc             |   l1 |   l2 |          lr |   batch_size |
|---------------------+----------+-----------------+------+------+-------------+--------------|
| DEFAULT_90d48_00000 | RUNNING  | 172.28.0.2:4190 |  256 |    8 | 0.00237941  |           16 |
| DEFAULT_90d48_00001 | PENDING  |                 |  256 |  128 | 0.0985469   |            4 |
| DEFAULT_90d48_000

[2m[36m(func pid=4190)[0m   cpuset_checked))


== Status ==
Current time: 2022-03-25 02:52:31 (running for 00:00:21.35)
Memory usage on this node: 2.4/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 0/0 GPUs, 0.0/6.68 GiB heap, 0.0/3.34 GiB objects
Result logdir: /root/ray_results/DEFAULT_2022-03-25_02-52-10
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+---------------------+----------+-----------------+------+------+-------------+--------------+
| Trial name          | status   | loc             |   l1 |   l2 |          lr |   batch_size |
|---------------------+----------+-----------------+------+------+-------------+--------------|
| DEFAULT_90d48_00000 | RUNNING  | 172.28.0.2:4190 |  256 |    8 | 0.00237941  |           16 |
| DEFAULT_90d48_00001 | PENDING  |                 |  256 |  128 | 0.0985469   |            4 |
| DEFAULT_90d48_00002 | PENDING  |                 |   16 |  128 | 0.000290688 |            8 |
|

In [None]:
## save model
model_path = './project1_model.pt'
torch.save(project1_model.state_dict(), model_path)

In [None]:
#And the saved file can be loaded with the following code.
## read model file

import torch
from project1_model import project1_model
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = project1_model().to(device)
model_path = './project1_model.pt'
model.load_state_dict(torch.load(model_path, map_location=device), strict=False)