In [1]:
import argparse
from typing import Dict
from ray.air import session
import os

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

import ray.train as train
from ray.train.torch import TorchTrainer
from ray.air.config import ScalingConfig

In [2]:
# Download training data from open datasets.
training_data = datasets.FashionMNIST(
    root="~/data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Download test data from open datasets.
test_data = datasets.FashionMNIST(
    root="~/data",
    train=False,
    download=True,
    transform=ToTensor(),
)


# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
            nn.ReLU(),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


def train_epoch(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset) // session.get_world_size()
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def validate_epoch(dataloader, model, loss_fn):
    size = len(dataloader.dataset) // session.get_world_size()
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(
        f"Test Error: \n "
        f"Accuracy: {(100 * correct):>0.1f}%, "
        f"Avg loss: {test_loss:>8f} \n"
    )
    return test_loss


def train_func(config: Dict):
    batch_size = config["batch_size"]
    lr = config["lr"]
    epochs = config["epochs"]

    worker_batch_size = batch_size // session.get_world_size()

    # Create data loaders.
    train_dataloader = DataLoader(training_data, batch_size=worker_batch_size)
    test_dataloader = DataLoader(test_data, batch_size=worker_batch_size)

    train_dataloader = train.torch.prepare_data_loader(train_dataloader)
    test_dataloader = train.torch.prepare_data_loader(test_dataloader)

    # Create model.
    model = NeuralNetwork()
    model = train.torch.prepare_model(model)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    for _ in range(epochs):
        train_epoch(train_dataloader, model, loss_fn, optimizer)
        loss = validate_epoch(test_dataloader, model, loss_fn)
        session.report(dict(loss=loss))


def train_fashion_mnist(num_workers=3, use_gpu=True):
    trainer = TorchTrainer(
        train_loop_per_worker=train_func,
        train_loop_config={"lr": 1e-3, "batch_size": 64, "epochs": 4},
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    result = trainer.fit()
    print(f"Last result: {result.metrics}")

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to /home/ubuntu/data/FashionMNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/26421880 [00:00<?, ?it/s]

Extracting /home/ubuntu/data/FashionMNIST/raw/train-images-idx3-ubyte.gz to /home/ubuntu/data/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to /home/ubuntu/data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/29515 [00:00<?, ?it/s]

Extracting /home/ubuntu/data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to /home/ubuntu/data/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to /home/ubuntu/data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/4422102 [00:00<?, ?it/s]

Extracting /home/ubuntu/data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to /home/ubuntu/data/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to /home/ubuntu/data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/5148 [00:00<?, ?it/s]

Extracting /home/ubuntu/data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to /home/ubuntu/data/FashionMNIST/raw
Processing...
Done!


  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [3]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--address", required=False, type=str, help="the address to use for Ray"
    )
    parser.add_argument(
        "--num-workers",
        "-n",
        type=int,
        default=3,
        help="Sets number of workers for training.",
    )
    parser.add_argument(
        "--use-gpu", action="store_true", default=True, help="Enables GPU training"
    )
    parser.add_argument(
        "--smoke-test",
        action="store_true",
        default=False,
        help="Finish quickly for testing.",
    )

    args, _ = parser.parse_known_args()

    import ray
    if ray.is_initialized() == False:
        print("Connecting to Ray cluster...")
        service_host = os.environ["RAY_HEAD_SERVICE_HOST"]
        service_port = os.environ["RAY_HEAD_SERVICE_PORT"]
        ray.util.connect(f"{service_host}:{service_port}")


    # if args.smoke_test:
    #     # 2 workers + 1 for trainer.
    #     ray.init(num_cpus=3)
    #     train_fashion_mnist()
    # else:
    # ray.init(address=args.address)
    train_fashion_mnist(num_workers=args.num_workers, use_gpu=True)

Connecting to Ray cluster...




[2m[36m(TunerInternal pid=209)[0m == Status ==
[2m[36m(TunerInternal pid=209)[0m Current time: 2023-04-05 12:16:23 (running for 00:00:03.78)
[2m[36m(TunerInternal pid=209)[0m Memory usage on this node: 3.9/31.0 GiB
[2m[36m(TunerInternal pid=209)[0m Using FIFO scheduling algorithm.
[2m[36m(TunerInternal pid=209)[0m Resources requested: 1.0/8 CPUs, 3.0/4 GPUs, 0.0/26.62 GiB heap, 0.0/11.82 GiB objects (0.0/4.0 accelerator_type:A10G)
[2m[36m(TunerInternal pid=209)[0m Result logdir: /home/ray/ray_results/TorchTrainer_2023-04-05_12-16-18
[2m[36m(TunerInternal pid=209)[0m Number of trials: 1/1 (1 RUNNING)
[2m[36m(TunerInternal pid=209)[0m +--------------------------+----------+---------------+
[2m[36m(TunerInternal pid=209)[0m | Trial name               | status   | loc           |
[2m[36m(TunerInternal pid=209)[0m |--------------------------+----------+---------------|
[2m[36m(TunerInternal pid=209)[0m | TorchTrainer_580a4_00000 | RUNNING  | 10.0.62.72:81 |


[2m[36m(RayTrainWorker pid=299)[0m 2023-04-05 12:16:24,988	INFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=3]
[2m[36m(RayTrainWorker pid=110, ip=10.0.62.72)[0m 2023-04-05 12:16:27,732	INFO train_loop_utils.py:300 -- Moving model to device: cuda:0
[2m[36m(RayTrainWorker pid=110, ip=10.0.62.72)[0m 2023-04-05 12:16:27,733	INFO train_loop_utils.py:347 -- Wrapping provided model in DDP.
[2m[36m(RayTrainWorker pid=80, ip=10.0.38.132)[0m 2023-04-05 12:16:27,748	INFO train_loop_utils.py:300 -- Moving model to device: cuda:0
[2m[36m(RayTrainWorker pid=80, ip=10.0.38.132)[0m 2023-04-05 12:16:27,749	INFO train_loop_utils.py:347 -- Wrapping provided model in DDP.
[2m[36m(RayTrainWorker pid=299)[0m 2023-04-05 12:16:27,793	INFO train_loop_utils.py:300 -- Moving model to device: cuda:0
[2m[36m(RayTrainWorker pid=299)[0m 2023-04-05 12:16:27,794	INFO train_loop_utils.py:347 -- Wrapping provided model in DDP.


[2m[36m(TunerInternal pid=209)[0m == Status ==
[2m[36m(TunerInternal pid=209)[0m Current time: 2023-04-05 12:16:28 (running for 00:00:08.78)
[2m[36m(TunerInternal pid=209)[0m Memory usage on this node: 7.3/31.0 GiB
[2m[36m(TunerInternal pid=209)[0m Using FIFO scheduling algorithm.
[2m[36m(TunerInternal pid=209)[0m Resources requested: 1.0/8 CPUs, 3.0/4 GPUs, 0.0/26.62 GiB heap, 0.0/11.82 GiB objects (0.0/4.0 accelerator_type:A10G)
[2m[36m(TunerInternal pid=209)[0m Result logdir: /home/ray/ray_results/TorchTrainer_2023-04-05_12-16-18
[2m[36m(TunerInternal pid=209)[0m Number of trials: 1/1 (1 RUNNING)
[2m[36m(TunerInternal pid=209)[0m +--------------------------+----------+---------------+
[2m[36m(TunerInternal pid=209)[0m | Trial name               | status   | loc           |
[2m[36m(TunerInternal pid=209)[0m |--------------------------+----------+---------------|
[2m[36m(TunerInternal pid=209)[0m | TorchTrainer_580a4_00000 | RUNNING  | 10.0.62.72:81 |




[2m[36m(TunerInternal pid=209)[0m Result for TorchTrainer_580a4_00000:
[2m[36m(TunerInternal pid=209)[0m   _time_this_iter_s: 12.591781377792358
[2m[36m(TunerInternal pid=209)[0m   _timestamp: 1680722239
[2m[36m(TunerInternal pid=209)[0m   _training_iteration: 4
[2m[36m(TunerInternal pid=209)[0m   date: 2023-04-05_12-17-19
[2m[36m(TunerInternal pid=209)[0m   done: true
[2m[36m(TunerInternal pid=209)[0m   experiment_id: 33b15b3e374d40bf9191d7f3bcd739a4
[2m[36m(TunerInternal pid=209)[0m   experiment_tag: '0'
[2m[36m(TunerInternal pid=209)[0m   hostname: ray-642dbfaf53943c280b964b09-ray-worker-1
[2m[36m(TunerInternal pid=209)[0m   iterations_since_restore: 4
[2m[36m(TunerInternal pid=209)[0m   loss: 1.7373216264652755
[2m[36m(TunerInternal pid=209)[0m   node_ip: 10.0.62.72
[2m[36m(TunerInternal pid=209)[0m   pid: 81
[2m[36m(TunerInternal pid=209)[0m   time_since_restore: 55.84689283370972
[2m[36m(TunerInternal pid=209)[0m   time_this_iter_s: 12

[2m[36m(TunerInternal pid=209)[0m 2023-04-05 12:17:23,257	INFO tune.py:758 -- Total run time: 64.67 seconds (63.70 seconds for the tuning loop).
2023-04-05 19:17:23,269	ERROR checkpoint_manager.py:133 -- The requested checkpoint is not available on this node, most likely because you are using Ray client or disabled checkpoint synchronization. To avoid this, enable checkpoint synchronization to cloud storage by specifying a `SyncConfig`. The checkpoint may be available on a different node - please check this location on worker nodes: /home/ray/ray_results/TorchTrainer_2023-04-05_12-16-18/TorchTrainer_580a4_00000_0_2023-04-05_12-16-20/checkpoint_-00001


[2m[36m(TunerInternal pid=209)[0m == Status ==
[2m[36m(TunerInternal pid=209)[0m Current time: 2023-04-05 12:17:23 (running for 00:01:03.70)
[2m[36m(TunerInternal pid=209)[0m Memory usage on this node: 4.1/31.0 GiB
[2m[36m(TunerInternal pid=209)[0m Using FIFO scheduling algorithm.
[2m[36m(TunerInternal pid=209)[0m Resources requested: 0/8 CPUs, 0/4 GPUs, 0.0/26.62 GiB heap, 0.0/11.82 GiB objects (0.0/4.0 accelerator_type:A10G)
[2m[36m(TunerInternal pid=209)[0m Result logdir: /home/ray/ray_results/TorchTrainer_2023-04-05_12-16-18
[2m[36m(TunerInternal pid=209)[0m Number of trials: 1/1 (1 TERMINATED)
[2m[36m(TunerInternal pid=209)[0m +--------------------------+------------+---------------+--------+------------------+---------+--------------+---------------------+
[2m[36m(TunerInternal pid=209)[0m | Trial name               | status     | loc           |   iter |   total time (s) |    loss |   _timestamp |   _time_this_iter_s |
[2m[36m(TunerInternal pid=209)