In [1]:
import argparse
from typing import Dict
from ray.air import session
import os

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

import ray.train as train
from ray.train.torch import TorchTrainer
from ray.air.config import ScalingConfig

In [2]:
# Download training data from open datasets.
training_data = datasets.FashionMNIST(
    root="~/data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Download test data from open datasets.
test_data = datasets.FashionMNIST(
    root="~/data",
    train=False,
    download=True,
    transform=ToTensor(),
)


# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
            nn.ReLU(),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


def train_epoch(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset) // session.get_world_size()
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def validate_epoch(dataloader, model, loss_fn):
    size = len(dataloader.dataset) // session.get_world_size()
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(
        f"Test Error: \n "
        f"Accuracy: {(100 * correct):>0.1f}%, "
        f"Avg loss: {test_loss:>8f} \n"
    )
    return test_loss


def train_func(config: Dict):
    batch_size = config["batch_size"]
    lr = config["lr"]
    epochs = config["epochs"]

    worker_batch_size = batch_size // session.get_world_size()

    # Create data loaders.
    train_dataloader = DataLoader(training_data, batch_size=worker_batch_size)
    test_dataloader = DataLoader(test_data, batch_size=worker_batch_size)

    train_dataloader = train.torch.prepare_data_loader(train_dataloader)
    test_dataloader = train.torch.prepare_data_loader(test_dataloader)

    # Create model.
    model = NeuralNetwork()
    model = train.torch.prepare_model(model)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    for _ in range(epochs):
        train_epoch(train_dataloader, model, loss_fn, optimizer)
        loss = validate_epoch(test_dataloader, model, loss_fn)
        session.report(dict(loss=loss))


def train_fashion_mnist(num_workers=3, use_gpu=True):
    trainer = TorchTrainer(
        train_loop_per_worker=train_func,
        train_loop_config={"lr": 1e-3, "batch_size": 64, "epochs": 4},
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    result = trainer.fit()
    print(f"Last result: {result.metrics}")

In [3]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--address", required=False, type=str, help="the address to use for Ray"
    )
    parser.add_argument(
        "--num-workers",
        "-n",
        type=int,
        default=3,
        help="Sets number of workers for training.",
    )
    parser.add_argument(
        "--use-gpu", action="store_true", default=True, help="Enables GPU training"
    )
    parser.add_argument(
        "--smoke-test",
        action="store_true",
        default=False,
        help="Finish quickly for testing.",
    )

    args, _ = parser.parse_known_args()

    import ray
    if ray.is_initialized() == False:
        print("Connecting to Ray cluster...")
        service_host = os.environ["RAY_HEAD_SERVICE_HOST"]
        service_port = os.environ["RAY_HEAD_SERVICE_PORT"]
        ray.util.connect(f"{service_host}:{service_port}")


    # if args.smoke_test:
    #     # 2 workers + 1 for trainer.
    #     ray.init(num_cpus=3)
    #     train_fashion_mnist()
    # else:
    # ray.init(address=args.address)
    train_fashion_mnist(num_workers=args.num_workers, use_gpu=True)

Connecting to Ray cluster...




[2m[36m(TunerInternal pid=2109)[0m == Status ==
[2m[36m(TunerInternal pid=2109)[0m Current time: 2023-02-24 16:29:22 (running for 00:00:06.37)
[2m[36m(TunerInternal pid=2109)[0m Memory usage on this node: 5.2/59.9 GiB
[2m[36m(TunerInternal pid=2109)[0m Using FIFO scheduling algorithm.
[2m[36m(TunerInternal pid=2109)[0m Resources requested: 1.0/8 CPUs, 3.0/4 GPUs, 0.0/26.63 GiB heap, 0.0/11.83 GiB objects (0.0/4.0 accelerator_type:V100)
[2m[36m(TunerInternal pid=2109)[0m Result logdir: /home/ray/ray_results/TorchTrainer_2023-02-24_16-29-14
[2m[36m(TunerInternal pid=2109)[0m Number of trials: 1/1 (1 RUNNING)
[2m[36m(TunerInternal pid=2109)[0m +--------------------------+----------+----------------+
[2m[36m(TunerInternal pid=2109)[0m | Trial name               | status   | loc            |
[2m[36m(TunerInternal pid=2109)[0m |--------------------------+----------+----------------|
[2m[36m(TunerInternal pid=2109)[0m | TorchTrainer_6ef96_00000 | RUNNING  | 10

[2m[36m(RayTrainWorker pid=135, ip=10.0.52.196)[0m 2023-02-24 16:29:24,928	INFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=3]


[2m[36m(TunerInternal pid=2109)[0m == Status ==
[2m[36m(TunerInternal pid=2109)[0m Current time: 2023-02-24 16:29:27 (running for 00:00:11.37)
[2m[36m(TunerInternal pid=2109)[0m Memory usage on this node: 6.2/59.9 GiB
[2m[36m(TunerInternal pid=2109)[0m Using FIFO scheduling algorithm.
[2m[36m(TunerInternal pid=2109)[0m Resources requested: 1.0/8 CPUs, 3.0/4 GPUs, 0.0/26.63 GiB heap, 0.0/11.83 GiB objects (0.0/4.0 accelerator_type:V100)
[2m[36m(TunerInternal pid=2109)[0m Result logdir: /home/ray/ray_results/TorchTrainer_2023-02-24_16-29-14
[2m[36m(TunerInternal pid=2109)[0m Number of trials: 1/1 (1 RUNNING)
[2m[36m(TunerInternal pid=2109)[0m +--------------------------+----------+----------------+
[2m[36m(TunerInternal pid=2109)[0m | Trial name               | status   | loc            |
[2m[36m(TunerInternal pid=2109)[0m |--------------------------+----------+----------------|
[2m[36m(TunerInternal pid=2109)[0m | TorchTrainer_6ef96_00000 | RUNNING  | 10

[2m[36m(RayTrainWorker pid=135, ip=10.0.52.196)[0m 2023-02-24 16:29:29,240	INFO train_loop_utils.py:300 -- Moving model to device: cuda:0
[2m[36m(RayTrainWorker pid=135, ip=10.0.52.196)[0m 2023-02-24 16:29:29,242	INFO train_loop_utils.py:347 -- Wrapping provided model in DDP.
[2m[36m(RayTrainWorker pid=2208)[0m 2023-02-24 16:29:29,257	INFO train_loop_utils.py:300 -- Moving model to device: cuda:0
[2m[36m(RayTrainWorker pid=2208)[0m 2023-02-24 16:29:29,258	INFO train_loop_utils.py:347 -- Wrapping provided model in DDP.
[2m[36m(RayTrainWorker pid=246, ip=10.0.53.75)[0m 2023-02-24 16:29:29,380	INFO train_loop_utils.py:300 -- Moving model to device: cuda:0
[2m[36m(RayTrainWorker pid=246, ip=10.0.53.75)[0m 2023-02-24 16:29:29,382	INFO train_loop_utils.py:347 -- Wrapping provided model in DDP.


[2m[36m(RayTrainWorker pid=246, ip=10.0.53.75)[0m loss: 2.300348  [    0/20000]
[2m[36m(RayTrainWorker pid=135, ip=10.0.52.196)[0m loss: 2.304214  [    0/20000]
[2m[36m(RayTrainWorker pid=2208)[0m loss: 2.296877  [    0/20000]
[2m[36m(RayTrainWorker pid=2208)[0m loss: 2.283382  [ 2100/20000]
[2m[36m(RayTrainWorker pid=246, ip=10.0.53.75)[0m loss: 2.280409  [ 2100/20000]
[2m[36m(RayTrainWorker pid=135, ip=10.0.52.196)[0m loss: 2.293032  [ 2100/20000]
[2m[36m(TunerInternal pid=2109)[0m == Status ==
[2m[36m(TunerInternal pid=2109)[0m Current time: 2023-02-24 16:29:32 (running for 00:00:16.38)
[2m[36m(TunerInternal pid=2109)[0m Memory usage on this node: 7.6/59.9 GiB
[2m[36m(TunerInternal pid=2109)[0m Using FIFO scheduling algorithm.
[2m[36m(TunerInternal pid=2109)[0m Resources requested: 1.0/8 CPUs, 3.0/4 GPUs, 0.0/26.63 GiB heap, 0.0/11.83 GiB objects (0.0/4.0 accelerator_type:V100)
[2m[36m(TunerInternal pid=2109)[0m Result logdir: /home/ray/ray_result

2023-02-25 00:30:39,215	ERROR checkpoint_manager.py:133 -- The requested checkpoint is not available on this node, most likely because you are using Ray client or disabled checkpoint synchronization. To avoid this, enable checkpoint synchronization to cloud storage by specifying a `SyncConfig`. The checkpoint may be available on a different node - please check this location on worker nodes: /home/ray/ray_results/TorchTrainer_2023-02-24_16-29-14/TorchTrainer_6ef96_00000_0_2023-02-24_16-29-17/checkpoint_-00001


[2m[36m(TunerInternal pid=2109)[0m == Status ==
[2m[36m(TunerInternal pid=2109)[0m Current time: 2023-02-24 16:30:39 (running for 00:01:23.13)
[2m[36m(TunerInternal pid=2109)[0m Memory usage on this node: 4.7/59.9 GiB
[2m[36m(TunerInternal pid=2109)[0m Using FIFO scheduling algorithm.
[2m[36m(TunerInternal pid=2109)[0m Resources requested: 0/8 CPUs, 0/4 GPUs, 0.0/26.63 GiB heap, 0.0/11.83 GiB objects (0.0/4.0 accelerator_type:V100)
[2m[36m(TunerInternal pid=2109)[0m Result logdir: /home/ray/ray_results/TorchTrainer_2023-02-24_16-29-14
[2m[36m(TunerInternal pid=2109)[0m Number of trials: 1/1 (1 TERMINATED)
[2m[36m(TunerInternal pid=2109)[0m +--------------------------+------------+----------------+--------+------------------+---------+--------------+---------------------+
[2m[36m(TunerInternal pid=2109)[0m | Trial name               | status     | loc            |   iter |   total time (s) |    loss |   _timestamp |   _time_this_iter_s |
[2m[36m(TunerInterna

[2m[36m(TunerInternal pid=2109)[0m 2023-02-24 16:30:39,200	INFO tune.py:758 -- Total run time: 84.87 seconds (83.12 seconds for the tuning loop).
