In [1]:
import argparse
from typing import Dict
from ray.air import session
import os

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

import ray.train as train
from ray.train.torch import TorchTrainer
from ray.air.config import ScalingConfig

In [2]:
# # Download training data from open datasets.
# training_data = datasets.FashionMNIST(
#     root="~/data",
#     train=True,
#     download=True,
#     transform=ToTensor(),
# )

# # Download test data from open datasets.
# test_data = datasets.FashionMNIST(
#     root="~/data",
#     train=False,
#     download=True,
#     transform=ToTensor(),
# )
def load_data():
    # Download training data from open datasets.
    training_data = datasets.FashionMNIST(
        root="data",
        train=True,
        download=True,
        transform=ToTensor(),
    )

    # Download test data from open datasets.
    test_data = datasets.FashionMNIST(
        root="data",
        train=False,
        download=True,
        transform=ToTensor(),
    )
    return training_data, test_data


# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
            nn.ReLU(),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


def train_epoch(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset) // session.get_world_size()
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def validate_epoch(dataloader, model, loss_fn):
    size = len(dataloader.dataset) // session.get_world_size()
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(
        f"Test Error: \n "
        f"Accuracy: {(100 * correct):>0.1f}%, "
        f"Avg loss: {test_loss:>8f} \n"
    )
    return test_loss


def train_func(config: Dict):
    batch_size = config["batch_size"]
    lr = config["lr"]
    epochs = config["epochs"]

    worker_batch_size = batch_size // session.get_world_size()

    # Create data loaders.
    training_data, test_data = load_data()  # <- this is new!
    train_dataloader = DataLoader(training_data, batch_size=worker_batch_size)
    test_dataloader = DataLoader(test_data, batch_size=worker_batch_size)

    train_dataloader = train.torch.prepare_data_loader(train_dataloader)
    test_dataloader = train.torch.prepare_data_loader(test_dataloader)

    # Create model.
    model = NeuralNetwork()
    model = train.torch.prepare_model(model)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    for _ in range(epochs):
        train_epoch(train_dataloader, model, loss_fn, optimizer)
        loss = validate_epoch(test_dataloader, model, loss_fn)
        session.report(dict(loss=loss))


def train_fashion_mnist(num_workers=3, use_gpu=True):
    trainer = TorchTrainer(
        train_loop_per_worker=train_func,
        train_loop_config={"lr": 1e-3, "batch_size": 64, "epochs": 4},
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    result = trainer.fit()
    print(f"Last result: {result.metrics}")

In [6]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--address", required=False, type=str, help="the address to use for Ray"
    )
    parser.add_argument(
        "--num-workers",
        "-n",
        type=int,
        default=2,
        help="Sets number of workers for training.",
    )
    parser.add_argument(
        "--use-gpu", action="store_true", default=True, help="Enables GPU training"
    )
    parser.add_argument(
        "--smoke-test",
        action="store_true",
        default=False,
        help="Finish quickly for testing.",
    )

    args, _ = parser.parse_known_args()

    import ray
    if ray.is_initialized() == False:
        print("Connecting to Ray cluster...")
        service_host = os.environ["RAY_HEAD_SERVICE_HOST"]
        service_port = os.environ["RAY_HEAD_SERVICE_PORT"]
        ray.util.connect(f"{service_host}:{service_port}")


    # if args.smoke_test:
    #     # 2 workers + 1 for trainer.
    #     ray.init(num_cpus=3)
    #     train_fashion_mnist()
    # else:
    # ray.init(address=args.address)
    train_fashion_mnist(num_workers=args.num_workers, use_gpu=True)

[2m[36m(TunerInternal pid=2185)[0m == Status ==
[2m[36m(TunerInternal pid=2185)[0m Current time: 2023-04-05 13:58:25 (running for 00:00:02.43)
[2m[36m(TunerInternal pid=2185)[0m Memory usage on this node: 3.7/31.0 GiB
[2m[36m(TunerInternal pid=2185)[0m Using FIFO scheduling algorithm.
[2m[36m(TunerInternal pid=2185)[0m Resources requested: 1.0/8 CPUs, 2.0/4 GPUs, 0.0/26.63 GiB heap, 0.0/11.83 GiB objects (0.0/4.0 accelerator_type:A10G)
[2m[36m(TunerInternal pid=2185)[0m Result logdir: /home/ray/ray_results/TorchTrainer_2023-04-05_13-58-23
[2m[36m(TunerInternal pid=2185)[0m Number of trials: 1/1 (1 RUNNING)
[2m[36m(TunerInternal pid=2185)[0m +--------------------------+----------+----------------+
[2m[36m(TunerInternal pid=2185)[0m | Trial name               | status   | loc            |
[2m[36m(TunerInternal pid=2185)[0m |--------------------------+----------+----------------|
[2m[36m(TunerInternal pid=2185)[0m | TorchTrainer_9a55c_00000 | RUNNING  | 10

[2m[36m(RayTrainWorker pid=790, ip=10.0.40.106)[0m 2023-04-05 13:58:27,507	INFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=2]
[2m[36m(TorchTrainer pid=336, ip=10.0.57.80)[0m 2023-04-05 13:58:27,501	ERROR function_trainable.py:298 -- Runner Thread raised error.
[2m[36m(TorchTrainer pid=336, ip=10.0.57.80)[0m Traceback (most recent call last):
[2m[36m(TorchTrainer pid=336, ip=10.0.57.80)[0m   File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/tune/trainable/function_trainable.py", line 289, in run
[2m[36m(TorchTrainer pid=336, ip=10.0.57.80)[0m     self._entrypoint()
[2m[36m(TorchTrainer pid=336, ip=10.0.57.80)[0m   File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/tune/trainable/function_trainable.py", line 362, in entrypoint
[2m[36m(TorchTrainer pid=336, ip=10.0.57.80)[0m     return self._trainable_func(
[2m[36m(TorchTrainer pid=336, ip=10.0.57.80)[0m   File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/util/tr

In [2]:
help(TorchTrainer)

Help on class TorchTrainer in module ray.train.torch.torch_trainer:

class TorchTrainer(ray.train.data_parallel_trainer.DataParallelTrainer)
 |  TorchTrainer(*args, **kwargs)
 |  
 |  A Trainer for data parallel PyTorch training.
 |  
 |  This Trainer runs the function ``train_loop_per_worker`` on multiple Ray
 |  Actors. These actors already have the necessary torch process group already
 |  configured for distributed PyTorch training.
 |  
 |  The ``train_loop_per_worker`` function is expected to take in either 0 or 1
 |  arguments:
 |  
 |  .. code-block:: python
 |  
 |      def train_loop_per_worker():
 |          ...
 |  
 |  .. code-block:: python
 |  
 |      def train_loop_per_worker(config: Dict):
 |          ...
 |  
 |  If ``train_loop_per_worker`` accepts an argument, then
 |  ``train_loop_config`` will be passed in as the argument. This is useful if you
 |  want to tune the values in ``train_loop_config`` as hyperparameters.
 |  
 |  If the ``datasets`` dict contains a tr

In [None]:

Parameters

        train_loop_per_worker – The training function to execute. This can either take in no arguments or a config dict.

        train_loop_config – Configurations to pass into train_loop_per_worker if it accepts an argument.

        torch_config – Configuration for setting up the PyTorch backend. If set to None, use the default configuration. This replaces the backend_config arg of DataParallelTrainer.

        scaling_config – Configuration for how to scale data parallel training.

        dataset_config – Configuration for dataset ingest.

        run_config – Configuration for the execution of the training run.

        datasets – Any Ray Datasets to use for training. Use the key “train” to denote which dataset is the training dataset. If a preprocessor is provided and has not already been fit, it will be fit on the training dataset. All datasets will be transformed by the preprocessor if one is provided.

        preprocessor – A ray.data.Preprocessor to preprocess the provided datasets.

        resume_from_checkpoint – A checkpoint to resume training from.

