In [6]:
import ray
import numpy as np

In [2]:
ray.init()

2023-10-17 06:50:46,349	INFO worker.py:1458 -- Connecting to existing Ray cluster at address: 192.168.11.179:6379...
2023-10-17 06:50:46,362	INFO worker.py:1633 -- Connected to Ray cluster. View the dashboard at [1m[32m192.168.11.179:8265 [39m[22m


0,1
Python version:,3.9.18
Ray version:,2.7.1
Dashboard:,http://192.168.11.179:8265


In [3]:
@ray.remote(num_gpus=2)
def f():
    print(ray.get_gpu_ids())

In [4]:
result = f.remote()
result

ObjectRef(c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000)

[2m[36m(f pid=1312)[0m [0, 1]


In [None]:
ray.get(result)

In [9]:
@ray.remote
def add(a, b):
    return np.add(a, b)

In [19]:
@ray.remote
def create_array():
    return np.ones(3)

In [20]:
a1 = create_array.remote()
a2 = create_array.remote()
sum = add.remote(a1, a2)
sum

ObjectRef(c54e76759b2a0c10ffffffffffffffffffffffff0100000001000000)

In [22]:
ray.get(sum)

array([2., 2., 2.])

In [9]:
import os
from filelock import FileLock
from typing import Dict

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchvision.transforms import ToTensor, Normalize
from tqdm import tqdm

import ray.train
from ray.train import ScalingConfig
from ray.train.torch import TorchTrainer


def get_dataloaders(batch_size):
    # Transform to normalize the input images
    transform = transforms.Compose([ToTensor(), Normalize((0.5,), (0.5,))])

    with FileLock(os.path.expanduser("~/data.lock")):
        # Download training data from open datasets.
        training_data = datasets.FashionMNIST(
            root="~/data",
            train=True,
            download=True,
            transform=transform,
        )

        # Download test data from open datasets.
        test_data = datasets.FashionMNIST(
            root="~/data",
            train=False,
            download=True,
            transform=transform,
        )

    # Create data loaders.
    train_dataloader = DataLoader(training_data, batch_size=batch_size)
    test_dataloader = DataLoader(test_data, batch_size=batch_size)

    return train_dataloader, test_dataloader


# Model Definition
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(512, 10),
            nn.ReLU(),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


def train_func_per_worker(config: Dict):
    lr = config["lr"]
    epochs = config["epochs"]
    batch_size = config["batch_size_per_worker"]

    # Get dataloaders inside worker training function
    train_dataloader, test_dataloader = get_dataloaders(batch_size=batch_size)

    # [1] Prepare Dataloader for distributed training
    # Shard the datasets among workers and move batches to the correct device
    # =======================================================================
    train_dataloader = ray.train.torch.prepare_data_loader(train_dataloader)
    test_dataloader = ray.train.torch.prepare_data_loader(test_dataloader)

    model = NeuralNetwork()

    # [2] Prepare and wrap your model with DistributedDataParallel
    # Move the model the correct GPU/CPU device
    # ============================================================
    model = ray.train.torch.prepare_model(model)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)

    # Model training loop
    for epoch in range(epochs):
        model.train()
        for X, y in tqdm(train_dataloader, desc=f"Train Epoch {epoch}"):
            pred = model(X)
            loss = loss_fn(pred, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        model.eval()
        test_loss, num_correct, num_total = 0, 0, 0
        with torch.no_grad():
            for X, y in tqdm(test_dataloader, desc=f"Test Epoch {epoch}"):
                pred = model(X)
                loss = loss_fn(pred, y)

                test_loss += loss.item()
                num_total += y.shape[0]
                num_correct += (pred.argmax(1) == y).sum().item()

        test_loss /= len(test_dataloader)
        accuracy = num_correct / num_total

        # [3] Report metrics to Ray Train
        # ===============================
        ray.train.report(metrics={"loss": test_loss, "accuracy": accuracy})


def train_fashion_mnist(num_workers=2, use_gpu=False, epochs=10):
    global_batch_size = 32

    train_config = {
        "lr": 1e-3,
        "epochs": epochs,
        "batch_size_per_worker": global_batch_size // num_workers,
    }

    # Configure computation resources
    scaling_config = ScalingConfig(num_workers=num_workers, use_gpu=use_gpu)

    # Initialize a Ray TorchTrainer
    trainer = TorchTrainer(
        train_loop_per_worker=train_func_per_worker,
        train_loop_config=train_config,
        scaling_config=scaling_config,
    )

    # [4] Start Distributed Training
    # Run `train_func_per_worker` on all workers
    # =============================================
    result = trainer.fit()
    print(f"Training result: {result}")

In [10]:
train_fashion_mnist(num_workers=1, use_gpu=True, epochs=5)

0,1
Current time:,2023-10-16 19:27:37
Running for:,00:01:56.65
Memory:,3.0/47.1 GiB

Trial name,status,loc,iter,total time (s),loss,accuracy
TorchTrainer_cab2a_00000,TERMINATED,192.168.10.82:21207,5,110.78,0.401416,0.8538


[2m[36m(TorchTrainer pid=21207)[0m Starting distributed worker processes: ['21242 (192.168.10.82)']
[2m[36m(RayTrainWorker pid=21242)[0m Setting up process group for: env:// [rank=0, world_size=1]


[2m[36m(RayTrainWorker pid=21242)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
[2m[36m(RayTrainWorker pid=21242)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to /home/ubuntu/data/FashionMNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/26421880 [00:00<?, ?it/s]
  0%|          | 98304/26421880 [00:00<00:34, 765638.23it/s]
  2%|▏         | 458752/26421880 [00:00<00:13, 1936193.11it/s]
  3%|▎         | 753664/26421880 [00:00<00:12, 2081477.48it/s]
  4%|▍         | 1081344/26421880 [00:00<00:11, 2239969.28it/s]
  5%|▌         | 1376256/26421880 [00:00<00:11, 2248206.78it/s]
  6%|▋         | 1703936/26421880 [00:00<00:10, 2333113.76it/s]
  8%|▊         | 1998848/26421880 [00:00<00:10, 2303965.48it/s]
  9%|▉         | 2326528/26421880 [00:01<00:10, 2365731.96it/s]
 10%|█         | 2654208/26421880 [00:01<00:09, 2407427.94it/s]
 11%|█▏        | 3014656/26421880 [00:01<00:09, 2511157.95it/s]
 13%|█▎        | 3342336/26421880 [00:01<00:09, 2514126.92it/s]
 14%|█▍        | 3702784/26421880 [00:01<00:08, 2585116.92it/s]
 15%|█▌        | 4063232/26421880 [00:01<00:08, 2633430.95it/s]
 17%|█▋        | 4456448/26421880 [00:01<00:08, 2742713.64it/s]
 18%|█▊        | 4816896/26421880 [00:01<00:07, 2747746.23it/s]


[2m[36m(RayTrainWorker pid=21242)[0m Extracting /home/ubuntu/data/FashionMNIST/raw/train-images-idx3-ubyte.gz to /home/ubuntu/data/FashionMNIST/raw
[2m[36m(RayTrainWorker pid=21242)[0m 
[2m[36m(RayTrainWorker pid=21242)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
[2m[36m(RayTrainWorker pid=21242)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to /home/ubuntu/data/FashionMNIST/raw/train-labels-idx1-ubyte.gz
[2m[36m(RayTrainWorker pid=21242)[0m Extracting /home/ubuntu/data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to /home/ubuntu/data/FashionMNIST/raw
[2m[36m(RayTrainWorker pid=21242)[0m 
[2m[36m(RayTrainWorker pid=21242)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz


100%|██████████| 29515/29515 [00:00<00:00, 697471.32it/s]


[2m[36m(RayTrainWorker pid=21242)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to /home/ubuntu/data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/4422102 [00:00<?, ?it/s]
  1%|▏         | 65536/4422102 [00:00<00:12, 352273.44it/s]
  3%|▎         | 131072/4422102 [00:00<00:21, 203975.10it/s]
  4%|▎         | 163840/4422102 [00:00<00:21, 193599.04it/s]
  4%|▍         | 196608/4422102 [00:00<00:22, 188724.39it/s]
  5%|▌         | 229376/4422102 [00:01<00:21, 196892.91it/s]
  6%|▌         | 262144/4422102 [00:01<00:21, 193181.52it/s]
  7%|▋         | 294912/4422102 [00:01<00:20, 203640.52it/s]
  7%|▋         | 327680/4422102 [00:01<00:19, 208021.40it/s]
  8%|▊         | 360448/4422102 [00:01<00:18, 215243.57it/s]
  9%|▉         | 393216/4422102 [00:01<00:18, 220799.47it/s]
 10%|▉         | 425984/4422102 [00:02<00:19, 210117.79it/s]
 10%|█         | 458752/4422102 [00:02<00:21, 183245.87it/s]
 11%|█         | 491520/4422102 [00:02<00:22, 177907.92it/s]
 12%|█▏        | 524288/4422102 [00:02<00:20, 190921.14it/s]
 13%|█▎        | 557056/4422102 [00:02<00:19, 200829.91it/s]
 13%|█▎        | 589824/4422102 [00:02<00:

[2m[36m(RayTrainWorker pid=21242)[0m Extracting /home/ubuntu/data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to /home/ubuntu/data/FashionMNIST/raw
[2m[36m(RayTrainWorker pid=21242)[0m 
[2m[36m(RayTrainWorker pid=21242)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz


100%|██████████| 4422102/4422102 [00:12<00:00, 354881.43it/s]


[2m[36m(RayTrainWorker pid=21242)[0m Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to /home/ubuntu/data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz
[2m[36m(RayTrainWorker pid=21242)[0m Extracting /home/ubuntu/data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to /home/ubuntu/data/FashionMNIST/raw
[2m[36m(RayTrainWorker pid=21242)[0m 


100%|██████████| 5148/5148 [00:00<00:00, 28675002.65it/s]
[2m[36m(RayTrainWorker pid=21242)[0m Moving model to device: cuda:0
Train Epoch 0:   0%|          | 0/1875 [00:00<?, ?it/s]
Train Epoch 0:   0%|          | 1/1875 [00:00<06:06,  5.11it/s]
Train Epoch 0:   1%|          | 11/1875 [00:00<00:41, 45.00it/s]
Train Epoch 0:   1%|▏         | 24/1875 [00:00<00:24, 74.85it/s]
Train Epoch 0:   2%|▏         | 38/1875 [00:00<00:19, 95.11it/s]
Train Epoch 0:   3%|▎         | 52/1875 [00:00<00:16, 107.69it/s]
Train Epoch 0:   4%|▎         | 66/1875 [00:00<00:15, 115.82it/s]
Train Epoch 0:   4%|▍         | 80/1875 [00:00<00:14, 120.98it/s]
Train Epoch 0:   5%|▌         | 94/1875 [00:00<00:14, 124.43it/s]
Train Epoch 0:   6%|▌         | 108/1875 [00:01<00:13, 126.34it/s]
Train Epoch 0:   7%|▋         | 122/1875 [00:01<00:13, 128.29it/s]
Train Epoch 0:   7%|▋         | 136/1875 [00:01<00:13, 129.75it/s]
Train Epoch 0:   8%|▊         | 150/1875 [00:01<00:13, 130.31it/s]
Train Epoch 0:   9%|▊   

Training result: Result(
  metrics={'loss': 0.4014163756856141, 'accuracy': 0.8538},
  path='/home/ubuntu/ray_results/TorchTrainer_2023-10-16_19-25-40/TorchTrainer_cab2a_00000_0_2023-10-16_19-25-40',
  filesystem='local',
  checkpoint=None
)


In [11]:
train_fashion_mnist(num_workers=2, use_gpu=True, epochs=5)

0,1
Current time:,2023-10-16 19:28:46
Running for:,00:01:09.53
Memory:,4.1/47.1 GiB

Trial name,status,loc,iter,total time (s),loss,accuracy
TorchTrainer_10431_00000,TERMINATED,192.168.10.82:21297,5,63.1642,0.393816,0.8572


[2m[36m(TorchTrainer pid=21297)[0m Starting distributed worker processes: ['21331 (192.168.10.82)', '21332 (192.168.10.82)']
[2m[36m(RayTrainWorker pid=21331)[0m Setting up process group for: env:// [rank=0, world_size=2]
[2m[36m(RayTrainWorker pid=21331)[0m Moving model to device: cuda:0
[2m[36m(RayTrainWorker pid=21331)[0m Wrapping provided model in DistributedDataParallel.
Train Epoch 0:   0%|          | 0/1875 [00:00<?, ?it/s]
Train Epoch 0:   0%|          | 1/1875 [00:00<06:35,  4.74it/s]
Train Epoch 0:   0%|          | 0/1875 [00:00<?, ?it/s]
Train Epoch 0:  49%|████▉     | 928/1875 [00:05<00:05, 181.49it/s][32m [repeated 96x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
Train Epoch 0:  91%|█████████▏| 1714/1875 [00:09<00:00, 186.79it/s]
Train Epoch 0: 100%|██████████| 1875/1875 [00:10<00:00, 183.55it

Training result: Result(
  metrics={'loss': 0.39381615475367626, 'accuracy': 0.8572},
  path='/home/ubuntu/ray_results/TorchTrainer_2023-10-16_19-27-37/TorchTrainer_10431_00000_0_2023-10-16_19-27-37',
  filesystem='local',
  checkpoint=None
)
