# Original code

In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download training data from open datasets.
'''Fashion-MNIST is a dataset comprising of 28×28 grayscale images of 70,000 fashion products from 10 categories, 
with 7,000 images per category. The training set has 60,000 images and the test set has 10,000 images.
'''
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Download test data from open datasets.
test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

In [3]:
batch_size = 64

# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

In [12]:
type(test_data)       #torchvision.datasets.mnist.FashionMNIST
type(test_dataloader) #torch.utils.data.dataloader.DataLoader
type(train_dataloader.dataset) #torchvision.datasets.mnist.FashionMNIST
type(train_dataloader.dataset[0]) # tuple (Tensor.tensor, Int)
#num_batches = len(train_dataloader)  # num of batches, 938 for batch_size=64
#size = len(train_dataloader.dataset) # num of samples, 60,000 for whatever batch_size

tuple

In [5]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork().to(device)
print(model)

Using cpu device
NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [6]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [7]:
def train_epoch(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            

In [8]:
def test_epoch(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return test_loss

In [34]:
epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_epoch(train_dataloader, model, loss_fn, optimizer)
    test_epoch(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 1.969066  [    0/60000]
loss: 1.941425  [ 6400/60000]
loss: 1.827421  [12800/60000]
loss: 1.859762  [19200/60000]
loss: 1.754379  [25600/60000]
loss: 1.706143  [32000/60000]
loss: 1.733826  [38400/60000]
loss: 1.636626  [44800/60000]
loss: 1.657616  [51200/60000]
loss: 1.554481  [57600/60000]
Test Error: 
 Accuracy: 61.0%, Avg loss: 1.571355 

Epoch 2
-------------------------------
loss: 1.637494  [    0/60000]
loss: 1.602694  [ 6400/60000]
loss: 1.451625  [12800/60000]
loss: 1.510427  [19200/60000]
loss: 1.396833  [25600/60000]
loss: 1.387469  [32000/60000]
loss: 1.404303  [38400/60000]
loss: 1.331993  [44800/60000]
loss: 1.363816  [51200/60000]
loss: 1.260510  [57600/60000]
Test Error: 
 Accuracy: 62.9%, Avg loss: 1.291033 

Epoch 3
-------------------------------
loss: 1.369351  [    0/60000]
loss: 1.352154  [ 6400/60000]
loss: 1.184853  [12800/60000]
loss: 1.273540  [19200/60000]
loss: 1.156797  [25600/60000]
loss: 1.176137  [32000/600

# Wrapper function

In [9]:
def train_func():
    batch_size = 64
    lr = 1e-3
    epochs = 5
    
    # Create data loaders.
    train_dataloader = DataLoader(training_data, batch_size=batch_size)
    test_dataloader = DataLoader(test_data, batch_size=batch_size)
    
    # Get cpu or gpu device for training.
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using {device} device")
    
    model = NeuralNetwork().to(device)
    print(model)
    
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train_epoch(train_dataloader, model, loss_fn, optimizer)
        test_epoch(test_dataloader, model, loss_fn)

    print("Done!")

In [10]:
train_func()

Using cpu device
NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)
Epoch 1
-------------------------------
loss: 2.298182  [    0/60000]
loss: 2.286429  [ 6400/60000]
loss: 2.269938  [12800/60000]
loss: 2.266409  [19200/60000]
loss: 2.262412  [25600/60000]
loss: 2.222612  [32000/60000]
loss: 2.227608  [38400/60000]
loss: 2.193105  [44800/60000]
loss: 2.191217  [51200/60000]
loss: 2.155695  [57600/60000]
Test Error: 
 Accuracy: 42.9%, Avg loss: 2.156213 

Epoch 2
-------------------------------
loss: 2.165565  [    0/60000]
loss: 2.158086  [ 6400/60000]
loss: 2.101922  [12800/60000]
loss: 2.116251  [19200/60000]
loss: 2.090752  [25600/60000]
loss: 2.012789  [32000/60000]
loss: 2.031749  [38400/60000]
loss: 1.956159  [44800

# Convert to AIR 

In [11]:
import ray
import ray.train as train
from ray.air import session

runtime_env = {
    "working_dir": ".",
    "excludes":['/data/','/.ipynb_checkpoints/']
}
ray.init(runtime_env=runtime_env)

2022-12-18 18:01:19,275	INFO worker.py:1230 -- Using address localhost:9031 set in the environment variable RAY_ADDRESS
2022-12-18 18:01:19,582	INFO worker.py:1352 -- Connecting to existing Ray cluster at address: 10.0.0.201:9031...
2022-12-18 18:01:19,623	INFO worker.py:1529 -- Connected to Ray cluster. View the dashboard at [1m[32mhttps://console.anyscale.com/api/v2/sessions/ses_buwxbm99nq8dryqg6p8sbytw/services?redirect_to=dashboard [39m[22m
2022-12-18 18:01:19,628	INFO packaging.py:546 -- Creating a file package for local directory '.'.
2022-12-18 18:01:19,630	INFO packaging.py:373 -- Pushing file package 'gcs://_ray_pkg_d2d377011b1433ea.zip' (0.07MiB) to Ray cluster...
2022-12-18 18:01:19,632	INFO packaging.py:386 -- Successfully pushed file package 'gcs://_ray_pkg_d2d377011b1433ea.zip'.


0,1
Python version:,3.9.12
Ray version:,2.2.0
Dashboard:,http://console.anyscale.com/api/v2/sessions/ses_buwxbm99nq8dryqg6p8sbytw/services?redirect_to=dashboard


In [24]:
def train_epoch(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset) // session.get_world_size()  # Divide by word size
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # We don't need this anymore! Ray Train does this automatically:
        # X, y = X.to(device), y.to(device)  

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [25]:
def test_epoch(dataloader, model, loss_fn):
    size = len(dataloader.dataset) // session.get_world_size()  # Divide by word size
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    # print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return test_loss

In [39]:
from ray.air import Checkpoint

def train_func(config: dict):
    batch_size = config["batch_size"]
    lr = config["lr"]
    epochs = config["epochs"]
    
    batch_size_per_worker = batch_size // session.get_world_size()
    
    # Create data loaders.
    train_dataloader = DataLoader(training_data, batch_size=batch_size_per_worker)
    test_dataloader = DataLoader(test_data, batch_size=batch_size_per_worker)
    print(f'training # of batches is {len(train_dataloader)} with {session.get_world_size()} workers')
    
    train_dataloader = train.torch.prepare_data_loader(train_dataloader)
    test_dataloader = train.torch.prepare_data_loader(test_dataloader)
    
    model = NeuralNetwork()
    model = train.torch.prepare_model(model)
    
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train_epoch(train_dataloader, model, loss_fn, optimizer)
        test_loss = test_epoch(test_dataloader, model, loss_fn)
        
        checkpoint = Checkpoint.from_dict(
            dict(epoch=t, model=model.module.state_dict())
        )
        session.report(dict(loss=test_loss), checkpoint=checkpoint)

    print("Done!")

In [40]:
len(DataLoader(training_data, batch_size=64))

938

In [41]:
from ray.train.torch import TorchTrainer
from ray.air.config import ScalingConfig


trainer = TorchTrainer(
    train_loop_per_worker=train_func,
    train_loop_config={"lr": 1e-3, "batch_size": 64, "epochs": 4},
    scaling_config=ScalingConfig(num_workers=2, use_gpu=False),
)
result = trainer.fit()
print(f"Last result: {result.metrics}")

0,1
Current time:,2022-12-18 19:15:57
Running for:,00:00:55.96
Memory:,3.4/30.9 GiB

Trial name,status,loc,iter,total time (s),loss,_timestamp,_time_this_iter_s
TorchTrainer_537a4_00000,TERMINATED,10.0.0.201:19159,4,46.9073,1.2252,1671419755,10.6953


(RayTrainWorker pid=19219) 2022-12-18 19:15:12,756	INFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=2]
(RayTrainWorker pid=19219) 2022-12-18 19:15:14,644	INFO train_loop_utils.py:270 -- Moving model to device: cpu
(RayTrainWorker pid=19219) 2022-12-18 19:15:14,644	INFO train_loop_utils.py:330 -- Wrapping provided model in DistributedDataParallel.


(RayTrainWorker pid=19219) training # of batches is 1875 with 2 workers
(RayTrainWorker pid=19219) Epoch 1
(RayTrainWorker pid=19219) -------------------------------
(RayTrainWorker pid=19219) loss: 2.314141  [    0/30000]
(RayTrainWorker pid=19220) training # of batches is 1875 with 2 workers
(RayTrainWorker pid=19220) Epoch 1
(RayTrainWorker pid=19220) -------------------------------
(RayTrainWorker pid=19220) loss: 2.295399  [    0/30000]
(RayTrainWorker pid=19219) loss: 2.300298  [ 3200/30000]
(RayTrainWorker pid=19220) loss: 2.285585  [ 3200/30000]
(RayTrainWorker pid=19219) loss: 2.267904  [ 6400/30000]
(RayTrainWorker pid=19220) loss: 2.281327  [ 6400/30000]
(RayTrainWorker pid=19219) loss: 2.261655  [ 9600/30000]
(RayTrainWorker pid=19220) loss: 2.275296  [ 9600/30000]
(RayTrainWorker pid=19219) loss: 2.254933  [12800/30000]
(RayTrainWorker pid=19220) loss: 2.237907  [12800/30000]
(RayTrainWorker pid=19219) loss: 2.203960  [16000/30000]
(RayTrainWorker pid=19220) loss: 2.216156

Trial name,_time_this_iter_s,_timestamp,_training_iteration,date,done,episodes_total,experiment_id,experiment_tag,hostname,iterations_since_restore,loss,node_ip,pid,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
TorchTrainer_537a4_00000,10.6953,1671419755,4,2022-12-18_19-15-55,True,,3e7c1fa2a8ea49b78d04719cf428ef98,0,ip-10-0-0-201,4,1.2252,10.0.0.201,19159,True,46.9073,10.697,46.9073,1671419755,0,,4,537a4_00000,0.808959


(RayTrainWorker pid=19219) Epoch 2
(RayTrainWorker pid=19219) -------------------------------
(RayTrainWorker pid=19219) loss: 2.182901  [    0/30000]
(RayTrainWorker pid=19220) Epoch 2
(RayTrainWorker pid=19220) -------------------------------
(RayTrainWorker pid=19220) loss: 2.146998  [    0/30000]
(RayTrainWorker pid=19219) loss: 2.174196  [ 3200/30000]
(RayTrainWorker pid=19220) loss: 2.119638  [ 3200/30000]
(RayTrainWorker pid=19219) loss: 2.066749  [ 6400/30000]
(RayTrainWorker pid=19220) loss: 2.117077  [ 6400/30000]
(RayTrainWorker pid=19219) loss: 2.104596  [ 9600/30000]
(RayTrainWorker pid=19220) loss: 2.110544  [ 9600/30000]
(RayTrainWorker pid=19219) loss: 2.057695  [12800/30000]
(RayTrainWorker pid=19220) loss: 2.028703  [12800/30000]
(RayTrainWorker pid=19219) loss: 1.970710  [16000/30000]
(RayTrainWorker pid=19220) loss: 2.002645  [16000/30000]
(RayTrainWorker pid=19219) loss: 2.034284  [19200/30000]
(RayTrainWorker pid=19220) loss: 1.997188  [19200/30000]
(RayTrainWorke

2022-12-18 19:15:57,881	INFO tune.py:762 -- Total run time: 56.07 seconds (55.95 seconds for the tuning loop).


Last result: {'loss': 1.2252038834960597, '_timestamp': 1671419755, '_time_this_iter_s': 10.69528341293335, '_training_iteration': 4, 'time_this_iter_s': 10.697039604187012, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 4, 'trial_id': '537a4_00000', 'experiment_id': '3e7c1fa2a8ea49b78d04719cf428ef98', 'date': '2022-12-18_19-15-55', 'timestamp': 1671419755, 'time_total_s': 46.9072630405426, 'pid': 19159, 'hostname': 'ip-10-0-0-201', 'node_ip': '10.0.0.201', 'config': {}, 'time_since_restore': 46.9072630405426, 'timesteps_since_restore': 0, 'iterations_since_restore': 4, 'warmup_time': 0.8089592456817627, 'experiment_tag': '0'}


In [58]:
print(type(result))
print(f"Checkpoint: {result.checkpoint}")

<class 'ray.air.result.Result'>
Checkpoint: TorchCheckpoint(local_path=/home/ray/ray_results/TorchTrainer_2022-12-18_19-19-40/TorchTrainer_f9dcf_00000_0_2022-12-18_19-19-41/checkpoint_000003)


# Move data loading into train_func

In [43]:
from ray.air import Checkpoint

def load_data():
    # Download training data from open datasets.
    training_data = datasets.FashionMNIST(
        root="data",
        train=True,
        download=True,
        transform=ToTensor(),
    )

    # Download test data from open datasets.
    test_data = datasets.FashionMNIST(
        root="data",
        train=False,
        download=True,
        transform=ToTensor(),
    )
    return training_data, test_data


def train_load_func(config: dict):
    batch_size = config["batch_size"]
    lr = config["lr"]
    epochs = config["epochs"]
    
    batch_size_per_worker = batch_size // session.get_world_size()
    
    training_data, test_data = load_data()  # <- this is new!
    
    # Create data loaders.
    train_dataloader = DataLoader(training_data, batch_size=batch_size_per_worker)
    test_dataloader = DataLoader(test_data, batch_size=batch_size_per_worker)
    
    train_dataloader = train.torch.prepare_data_loader(train_dataloader)
    test_dataloader = train.torch.prepare_data_loader(test_dataloader)
    
    model = NeuralNetwork()
    model = train.torch.prepare_model(model)
    
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    
    for t in range(epochs):
        train_epoch(train_dataloader, model, loss_fn, optimizer)
        test_loss = test_epoch(test_dataloader, model, loss_fn)
        checkpoint = Checkpoint.from_dict(
            dict(epoch=t, model=model.module.state_dict())
        )
        session.report(dict(loss=test_loss), checkpoint=checkpoint)

    print("Done!")

In [44]:
trainer = TorchTrainer(
    train_loop_per_worker=train_load_func,
    train_loop_config={"lr": 1e-3, "batch_size": 64, "epochs": 4},
    scaling_config=ScalingConfig(num_workers=2, use_gpu=False),
)
result = trainer.fit()

print(f"Last result: {result.metrics}")
print(f"Checkpoint: {result.checkpoint}")

0,1
Current time:,2022-12-18 19:20:43
Running for:,00:01:02.09
Memory:,3.2/30.9 GiB

Trial name,status,loc,iter,total time (s),loss,_timestamp,_time_this_iter_s
TorchTrainer_f9dcf_00000,TERMINATED,10.0.0.201:20312,4,53.4951,1.22565,1671420040,9.48647


(RayTrainWorker pid=20368) 2022-12-18 19:19:51,014	INFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=2]


(RayTrainWorker pid=20368) Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
(RayTrainWorker pid=20369) Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
(RayTrainWorker pid=20368) Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to data/FashionMNIST/raw/train-images-idx3-ubyte.gz
(RayTrainWorker pid=20369) Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to data/FashionMNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/26421880 [00:00<?, ?it/s]
  0%|          | 0/26421880 [00:00<?, ?it/s]
  0%|          | 32768/26421880 [00:00<01:51, 236843.25it/s]
  0%|          | 32768/26421880 [00:00<01:52, 235550.41it/s]
  0%|          | 65536/26421880 [00:00<01:51, 235408.13it/s]
  0%|          | 65536/26421880 [00:00<01:52, 234109.76it/s]
  0%|          | 98304/26421880 [00:00<01:51, 235134.97it/s]
  0%|          | 131072/26421880 [00:00<01:17, 340459.61it/s]
  1%|          | 229376/26421880 [00:00<00:54, 482913.50it/s]
  1%|          | 229376/26421880 [00:00<00:51, 513064.01it/s]
  2%|▏         | 458752/26421880 [00:00<00:28, 920375.09it/s]
  2%|▏         | 458752/26421880 [00:00<00:28, 898286.99it/s]
  3%|▎         | 917504/26421880 [00:00<00:14, 1724322.16it/s]
  4%|▎         | 950272/26421880 [00:00<00:14, 1783198.21it/s]
  7%|▋         | 1867776/26421880 [00:00<00:07, 3337578.77it/s]
  7%|▋         | 1835008/26421880 [00:00<00:07, 3307761.24it/s]
 13%|█▎        | 3407872/26421880 [00:01<

(RayTrainWorker pid=20368) Extracting data/FashionMNIST/raw/train-images-idx3-ubyte.gz to data/FashionMNIST/raw
(RayTrainWorker pid=20369) Extracting data/FashionMNIST/raw/train-images-idx3-ubyte.gz to data/FashionMNIST/raw
(RayTrainWorker pid=20368) 
(RayTrainWorker pid=20368) Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
(RayTrainWorker pid=20369) 
(RayTrainWorker pid=20369) Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
(RayTrainWorker pid=20368) Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw/train-labels-idx1-ubyte.gz
(RayTrainWorker pid=20369) Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/29515 [00:00<?, ?it/s]
  0%|          | 0/29515 [00:00<?, ?it/s]
100%|██████████| 29515/29515 [00:00<00:00, 208105.84it/s]
100%|██████████| 29515/29515 [00:00<00:00, 206245.06it/s]


(RayTrainWorker pid=20368) Extracting data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw
(RayTrainWorker pid=20368) 
(RayTrainWorker pid=20368) Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
(RayTrainWorker pid=20369) Extracting data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw
(RayTrainWorker pid=20369) 
(RayTrainWorker pid=20369) Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
(RayTrainWorker pid=20368) Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz
(RayTrainWorker pid=20369) Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/4422102 [00:00<?, ?it/s]
  0%|          | 0/4422102 [00:00<?, ?it/s]
  1%|          | 32768/4422102 [00:00<00:18, 235916.73it/s]
  1%|          | 32768/4422102 [00:00<00:18, 236193.23it/s]
  1%|▏         | 65536/4422102 [00:00<00:18, 235479.71it/s]
  1%|▏         | 65536/4422102 [00:00<00:18, 235923.50it/s]
  3%|▎         | 131072/4422102 [00:00<00:12, 342576.46it/s]
  3%|▎         | 131072/4422102 [00:00<00:12, 343373.71it/s]
  5%|▌         | 229376/4422102 [00:00<00:08, 486364.36it/s]
  5%|▌         | 229376/4422102 [00:00<00:08, 486817.03it/s]
 10%|█         | 458752/4422102 [00:00<00:04, 905047.10it/s]
 10%|█         | 458752/4422102 [00:00<00:04, 906469.28it/s]
 21%|██        | 917504/4422102 [00:00<00:02, 1717383.94it/s]
 21%|██▏       | 950272/4422102 [00:00<00:01, 1798667.18it/s]
 42%|████▏     | 1867776/4422102 [00:00<00:00, 3384279.81it/s]
 42%|████▏     | 1867776/4422102 [00:00<00:00, 3369805.44it/s]
100%|██████████| 4422102/4422102 [00:01<00:00, 3959696.9

(RayTrainWorker pid=20368) Extracting data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw


100%|██████████| 4422102/4422102 [00:01<00:00, 3963189.59it/s]


(RayTrainWorker pid=20368) 
(RayTrainWorker pid=20368) Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
(RayTrainWorker pid=20369) Extracting data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw
(RayTrainWorker pid=20369) 
(RayTrainWorker pid=20369) Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
(RayTrainWorker pid=20368) Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz
(RayTrainWorker pid=20369) Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 5148/5148 [00:00<00:00, 39691685.65it/s]
100%|██████████| 5148/5148 [00:00<00:00, 35166574.91it/s]


(RayTrainWorker pid=20368) Extracting data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw
(RayTrainWorker pid=20368) 
(RayTrainWorker pid=20369) Extracting data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw
(RayTrainWorker pid=20369) 


(RayTrainWorker pid=20368) 2022-12-18 19:20:00,028	INFO train_loop_utils.py:270 -- Moving model to device: cpu
(RayTrainWorker pid=20368) 2022-12-18 19:20:00,028	INFO train_loop_utils.py:330 -- Wrapping provided model in DistributedDataParallel.


(RayTrainWorker pid=20368) loss: 2.301111  [    0/30000]
(RayTrainWorker pid=20369) loss: 2.287547  [    0/30000]
(RayTrainWorker pid=20368) loss: 2.290906  [ 3200/30000]
(RayTrainWorker pid=20369) loss: 2.288523  [ 3200/30000]
(RayTrainWorker pid=20369) loss: 2.257152  [ 6400/30000]
(RayTrainWorker pid=20368) loss: 2.264019  [ 6400/30000]
(RayTrainWorker pid=20368) loss: 2.259223  [ 9600/30000]
(RayTrainWorker pid=20369) loss: 2.264961  [ 9600/30000]
(RayTrainWorker pid=20368) loss: 2.256486  [12800/30000]
(RayTrainWorker pid=20369) loss: 2.240544  [12800/30000]
(RayTrainWorker pid=20368) loss: 2.196257  [16000/30000]
(RayTrainWorker pid=20369) loss: 2.202147  [16000/30000]
(RayTrainWorker pid=20368) loss: 2.215132  [19200/30000]
(RayTrainWorker pid=20369) loss: 2.224874  [19200/30000]
(RayTrainWorker pid=20368) loss: 2.146052  [22400/30000]
(RayTrainWorker pid=20369) loss: 2.199692  [22400/30000]
(RayTrainWorker pid=20368) loss: 2.179719  [25600/30000]
(RayTrainWorker pid=20369) loss

Trial name,_time_this_iter_s,_timestamp,_training_iteration,date,done,episodes_total,experiment_id,experiment_tag,hostname,iterations_since_restore,loss,node_ip,pid,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
TorchTrainer_f9dcf_00000,9.48647,1671420040,4,2022-12-18_19-20-40,True,,c0afd5f50d614603acd7d62c4ac257f1,0,ip-10-0-0-201,4,1.22565,10.0.0.201,20312,True,53.4951,9.45747,53.4951,1671420040,0,,4,f9dcf_00000,0.551347


(RayTrainWorker pid=20368) loss: 2.163013  [    0/30000]
(RayTrainWorker pid=20369) loss: 2.117441  [    0/30000]
(RayTrainWorker pid=20368) loss: 2.166237  [ 3200/30000]
(RayTrainWorker pid=20369) loss: 2.116500  [ 3200/30000]
(RayTrainWorker pid=20368) loss: 2.061838  [ 6400/30000]
(RayTrainWorker pid=20369) loss: 2.077756  [ 6400/30000]
(RayTrainWorker pid=20368) loss: 2.090693  [ 9600/30000]
(RayTrainWorker pid=20369) loss: 2.096717  [ 9600/30000]
(RayTrainWorker pid=20368) loss: 2.069101  [12800/30000]
(RayTrainWorker pid=20369) loss: 2.028642  [12800/30000]
(RayTrainWorker pid=20368) loss: 1.959103  [16000/30000]
(RayTrainWorker pid=20369) loss: 1.981701  [16000/30000]
(RayTrainWorker pid=20368) loss: 2.017103  [19200/30000]
(RayTrainWorker pid=20369) loss: 1.999470  [19200/30000]
(RayTrainWorker pid=20368) loss: 1.854572  [22400/30000]
(RayTrainWorker pid=20369) loss: 1.981408  [22400/30000]
(RayTrainWorker pid=20368) loss: 1.944194  [25600/30000]
(RayTrainWorker pid=20369) loss

2022-12-18 19:20:43,164	INFO tune.py:762 -- Total run time: 62.21 seconds (62.09 seconds for the tuning loop).


Last result: {'loss': 1.2256454950684954, '_timestamp': 1671420040, '_time_this_iter_s': 9.48647141456604, '_training_iteration': 4, 'time_this_iter_s': 9.45746922492981, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 4, 'trial_id': 'f9dcf_00000', 'experiment_id': 'c0afd5f50d614603acd7d62c4ac257f1', 'date': '2022-12-18_19-20-40', 'timestamp': 1671420040, 'time_total_s': 53.49507689476013, 'pid': 20312, 'hostname': 'ip-10-0-0-201', 'node_ip': '10.0.0.201', 'config': {}, 'time_since_restore': 53.49507689476013, 'timesteps_since_restore': 0, 'iterations_since_restore': 4, 'warmup_time': 0.5513465404510498, 'experiment_tag': '0'}
Checkpoint: TorchCheckpoint(local_path=/home/ray/ray_results/TorchTrainer_2022-12-18_19-19-40/TorchTrainer_f9dcf_00000_0_2022-12-18_19-19-41/checkpoint_000003)


# Loading the model for prediction

In [48]:
classes = [
    "T-shirt/top",
    "Trouser",
    "Pullover",
    "Dress",
    "Coat",
    "Sandal",
    "Shirt",
    "Sneaker",
    "Bag",
    "Ankle boot",
]

def predict_from_model(model, data):
    model.eval()
    with torch.no_grad():
        for x, y in data:
            pred = model(x)
            predicted, actual = classes[pred[0].argmax(0)], classes[y]
            print(f'Predicted: "{predicted}", Actual: "{actual}"')


In [49]:
from ray.train.torch import TorchCheckpoint

model = TorchCheckpoint.from_checkpoint(result.checkpoint).get_model(NeuralNetwork())

predict_from_model(model, [test_data[i] for i in range(10)])

Predicted: "Ankle boot", Actual: "Ankle boot"
Predicted: "Pullover", Actual: "Pullover"
Predicted: "Trouser", Actual: "Trouser"
Predicted: "Trouser", Actual: "Trouser"
Predicted: "Pullover", Actual: "Shirt"
Predicted: "Trouser", Actual: "Trouser"
Predicted: "Coat", Actual: "Coat"
Predicted: "Coat", Actual: "Shirt"
Predicted: "Sneaker", Actual: "Sandal"
Predicted: "Sneaker", Actual: "Sneaker"


# Predictor

In [66]:
from ray.train.torch import TorchPredictor

torch_predictor = TorchPredictor.from_checkpoint(result.checkpoint, model=NeuralNetwork())

In [79]:
X, y = test_data[0]
pred = torch_predictor.predict(X.numpy())
#len(pred['predictions']) = 1
#len(pred['predictions'][0]) = num of classes
pred['predictions'][0].argmax()

9

# Batch Prediction

In [50]:
from ray.train.batch_predictor import BatchPredictor

batch_predictor = BatchPredictor.from_checkpoint(result.checkpoint, TorchPredictor, model=NeuralNetwork())

In [51]:
import ray.data

ds = ray.data.from_items([x.numpy() for x, y in test_data], parallelism=8)

In [52]:
results = batch_predictor.predict(ds, batch_size=32, min_scoring_workers=2)

Map Progress (2 actors 1 pending): 100%|██████████| 8/8 [00:05<00:00,  1.36it/s]


In [53]:
results.show()

{'predictions': array([-1.2659273 , -1.9728076 , -0.47294343, -1.4067363 , -0.54722816,
        1.4815099 , -0.5460603 ,  1.9433444 ,  1.4214759 ,  2.3140082 ],
      dtype=float32)}
{'predictions': array([ 0.90613997, -2.3244176 ,  3.0169926 , -0.8443391 ,  2.6863618 ,
       -1.4803131 ,  2.2586493 , -2.4865446 ,  1.3047663 , -1.1552725 ],
      dtype=float32)}
{'predictions': array([ 1.9848795 ,  3.9836159 , -0.0767667 ,  2.9862223 ,  0.81452185,
       -2.2852166 ,  0.65543973, -2.976378  , -1.863825  , -2.6263735 ],
      dtype=float32)}
{'predictions': array([ 1.3361863 ,  3.1420417 , -0.18481702,  2.2959752 ,  0.43808815,
       -1.5627369 ,  0.33526626, -2.1126003 , -1.4807713 , -1.8568392 ],
      dtype=float32)}
{'predictions': array([ 0.8119151 , -1.0520415 ,  1.2079775 , -0.14650142,  1.0830628 ,
       -0.74526155,  1.1679947 , -1.3283877 ,  0.5627702 , -0.49579006],
      dtype=float32)}
{'predictions': array([ 2.0064857 ,  2.902035  ,  0.3147434 ,  2.3969293 ,  0.9864738

In [81]:
type(results)

ray.data.dataset.Dataset