In [34]:
import ray
import random
import numpy as np
from ray.air import session, Checkpoint
from ray.air import DatasetConfig
from ray.data import Dataset
from ray.train.torch import TorchTrainer

In [35]:
def objective(tensor):
    acc = (tensor ** 2 + 2.5)/10e5
    acc *= random.uniform(0,1)
    return acc
    
def model(batch):
    scores=[]
    # print(f"type:{type(batch)} batch of size: {batch.shape}")
    for score in np.nditer(batch):
        res = objective(score)
        scores.append(res)
    return 100.00 if sum(scores) > 100.00 else sum(scores)

In [42]:
# our dummy model function

def train_loop():
    # By default, bulk loading is used and returns a Dataset object.
    data_shard: Dataset = session.get_dataset_shard("train")
    acc = 0.0
    # Manually iterate over the data 10 times (10 epochs).
    for epoch in range(1, 11):
        # for each epoch iterate over batches
        num_batches = 0
        for batch in data_shard.iter_batches():
            num_batches += 1
            batch_acc = model(batch)
            acc += batch_acc
        acc /= num_batches * 100
        if epoch % 2 == 0:
            print(f"Doing some training on epoch: {epoch} for batches: {num_batches} and acc over batch: {acc:.3f}")
        session.report({"acc": acc, "epoch": epoch}, 
                       checkpoint=Checkpoint.from_dict({"acc": acc, "epoch": epoch}))
    # View the stats for performance debugging.
    # print(data_shard.stats())

In [43]:
# Create our TorchTrainer
from ray.air.config import ScalingConfig

train_ds = ray.data.range_tensor(1000)
trainer = TorchTrainer(train_loop,
                       scaling_config= ScalingConfig(num_workers=1),
                       datasets={"train": train_ds},
                      )
result = trainer.fit()

Trial name,status,loc,iter,total time (s),acc,epoch,_timestamp
TorchTrainer_001a7_00000,TERMINATED,127.0.0.1:31687,10,2.27152,0.41711,10,1659415990


[2m[36m(BaseWorkerMixin pid=31704)[0m 2022-08-01 21:53:09,449	INFO config.py:70 -- Setting up process group for: env:// [rank=0, world_size=1]


Result for TorchTrainer_001a7_00000:
  _time_this_iter_s: 0.061104774475097656
  _timestamp: 1659415990
  _training_iteration: 1
  acc: 0.40067345427681794
  date: 2022-08-01_21-53-10
  done: false
  epoch: 1
  experiment_id: b248eb6a2b1f4b1ca3a31d250d7e919c
  hostname: Juless-MacBook-Pro-16
  iterations_since_restore: 1
  node_ip: 127.0.0.1
  pid: 31687
  should_checkpoint: true
  time_since_restore: 1.9261529445648193
  time_this_iter_s: 1.9261529445648193
  time_total_s: 1.9261529445648193
  timestamp: 1659415990
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 001a7_00000
  warmup_time: 0.0032911300659179688
  
[2m[36m(BaseWorkerMixin pid=31704)[0m Doing some training on epoch: 2 for batches: 4 and loss over batch: 0.413
[2m[36m(BaseWorkerMixin pid=31704)[0m Doing some training on epoch: 4 for batches: 4 and loss over batch: 0.418
[2m[36m(BaseWorkerMixin pid=31704)[0m Doing some training on epoch: 6 for batches: 4 and loss over batch: 0.432
[2m[36m(BaseWo

Total run time: 3.88 seconds (3.76 seconds for the tuning loop).


In [44]:
print(result.metrics)

{'acc': 0.41710975545740375, 'epoch': 10, '_timestamp': 1659415990, '_time_this_iter_s': 0.03656911849975586, '_training_iteration': 10, 'time_this_iter_s': 0.03725695610046387, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 10, 'trial_id': '001a7_00000', 'experiment_id': 'b248eb6a2b1f4b1ca3a31d250d7e919c', 'date': '2022-08-01_21-53-10', 'timestamp': 1659415990, 'time_total_s': 2.2715229988098145, 'pid': 31687, 'hostname': 'Juless-MacBook-Pro-16', 'node_ip': '127.0.0.1', 'config': {}, 'time_since_restore': 2.2715229988098145, 'timesteps_since_restore': 0, 'iterations_since_restore': 10, 'warmup_time': 0.0032911300659179688, 'experiment_tag': '0'}


In [45]:
result.metrics["acc"]

0.41710975545740375

In [46]:
result.checkpoint.to_dict()['acc']

0.41710975545740375