In [10]:
import ray
import numpy as np
from ray.air import session, Checkpoint
from ray.data import Dataset
from ray.train.torch import TorchTrainer
from ingest_utils import model

In [11]:
# our dummy model function

# def model(batch):
#     return len(batch) * 0.1 * random.uniform(0,1)

def train_loop():
    # By default, bulk loading is used and returns a Dataset object.
    data_shard: Dataset = session.get_dataset_shard("train")
    acc = 0.0
    # Manually iterate over the data 10 times (10 epochs).
    for epoch in range(1, 11):
        # for each epoch iterate over batches
        num_batches = 0
        for batch in data_shard.iter_batches():
            num_batches += 1
            batch_acc = model(batch)
            acc += batch_acc
        acc /= num_batches * 100
        if epoch % 2 == 0:
            print(f"Doing some training on epoch: {epoch} for batches: {num_batches} and acc over batch: {acc:.3f}")
        session.report({"acc": acc, "epoch": epoch}, 
                       checkpoint=Checkpoint.from_dict({"acc": acc, "epoch": epoch}))
    # View the stats for performance debugging.
    # print(data_shard.stats())

In [12]:
# Create our TorchTrainer
from ray.air.config import ScalingConfig

train_ds = ray.data.range_tensor(1000)
trainer = TorchTrainer(train_loop,
                       scaling_config= ScalingConfig(num_workers=1),
                       datasets={"train": train_ds},
                      )

Trial name,status,loc,iter,total time (s),acc,epoch,_timestamp
TorchTrainer_5a002_00000,TERMINATED,127.0.0.1:68051,10,2.5264,0.414789,10,1659466822


[2m[36m(RayTrainWorker pid=68058)[0m 2022-08-02 12:00:21,105	INFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=1]


Result for TorchTrainer_5a002_00000:
  _time_this_iter_s: 0.06127190589904785
  _timestamp: 1659466822
  _training_iteration: 1
  acc: 0.43601966983402657
  date: 2022-08-02_12-00-22
  done: false
  epoch: 1
  experiment_id: d73907bde308438e88c6fe8f10759325
  hostname: Juless-MacBook-Pro-16
  iterations_since_restore: 1
  node_ip: 127.0.0.1
  pid: 68051
  should_checkpoint: true
  time_since_restore: 2.1793737411499023
  time_this_iter_s: 2.1793737411499023
  time_total_s: 2.1793737411499023
  timestamp: 1659466822
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 5a002_00000
  warmup_time: 0.003217935562133789
  
[2m[36m(RayTrainWorker pid=68058)[0m Doing some training on epoch: 2 for batches: 4 and acc over batch: 0.430
[2m[36m(RayTrainWorker pid=68058)[0m Doing some training on epoch: 4 for batches: 4 and acc over batch: 0.402
[2m[36m(RayTrainWorker pid=68058)[0m Doing some training on epoch: 6 for batches: 4 and acc over batch: 0.410
[2m[36m(RayTrainWorker

2022-08-02 12:00:22,969	INFO tune.py:758 -- Total run time: 4.20 seconds (4.09 seconds for the tuning loop).


In [None]:
result = trainer.fit()

In [13]:
print(result.metrics)

{'acc': 0.41478898122963925, 'epoch': 10, '_timestamp': 1659466822, '_time_this_iter_s': 0.038574934005737305, '_training_iteration': 10, 'time_this_iter_s': 0.03709101676940918, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 10, 'trial_id': '5a002_00000', 'experiment_id': 'd73907bde308438e88c6fe8f10759325', 'date': '2022-08-02_12-00-22', 'timestamp': 1659466822, 'time_total_s': 2.5263969898223877, 'pid': 68051, 'hostname': 'Juless-MacBook-Pro-16', 'node_ip': '127.0.0.1', 'config': {}, 'time_since_restore': 2.5263969898223877, 'timesteps_since_restore': 0, 'iterations_since_restore': 10, 'warmup_time': 0.003217935562133789, 'experiment_tag': '0'}


In [14]:
result.metrics["acc"]

0.41478898122963925

In [15]:
result.checkpoint.to_dict()['acc']

0.41478898122963925