In [1]:
import random
import ray
from ray.air import session, Checkpoint
from ray.data import DatasetPipeline
from ray.train.torch import TorchTrainer
from ray.air.config import DatasetConfig
from ingest_utils import model

In [2]:
import logging
if ray.is_initialized:
    ray.shutdown()
ray.init(logging_level=logging.ERROR)

0,1
Python version:,3.8.13
Ray version:,3.0.0.dev0
Dashboard:,http://127.0.0.1:8266


In [3]:
# def model(batch):
#     return len(batch) * 0.1 * random.uniform(0,1)

def train_loop_per_worker():
    # A DatasetPipeline object is returned when `use_stream_api` is set.
    data_shard: DatasetPipeline = session.get_dataset_shard("train")
    
    acc = 0.0
    # Manually iterate over the data 10 times (10 epochs).
    for epoch in data_shard.iter_epochs(10): 
        # for each epoch iterate over batches
        num_batches = 0
        num_epochs = 0
        for batch in epoch.iter_batches():
            num_batches += 1
            num_epochs += 1
            batch_acc = model(batch)
            acc += batch_acc
        acc /= num_batches * 100
        if num_epochs % 2 == 0:
            print(f"Doing some training on epoch: {num_epochs} for batches: {num_batches} and loss over batch: {acc:.3f}")
        session.report({"acc": acc, "epoch": num_epochs}, 
                       checkpoint=Checkpoint.from_dict({"acc": acc, "epoch": num_epochs}))
    # View the stats for performance debugging.
    # print(data_shard.stats())

In [4]:
from ray.air.config import ScalingConfig

# Set N = 200 bytes for this toy example. Typically, you'd set N >= 1GiB.
N = 200
train_ds = ray.data.range_tensor(1000)
trainer = TorchTrainer(train_loop_per_worker,
                       scaling_config= ScalingConfig(num_workers=1),
                       datasets={"train": train_ds},
                       dataset_config={"train": DatasetConfig(use_stream_api=True, 
                                                              stream_window_size=N)},
                      )

In [5]:
result = trainer.fit()

Trial name,status,loc,iter,total time (s),acc,epoch,_timestamp
TorchTrainer_78b74_00000,TERMINATED,127.0.0.1:72216,10,4.12106,0.428409,4,1659471170


[2m[36m(RayTrainWorker pid=72226)[0m 2022-08-02 13:12:47,689	INFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=1]
[2m[36m(TorchTrainer pid=72216)[0m 2022-08-02 13:12:47,908	INFO dataset.py:3233 -- Created DatasetPipeline with 20 windows: 400b min, 400b max, 400b mean
[2m[36m(TorchTrainer pid=72216)[0m 2022-08-02 13:12:47,909	INFO dataset.py:3243 -- Blocks per window: 1 min, 1 max, 1 mean
[2m[36m(TorchTrainer pid=72216)[0m 2022-08-02 13:12:47,911	INFO dataset.py:3282 -- ✔️  This pipeline's windows likely fit in object store memory without spilling.
Stage 0:   0%|          | 0/1 [00:00<?, ?it/s]=72238)[0m 
  0%|          | 0/1 [00:00<?, ?it/s][Aor pid=72238)[0m 
Stage 1:   0%|          | 0/1 [00:00<?, ?it/s][A238)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=72238)[0m 
  0%|          | 0/1 [00:00<?, ?it/s][A[Apid=72238)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=72238)[0m 
Stage 2:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Result for TorchTrainer_78b74_00000:
  _time_this_iter_s: 1.9532630443572998
  _timestamp: 1659471169
  _training_iteration: 1
  acc: 0.42307759752536983
  date: 2022-08-02_13-12-49
  done: false
  epoch: 4
  experiment_id: 97a222b531324fbe8cbfb98433261bf1
  hostname: Juless-MacBook-Pro-16
  iterations_since_restore: 1
  node_ip: 127.0.0.1
  pid: 72216
  should_checkpoint: true
  time_since_restore: 3.291666030883789
  time_this_iter_s: 3.291666030883789
  time_total_s: 3.291666030883789
  timestamp: 1659471169
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 78b74_00000
  warmup_time: 0.0033578872680664062
  
[2m[36m(RayTrainWorker pid=72226)[0m Doing some training on epoch: 4 for batches: 4 and loss over batch: 0.423
[2m[36m(RayTrainWorker pid=72226)[0m Doing some training on epoch: 4 for batches: 4 and loss over batch: 0.434
[2m[36m(RayTrainWorker pid=72226)[0m Doing some training on epoch: 4 for batches: 4 and loss over batch: 0.412


[2m[36m(PipelineSplitExecutorCoordinator pid=72238)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=72238)[0m 
Stage 2: : 43it [00:01, 54.90it/s][A[Aor pid=72238)[0m 
Stage 0: : 45it [00:01, 56.96it/s][Anator pid=72238)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=72238)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=72238)[0m 
Stage 2: : 63it [00:01, 79.64it/s][A[Aor pid=72238)[0m 
Stage 0: : 65it [00:01, 81.28it/s][Anator pid=72238)[0m 


[2m[36m(RayTrainWorker pid=72226)[0m Doing some training on epoch: 4 for batches: 4 and loss over batch: 0.410
[2m[36m(RayTrainWorker pid=72226)[0m Doing some training on epoch: 4 for batches: 4 and loss over batch: 0.417
[2m[36m(RayTrainWorker pid=72226)[0m Doing some training on epoch: 4 for batches: 4 and loss over batch: 0.421


[2m[36m(PipelineSplitExecutorCoordinator pid=72238)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=72238)[0m 
Stage 2: : 83it [00:01, 101.31it/s][A[Ar pid=72238)[0m 
Stage 0: : 85it [00:01, 102.56it/s][Aator pid=72238)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=72238)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=72238)[0m 
Stage 2: : 105it [00:01, 126.49it/s][A[A pid=72238)[0m 
Stage 0: : 107it [00:01, 127.43it/s][Ator pid=72238)[0m 


[2m[36m(RayTrainWorker pid=72226)[0m Doing some training on epoch: 4 for batches: 4 and loss over batch: 0.413
[2m[36m(RayTrainWorker pid=72226)[0m Doing some training on epoch: 4 for batches: 4 and loss over batch: 0.419


[2m[36m(PipelineSplitExecutorCoordinator pid=72238)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=72238)[0m 
Stage 2: : 132it [00:01, 160.26it/s][A[A pid=72238)[0m 
Stage 0: : 134it [00:01, 160.95it/s][Ator pid=72238)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=72238)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=72238)[0m 
Stage 2: : 159it [00:01, 187.65it/s][A[A pid=72238)[0m 
Stage 0: : 161it [00:01, 188.14it/s][Ator pid=72238)[0m 


[2m[36m(RayTrainWorker pid=72226)[0m Doing some training on epoch: 4 for batches: 4 and loss over batch: 0.418
[2m[36m(RayTrainWorker pid=72226)[0m Doing some training on epoch: 4 for batches: 4 and loss over batch: 0.428


[2m[36m(PipelineSplitExecutorCoordinator pid=72238)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=72238)[0m 
Stage 2: : 183it [00:01, 185.79it/s][A[A pid=72238)[0m 
Stage 0: : 185it [00:01, 186.12it/s][Ator pid=72238)[0m 


Result for TorchTrainer_78b74_00000:
  _time_this_iter_s: 0.0832369327545166
  _timestamp: 1659471170
  _training_iteration: 10
  acc: 0.42840909908721947
  date: 2022-08-02_13-12-50
  done: true
  epoch: 4
  experiment_id: 97a222b531324fbe8cbfb98433261bf1
  experiment_tag: '0'
  hostname: Juless-MacBook-Pro-16
  iterations_since_restore: 10
  node_ip: 127.0.0.1
  pid: 72216
  should_checkpoint: true
  time_since_restore: 4.121056079864502
  time_this_iter_s: 0.09208011627197266
  time_total_s: 4.121056079864502
  timestamp: 1659471170
  timesteps_since_restore: 0
  training_iteration: 10
  trial_id: 78b74_00000
  warmup_time: 0.0033578872680664062
  




In [6]:
print(result)

Result(metrics={'acc': 0.42840909908721947, 'epoch': 4, '_timestamp': 1659471170, '_time_this_iter_s': 0.0832369327545166, '_training_iteration': 10, 'should_checkpoint': True, 'done': True, 'trial_id': '78b74_00000', 'experiment_tag': '0'}, error=None, log_dir=PosixPath('/Users/jules/ray_results/TorchTrainer_2022-08-02_13-12-45/TorchTrainer_78b74_00000_0_2022-08-02_13-12-45'))


In [7]:
ray.shutdown()