In [9]:
import random
import ray
from ray.air import session, Checkpoint
from ray.data import DatasetPipeline
from ray.train.torch import TorchTrainer
from ray.air.config import DatasetConfig

In [10]:
def model(batch):
    return len(batch) * 0.1 * random.uniform(0,1)

def train_loop_per_worker():
    # A DatasetPipeline object is returned when `use_stream_api` is set.
    data_shard: DatasetPipeline = session.get_dataset_shard("train")
    
    loss = 0.0
    # Manually iterate over the data 10 times (10 epochs).
    for epoch in data_shard.iter_epochs(10): 
        # for each epoch iterate over batches
        num_batches = 0
        num_epochs = 0
        for batch in epoch.iter_batches():
            num_batches += 1
            num_epochs += 1
            batch_loss = model(batch)
            loss += batch_loss
        loss /= num_batches * 100
        if num_epochs % 2 == 0:
            print(f"Doing some training on epoch: {num_epochs} for batches: {num_batches} and loss over batch: {loss:.3f}")
        session.report({"loss": loss, "epoch": num_epochs}, 
                       checkpoint=Checkpoint.from_dict({"loss": loss, "epoch": num_epochs}))
    # View the stats for performance debugging.
    # print(data_shard.stats())

In [11]:
# Set N = 200 bytes for this toy example. Typically, you'd set N >= 1GiB.
N = 200
train_ds = ray.data.range_tensor(1000)
trainer = TorchTrainer(train_loop_per_worker,
                       scaling_config={"num_workers": 1},
                       datasets={"train": train_ds},
                       dataset_config={"train": DatasetConfig(use_stream_api=True, 
                                                              stream_window_size=N)},
                      )

In [12]:
result = trainer.fit()

Trial name,status,loc,iter,total time (s),loss,epoch,_timestamp
TorchTrainer_518ce_00000,TERMINATED,127.0.0.1:86187,10,2.87596,0.0255333,20,1658009526


[2m[36m(TorchTrainer pid=86187)[0m 2022-07-16 15:12:04,275	INFO dataset.py:3094 -- Created DatasetPipeline with 20 windows: 400b min, 400b max, 400b mean
[2m[36m(TorchTrainer pid=86187)[0m 2022-07-16 15:12:04,276	INFO dataset.py:3103 -- Blocks per window: 1 min, 1 max, 1 mean
[2m[36m(BaseWorkerMixin pid=86194)[0m 2022-07-16 15:12:04,258	INFO config.py:70 -- Setting up process group for: env:// [rank=0, world_size=1]
Stage 0:   0%|          | 0/1 [00:00<?, ?it/s]=86197)[0m 
  0%|          | 0/1 [00:00<?, ?it/s][Aor pid=86197)[0m 
Stage 1:   0%|          | 0/1 [00:00<?, ?it/s][A197)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
  0%|          | 0/1 [00:00<?, ?it/s][A[Apid=86197)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
Stage 2:   0%|          | 0/1 [00:00<?, ?it/s][A[A)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
Stage 2: : 10it [00:00, 98.65it/s]      

[2m[36m(BaseWorkerMixin pid=86194)[0m Doing some training on epoch: 20 for batches: 20 and loss over batch: 0.030
Result for TorchTrainer_518ce_00000:
  _time_this_iter_s: 0.8639390468597412
  _timestamp: 1658009525
  _training_iteration: 1
  date: 2022-07-16_15-12-05
  done: false
  epoch: 20
  experiment_id: 858e3fea072f4687a81dde9582592f9f
  hostname: Juless-MacBook-Pro-16
  iterations_since_restore: 1
  loss: 0.029737603763258125
  node_ip: 127.0.0.1
  pid: 86187
  should_checkpoint: true
  time_since_restore: 1.8201608657836914
  time_this_iter_s: 1.8201608657836914
  time_total_s: 1.8201608657836914
  timestamp: 1658009525
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 518ce_00000
  warmup_time: 0.0030679702758789062
  
[2m[36m(BaseWorkerMixin pid=86194)[0m Doing some training on epoch: 20 for batches: 20 and loss over batch: 0.029


[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
Stage 2: : 30it [00:00, 156.58it/s][A[Ar pid=86197)[0m 
Stage 0: : 32it [00:00, 164.41it/s][Aator pid=86197)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
Stage 2: : 46it [00:00, 126.08it/s][A[Ar pid=86197)[0m 


[2m[36m(BaseWorkerMixin pid=86194)[0m Doing some training on epoch: 20 for batches: 20 and loss over batch: 0.024
[2m[36m(BaseWorkerMixin pid=86194)[0m Doing some training on epoch: 20 for batches: 20 and loss over batch: 0.026


[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
Stage 0: : 49it [00:00, 110.74it/s][Aator pid=86197)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
Stage 2: : 60it [00:00, 122.47it/s][A[Ar pid=86197)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
Stage 0: : 65it [00:00, 121.47it/s][Aator pid=86197)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
Stage 2: : 79it [00:00, 143.45it/s][A[Ar pid=86197)[0m 


[2m[36m(BaseWorkerMixin pid=86194)[0m Doing some training on epoch: 20 for batches: 20 and loss over batch: 0.023
[2m[36m(BaseWorkerMixin pid=86194)[0m Doing some training on epoch: 20 for batches: 20 and loss over batch: 0.021


[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
Stage 0: : 85it [00:00, 143.27it/s][Aator pid=86197)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
Stage 2: : 99it [00:00, 160.34it/s][A[Ar pid=86197)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
Stage 0: : 105it [00:00, 159.33it/s][Ator pid=86197)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
Stage 2: : 119it [00:00, 171.18it/s][A[A pid=86197)[0m 


[2m[36m(BaseWorkerMixin pid=86194)[0m Doing some training on epoch: 20 for batches: 20 and loss over batch: 0.026
[2m[36m(BaseWorkerMixin pid=86194)[0m Doing some training on epoch: 20 for batches: 20 and loss over batch: 0.028


[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
Stage 0: : 125it [00:00, 170.07it/s][Ator pid=86197)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
Stage 2: : 139it [00:00, 178.54it/s][A[A pid=86197)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
Stage 0: : 145it [00:00, 177.51it/s][Ator pid=86197)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
Stage 2: : 159it [00:00, 183.98it/s][A[A pid=86197)[0m 


[2m[36m(BaseWorkerMixin pid=86194)[0m Doing some training on epoch: 20 for batches: 20 and loss over batch: 0.025
[2m[36m(BaseWorkerMixin pid=86194)[0m Doing some training on epoch: 20 for batches: 20 and loss over batch: 0.026
[2m[36m(BaseWorkerMixin pid=86194)[0m == Pipeline Window 198 ==
[2m[36m(BaseWorkerMixin pid=86194)[0m Stage 1 read->randomize_block_order: 1/1 blocks executed in 0s
[2m[36m(BaseWorkerMixin pid=86194)[0m * Remote wall time: 134.46us min, 134.46us max, 134.46us mean, 134.46us total
[2m[36m(BaseWorkerMixin pid=86194)[0m * Remote cpu time: 135.0us min, 135.0us max, 135.0us mean, 135.0us total
[2m[36m(BaseWorkerMixin pid=86194)[0m * Peak heap memory usage (MiB): 154894336000.0 min, 154894336000.0 max, 154894336000 mean
[2m[36m(BaseWorkerMixin pid=86194)[0m * Output num rows: 50 min, 50 max, 50 mean, 50 total
[2m[36m(BaseWorkerMixin pid=86194)[0m * Output size bytes: 600 min, 600 max, 600 mean, 600 total
[2m[36m(BaseWorkerMixin pid=86194)

[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
Stage 0: : 166it [00:01, 185.07it/s][Ator pid=86197)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
Stage 2: : 180it [00:01, 190.00it/s][A[A pid=86197)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
Stage 0: : 186it [00:01, 188.54it/s][Ator pid=86197)[0m 
[2m[36m(PipelineSplitExecutorCoordinator pid=86197)[0m 
Stage 2: : 200it [00:01, 192.24it/s][A[A pid=86197)[0m 


Result for TorchTrainer_518ce_00000:
  _time_this_iter_s: 0.1010432243347168
  _timestamp: 1658009526
  _training_iteration: 10
  date: 2022-07-16_15-12-06
  done: true
  epoch: 20
  experiment_id: 858e3fea072f4687a81dde9582592f9f
  experiment_tag: '0'
  hostname: Juless-MacBook-Pro-16
  iterations_since_restore: 10
  loss: 0.0255332714713642
  node_ip: 127.0.0.1
  pid: 86187
  should_checkpoint: true
  time_since_restore: 2.875962972640991
  time_this_iter_s: 0.09958791732788086
  time_total_s: 2.875962972640991
  timestamp: 1658009526
  timesteps_since_restore: 0
  training_iteration: 10
  trial_id: 518ce_00000
  warmup_time: 0.0030679702758789062
  


2022-07-16 15:12:07,056	INFO tune.py:737 -- Total run time: 4.87 seconds (4.75 seconds for the tuning loop).


In [19]:
print(result)

Result(metrics={'loss': 0.022335927990326774, 'epoch': 20, '_timestamp': 1657944200, '_time_this_iter_s': 0.10506796836853027, '_training_iteration': 10, 'time_this_iter_s': 0.10604286193847656, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 10, 'trial_id': '38a86_00000', 'experiment_id': 'be17d5ff543b4e6b8e9df0efbb62c498', 'date': '2022-07-15_21-03-20', 'timestamp': 1657944200, 'time_total_s': 2.79736590385437, 'pid': 64653, 'hostname': 'Juless-MacBook-Pro-16', 'node_ip': '127.0.0.1', 'config': {}, 'time_since_restore': 2.79736590385437, 'timesteps_since_restore': 0, 'iterations_since_restore': 10, 'warmup_time': 0.002788066864013672, 'experiment_tag': '0'}, checkpoint=<ray.air.checkpoint.Checkpoint object at 0x130b2a460>, error=None, log_dir=PosixPath('/Users/jules/ray_results/TorchTrainer_2022-07-15_21-03-16/TorchTrainer_38a86_00000_0_2022-07-15_21-03-17'), metrics_dataframe=       loss  epoch  _timestamp  _time_this_i

In [20]:
ray.shutdown()