In [1]:
import subprocess, os, time
import cml.workers_v1 as workers

DASHBOARD_PORT = os.environ['CDSW_READONLY_PORT']
DASHBOARD_IP = os.environ['CDSW_IP_ADDRESS']

# use num-cpus=0 when start a head node to prevent this node from performing task/actor computation.
command = "ray start --head --block --include-dashboard=true --dashboard-port=$CDSW_READONLY_PORT --num-cpus=0 --num-gpus=0 &" 
#command = "ray start --head --block --include-dashboard=true --dashboard-port=$CDSW_READONLY_PORT --num-cpus=6 --num-gpus=1 &" 

subprocess.run(command, shell = True, executable="/bin/bash")

with open("RAY_HEAD_IP", 'w') as output_file:
    output_file.write(DASHBOARD_IP)
            
ray_head_addr = DASHBOARD_IP + ':6379'
ray_url = f"ray://{DASHBOARD_IP}:10001" 
worker_start_cmd = f"!ray start --block --address={ray_head_addr}"

time.sleep(7)
ray_workers = workers.launch_workers(
    n=1, 
    cpu=5, 
    memory=32,
    nvidia_gpu=1,
    code=worker_start_cmd,
)

In [None]:

    # DeepSpeed correctly handles device placement (CPU or GPU)
    #device = get_accelerator().device_name(model.local_rank)
    device = "cuda:0"

if __name__ == "__main__":
    # --- Conditionally set device based on CUDA availability ---
    use_gpu = torch.cuda.is_available()
    if use_gpu:
        print("✅ CUDA is available. Training will run on GPU.")
    else:
        print("⚠️ CUDA not found. Training will run on CPU.")

    # --- DeepSpeed Configuration ---
    deepspeed_config = {
        "optimizer": {"type": "AdamW", "params": {"lr": 2e-5}},
        "scheduler": {"type": "WarmupLR", "params": {"warmup_num_steps": 100}},
        "fp16": {"enabled": use_gpu}, # Enable fp16 only if GPU is available
        "bf16": {"enabled": False},
        "zero_optimization": {
            "stage": 3,
            "offload_optimizer": {"device": "none"},
            "offload_param": {"device": "none"},
        },
        "gradient_accumulation_steps": 1,
        "gradient_clipping": True,
        "steps_per_print": 10,
        "train_micro_batch_size_per_gpu": 16,
    }


In [2]:
import os
import json
from tempfile import TemporaryDirectory
import torch
import deepspeed
from deepspeed.accelerator import get_accelerator
from torchmetrics.text import ROUGEScore
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, set_seed

import ray
import ray.train
from ray.train import Checkpoint, DataConfig, ScalingConfig
from ray.train.torch import TorchTrainer

# NLTK is required for ROUGE score calculation
import nltk
nltk.download("punkt", quiet=True)


def train_func(config):
    """
    Your training function that will be launched on each Ray worker.
    """
    # Unpack training configs
    set_seed(config["seed"])
    model_id = config["model_id"]
    num_epochs = config["num_epochs"]
    train_batch_size = config["train_batch_size"]
    eval_batch_size = config["eval_batch_size"]
    generation_max_length = config["generation_max_length"]
    generation_num_beams = config["generation_num_beams"]
    deepspeed_config = config["deepspeed_config"]

    # Instantiate the Tokenizer and Model
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

    if model.config.pad_token_id is None:
        model.config.pad_token_id = tokenizer.pad_token_id

    # --- Prepare Ray Data Loaders ---
    train_ds = ray.train.get_dataset_shard("train")
    eval_ds = ray.train.get_dataset_shard("validation")

    def collate_fn(batch):
        # Ensure all data is string type, replacing None with an empty string ""
        instructions = [str(s) if s is not None else "" for s in batch["instruction"]]
        inputs_col = [str(s) if s is not None else "" for s in batch["input"]]
        targets = [str(s) if s is not None else "" for s in batch["output"]]
        
        # Combine instruction and input columns to create the full input prompt
        inputs = [
            f"Instruction: {instr}\nInput: {inp}" 
            for instr, inp in zip(instructions, inputs_col)
        ]

        model_inputs = tokenizer(
            inputs,
            max_length=256,
            padding="longest",
            truncation=True,
            return_tensors="pt"
        )

        labels = tokenizer(
            text_target=targets,
            max_length=generation_max_length,
            padding="longest",
            truncation=True,
            return_tensors="pt"
        ).input_ids

        labels[labels == tokenizer.pad_token_id] = -100
        model_inputs["labels"] = labels

        return model_inputs

    train_dataloader = train_ds.iter_torch_batches(
        batch_size=train_batch_size, collate_fn=collate_fn
    )
    eval_dataloader = eval_ds.iter_torch_batches(
        batch_size=eval_batch_size, collate_fn=collate_fn
    )

    # --- Initialize DeepSpeed Engine ---
    model, optimizer, _, lr_scheduler = deepspeed.initialize(
        model=model,
        model_parameters=model.parameters(),
        config=deepspeed_config,
    )
    
    # Set device as requested
    device = "cuda:0"

    # --- Initialize Evaluation Metrics ---
    rouge_metric = ROUGEScore().to(device)

    # --- Training and Evaluation Loop ---
    for epoch in range(num_epochs):
        model.train()
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            model.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        model.eval()
        for batch in eval_dataloader:
            labels = batch.pop("labels").to(device)
            batch_on_device = {k: v.to(device) for k, v in batch.items()}

            with torch.no_grad():
                generated_ids = model.generate(
                    **batch_on_device,
                    max_length=generation_max_length,
                    num_beams=generation_num_beams,
                )

            decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            labels[labels == -100] = tokenizer.pad_token_id
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            rouge_metric.update(decoded_preds, decoded_labels)

        eval_metrics = rouge_metric.compute()
        rouge_metric.reset()

        if model.global_rank == 0:
            print(f"Epoch {epoch}:")
            for key, value in eval_metrics.items():
                print(f"  {key}: {value.item():.4f}")

        # --- Report checkpoint and metrics to Ray Train ---
        with TemporaryDirectory() as tmpdir:
            model.save_checkpoint(tmpdir)
            if torch.distributed.is_initialized():
                torch.distributed.barrier()

            ray.train.report(
                metrics={k: v.item() for k, v in eval_metrics.items()},
                checkpoint=Checkpoint.from_directory(tmpdir)
            )

if __name__ == "__main__":
    # Check for GPU availability
    use_gpu = torch.cuda.is_available()
    print(f"Using GPU: {use_gpu}")

    deepspeed_config = {
        "optimizer": {"type": "AdamW", "params": {"lr": 2e-5}},
        "scheduler": {"type": "WarmupLR", "params": {"warmup_num_steps": 100}},
        "fp16": {"enabled": use_gpu},
        "bf16": {"enabled": False},
        "zero_optimization": {
            "stage": 3,
            "offload_optimizer": {"device": "none"},
            "offload_param": {"device": "none"},
        },
        "gradient_accumulation_steps": 1,
        "gradient_clipping": True,
        "steps_per_print": 10,
        "train_micro_batch_size_per_gpu": 16,
    }

    training_config = {
        "seed": 42,
        "model_id": "t5-small",
        "num_epochs": 1,
        "train_batch_size": 32,
        "eval_batch_size": 32,
        "generation_max_length": 128,
        "generation_num_beams": 4,
        "deepspeed_config": deepspeed_config,
    }

    deepspeed_config["train_micro_batch_size_per_gpu"] = training_config["train_batch_size"]

    ray_datasets = {
        "train": ray.data.read_parquet("wikisql/data/train-00000-of-00001-36d5d5ed0289390f.parquet"),
        "validation": ray.data.read_parquet("wikisql/data/validation-00000-of-00001-3f1ecb1168a6a037.parquet"),
    }

    trainer = TorchTrainer(
        train_func,
        train_loop_config=training_config,
        scaling_config=ScalingConfig(num_workers=1, use_gpu=True),
        datasets=ray_datasets,
        dataset_config=DataConfig(datasets_to_split=["train", "validation"]),
    )

    result = trainer.fit()

    print("Training finished!")
    if result.best_checkpoints:
        best_checkpoint_path = result.best_checkpoints[0][0].path
        print(f"Best checkpoint saved at: {best_checkpoint_path}")

[2025-07-28 13:15:06,957] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cpu (auto detect)


/usr/bin/ld: cannot find -laio
collect2: error: ld returned 1 exit status


[2025-07-28 13:17:01,586] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
Using GPU: False


2025-07-28 13:17:47,851	INFO worker.py:1747 -- Connecting to existing Ray cluster at address: 10.254.10.56:6379...
2025-07-28 13:17:47,875	INFO worker.py:1918 -- Connected to Ray cluster. View the dashboard at [1m[32m127.0.0.1:8100 [39m[22m


Parquet Files Sample 0:   0%|          | 0.00/1.00 [00:00<?, ? file/s]

Parquet Files Sample 0:   0%|          | 0.00/1.00 [00:00<?, ? file/s]

2025-07-28 13:17:59,365	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


== Status ==
Current time: 2025-07-28 13:17:59 (running for 00:00:00.14)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-07-28 13:18:05 (running for 00:00:05.17)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-07-28 13:18:10 (running for 00:00:10.22)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_

[36m(TrainTrainable pid=551, ip=10.254.5.187)[0m Trainable.setup took 164.692 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


== Status ==
Current time: 2025-07-28 13:21:56 (running for 00:03:57.07)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m Setting up process group for: env:// [rank=0, world_size=1]
[36m(TorchTrainer pid=551, ip=10.254.5.187)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=551, ip=10.254.5.187)[0m - (node_id=5d4f22d0f159853292fe9754df0cccaa2a6820c3e1b5a0437c2a85ed, ip=10.254.5.187, pid=715) world_rank=0, local_rank=0, node_rank=0


[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m [2025-07-28 13:22:01,788] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
== Status ==
Current time: 2025-07-28 13:22:01 (running for 00:04:02.12)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m [2025-07-28 13:22:06,756] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
== Status ==
Current time: 2025-07-28 13:22:07 (running for 00:04:07.15)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/To



[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m [2025-07-28 13:22:16,738] [INFO] [logging.py:107:log_dist] [Rank -1] DeepSpeed info: version=0.17.2, git-hash=unknown, git-branch=unknown
[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m [2025-07-28 13:22:16,738] [INFO] [comm.py:676:init_distributed] cdb=None
[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m [2025-07-28 13:22:16,738] [INFO] [config.py:684:__init__] Config mesh_device None world_size = 1
== Status ==
Current time: 2025-07-28 13:22:17 (running for 00:04:17.23)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m [2025-07-28 13:22:21,032] [INFO] [engine.py:1339:_configure_distributed_model] ********** distributed groups summary ******

[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m Using /home/cdsw/.cache/torch_extensions/py310_cu126 as PyTorch extensions root...
[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m Detected CUDA files, patching ldflags
[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m Emitting ninja build file /home/cdsw/.cache/torch_extensions/py310_cu126/fused_adam/build.ninja...
[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m Building extension module fused_adam...
[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


== Status ==
Current time: 2025-07-28 13:22:32 (running for 00:04:32.35)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m [1/1] c++ fused_adam_frontend.o multi_tensor_adam.cuda.o -shared -L/home/cdsw/.local/lib/python3.10/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda -ltorch -ltorch_python -L/usr/local/cuda/lib64 -lcudart -o fused_adam.so
[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m Time to load fused_adam op: 4.852369546890259 seconds
[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m [2025-07-28 13:22:34,596] [INFO] [logging.py:107:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adamw as basic optimizer
[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m [2025-07

[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m Loading extension module fused_adam...


[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m [2025-07-28 13:22:34,858] [INFO] [utils.py:781:see_memory_usage] Stage 3 initialize beginning
[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m [2025-07-28 13:22:34,859] [INFO] [utils.py:782:see_memory_usage] MA 0.23 GB         Max_MA 0.23 GB         CA 0.23 GB         Max_CA 0 GB 
[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m [2025-07-28 13:22:34,859] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory:  used = 14.68 GB, percent = 2.9%
[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m [2025-07-28 13:22:34,861] [INFO] [stage3.py:186:__init__] Reduce bucket size 500000000
[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m [2025-07-28 13:22:34,861] [INFO] [stage3.py:187:__init__] Prefetch bucket size 50000000
[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m [2025-07-28 13:22:35,095] [INFO] [utils.py:781:see_memory_usage] DeepSpeedZeRoOffload initialize [begin]
[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m [2025-07-28 



[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m [2025-07-28 13:22:36,149] [INFO] [utils.py:781:see_memory_usage] Before creating fp16 partitions
[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m [2025-07-28 13:22:36,150] [INFO] [utils.py:782:see_memory_usage] MA 0.23 GB         Max_MA 0.23 GB         CA 0.29 GB         Max_CA 0 GB 
[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m [2025-07-28 13:22:36,150] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory:  used = 14.71 GB, percent = 2.9%
[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m [2025-07-28 13:22:36,853] [INFO] [utils.py:781:see_memory_usage] After creating fp16 partitions: 1
[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m [2025-07-28 13:22:36,854] [INFO] [utils.py:782:see_memory_usage] MA 0.23 GB         Max_MA 0.23 GB         CA 0.23 GB         Max_CA 0 GB 
[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m [2025-07-28 13:22:36,854] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory:  used = 14.71 GB,

[36m(SplitCoordinator pid=791, ip=10.254.5.187)[0m Registered dataset logger for dataset train_2_0
[36m(SplitCoordinator pid=791, ip=10.254.5.187)[0m Starting execution of Dataset train_2_0. Full logs are in /tmp/ray/session_2025-07-28_13-13-11_505390_320/logs/ray-data
[36m(SplitCoordinator pid=791, ip=10.254.5.187)[0m Execution plan of Dataset train_2_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=791, ip=10.254.5.187)[0m ⚠️  Ray's object store is configured to use only 42.9% of available memory (17.7GB out of 41.3GB total). For optimal Ray Data performance, we recommend setting the object store to at least 50% of available memory. You can do this by setting the 'object_store_memory' parameter when calling ray.init() or by setting the RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION environment variable.


(pid=791, ip=10.254.5.187) Running 0: 0.00 row [00:00, ? row/s]

(pid=791, ip=10.254.5.187) - ReadParquet->SplitBlocks(80) 1: 0.00 row [00:00, ? row/s]

(pid=791, ip=10.254.5.187) - split(1, equal=True) 2: 0.00 row [00:00, ? row/s]

== Status ==
Current time: 2025-07-28 13:22:47 (running for 00:04:47.46)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 13:22:52 (running for 00:04:52.50)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 13:22:57 (running for 00:04:57.54)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_

[36m(SplitCoordinator pid=791, ip=10.254.5.187)[0m ✔️  Dataset train_2_0 execution finished in 1062.74 seconds


== Status ==
Current time: 2025-07-28 13:40:30 (running for 00:22:31.00)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 13:40:35 (running for 00:22:36.04)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 13:40:40 (running for 00:22:41.08)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_

[36m(SplitCoordinator pid=790, ip=10.254.5.187)[0m Registered dataset logger for dataset validation_3_0
[36m(SplitCoordinator pid=790, ip=10.254.5.187)[0m Starting execution of Dataset validation_3_0. Full logs are in /tmp/ray/session_2025-07-28_13-13-11_505390_320/logs/ray-data
[36m(SplitCoordinator pid=790, ip=10.254.5.187)[0m Execution plan of Dataset validation_3_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> OutputSplitter[split(1, equal=True)]
[36m(SplitCoordinator pid=790, ip=10.254.5.187)[0m ⚠️  Ray's object store is configured to use only 42.9% of available memory (17.7GB out of 41.3GB total). For optimal Ray Data performance, we recommend setting the object store to at least 50% of available memory. You can do this by setting the 'object_store_memory' parameter when calling ray.init() or by setting the RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION environment variable.


(pid=790, ip=10.254.5.187) Running 0: 0.00 row [00:00, ? row/s]

(pid=790, ip=10.254.5.187) - ReadParquet->SplitBlocks(80) 1: 0.00 row [00:00, ? row/s]

(pid=790, ip=10.254.5.187) - split(1, equal=True) 2: 0.00 row [00:00, ? row/s]

== Status ==
Current time: 2025-07-28 13:41:41 (running for 00:23:41.54)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 13:41:46 (running for 00:23:46.58)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 13:41:51 (running for 00:23:51.62)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_

[36m(SplitCoordinator pid=790, ip=10.254.5.187)[0m ✔️  Dataset validation_3_0 execution finished in 576.36 seconds


== Status ==
Current time: 2025-07-28 13:51:21 (running for 00:33:21.22)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 13:51:26 (running for 00:33:26.26)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 13:51:31 (running for 00:33:31.30)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_

[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/cdsw/ray_results/TorchTrainer_2025-07-28_13-17-59/TorchTrainer_483dd_00000_0_2025-07-28_13-17-59/checkpoint_000000)
[36m(SplitCoordinator pid=791, ip=10.254.5.187)[0m Registered dataset logger for dataset train_2_1
[36m(SplitCoordinator pid=791, ip=10.254.5.187)[0m Starting execution of Dataset train_2_1. Full logs are in /tmp/ray/session_2025-07-28_13-13-11_505390_320/logs/ray-data
[36m(SplitCoordinator pid=791, ip=10.254.5.187)[0m Execution plan of Dataset train_2_1: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> OutputSplitter[split(1, equal=True)]


(pid=791, ip=10.254.5.187) Running 0: 0.00 row [00:00, ? row/s]

(pid=791, ip=10.254.5.187) - ReadParquet->SplitBlocks(80) 1: 0.00 row [00:00, ? row/s]

(pid=791, ip=10.254.5.187) - split(1, equal=True) 2: 0.00 row [00:00, ? row/s]

== Status ==
Current time: 2025-07-28 13:52:51 (running for 00:34:51.96)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 13:52:56 (running for 00:34:57.01)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 13:53:01 (running for 00:35:02.04)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_

[36m(SplitCoordinator pid=791, ip=10.254.5.187)[0m ✔️  Dataset train_2_1 execution finished in 1048.39 seconds


== Status ==
Current time: 2025-07-28 14:10:20 (running for 00:52:20.45)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 14:10:25 (running for 00:52:25.49)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 14:10:30 (running for 00:52:30.53)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_

[36m(SplitCoordinator pid=790, ip=10.254.5.187)[0m Registered dataset logger for dataset validation_3_1
[36m(SplitCoordinator pid=790, ip=10.254.5.187)[0m Starting execution of Dataset validation_3_1. Full logs are in /tmp/ray/session_2025-07-28_13-13-11_505390_320/logs/ray-data
[36m(SplitCoordinator pid=790, ip=10.254.5.187)[0m Execution plan of Dataset validation_3_1: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> OutputSplitter[split(1, equal=True)]


(pid=790, ip=10.254.5.187) Running 0: 0.00 row [00:00, ? row/s]

(pid=790, ip=10.254.5.187) - ReadParquet->SplitBlocks(80) 1: 0.00 row [00:00, ? row/s]

(pid=790, ip=10.254.5.187) - split(1, equal=True) 2: 0.00 row [00:00, ? row/s]

== Status ==
Current time: 2025-07-28 14:11:30 (running for 00:53:30.99)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 14:11:35 (running for 00:53:36.04)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 14:11:40 (running for 00:53:41.07)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_

[36m(SplitCoordinator pid=790, ip=10.254.5.187)[0m ✔️  Dataset validation_3_1 execution finished in 572.55 seconds


== Status ==
Current time: 2025-07-28 14:21:05 (running for 01:03:05.65)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 14:21:10 (running for 01:03:10.69)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 14:21:15 (running for 01:03:15.73)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_

[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/cdsw/ray_results/TorchTrainer_2025-07-28_13-17-59/TorchTrainer_483dd_00000_0_2025-07-28_13-17-59/checkpoint_000001)
[36m(SplitCoordinator pid=791, ip=10.254.5.187)[0m Registered dataset logger for dataset train_2_2
[36m(SplitCoordinator pid=791, ip=10.254.5.187)[0m Starting execution of Dataset train_2_2. Full logs are in /tmp/ray/session_2025-07-28_13-13-11_505390_320/logs/ray-data
[36m(SplitCoordinator pid=791, ip=10.254.5.187)[0m Execution plan of Dataset train_2_2: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> OutputSplitter[split(1, equal=True)]


(pid=791, ip=10.254.5.187) Running 0: 0.00 row [00:00, ? row/s]

(pid=791, ip=10.254.5.187) - ReadParquet->SplitBlocks(80) 1: 0.00 row [00:00, ? row/s]

(pid=791, ip=10.254.5.187) - split(1, equal=True) 2: 0.00 row [00:00, ? row/s]

== Status ==
Current time: 2025-07-28 14:22:31 (running for 01:04:31.37)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 14:22:36 (running for 01:04:36.41)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 14:22:41 (running for 01:04:41.45)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_

[36m(SplitCoordinator pid=791, ip=10.254.5.187)[0m ✔️  Dataset train_2_2 execution finished in 1081.18 seconds


== Status ==
Current time: 2025-07-28 14:40:34 (running for 01:22:34.96)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 14:40:39 (running for 01:22:40.00)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 14:40:44 (running for 01:22:45.04)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_

[36m(SplitCoordinator pid=790, ip=10.254.5.187)[0m Registered dataset logger for dataset validation_3_2
[36m(SplitCoordinator pid=790, ip=10.254.5.187)[0m Starting execution of Dataset validation_3_2. Full logs are in /tmp/ray/session_2025-07-28_13-13-11_505390_320/logs/ray-data
[36m(SplitCoordinator pid=790, ip=10.254.5.187)[0m Execution plan of Dataset validation_3_2: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> OutputSplitter[split(1, equal=True)]


(pid=790, ip=10.254.5.187) Running 0: 0.00 row [00:00, ? row/s]

(pid=790, ip=10.254.5.187) - ReadParquet->SplitBlocks(80) 1: 0.00 row [00:00, ? row/s]

(pid=790, ip=10.254.5.187) - split(1, equal=True) 2: 0.00 row [00:00, ? row/s]

== Status ==
Current time: 2025-07-28 14:41:45 (running for 01:23:45.50)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 14:41:50 (running for 01:23:50.54)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 14:41:55 (running for 01:23:55.58)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_

[36m(SplitCoordinator pid=790, ip=10.254.5.187)[0m ✔️  Dataset validation_3_2 execution finished in 592.86 seconds


== Status ==
Current time: 2025-07-28 14:51:40 (running for 01:33:40.20)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 14:51:45 (running for 01:33:45.25)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-28 14:51:50 (running for 01:33:50.28)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_

[36m(RayTrainWorker pid=715, ip=10.254.5.187)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/cdsw/ray_results/TorchTrainer_2025-07-28_13-17-59/TorchTrainer_483dd_00000_0_2025-07-28_13-17-59/checkpoint_000002)
2025-07-28 14:53:10,420	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/cdsw/ray_results/TorchTrainer_2025-07-28_13-17-59' in 1.2310s.
2025-07-28 14:53:10,426	INFO tune.py:1041 -- Total run time: 5711.06 seconds (5709.34 seconds for the tuning loop).


== Status ==
Current time: 2025-07-28 14:53:10 (running for 01:35:10.57)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/40 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
Result logdir: /tmp/ray/session_2025-07-28_13-13-11_505390_320/artifacts/2025-07-28_13-17-59/TorchTrainer_2025-07-28_13-17-59/driver_artifacts
Number of trials: 1/1 (1 TERMINATED)


Training finished!
Best checkpoint saved at: /home/cdsw/ray_results/TorchTrainer_2025-07-28_13-17-59/TorchTrainer_483dd_00000_0_2025-07-28_13-17-59/checkpoint_000000
