In [34]:
import sys
sys.path.append("/mnt/code")

In [35]:
import os
import json
import math
import pandas as pd
import xgboost as xgb
import pyarrow as pa
import pyarrow.dataset as pds

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

import mlflow
from mlflow.tracking import MlflowClient
from mlflow.models.signature import infer_signature

import ray
from ray import tune
from ray.air import RunConfig, ScalingConfig

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

import ray
from ray import air, train
from ray.train import Checkpoint
from ray.train.torch import TorchTrainer, get_device, prepare_model, prepare_data_loader
from ray.air.config import RunConfig, ScalingConfig

try:
    from ray.tune.callback import Callback      # Ray >= 2.6
except ImportError:
    from ray.tune.callbacks import Callback     # Older Ray
from utils import ddl_cluster_scaling_client
from utils import mlflow_utils
from utils import ray_utils

from pathlib import Path

In [36]:
import os
import platform
import json
import ray

# ----- RUN CONFIG (edit as needed) -----

'''
#Pip installs do not work yet in Domino. Add the libraries to your environment or install using command line in the worker
RUN_CONFIG = {
    # Packages are installed into an isolated env for each worker by Ray
    
    "pip": [
        "ray[train]==2.49.1",
        "torch==2.3.1",
        "torchvision==0.18.1",
        "torchaudio==2.3.1"
    ],
    # Environment variables to expose inside workers
    "env_vars": {
        "MY_APP_FLAG": "enabled",
        "NCCL_IB_DISABLE": "1",     # harmless on CPU-only boxes; useful hint on GPU clusters without IB/EFA
        "TORCH_SHOW_CPP_STACKTRACES": "1"
    }
}
'''
RUN_CONFIG = {
    # Packages are installed into an isolated env for each worker by Ray
    # Environment variables to expose inside workers
    "env_vars": {
        "MY_APP_FLAG": "enabled",
        "NCCL_IB_DISABLE": "1",     # harmless on CPU-only boxes; useful hint on GPU clusters without IB/EFA
        "TORCH_SHOW_CPP_STACKTRACES": "1"
    }
}

In [37]:
#!pip install --user "torch==2.3.1" "torchvision==0.18.1" "torchaudio==2.3.1"

In [38]:
#!pip install -U "pyopenssl<24" "cryptography<42"

In [39]:
@ray.remote(num_cpus=1)
def probe_worker(env_keys):
    import os, platform, importlib
    from ray.runtime_context import get_runtime_context

    ctx = get_runtime_context()

    def _id(ctx, name):
        if not hasattr(ctx, name):
            return None
        v = getattr(ctx, name)()  # may be bytes-like w/ .hex() or already a str
        try:
            return v.hex()
        except AttributeError:
            return str(v)

    # Torch proof without leaking torch objects in the return
    torch = importlib.import_module("torch")
    x = torch.tensor([1.0, 2.0, 3.0]) * 2.0
    sample_sum = float(x.sum().item())
    torch_version = str(getattr(torch, "__version__", "unknown"))
    cuda_avail = bool(hasattr(torch, "cuda") and torch.cuda.is_available())
    del x, torch

    return {
        "node": platform.node(),
        "pid": os.getpid(),
        "python_executable": os.sys.executable,
        "torch_version": torch_version,
        "torch_cuda_available": cuda_avail,
        "env": {k: os.environ.get(k) for k in env_keys},
        "torch_sample_sum": sample_sum,
        "ids": {
            "task_id": _id(ctx, "get_task_id"),
            "actor_id": _id(ctx, "get_actor_id"),
            "node_id": _id(ctx, "get_node_id"),
            "job_id": _id(ctx, "get_job_id"),
            "namespace": ctx.get_namespace() if hasattr(ctx, "get_namespace") else None,
        },
    }


In [40]:
# file: simple_ray_runtime_env_fix.py
import os, json, platform
import ray
import torch



SHARED_DIR = "/mnt/data/ddl-end-to-end-demo"

RUNTIME_ENV = {
    "env_vars": {
        "SHARED_DIR": SHARED_DIR,
        "APP_MODE": "probe",
        "MY_APP_FLAG": "enabled",
        "NCCL_IB_DISABLE": "1",
        "TORCH_SHOW_CPP_STACKTRACES": "1",
    }
}

if "RAY_HEAD_SERVICE_HOST" in os.environ and "RAY_HEAD_SERVICE_PORT" in os.environ:
   addr = f"ray://{os.environ['RAY_HEAD_SERVICE_HOST']}:{os.environ['RAY_HEAD_SERVICE_PORT']}"
   ray.shutdown()
   ray.init(
      address=addr or "auto",
      runtime_env=RUNTIME_ENV,
      namespace="demo-ray-ns"
  )

def _norm_id(val):
    try:
        return val.hex()
    except Exception:
        return str(val)

@ray.remote(num_cpus=1)
def probe_worker():
    import os, time, socket, json, tempfile
    from ray.runtime_context import get_runtime_context
    from pathlib import Path

    ctx = get_runtime_context()
    job_id  = _norm_id(ctx.get_job_id())  if hasattr(ctx, "get_job_id") else "na"
    task_id = _norm_id(ctx.get_task_id()) if hasattr(ctx, "get_task_id") else "na"

    shared = Path(os.environ["SHARED_DIR"],job_id,task_id)  # pulled from runtime_env
    shared.mkdir(parents=True, exist_ok=True)

    payload = {
        "node": platform.node(),
        "pid": os.getpid(),
        "job_id": job_id,
        "task_id": task_id,
        "env": dict(os.environ),
        "ts": time.time(),
    }

    fname = f"probe_{job_id}_{task_id}_{os.getpid()}.json"
    dest = shared / fname
    with tempfile.NamedTemporaryFile("w", delete=False, dir=str(shared)) as tmp:
        json.dump(payload, tmp, indent=2)
        tmp.flush()
        os.fsync(tmp.fileno())
        tmp_path = tmp.name
    os.replace(tmp_path, dest)

    return {"wrote": str(dest), "size": dest.stat().st_size}

def main():
    # --------------------------------------
    if "RAY_HEAD_SERVICE_HOST" in os.environ and "RAY_HEAD_SERVICE_PORT" in os.environ:
       addr = f"ray://{os.environ['RAY_HEAD_SERVICE_HOST']}:{os.environ['RAY_HEAD_SERVICE_PORT']}"
       ray.shutdown()
       ray.init(
          address=addr or "auto",
          #runtime_env={"env_vars": RUNTIME_ENV},   # same env you used earlier
          namespace="demo-ray-ns"
      )
# Connect to Domino Ray if available; otherw
    t1 = probe_worker.options(runtime_env=RUNTIME_ENV).remote()
    t2 = probe_worker.options(runtime_env=RUNTIME_ENV).remote()
    out1, out2 = ray.get([t1, t2])
    print(json.dumps({"worker_1": out1, "worker_2": out2}, indent=2))

main()



2025-09-20 17:30:34,920	INFO client_builder.py:244 -- Passing the following kwargs to ray.init() on the server: log_to_driver
SIGTERM handler is not set because current thread is not the main thread.
2025-09-20 17:30:35,945	INFO client_builder.py:244 -- Passing the following kwargs to ray.init() on the server: log_to_driver
SIGTERM handler is not set because current thread is not the main thread.


{
  "worker_1": {
    "wrote": "/mnt/data/ddl-end-to-end-demo/1f000000/81a4966da5e2d352ffffffffffffffffffffffff1f000000/probe_1f000000_81a4966da5e2d352ffffffffffffffffffffffff1f000000_157.json",
    "size": 17165
  },
  "worker_2": {
    "wrote": "/mnt/data/ddl-end-to-end-demo/1f000000/ebd77e3d348811d7ffffffffffffffffffffffff1f000000/probe_1f000000_ebd77e3d348811d7ffffffffffffffffffffffff1f000000_156.json",
    "size": 17165
  }
}


In [41]:
def prepare_mnist(data_root: Path):
    data_root.mkdir(parents=True, exist_ok=True)
    # Uses torchvision's built-in downloader/extractor
    from torchvision import datasets, transforms
    tfm = transforms.Compose([transforms.ToTensor()])
    datasets.MNIST(str(data_root), train=True,  download=True, transform=tfm)
    datasets.MNIST(str(data_root), train=False, download=True, transform=tfm)


data_dir = Path("/mnt/data/ddl-end-to-end-demo/mnist/")
data_dir.mkdir(parents=True, exist_ok=True)
prepare_mnist(data_dir)

## Apply Domsed mutation if using this under Istio

Worker nodes need to be able to communicate over ephemeral ports for distributed training to work. When using Domino with istio 
you need to open a port (Ex. 29000) for the workers for both inbound and outbound connections. The `allow-custom-port-inbound-ray-interworker-comm` simply adds 29000 to the list of the `includeInboundPorts`

Mutation to allow inbound to port `29000` - **allow-custom-port-inbound-ray-interworker-comm**

```
apiVersion: apps.dominodatalab.com/v1alpha1
kind: Mutation
metadata:
  name: allow-custom-port-inbound-ray-interworker-comm
  namespace: domino-platform
rules:
-
  modifyAnnotation:
    key: "traffic.sidecar.istio.io/includeInboundPorts"
    value: "2384,2385,11000,11001,11002,11003,11004,11005,11006,11007,11008,11009,11010,11011,11012,11013,11014,11015,11016,11017,11018,11019,11020,11021,11022,11023,11024,11025,11026,11027,11028,11029,11030,11031,11032,11033,11034,11035,11036,11037,11038,11039,11040,11041,11042,11043,11044,11045,11046,11047,11048,11049,11050,11051,11052,11053,11054,11055,11056,11057,11058,11059,11060,11061,11062,11063,11064,11065,11066,11067,11068,11069,11070,11071,11072,11073,11074,11075,11076,11077,11078,11079,11080,11081,11082,11083,11084,11085,11086,11087,11088,11089,11090,11091,11092,11093,11094,11095,11096,11097,11098,11099,29000"

```

Mutation to allow outbound to port `29000` - **allow-custom-port-outbound-ray-interworker-comm**
```
apiVersion: apps.dominodatalab.com/v1alpha1
kind: Mutation
metadata:
  name: allow-custom-port-outbound-ray-interworker-comm
  namespace: domino-platform
rules:
-
  modifyAnnotation:
    key: "traffic.sidecar.istio.io/excludeOutboundPorts"
    value: "29000"
```

When using without Istio you do not need these mutation. You also do not have to pass the env variable `"MASTER_PORT": "29000"`.
Ray runtime will pick any ephemeral port to communicate for the purpose of distributed training

In [None]:
import os, ray
from ray.air.config import RunConfig
from ray.tune.logger import CSVLoggerCallback, JsonLoggerCallback
from ray.runtime_context import get_runtime_context
from pathlib import Path

def build_model(num_classes: int = 10) -> nn.Module:
    return nn.Sequential(
        nn.Flatten(),
        nn.Linear(28 * 28, 512), nn.ReLU(),
        nn.Linear(512, 256), nn.ReLU(),
        nn.Linear(256, num_classes),
    )
    
def train_loop_per_worker(config):
    device = get_device()
    model = prepare_model(build_model().to(device))

    data_root = os.environ["SHARED_DIR"]  # already populated
    tfm = transforms.Compose([transforms.ToTensor()])

    # No network access in workers; just read the files
    train_ds = datasets.MNIST(data_root, train=True,  download=False, transform=tfm)
    test_ds  = datasets.MNIST(data_root, train=False, download=False, transform=tfm)

    # Start conservative; you can raise num_workers/pin_memory after it works
    train_loader = DataLoader(train_ds, batch_size=config["batch_size"], shuffle=True,
                              num_workers=0, pin_memory=False)
    test_loader  = DataLoader(test_ds,  batch_size=512, shuffle=False,
                              num_workers=0, pin_memory=False)

    train_loader = prepare_data_loader(train_loader)
    test_loader  = prepare_data_loader(test_loader)

    opt = torch.optim.AdamW(model.parameters(), lr=config["lr"])
    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(config["epochs"]):
        model.train()
        running = 0.0
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            opt.zero_grad(set_to_none=True)
            loss = loss_fn(model(x), y)
            loss.backward()
            opt.step()
            running += loss.item()

        model.eval()
        correct = total = 0
        with torch.no_grad():
            for x, y in test_loader:
                x, y = x.to(device), y.to(device)
                pred = model(x).argmax(dim=1)
                correct += (pred == y).sum().item()
                total += y.numel()
        acc = correct / total
        train.report({"epoch": epoch, "train_loss": running, "val_acc": acc})



def main():

    RUNTIME_ENV = {
        "env_vars": {
        "GLOO_SOCKET_IFNAME": "eth0",
        "SHARED_DIR": str(data_dir),
        "TUNE_DISABLE_AUTO_CALLBACKS": "1",
        "TORCH_DISABLE_ADDR2LINE": "1",     # stop symbolizer hang
        "TORCH_SHOW_CPP_STACKTRACES": "1",
        #"NCCL_IB_DISABLE": "1",
        "NCCL_P2P_DISABLE": "1",
        "NCCL_SHM_DISABLE": "1",
        "OMP_NUM_THREADS": "2",
        # >>> Key bits for DDP rendezvous <<<
        "MASTER_PORT": "29000",           # fixed, not ephemeral
        "GLOO_SOCKET_IFNAME": "eth0",     # bind on pod interface
        "NCCL_SOCKET_IFNAME": "eth0",     # harmless even if CPU-only
            
        }
    }
    # --------------------------------------
    if "RAY_HEAD_SERVICE_HOST" in os.environ and "RAY_HEAD_SERVICE_PORT" in os.environ:
       addr = f"ray://{os.environ['RAY_HEAD_SERVICE_HOST']}:{os.environ['RAY_HEAD_SERVICE_PORT']}"
       ray.shutdown()
       ray.init(
          address=addr or "auto",
          runtime_env=RUNTIME_ENV,   # same env you used earlier
          namespace="demo-ray-ns"
      )

    ctx = get_runtime_context()
    try:
        job_id_hex = ctx.get_job_id().hex()
    except Exception:
        job_id_hex = "unknown_job"

    DATASET_FOLDER = "/mnt/data/ddl-end-to-end-demo/"
    shared = Path(DATASET_FOLDER,job_id_hex,"ray_results")  # pulled from runtime_env
    shared.mkdir(parents=True, exist_ok=True)
    STORAGE_PATH=str(shared)
    
    storage_base = Path("/mnt/data/ddl-end-to-end-demo")  # head-visible shared
    job_id_hex = getattr(ray.get_runtime_context(), "get_job_id", lambda: "unknown")()
    job_id_hex = job_id_hex.hex() if hasattr(job_id_hex, "hex") else str(job_id_hex)
    storage_path = str(storage_base / job_id_hex / "ray_results")
    
    os.environ["TUNE_DISABLE_AUTO_CALLBACKS"] = "1"
    trainer = TorchTrainer(
        train_loop_per_worker,
        train_loop_config={"lr": 1e-3, "batch_size": 256, "epochs": 5},
        scaling_config=ScalingConfig(
            num_workers=2,
            use_gpu=True,                      # keep CPU+gloo until stable
            resources_per_worker={"CPU": 2,"GPU": 1},
            trainer_resources={"CPU": 0},               # <— key change
            placement_strategy="SPREAD",     
            #placement_strategy="PACK",          # single-node to avoid networking issues
        ),
        run_config=RunConfig(
            name=f"mnist_torch_ddp_{job_id_hex}",
            storage_path=STORAGE_PATH,
            callbacks=[CSVLoggerCallback(), JsonLoggerCallback()],
        ),
    )


    result = trainer.fit()


In [43]:
main()


[36m(TunerInternal pid=11812)[0m 
[36m(TunerInternal pid=11812)[0m View detailed results here: /mnt/data/ddl-end-to-end-demo/unknown_job/ray_results/mnist_torch_ddp_20000000
[36m(TunerInternal pid=11812)[0m To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2025-09-20_07-15-22_475265_1/artifacts/2025-09-20_10-30-43/mnist_torch_ddp_20000000/driver_artifacts`


[36m(TunerInternal pid=11812)[0m AIR_VERBOSITY is set, ignoring passed-in ProgressReporter for now.


[36m(TunerInternal pid=11812)[0m 
[36m(TunerInternal pid=11812)[0m Training started with configuration:
[36m(TunerInternal pid=11812)[0m ╭──────────────────────────────────────╮
[36m(TunerInternal pid=11812)[0m │ Training config                      │
[36m(TunerInternal pid=11812)[0m ├──────────────────────────────────────┤
[36m(TunerInternal pid=11812)[0m │ train_loop_config/batch_size     256 │
[36m(TunerInternal pid=11812)[0m │ train_loop_config/epochs           5 │
[36m(TunerInternal pid=11812)[0m │ train_loop_config/lr           0.001 │
[36m(TunerInternal pid=11812)[0m ╰──────────────────────────────────────╯


[36m(RayTrainWorker pid=325, ip=100.64.60.227)[0m Setting up process group for: env:// [rank=0, world_size=2]
[36m(TorchTrainer pid=280, ip=100.64.60.227)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=280, ip=100.64.60.227)[0m - (node_id=3b69fc3f37ca82c7ec924c97f63485d15dba64e05e8e9b299ded9f91, ip=100.64.60.227, pid=325) world_rank=0, local_rank=0, node_rank=0
[36m(TorchTrainer pid=280, ip=100.64.60.227)[0m - (node_id=135baf60627c984443801299c5cae9747aa01920cc1d82b8701a2bd6, ip=100.64.21.58, pid=156) world_rank=1, local_rank=0, node_rank=1
[36m(RayTrainWorker pid=325, ip=100.64.60.227)[0m Moving model to device: cuda:0
[36m(RayTrainWorker pid=325, ip=100.64.60.227)[0m Wrapping provided model in DistributedDataParallel.
[36m(RayTrainWorker pid=156, ip=100.64.21.58)[0m Moving model to device: cuda:0
[36m(RayTrainWorker pid=156, ip=100.64.21.58)[0m Wrapping provided model in DistributedDataParallel.


[36m(TunerInternal pid=11812)[0m 
[36m(TunerInternal pid=11812)[0m Training finished iteration 1 at 2025-09-20 10:31:01. Total running time: 16s
[36m(TunerInternal pid=11812)[0m ╭───────────────────────────────╮
[36m(TunerInternal pid=11812)[0m │ Training result               │
[36m(TunerInternal pid=11812)[0m ├───────────────────────────────┤
[36m(TunerInternal pid=11812)[0m │ checkpoint_dir_name           │
[36m(TunerInternal pid=11812)[0m │ time_this_iter_s      11.6474 │
[36m(TunerInternal pid=11812)[0m │ time_total_s          11.6474 │
[36m(TunerInternal pid=11812)[0m │ training_iteration          1 │
[36m(TunerInternal pid=11812)[0m │ epoch                       0 │
[36m(TunerInternal pid=11812)[0m │ train_loss            55.5477 │
[36m(TunerInternal pid=11812)[0m │ val_acc                0.9386 │
[36m(TunerInternal pid=11812)[0m ╰───────────────────────────────╯
[36m(TunerInternal pid=11812)[0m 
[36m(TunerInternal pid=11812)[0m Training finished ite

[36m(TunerInternal pid=11812)[0m Wrote the latest version of all result files and experiment state to '/mnt/data/ddl-end-to-end-demo/unknown_job/ray_results/mnist_torch_ddp_20000000' in 0.0226s.


In [None]:
ray.shutdown()