In [1]:
import sys
sys.path.append("/mnt/code")

In [2]:
import os
import json
import math
import pandas as pd
import xgboost as xgb
import pyarrow as pa
import pyarrow.dataset as pds

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

import mlflow
from mlflow.tracking import MlflowClient
from mlflow.models.signature import infer_signature

import ray
from ray import tune
from ray.air import RunConfig, ScalingConfig

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

import ray
from ray import air, train
from ray.train import Checkpoint
from ray.train.torch import TorchTrainer, get_device, prepare_model, prepare_data_loader
from ray.air.config import RunConfig, ScalingConfig

try:
    from ray.tune.callback import Callback      # Ray >= 2.6
except ImportError:
    from ray.tune.callbacks import Callback     # Older Ray
from utils import ddl_cluster_scaling_client
from utils import mlflow_utils
from utils import ray_utils

2025-09-19 22:27:11,170	INFO util.py:154 -- Outdated packages:
  ipywidgets==7.7.5 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-09-19 22:27:11,250	INFO util.py:154 -- Outdated packages:
  ipywidgets==7.7.5 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-09-19 22:27:13,045	INFO util.py:154 -- Outdated packages:
  ipywidgets==7.7.5 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


## Pre-requsites

Configure the following user environment variables

1. AWS_ROLE_ARN - This is the AWS role being assumed via IR
2. S3_BUCKET_NAME

In [3]:
# Download dataset and push to S3

# Load dataset
data = fetch_california_housing(as_frame=True)
df = data.frame.rename(columns={"MedHouseVal": "median_house_value"})

# Split
train, tmp = train_test_split(df, test_size=0.3, random_state=42)
val, test  = train_test_split(tmp, test_size=0.5, random_state=42)

# Save locally
train.to_parquet("/tmp/train.parquet", index=False)
val.to_parquet("/tmp/val.parquet", index=False)
test.to_parquet("/tmp/test.parquet", index=False)

# Push to S3
!aws s3 cp /tmp/train.parquet s3://${S3_BUCKET_NAME}/end-to-end/california/train/
!aws s3 cp /tmp/val.parquet   s3://${S3_BUCKET_NAME}/end-to-end/california/val/
!aws s3 cp /tmp/test.parquet  s3://${S3_BUCKET_NAME}/end-to-end/california/test/

upload: ../../../tmp/train.parquet to s3://ddl-wadkars/end-to-end/california/train/train.parquet
upload: ../../../tmp/val.parquet to s3://ddl-wadkars/end-to-end/california/val/val.parquet
upload: ../../../tmp/test.parquet to s3://ddl-wadkars/end-to-end/california/test/test.parquet


In [4]:
cluster_kind = "rayclusters"

In [5]:
j = ddl_cluster_scaling_client.scale_cluster(cluster_kind=cluster_kind,head_hw_tier_name="Medium", worker_hw_tier_name="Medium", replicas=2)
json.dumps(j, indent=2, sort_keys=True, ensure_ascii=False)
ddl_cluster_scaling_client.wait_until_scaling_complete(cluster_kind=cluster_kind)

2
Status code 200
ray-68cdcd8314e2171c9b0f2508
http://ddl-cluster-scaler-svc.domino-field.svc.cluster.local/ddl_cluster_scaler/cluster/rayclusters/ray-68cdcd8314e2171c9b0f2508
Status code 200
Expected worker nodes 2
Current worker nodes ['ray-68cdcd8314e2171c9b0f2508-ray-worker-0', 'ray-68cdcd8314e2171c9b0f2508-ray-worker-1']


True

In [None]:
from utils import ddl_cluster_scaling_client
j = ddl_cluster_scaling_client.restart_head_node(cluster_kind="rayclusters")
restarts_at = j['started_at']
print(restarts_at)
ddl_cluster_scaling_client.restart_head_node_status(cluster_kind="rayclusters",restarted_since=restarts_at)
ddl_cluster_scaling_client.wait_until_node_restarted(cluster_kind="rayclusters",restarted_since=restarts_at)

In [6]:
import os
import platform
import json
import ray

# ----- RUN CONFIG (edit as needed) -----

'''
#Pip installs do not work yet in Domino. Add the libraries to your environment or install using command line in the worker
RUN_CONFIG = {
    # Packages are installed into an isolated env for each worker by Ray
    
    "pip": [
        "ray[train]==2.49.1",
        "torch==2.3.1",
        "torchvision==0.18.1",
        "torchaudio==2.3.1"
    ],
    # Environment variables to expose inside workers
    "env_vars": {
        "MY_APP_FLAG": "enabled",
        "NCCL_IB_DISABLE": "1",     # harmless on CPU-only boxes; useful hint on GPU clusters without IB/EFA
        "TORCH_SHOW_CPP_STACKTRACES": "1"
    }
}
'''
RUN_CONFIG = {
    # Packages are installed into an isolated env for each worker by Ray
    # Environment variables to expose inside workers
    "env_vars": {
        "MY_APP_FLAG": "enabled",
        "NCCL_IB_DISABLE": "1",     # harmless on CPU-only boxes; useful hint on GPU clusters without IB/EFA
        "TORCH_SHOW_CPP_STACKTRACES": "1"
    }
}

In [None]:
!pip install --user "torch==2.3.1" "torchvision==0.18.1" "torchaudio==2.3.1"

In [None]:
!pip install -U "pyopenssl<24" "cryptography<42"

In [7]:
@ray.remote(num_cpus=1)
def probe_worker(env_keys):
    import os, platform, importlib
    from ray.runtime_context import get_runtime_context

    ctx = get_runtime_context()

    def _id(ctx, name):
        if not hasattr(ctx, name):
            return None
        v = getattr(ctx, name)()  # may be bytes-like w/ .hex() or already a str
        try:
            return v.hex()
        except AttributeError:
            return str(v)

    # Torch proof without leaking torch objects in the return
    torch = importlib.import_module("torch")
    x = torch.tensor([1.0, 2.0, 3.0]) * 2.0
    sample_sum = float(x.sum().item())
    torch_version = str(getattr(torch, "__version__", "unknown"))
    cuda_avail = bool(hasattr(torch, "cuda") and torch.cuda.is_available())
    del x, torch

    return {
        "node": platform.node(),
        "pid": os.getpid(),
        "python_executable": os.sys.executable,
        "torch_version": torch_version,
        "torch_cuda_available": cuda_avail,
        "env": {k: os.environ.get(k) for k in env_keys},
        "torch_sample_sum": sample_sum,
        "ids": {
            "task_id": _id(ctx, "get_task_id"),
            "actor_id": _id(ctx, "get_actor_id"),
            "node_id": _id(ctx, "get_node_id"),
            "job_id": _id(ctx, "get_job_id"),
            "namespace": ctx.get_namespace() if hasattr(ctx, "get_namespace") else None,
        },
    }


In [8]:
# file: simple_ray_runtime_env_fix.py
import os, json, platform
import ray
import torch
# ---- hard fix for your error: avoid virtualenv, use stdlib venv ----
os.environ["RAY_RUNTIME_ENV_VENV_CREATION"] = "venv"  # must be set before ray.init

# ---- choose one path ----
USE_PREINSTALLED = False  # True = skip installs; all deps must already be on nodes

RUNTIME_ENV = {
    "env_vars": {
        "MY_APP_FLAG": "enabled",
        "NCCL_IB_DISABLE": "1",
        "TORCH_SHOW_CPP_STACKTRACES": "1",
    },
    "eager_install": True,  # fail fast during setup
}

if not USE_PREINSTALLED:
    # Ray will create a venv using stdlib venv (due to the env var above) and pip-install these
    RUNTIME_ENV["pip"] = {
        "packages": [
            "ray[default]==2.49.1",
            "torch==2.3.1",
            "torchvision==0.18.1",
            "torchaudio==2.3.1",
        ],
        # optional: pin pip itself to avoid old/broken versions on the node
        "pip_check": True,
    }
else:
    # Skip creating a new env; require packages preinstalled in the base image
    RUNTIME_ENV["pip"] = {"packages": [], "preinstalled": True}

        
# --------------------------------------
if "RAY_HEAD_SERVICE_HOST" in os.environ and "RAY_HEAD_SERVICE_PORT" in os.environ:
   addr = f"ray://{os.environ['RAY_HEAD_SERVICE_HOST']}:{os.environ['RAY_HEAD_SERVICE_PORT']}"
   ray.shutdown()
   ray.init(
      address=addr or "auto",
      #runtime_env={"env_vars": ray_envs},   # same env you used earlier
      namespace="demo-ray-ns"
  )
# Connect to Domino Ray if available; otherw
def main():
    runtime_env = {        
        "env_vars": RUN_CONFIG["env_vars"],
    }
    env_keys = list(RUN_CONFIG["env_vars"].keys())

    # Launch two workers with the same runtime_env
    t1 = probe_worker.options(runtime_env=runtime_env).remote(env_keys)
    t2 = probe_worker.options(runtime_env=runtime_env).remote(env_keys)

    out1, out2 = ray.get([t1, t2])
    print(json.dumps({"worker_1": out1, "worker_2": out2}, indent=2))



2025-09-19 22:27:29,641	INFO client_builder.py:244 -- Passing the following kwargs to ray.init() on the server: log_to_driver
SIGTERM handler is not set because current thread is not the main thread.


In [9]:
# file: simple_ray_runtime_env_fix.py
import os, json, platform
import ray
import torch



SHARED_DIR = "/mnt/data/ddl-end-to-end-demo"

RUNTIME_ENV = {
    "env_vars": {
        "SHARED_DIR": SHARED_DIR,
        "APP_MODE": "probe",
        "MY_APP_FLAG": "enabled",
        "NCCL_IB_DISABLE": "1",
        "TORCH_SHOW_CPP_STACKTRACES": "1",
    }
}

if "RAY_HEAD_SERVICE_HOST" in os.environ and "RAY_HEAD_SERVICE_PORT" in os.environ:
   addr = f"ray://{os.environ['RAY_HEAD_SERVICE_HOST']}:{os.environ['RAY_HEAD_SERVICE_PORT']}"
   ray.shutdown()
   ray.init(
      address=addr or "auto",
      runtime_env=RUNTIME_ENV,
      namespace="demo-ray-ns"
  )

def _norm_id(val):
    try:
        return val.hex()
    except Exception:
        return str(val)

@ray.remote(num_cpus=1)
def probe_worker():
    import os, time, socket, json, tempfile
    from ray.runtime_context import get_runtime_context
    from pathlib import Path

    ctx = get_runtime_context()
    job_id  = _norm_id(ctx.get_job_id())  if hasattr(ctx, "get_job_id") else "na"
    task_id = _norm_id(ctx.get_task_id()) if hasattr(ctx, "get_task_id") else "na"

    shared = Path(os.environ["SHARED_DIR"],job_id,task_id)  # pulled from runtime_env
    shared.mkdir(parents=True, exist_ok=True)

    payload = {
        "node": platform.node(),
        "pid": os.getpid(),
        "job_id": job_id,
        "task_id": task_id,
        "env": dict(os.environ),
        "ts": time.time(),
    }

    fname = f"probe_{job_id}_{task_id}_{os.getpid()}.json"
    dest = shared / fname
    with tempfile.NamedTemporaryFile("w", delete=False, dir=str(shared)) as tmp:
        json.dump(payload, tmp, indent=2)
        tmp.flush()
        os.fsync(tmp.fileno())
        tmp_path = tmp.name
    os.replace(tmp_path, dest)

    return {"wrote": str(dest), "size": dest.stat().st_size}

def main():
    # --------------------------------------
    if "RAY_HEAD_SERVICE_HOST" in os.environ and "RAY_HEAD_SERVICE_PORT" in os.environ:
       addr = f"ray://{os.environ['RAY_HEAD_SERVICE_HOST']}:{os.environ['RAY_HEAD_SERVICE_PORT']}"
       ray.shutdown()
       ray.init(
          address=addr or "auto",
          #runtime_env={"env_vars": RUNTIME_ENV},   # same env you used earlier
          namespace="demo-ray-ns"
      )
# Connect to Domino Ray if available; otherw
    t1 = probe_worker.options(runtime_env=RUNTIME_ENV).remote()
    t2 = probe_worker.options(runtime_env=RUNTIME_ENV).remote()
    out1, out2 = ray.get([t1, t2])
    print(json.dumps({"worker_1": out1, "worker_2": out2}, indent=2))

main()



2025-09-19 22:27:44,419	INFO client_builder.py:244 -- Passing the following kwargs to ray.init() on the server: log_to_driver
SIGTERM handler is not set because current thread is not the main thread.
2025-09-19 22:27:45,439	INFO client_builder.py:244 -- Passing the following kwargs to ray.init() on the server: log_to_driver
SIGTERM handler is not set because current thread is not the main thread.


{
  "worker_1": {
    "wrote": "/mnt/data/ddl-end-to-end-demo/12000000/96c5738fe28f186fffffffffffffffffffffffff12000000/probe_12000000_96c5738fe28f186fffffffffffffffffffffffff12000000_1266.json",
    "size": 17092
  },
  "worker_2": {
    "wrote": "/mnt/data/ddl-end-to-end-demo/12000000/a936e6d80473d7eeffffffffffffffffffffffff12000000/probe_12000000_a936e6d80473d7eeffffffffffffffffffffffff12000000_1265.json",
    "size": 17092
  }
}


In [None]:
import os
from typing import Dict

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

import ray
from ray import air, train
from ray.tune.logger import CSVLoggerCallback, JsonLoggerCallback
from ray.train import Checkpoint
from ray.train.torch import TorchTrainer, get_device, prepare_model, prepare_data_loader
from ray.air.config import RunConfig, ScalingConfig
from pathlib import Path

# ---------- 1) Pre-download MNIST once on the driver ----------
def prepare_mnist(data_root: Path):
    data_root.mkdir(parents=True, exist_ok=True)
    # Uses torchvision's built-in downloader/extractor
    from torchvision import datasets, transforms
    tfm = transforms.Compose([transforms.ToTensor()])
    datasets.MNIST(str(data_root), train=True,  download=True, transform=tfm)
    datasets.MNIST(str(data_root), train=False, download=True, transform=tfm)


def build_model(num_classes: int = 10) -> nn.Module:
    return nn.Sequential(
        nn.Flatten(),
        nn.Linear(28 * 28, 512), nn.ReLU(),
        nn.Linear(512, 256), nn.ReLU(),
        nn.Linear(256, num_classes),
    )


def train_loop_per_worker(config: Dict):
    device = get_device()
    model = build_model().to(device)
    model = prepare_model(model)

    # Shared dataset root so rank-0 downloads once and all ranks read the same files
    data_root = os.path.join(
        os.environ.get("SHARED_DIR"),
        "mnist",
    )

    transform = transforms.Compose([transforms.ToTensor()])

    ''' Not avaiable in Ray 2.36
    # Ensure only world rank 0 performs the download, others wait
    with train.world_rank_zero_first():
        datasets.MNIST(data_root, download=True, train=True, transform=transform)
        datasets.MNIST(data_root, download=True, train=False, transform=transform)
    '''
    
    train_ds = datasets.MNIST(data_root, train=True, transform=transform)
    test_ds = datasets.MNIST(data_root, train=False, transform=transform)

    train_loader = DataLoader(
        train_ds,
        batch_size=config["batch_size"],
        shuffle=True,
        num_workers=2,
        pin_memory=True,
    )
    test_loader = DataLoader(
        test_ds,
        batch_size=1024,
        shuffle=False,
        num_workers=2,
        pin_memory=True,
    )

    train_loader = prepare_data_loader(train_loader)
    test_loader = prepare_data_loader(test_loader)

    optimizer = torch.optim.AdamW(model.parameters(), lr=config["lr"])
    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(config["epochs"]):
        model.train()
        running = 0.0
        for x, y in train_loader:
            x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
            optimizer.zero_grad(set_to_none=True)
            loss = loss_fn(model(x), y)
            loss.backward()
            optimizer.step()
            running += loss.item()

        # simple eval
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for x, y in test_loader:
                x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
                pred = model(x).argmax(dim=1)
                correct += (pred == y).sum().item()
                total += y.numel()
        acc = correct / total

        train.report({"epoch": epoch, "train_loss": running, "val_acc": acc})

    # optional checkpoint (handles DDP-wrapped model)
    state = model.module.state_dict() if hasattr(model, "module") else model.state_dict()
    checkpoint = Checkpoint.from_dict({"model_state": state})
    train.report({"final_acc": acc}, checkpoint=checkpoint)

In [None]:
data_dir = Path("/mnt/data/ddl-end-to-end-demo/mnist/")
data_dir.mkdir(parents=True, exist_ok=True)
prepare_mnist(data_dir)

In [None]:
import os, ray
from ray.air.config import RunConfig
from ray.tune.logger import CSVLoggerCallback, JsonLoggerCallback
from ray.runtime_context import get_runtime_context
from pathlib import Path

def train_loop_per_worker(config):
    device = get_device()
    model = prepare_model(build_model().to(device))

    data_root = os.environ["SHARED_DIR"]  # already populated
    tfm = transforms.Compose([transforms.ToTensor()])

    # No network access in workers; just read the files
    train_ds = datasets.MNIST(data_root, train=True,  download=False, transform=tfm)
    test_ds  = datasets.MNIST(data_root, train=False, download=False, transform=tfm)

    # Start conservative; you can raise num_workers/pin_memory after it works
    train_loader = DataLoader(train_ds, batch_size=config["batch_size"], shuffle=True,
                              num_workers=0, pin_memory=False)
    test_loader  = DataLoader(test_ds,  batch_size=512, shuffle=False,
                              num_workers=0, pin_memory=False)

    train_loader = prepare_data_loader(train_loader)
    test_loader  = prepare_data_loader(test_loader)

    opt = torch.optim.AdamW(model.parameters(), lr=config["lr"])
    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(config["epochs"]):
        model.train()
        running = 0.0
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            opt.zero_grad(set_to_none=True)
            loss = loss_fn(model(x), y)
            loss.backward()
            opt.step()
            running += loss.item()

        model.eval()
        correct = total = 0
        with torch.no_grad():
            for x, y in test_loader:
                x, y = x.to(device), y.to(device)
                pred = model(x).argmax(dim=1)
                correct += (pred == y).sum().item()
                total += y.numel()
        acc = correct / total
        train.report({"epoch": epoch, "train_loss": running, "val_acc": acc})



def main():

    RUNTIME_ENV = {
        "env_vars": {
        "GLOO_SOCKET_IFNAME": "eth0",
        "SHARED_DIR": str(data_dir),
        "TUNE_DISABLE_AUTO_CALLBACKS": "1",
        "TORCH_DISABLE_ADDR2LINE": "1",     # stop symbolizer hang
        "TORCH_SHOW_CPP_STACKTRACES": "1",
        "NCCL_IB_DISABLE": "1",
        "NCCL_P2P_DISABLE": "1",
        "NCCL_SHM_DISABLE": "1",
        "OMP_NUM_THREADS": "2",
        # >>> Key bits for DDP rendezvous <<<
        "MASTER_PORT": "29400",           # fixed, not ephemeral
        "GLOO_SOCKET_IFNAME": "eth0",     # bind on pod interface
        "NCCL_SOCKET_IFNAME": "eth0",     # harmless even if CPU-only
            
        }
    }
    # --------------------------------------
    if "RAY_HEAD_SERVICE_HOST" in os.environ and "RAY_HEAD_SERVICE_PORT" in os.environ:
       addr = f"ray://{os.environ['RAY_HEAD_SERVICE_HOST']}:{os.environ['RAY_HEAD_SERVICE_PORT']}"
       ray.shutdown()
       ray.init(
          address=addr or "auto",
          runtime_env=RUNTIME_ENV,   # same env you used earlier
          namespace="demo-ray-ns"
      )

    ctx = get_runtime_context()
    try:
        job_id_hex = ctx.get_job_id().hex()
    except Exception:
        job_id_hex = "unknown_job"

    DATASET_FOLDER = "/mnt/data/ddl-end-to-end-demo/"
    shared = Path(DATASET_FOLDER,job_id_hex,"ray_results")  # pulled from runtime_env
    shared.mkdir(parents=True, exist_ok=True)
    STORAGE_PATH=str(shared)
    
    storage_base = Path("/mnt/data/ddl-end-to-end-demo")  # head-visible shared
    job_id_hex = getattr(ray.get_runtime_context(), "get_job_id", lambda: "unknown")()
    job_id_hex = job_id_hex.hex() if hasattr(job_id_hex, "hex") else str(job_id_hex)
    storage_path = str(storage_base / job_id_hex / "ray_results")
    
    os.environ["TUNE_DISABLE_AUTO_CALLBACKS"] = "1"
    trainer = TorchTrainer(
        train_loop_per_worker,
        train_loop_config={"lr": 1e-3, "batch_size": 256, "epochs": 5},
        scaling_config=ScalingConfig(
            num_workers=1,
            use_gpu=False,                      # keep CPU+gloo until stable
            resources_per_worker={"CPU": 2},
            placement_strategy="PACK",          # single-node to avoid networking issues
        ),
        run_config=RunConfig(
            name=f"mnist_torch_ddp_{job_id_hex}",
            storage_path=STORAGE_PATH,
            callbacks=[CSVLoggerCallback(), JsonLoggerCallback()],
        ),
    )


    result = trainer.fit()


In [None]:
main()


In [None]:
ray.shutdown()