In [1]:
import sys
sys.path.append("/mnt/code")

## Create Domino Environments

**Workspace Environment**

**Base Env** - `quay.io/domino/domino-ray-environment:ubuntu22-py3.10-r4.4-ray2.36.0-domino6.0`

**Dockerfile**
```
USER root
RUN apt update && apt install -y unixodbc unixodbc-dev
RUN pip install h11==0.16.0
RUN pip install pyarrow==14.0.2
RUN pip install hyperopt
RUN pip uninstall -y bson
RUN pip install pymongo
RUN pip install -q "xgboost>=2.0,<3"
RUN pip install hydra-core 
RUN pip install --no-cache-dir -q "torch==2.3.1" "torchvision==0.18.1" "torchaudio==2.3.1" "pyopenssl<24" "cryptography<42"
```

Make sure to add the **pluggable workspace tools**
```
jupyter:
  title: "Jupyter (Python, R, Julia)"
  iconUrl: "/assets/images/workspace-logos/Jupyter.svg"
  start: [ "/opt/domino/workspaces/jupyter/start" ]
  supportedFileExtensions: [ ".ipynb" ]
  httpProxy:
    port: 8888
    rewrite: false
    internalPath: "/{{ownerUsername}}/{{projectName}}/{{sessionPathComponent}}/{{runId}}/{{#if pathToOpen}}tree/{{pathToOpen}}{{/if}}"
    requireSubdomain: false
jupyterlab:
  title: "JupyterLab"
  iconUrl: "/assets/images/workspace-logos/jupyterlab.svg"
  start: [  "/opt/domino/workspaces/jupyterlab/start" ]
  httpProxy:
    internalPath: "/{{ownerUsername}}/{{projectName}}/{{sessionPathComponent}}/{{runId}}/{{#if pathToOpen}}tree/{{pathToOpen}}{{/if}}"
    port: 8888
    rewrite: false
    requireSubdomain: false
vscode:
  title: "vscode"
  iconUrl: "/assets/images/workspace-logos/vscode.svg"
  start: [ "/opt/domino/workspaces/vscode/start" ]
  httpProxy:
    port: 8888
    requireSubdomain: false
rstudio:
  title: "RStudio"
  iconUrl: "/assets/images/workspace-logos/Rstudio.svg"
  start: [ "/opt/domino/workspaces/rstudio/start" ]
  httpProxy:
    port: 8888
    requireSubdomain: false
```
  


## Ray cluster environment

**Base Env** - `quay.io/domino/ray-cluster-environment:ray2.36.0-py3.10-domino6.0`

**Dockerfile**

```
USER root
RUN apt update && apt install -y unixodbc unixodbc-dev
RUN /opt/conda/bin/pip install h11==0.16.0
RUN pip install pyarrow==14.0.2
RUN pip install hyperopt
RUN pip uninstall -y bson
RUN pip install pymongo
RUN pip install -q "xgboost>=2.0,<3"
RUN pip install hydra-core 
RUN pip install --no-cache-dir -q "torch==2.3.1" "torchvision==0.18.1" "torchaudio==2.3.1" "pyopenssl<24" "cryptography<42"
USER ubuntu    
```

## Start the workspace

Start a workspace with a Ray cluster with 3 medium sized worker nodes and 1 small sized head node

## Pick the configuration file

We use hydra to run various types of run (Root folder is `/mnt/code/conf`) -
1. `config.yaml` - Default values
2. `env/local.yaml` - Runs in Domino via Domino datasets as a shared location. Uses a small subset of the total data
3. `env/dev.yaml` - Runs with S3 bucket and uses a small subset of the total data
4. `env/test.yaml` - Runs with S3 bucket and uses the full dataset
5. `env/prod.yaml` - Runs with S3 bucket and uses the full dataset

There is no difference between `test` and `prod`

In [2]:
which_env="local" #Picks the appropriate hydra config file

In [3]:
import os
import math
import pandas as pd
import xgboost as xgb
import pyarrow as pa
import pyarrow.dataset as pds

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

import mlflow
import mlflow.xgboost as mlflow_xgb
from mlflow.tracking import MlflowClient
from mlflow.models.signature import infer_signature

import ray
from ray import tune
from ray.air import RunConfig, ScalingConfig
from ray.data import read_parquet
from ray.air.integrations.mlflow import MLflowLoggerCallback
from ray.train.xgboost import XGBoostTrainer
from ray.tune.search.hyperopt import HyperOptSearch
from ray.tune.schedulers import ASHAScheduler

try:
    from ray.tune.callback import Callback      # Ray >= 2.6
except ImportError:
    from ray.tune.callbacks import Callback     # Older Ray
from utils import ddl_cluster_scaling_client
from utils import mlflow_utils
from utils import ray_utils

2025-10-07 13:34:41,673	INFO util.py:154 -- Outdated packages:
  ipywidgets==7.7.5 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-10-07 13:34:41,774	INFO util.py:154 -- Outdated packages:
  ipywidgets==7.7.5 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-10-07 13:34:41,919	INFO util.py:154 -- Outdated packages:
  ipywidgets==7.7.5 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


## Pre-requsites - When using S3 bucket

Configure the following user environment variables

1. AWS_ROLE_ARN - This is the AWS role being assumed via IR
2. S3_BUCKET_NAME

In [4]:
## Verify your role identity in AWS
import boto3
sts = boto3.client("sts")
identity = sts.get_caller_identity()
print(identity)

{'UserId': 'AROA5YW464O6S35MGC2WL:botocore-session-1759844082', 'Account': '946429944765', 'Arn': 'arn:aws:sts::946429944765:assumed-role/sameer-irsa-full-bucket-role/botocore-session-1759844082', 'ResponseMetadata': {'RequestId': 'be3a2d66-9ba1-44bf-a443-f4d80896ed58', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'be3a2d66-9ba1-44bf-a443-f4d80896ed58', 'x-amz-sts-extended-request-id': 'MTp1cy13ZXN0LTI6MTc1OTg0NDA4MjY3MjpSOkRBM3JhZ0xM', 'content-type': 'text/xml', 'content-length': '489', 'date': 'Tue, 07 Oct 2025 13:34:42 GMT'}, 'RetryAttempts': 0}}


In [5]:
import os
import shutil



# Load dataset
data = fetch_california_housing(as_frame=True)
df = data.frame.rename(columns={"MedHouseVal": "median_house_value"})

# Split
train, tmp = train_test_split(df, test_size=0.3, random_state=42)
val, test  = train_test_split(tmp, test_size=0.5, random_state=42)

# Save locally
train.to_parquet("/tmp/train.parquet", index=False)
val.to_parquet("/tmp/val.parquet", index=False)
test.to_parquet("/tmp/test.parquet", index=False)

In [6]:
# Download dataset and push to S3
# Push to S3
!aws s3 cp /tmp/train.parquet s3://${S3_BUCKET_NAME}/end-to-end/california/train/
!aws s3 cp /tmp/val.parquet   s3://${S3_BUCKET_NAME}/end-to-end/california/val/
!aws s3 cp /tmp/test.parquet  s3://${S3_BUCKET_NAME}/end-to-end/california/test/



upload: ../../../tmp/train.parquet to s3://ddl-wadkars/end-to-end/california/train/train.parquet
upload: ../../../tmp/val.parquet to s3://ddl-wadkars/end-to-end/california/val/val.parquet
upload: ../../../tmp/test.parquet to s3://ddl-wadkars/end-to-end/california/test/test.parquet


## Pre-requsites - When using a Domino dataset

If you are working on your own project there will be a dataset with the same name as the project. Configure the env variable
`DATASET_NAME` below to that dataset name. If not associate a dataset you have permissions to for your project and name the 
environment variable accordingly.

Also note that the `ROOT_DOMINO_DATASET_FOLDER` is `/mnt/data` for a git backed project. For a Domino File System backed project it is `/domino/datasets/local/`



In [7]:
ROOT_DOMINO_DATASET_FOLDER = "/mnt/data" #For DFS based project use /domino/datasets/local/
DATASET_NAME="ddl-end-to-end-demo"

In [8]:
def copy_file_with_dirs(src_file, dest_file):
    """
    Copy a file from src_file to dest_file, creating destination
    folders if they don't exist.
    """
    # Ensure destination directory exists
    dest_dir = os.path.dirname(dest_file)
    os.makedirs(dest_dir, exist_ok=True)

    # Copy the file
    shutil.copy2(src_file, dest_file)  # copy2 preserves metadata
    print(f"Copied {src_file} -> {dest_file}")


src = f"/tmp/train.parquet"
dest = f"{ROOT_DOMINO_DATASET_FOLDER}/{DATASET_NAME}/end-to-end/california/train"
copy_file_with_dirs(src, dest)

src = f"/tmp/val.parquet"
dest = f"{ROOT_DOMINO_DATASET_FOLDER}/{DATASET_NAME}/end-to-end/california/val"
copy_file_with_dirs(src, dest)

src = f"/tmp/test.parquet"
dest = f"{ROOT_DOMINO_DATASET_FOLDER}/{DATASET_NAME}/end-to-end/california/test"
copy_file_with_dirs(src, dest)

Copied /tmp/train.parquet -> /mnt/data/ddl-end-to-end-demo/end-to-end/california/train
Copied /tmp/val.parquet -> /mnt/data/ddl-end-to-end-demo/end-to-end/california/val
Copied /tmp/test.parquet -> /mnt/data/ddl-end-to-end-demo/end-to-end/california/test


In [9]:
from __future__ import annotations

import os
import math
import numbers
from typing import Any, Dict, List, Optional

import mlflow
from mlflow.tracking import MlflowClient
from ray.tune.experiment.trial import Trial
from ray.air.integrations.mlflow import MLflowLoggerCallback as _BaseMLflowLoggerCallback


def _flatten(d: Dict[str, Any], parent: str = "", sep: str = ".") -> Dict[str, Any]:
    out = {}
    for k, v in (d or {}).items():
        nk = f"{parent}{sep}{k}" if parent else str(k)
        if isinstance(v, dict):
            out.update(_flatten(v, nk, sep))
        elif isinstance(v, (list, tuple)):
            s = str(v[:64]) + ("..." if len(v) > 64 else "")
            out[nk] = s
        else:
            out[nk] = v
    return out


def _is_num(x: Any) -> bool:
    return isinstance(x, numbers.Number) and not (isinstance(x, float) and (math.isnan(x) or math.isinf(x)))


class ChildRunMLflowCallback(_BaseMLflowLoggerCallback):
    """
    A strict child-run MLflow logger for Ray Tune that preserves the public API of
    ray.air.integrations.mlflow.MLflowLoggerCallback, but:
      - Nests each trial under an optional parent_run_id.
      - Logs all numeric metrics per step.
      - Logs flattened trial config as params once at trial start.
    """

    def __init__(
        self,
        tracking_uri: Optional[str] = None,
        experiment_name: Optional[str] = None,
        save_artifact: bool = True,
        tags: Optional[Dict[str, str]] = None,
        *,
        parent_run_id: Optional[str] = None,
        run_name_prefix: str = "trial",
    ):
        self.parent_run_id = parent_run_id
        self.run_name_prefix = run_name_prefix

        merged_tags = dict(tags or {})
        if parent_run_id and "mlflow.parentRunId" not in merged_tags:
            merged_tags["mlflow.parentRunId"] = parent_run_id

        # init base (keeps compatibility with Ray’s signature/behavior)
        super().__init__(
            tracking_uri=tracking_uri,
            experiment_name=experiment_name,
            save_artifact=save_artifact,
            tags=merged_tags,
        )

        # Our own bookkeeping
        self._client: Optional[MlflowClient] = None
        self._exp_id: Optional[str] = None
        self._trial_run_ids: Dict[str, str] = {}
        self._trial_steps: Dict[str, int] = {}

        # Make workers inherit if needed
        if tracking_uri:
            os.environ["MLFLOW_TRACKING_URI"] = tracking_uri

    # ---- Ray callback hooks ----

    def setup(self, **info):
        # Mirror Ray’s built-in setup, but we keep explicit handles around.
        if self.tracking_uri:
            mlflow.set_tracking_uri(self.tracking_uri)
        if self.experiment_name:
            mlflow.set_experiment(self.experiment_name)

        self._client = MlflowClient(self.tracking_uri or mlflow.get_tracking_uri())
        if self.experiment_name:
            exp = self._client.get_experiment_by_name(self.experiment_name)
            self._exp_id = exp.experiment_id if exp else self._client.create_experiment(self.experiment_name)
        else:
            # fall back to the current active experiment
            self._exp_id = mlflow.get_experiment_by_name(mlflow.get_experiment(mlflow.active_run().info.experiment_id).name).experiment_id  # type: ignore

    def on_trial_start(self, iteration: int, trials: List[Trial], trial: Trial, **info):
        assert self._client and self._exp_id, "MLflow not initialized. Did setup() run?"
        run_name = f"{self.run_name_prefix}-{trial.trial_id[:8]}"
        tags = dict(self.tags or {})
        tags.setdefault("mlflow.runName", run_name)
        tags["source"] = "ray.tune"
        tags["ray.trial_id"] = trial.trial_id

        run = self._client.create_run(experiment_id=self._exp_id, tags=tags)
        run_id = run.info.run_id
        self._trial_run_ids[trial.trial_id] = run_id
        self._trial_steps[trial.trial_id] = 0

        # Params once
        for k, v in _flatten(trial.config).items():
            try:
                self._client.log_param(run_id, k, str(v))
            except Exception:
                pass

    def on_trial_result(self, iteration: int, trials: List[Trial], trial: Trial, result: Dict[str, Any], **info):
        run_id = self._trial_run_ids.get(trial.trial_id)
        if not run_id:
            return

        step = int(result.get("training_iteration", self._trial_steps[trial.trial_id]))
        self._trial_steps[trial.trial_id] = step + 1

        metrics = {k: float(v) for k, v in result.items() if _is_num(v)}
        if not metrics:
            return

        # Batch log for efficiency
        self._client.log_batch(
            run_id=run_id,
            metrics=[mlflow.entities.Metric(key=k, value=v, timestamp=None, step=step) for k, v in metrics.items()],
        )

    def on_trial_complete(self, iteration: int, trials: List[Trial], trial: Trial, **info):
        self._finish(trial, status="FINISHED")

    def on_trial_error(self, iteration: int, trials: List[Trial], trial: Trial, **info):
        self._finish(trial, status="FAILED")

    def on_experiment_end(self, trials: List[Trial], **info):
        for t in list(self._trial_run_ids.keys()):
            # close any dangling runs
            fake = type("T", (), {"trial_id": t})
            self._finish(fake, status="FINISHED")

    # ---- internals ----

    def _finish(self, trial: Trial, status: str):
        run_id = self._trial_run_ids.pop(trial.trial_id, None)
        if not run_id or not self._client:
            return
        try:
            self._client.set_terminated(run_id, status=status)
        except Exception:
            pass

In [10]:
from ray.air._internal.mlflow import _MLflowLoggerUtil
from ray.tune.logger import LoggerCallback
from ray.tune.result import TIMESTEPS_TOTAL
from ray.tune.experiment import Trial
from ray.util.annotations import PublicAPI
import logging
logger = logging.getLogger(__name__)
class MyMLflowLoggerCallback(LoggerCallback):
    """MLflow Logger to automatically log Tune results and config to MLflow.

    MLflow (https://mlflow.org) Tracking is an open source library for
    recording and querying experiments. This Ray Tune ``LoggerCallback``
    sends information (config parameters, training results & metrics,
    and artifacts) to MLflow for automatic experiment tracking.

    Args:
        tracking_uri: The tracking URI for where to manage experiments
            and runs. This can either be a local file path or a remote server.
            This arg gets passed directly to mlflow
            initialization. When using Tune in a multi-node setting, make sure
            to set this to a remote server and not a local file path.
        registry_uri: The registry URI that gets passed directly to
            mlflow initialization.
        experiment_name: The experiment name to use for this Tune run.
            If the experiment with the name already exists with MLflow,
            it will be reused. If not, a new experiment will be created with
            that name.
        tags: An optional dictionary of string keys and values to set
            as tags on the run
        tracking_token: Tracking token used to authenticate with MLflow.
        save_artifact: If set to True, automatically save the entire
            contents of the Tune local_dir as an artifact to the
            corresponding run in MlFlow.

    Example:

    .. code-block:: python

        from ray.air.integrations.mlflow import MLflowLoggerCallback

        tags = { "user_name" : "John",
                 "git_commit_hash" : "abc123"}

        tune.run(
            train_fn,
            config={
                # define search space here
                "parameter_1": tune.choice([1, 2, 3]),
                "parameter_2": tune.choice([4, 5, 6]),
            },
            callbacks=[MLflowLoggerCallback(
                experiment_name="experiment1",
                tags=tags,
                save_artifact=True)])

    """

    def __init__(
        self,
        tracking_uri: Optional[str] = None,
        *,
        registry_uri: Optional[str] = None,
        experiment_name: Optional[str] = None,
        tags: Optional[Dict] = None,
        tracking_token: Optional[str] = None,
        save_artifact: bool = False,
    ):

        self.tracking_uri = tracking_uri
        self.registry_uri = registry_uri
        self.experiment_name = experiment_name
        self.tags = tags
        self.tracking_token = tracking_token
        self.should_save_artifact = save_artifact

        self.mlflow_util = _MLflowLoggerUtil()
        self.parent_run_id = ''
        if ray.util.client.ray.is_connected():
            logger.warning(
                "When using MLflowLoggerCallback with Ray Client, "
                "it is recommended to use a remote tracking "
                "server. If you are using a MLflow tracking server "
                "backed by the local filesystem, then it must be "
                "setup on the server side and not on the client "
                "side."
            )
    def log_trial_result(self, iteration: int, trial: "Trial", result: Dict):
        #step = result.get(TIMESTEPS_TOTAL) or result[TRAINING_ITERATION]
        #run_id = self._trial_runs[trial]
        #print(mlflow.active_run())
        #run_id = mlflow.active_run().info.run_id
        self.mlflow_util.log_metrics(metrics_to_log=result, step=0)


In [17]:
#from ray.train.callbacks import JsonLoggerCallback

def _s3p(root: str, sub: str) -> str:
    """Safe join for S3/posix URIs."""
    val = f"{root.rstrip('/')}/{sub.lstrip('/')}"
    return val 


def read_parquet_to_pandas(uri: str, columns=None, limit: int | None = None) -> pd.DataFrame:
    """
    Robust Parquet→pandas loader that bypasses Ray Data.
    Works with local paths and s3:// (PyArrow uses AWS_* env vars / IRSA).
    """
    ds = pds.dataset(uri.rstrip("/"), format="parquet")
    if limit is None:
        return ds.to_table(columns=columns).to_pandas()

    # Respect limit across files/row groups
    scanner = pds.Scanner.from_dataset(ds, columns=columns)
    batches, rows = [], 0
    for b in scanner.to_batches():
        batches.append(b)
        rows += len(b)
        if rows >= limit:
            return pa.Table.from_batches(batches)[:limit].to_pandas()
    return pa.Table.from_batches(batches).to_pandas()


def main(experiment_name:str,data_dir: str,
         model_name:str,model_desc:str,
         num_workers: int = 4, cpus_per_worker: int = 1,  DEV_FAST: bool = False):
    """
    Quick knobs:
      - num_workers * cpus_per_worker = CPUs per trial.
      - trainer_resources={"CPU":0} so the driver doesn't steal a core.
      - PACK placement to keep trials tight.
      - max_concurrent_trials caps parallel trials.
      - num_boost_round / early_stopping_rounds control trial length.
      - nthread = cpus_per_worker to avoid oversubscription.
    """
    mlflow.xgboost.autolog() 
    exp_id = mlflow_utils.ensure_mlflow_experiment(experiment_name)
    mv = mlflow_utils.ensure_registered_model(model_name)
    # Storage: local for dev, S3/your env otherwise
    RUN_STORAGE = os.environ.get("RAY_AIR_STORAGE", f"{data_dir}/air/xgb")
    TUNER_STORAGE = "/tmp/air-dev" if DEV_FAST else RUN_STORAGE
    FINAL_STORAGE = "/mnt/data/ddl-end-to-end-demo/air/final_fit" if DEV_FAST else RUN_STORAGE

    # Sanity: workers see IRSA env?
    @ray.remote
    def _peek():
        import os
        return {
            "ROLE": bool(os.environ.get("AWS_ROLE_ARN")),
            "TOKEN_FILE": os.environ.get("AWS_WEB_IDENTITY_TOKEN_FILE"),
            "REGION": os.environ.get("AWS_REGION"),
        }
    print("Worker env peek:", ray.get(_peek.remote()))

    # MLflow (experiment + parent run)
    CLUSTER_TRACKING_URI = os.environ["CLUSTER_MLFLOW_TRACKING_URI"]
    
    
    client = MlflowClient()


    parent = client.create_run(
        experiment_id=exp_id,
        tags={"mlflow.runName": "xgb_parent", "role": "tune_parent"},
    )
    parent_run_id = parent.info.run_id
    print("Parent run id:", parent_run_id)

    # Data (Ray Datasets for training/val)
    train_ds = read_parquet(_s3p(data_dir, "train"), parallelism=num_workers)
    val_ds   = read_parquet(_s3p(data_dir, "val"),   parallelism=num_workers)
    test_ds  = read_parquet(_s3p(data_dir, "test"),  parallelism=num_workers)
    print("Schema:", train_ds.schema())

    # Label + features
    label_col = "median_house_value"
    feature_cols = [c for c in train_ds.schema().names if c != label_col]
    keep = feature_cols + [label_col]
    train_ds = train_ds.select_columns(keep)
    val_ds   = val_ds.select_columns(keep)

    # DEV: trim Ray Datasets used for training; eval will bypass Ray entirely
    if DEV_FAST:
        train_ds = train_ds.limit(5_000)
        val_ds   = val_ds.limit(2_000)

    # --- Build test DataFrame without Ray (avoids 'Global node is not initialized') ---
    test_uri = _s3p(data_dir, "test")
    test_pdf = read_parquet_to_pandas(
        test_uri, columns=keep, limit=2_000 if DEV_FAST else None
    )

    # Search space
    param_space = {
        "params": {
            "objective": "reg:squarederror",
            "tree_method": "hist",
            "eval_metric": "rmse",
            "eta": tune.loguniform(1e-3, 3e-1),
            "max_depth": tune.randint(4, 12),
            "min_child_weight": tune.loguniform(1e-2, 10),
            "subsample": tune.uniform(0.6, 1.0),
            "colsample_bytree": tune.uniform(0.6, 1.0),
            "lambda": tune.loguniform(1e-3, 10),
            "alpha": tune.loguniform(1e-3, 10),
        },
        "num_boost_round": 300,
        "early_stopping_rounds": 20,
    }

    # Dev shortcuts
    if DEV_FAST:
        param_space["num_boost_round"] = 20
        param_space["early_stopping_rounds"] = 5
        #num_workers = 1
        #cpus_per_worker = 1
        NUM_SAMPLES = 5
        MAX_CONCURRENT = 3
        SAVE_ARTIFACTS = True
    else:
        NUM_SAMPLES = 30
        MAX_CONCURRENT = 3
        SAVE_ARTIFACTS = True

    # Threads per worker
    param_space["params"]["nthread"] = cpus_per_worker
    print("Per-trial CPUs =", num_workers * cpus_per_worker)

    # Scaling / placement
    scaling = ScalingConfig(
        num_workers=num_workers,
        use_gpu=False,
        resources_per_worker={"CPU": cpus_per_worker},
        trainer_resources={"CPU": 0},
        placement_strategy="PACK",
    )

    # Trainable
    trainer = XGBoostTrainer(
        label_column=label_col,
        params=param_space["params"],
        datasets={"train": train_ds, "valid": val_ds},
        num_boost_round=param_space["num_boost_round"],
        scaling_config=scaling,
    )

    # Search + scheduler
    MAX_T = int(param_space["num_boost_round"])
    GRACE = int(min(param_space.get("early_stopping_rounds", 1), MAX_T))
    algo = HyperOptSearch(metric="valid-rmse", mode="min")
    scheduler = ASHAScheduler(max_t=MAX_T, grace_period=GRACE, reduction_factor=3)

    # MLflow callback (child runs)
    
    mlflow_cb = MLflowLoggerCallback(
        tracking_uri=CLUSTER_TRACKING_URI,
        experiment_name=experiment_name,
        save_artifact=SAVE_ARTIFACTS,
        #log_params_on_trial_end=True,
        tags={"mlflow.parentRunId": parent_run_id},
    )
    '''
    mlflow_cb = ChildRunMLflowCallback(
        tracking_uri=CLUSTER_TRACKING_URI,
        experiment_name=experiment_name,
        save_artifact=True,                         # same flag name as Ray's
        #tags={"project": "xgb_from_s3_irsa"},       # optional; parent added automatically
        tags={"mlflow.parentRunId": parent_run_id},
        parent_run_id=parent_run_id,
        run_name_prefix="xgb-trial",
        
    )
    '''
    # Tuner
    tuner = tune.Tuner(
        trainer.as_trainable(),
        run_config=RunConfig(
            name="xgb_from_s3_irsa",
            storage_path=TUNER_STORAGE,
            callbacks=[mlflow_cb],
        ),
        tune_config=tune.TuneConfig(
            search_alg=algo,
            scheduler=scheduler,
            metric="valid-rmse",
            mode="min",
            num_samples=NUM_SAMPLES,
            max_concurrent_trials=MAX_CONCURRENT,
        ),
        param_space={"params": param_space["params"]},
    )

    # Tune
    results = tuner.fit()
    best = results.get_best_result(metric="valid-rmse", mode="min")
    print("Best config:", best.config)
    print("Best valid RMSE:", best.metrics.get("valid-rmse"))

    # Final fit (train + val)
    merged = train_ds.union(val_ds)
    final_trainer = XGBoostTrainer(
        label_column=label_col,
        params=best.config["params"],
        datasets={"train": merged},
        num_boost_round=param_space["num_boost_round"],
        scaling_config=scaling,
        run_config=RunConfig(name="final_fit", storage_path=FINAL_STORAGE),
    )
    final_result = final_trainer.fit()
    final_ckpt = final_result.checkpoint

    # Load Booster from checkpoint
    with final_ckpt.as_directory() as ckpt_dir:
        print("Checkpoint dir:", ckpt_dir, "files:", os.listdir(ckpt_dir))
        candidates = ["model.json", "model.ubj", "model.xgb", "xgboost_model.json", "model"]
        model_path = next(
            (os.path.join(ckpt_dir, f) for f in candidates if os.path.exists(os.path.join(ckpt_dir, f))),
            None,
        )
        if not model_path:
            raise FileNotFoundError(f"No XGBoost model file found in checkpoint dir: {ckpt_dir}")
        booster = xgb.Booster()
        print(f"MODEL PATH {model_path}")
        booster.load_model(model_path)

    # Driver-side eval (no Ray dependency)
    X_test = test_pdf.drop(columns=[label_col])
    
    dmat = xgb.DMatrix(X_test)
    y_pred = booster.predict(dmat)
    rmse = math.sqrt(((test_pdf[label_col].to_numpy() - y_pred) ** 2).mean())
    print(f"Test RMSE: {rmse:.4f}")

    
    # Log final under parent

    with mlflow.start_run(run_id=parent_run_id):
        X_example = X_test.head(5).copy()  
        y_example = booster.predict(xgb.DMatrix(X_example))
        sig = infer_signature(X_example, y_example)
        with mlflow.start_run(run_name="final_fit", nested=True) as final_run:
            mlflow.log_params(best.config.get("params", {}))
            mlflow.log_dict({"label": label_col, "features": feature_cols}, "features.json")
            mlflow.log_metric("valid_rmse_best", float(best.metrics.get("valid-rmse")))
            mlflow.log_metric("test_rmse", float(rmse))
            model_info = mlflow_xgb.log_model(booster, artifact_path="model",signature=sig,input_example=X_example)

            mv = mlflow_utils.register_model_version(model_name=model_name,model_desc=model_desc,
                                                model_info=model_info,run=final_run)
            
            print("Name: {}".format(mv.name))
            print("Version: {}".format(mv.version))
            print("Description: {}".format(mv.description))
            print("Status: {}".format(mv.status))
            print("Stage: {}".format(mv.current_stage))
            
    
    run = client.get_run(parent_run_id)
    if run.info.status == "RUNNING":
        client.set_terminated(parent_run_id, "FINISHED")

In [18]:
which_env="dev" #Picks the appropriate hydra config file

In [19]:
## Read Conf from Hydra
from hydra import compose, initialize
from omegaconf import OmegaConf
import os
# Point Hydra to your conf/ directory
with initialize(config_path="../conf"):
    cfg = compose(config_name="config", overrides=[f"env={which_env}"])
    #print(f"Running in {cfg.env} environment")
    #print(OmegaConf.to_yaml(cfg, resolve=True))
    
    app_name = cfg.app.name

    #For dev we use datasets
    data_dir = cfg.app.data_dir
    experiment_name = cfg.mlflow.experiment_name    
    model_name = cfg.mlflow.model_name    
    model_desc = cfg.mlflow.model_desc
    ray_workers = cfg.env.ray.num_workers
    cpus_per_worker = cfg.env.ray.cpus_per_worker
    dev_fast = cfg.env.ray.dev_fast
    #print(ray_workers)
    #print(dev_fast)
    
# Disable tensorboard integration
os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1"

The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  with initialize(config_path="../conf"):


In [20]:
## All AWS Env variables are redundant when using locally

RAY_JOB_ENV = {
    "AWS_ROLE_ARN": os.environ.get("AWS_ROLE_ARN", ""),
    "AWS_WEB_IDENTITY_TOKEN_FILE": os.environ.get("AWS_WEB_IDENTITY_TOKEN_FILE", ""),
    "AWS_REGION": os.environ.get("AWS_REGION", os.environ.get("AWS_DEFAULT_REGION", "us-east-1")),
    "AWS_DEFAULT_REGION": os.environ.get("AWS_REGION", os.environ.get("AWS_DEFAULT_REGION", "us-east-1")),
    "TUNE_DISABLE_AUTO_CALLBACK_LOGGERS":"1",
    "TUNE_RESULT_BUFFER_LENGTH": "16",
    "TUNE_RESULT_BUFFER_FLUSH_INTERVAL_S": "3",    
    
}
ray.shutdown()
ray_utils.ensure_ray_connected(RAY_JOB_ENV,ray_ns=app_name)
mlflow.xgboost.autolog() 
main(experiment_name=experiment_name,data_dir=data_dir, 
     model_name=model_name,model_desc=model_desc,
     num_workers=ray_workers, cpus_per_worker=cpus_per_worker,DEV_FAST=dev_fast)

[36m(TunerInternal pid=2920)[0m 
[36m(TunerInternal pid=2920)[0m View detailed results here: /mnt/data/ddl-end-to-end-demo/air/final_fit/final_fit


[36m(TunerInternal pid=2920)[0m AIR_VERBOSITY is set, ignoring passed-in ProgressReporter for now.


[36m(TunerInternal pid=2920)[0m 
[36m(TunerInternal pid=2920)[0m Training started without custom configuration.


[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m Started distributed worker processes: 
[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m - (node_id=8d750bcc9055c1f84d9d50c044091e65cbb25753ee8284c2ec9635fe, ip=100.64.74.230, pid=751) world_rank=0, local_rank=0, node_rank=0
[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m - (node_id=0e1126f0ada99ca3098b1a24f119d3e0867af419d9ec0873ac17743b, ip=100.64.80.25, pid=793) world_rank=1, local_rank=0, node_rank=1
[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m - (node_id=b574e06f0b8b558aa7d02fb0c509388d27d20f522c7b7e403af587f3, ip=100.64.84.191, pid=502) world_rank=2, local_rank=0, node_rank=2
[36m(RayTrainWorker pid=502, ip=100.64.84.191)[0m [06:40:19] Task [xgboost.ray-rank=00000002]:14c8513b08e1ba32a84ef40f03000000 got rank 2
[36m(RayTrainWorker pid=751, ip=100.64.74.230)[0m [06:40:19] Task [xgboost.ray-rank=00000000]:8d74fc7fd1f0b5098a98715303000000 got rank 0
[36m(RayTrainWorker pid=793, ip=100.64.80.25)[0m [06:40:19] Task

[36m(TunerInternal pid=2920)[0m 
[36m(TunerInternal pid=2920)[0m Training finished iteration 1 at 2025-10-07 06:40:23. Total running time: 9s
[36m(TunerInternal pid=2920)[0m ╭───────────────────────────────╮
[36m(TunerInternal pid=2920)[0m │ Training result               │
[36m(TunerInternal pid=2920)[0m ├───────────────────────────────┤
[36m(TunerInternal pid=2920)[0m │ checkpoint_dir_name           │
[36m(TunerInternal pid=2920)[0m │ time_this_iter_s      6.81435 │
[36m(TunerInternal pid=2920)[0m │ time_total_s          6.81435 │
[36m(TunerInternal pid=2920)[0m │ training_iteration          1 │
[36m(TunerInternal pid=2920)[0m │ train-rmse            1.09091 │
[36m(TunerInternal pid=2920)[0m ╰───────────────────────────────╯
[36m(TunerInternal pid=2920)[0m 
[36m(TunerInternal pid=2920)[0m Training finished iteration 2 at 2025-10-07 06:40:23. Total running time: 9s
[36m(TunerInternal pid=2920)[0m ╭───────────────────────────────╮
[36m(TunerInternal pid=2920

[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m [06:40:23] [0]	train-rmse:1.09091
[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m [06:40:23] [1]	train-rmse:1.03083
[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m [06:40:24] [2]	train-rmse:0.97488
[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m [06:40:24] [3]	train-rmse:0.92443
[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m [06:40:24] [4]	train-rmse:0.87589


[36m(TunerInternal pid=2920)[0m 
[36m(TunerInternal pid=2920)[0m Training finished iteration 3 at 2025-10-07 06:40:24. Total running time: 9s
[36m(TunerInternal pid=2920)[0m ╭───────────────────────────────╮
[36m(TunerInternal pid=2920)[0m │ Training result               │
[36m(TunerInternal pid=2920)[0m ├───────────────────────────────┤
[36m(TunerInternal pid=2920)[0m │ checkpoint_dir_name           │
[36m(TunerInternal pid=2920)[0m │ time_this_iter_s      0.02954 │
[36m(TunerInternal pid=2920)[0m │ time_total_s          6.87458 │
[36m(TunerInternal pid=2920)[0m │ training_iteration          3 │
[36m(TunerInternal pid=2920)[0m │ train-rmse            0.97488 │
[36m(TunerInternal pid=2920)[0m ╰───────────────────────────────╯
[36m(TunerInternal pid=2920)[0m 
[36m(TunerInternal pid=2920)[0m Training finished iteration 4 at 2025-10-07 06:40:24. Total running time: 9s
[36m(TunerInternal pid=2920)[0m ╭───────────────────────────────╮
[36m(TunerInternal pid=2920

[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m [06:40:24] [5]	train-rmse:0.83132
[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m [06:40:24] [6]	train-rmse:0.79232
[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m [06:40:24] [7]	train-rmse:0.75313
[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m [06:40:24] [8]	train-rmse:0.71597
[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m [06:40:24] [9]	train-rmse:0.68238
[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m [06:40:24] [10]	train-rmse:0.65267


[36m(TunerInternal pid=2920)[0m 
[36m(TunerInternal pid=2920)[0m Training finished iteration 9 at 2025-10-07 06:40:24. Total running time: 9s
[36m(TunerInternal pid=2920)[0m ╭───────────────────────────────╮
[36m(TunerInternal pid=2920)[0m │ Training result               │
[36m(TunerInternal pid=2920)[0m ├───────────────────────────────┤
[36m(TunerInternal pid=2920)[0m │ checkpoint_dir_name           │
[36m(TunerInternal pid=2920)[0m │ time_this_iter_s      0.03359 │
[36m(TunerInternal pid=2920)[0m │ time_total_s          7.07043 │
[36m(TunerInternal pid=2920)[0m │ training_iteration          9 │
[36m(TunerInternal pid=2920)[0m │ train-rmse            0.71597 │
[36m(TunerInternal pid=2920)[0m ╰───────────────────────────────╯
[36m(TunerInternal pid=2920)[0m 
[36m(TunerInternal pid=2920)[0m Training finished iteration 10 at 2025-10-07 06:40:24. Total running time: 9s
[36m(TunerInternal pid=2920)[0m ╭───────────────────────────────╮
[36m(TunerInternal pid=292

[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m [06:40:24] [11]	train-rmse:0.62230
[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m [06:40:24] [12]	train-rmse:0.59716
[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m [06:40:24] [13]	train-rmse:0.57419
[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m [06:40:24] [14]	train-rmse:0.54923
[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m [06:40:24] [15]	train-rmse:0.52705
[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m [06:40:24] [16]	train-rmse:0.50635


[36m(TunerInternal pid=2920)[0m 
[36m(TunerInternal pid=2920)[0m Training finished iteration 15 at 2025-10-07 06:40:24. Total running time: 9s
[36m(TunerInternal pid=2920)[0m ╭───────────────────────────────╮
[36m(TunerInternal pid=2920)[0m │ Training result               │
[36m(TunerInternal pid=2920)[0m ├───────────────────────────────┤
[36m(TunerInternal pid=2920)[0m │ checkpoint_dir_name           │
[36m(TunerInternal pid=2920)[0m │ time_this_iter_s      0.02914 │
[36m(TunerInternal pid=2920)[0m │ time_total_s          7.25352 │
[36m(TunerInternal pid=2920)[0m │ training_iteration         15 │
[36m(TunerInternal pid=2920)[0m │ train-rmse            0.54923 │
[36m(TunerInternal pid=2920)[0m ╰───────────────────────────────╯
[36m(TunerInternal pid=2920)[0m 
[36m(TunerInternal pid=2920)[0m Training finished iteration 16 at 2025-10-07 06:40:24. Total running time: 9s
[36m(TunerInternal pid=2920)[0m ╭───────────────────────────────╮
[36m(TunerInternal pid=29

[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m [06:40:24] [17]	train-rmse:0.48660
[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m [06:40:24] [18]	train-rmse:0.47074
[36m(XGBoostTrainer pid=707, ip=100.64.74.230)[0m [06:40:24] [19]	train-rmse:0.45359
[36m(RayTrainWorker pid=751, ip=100.64.74.230)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/mnt/data/ddl-end-to-end-demo/air/final_fit/final_fit/XGBoostTrainer_27700_00000_0_2025-10-07_06-40-14/checkpoint_000000)


[36m(TunerInternal pid=2920)[0m 
[36m(TunerInternal pid=2920)[0m Training finished iteration 21 at 2025-10-07 06:40:24. Total running time: 10s
[36m(TunerInternal pid=2920)[0m ╭─────────────────────────────────────────╮
[36m(TunerInternal pid=2920)[0m │ Training result                         │
[36m(TunerInternal pid=2920)[0m ├─────────────────────────────────────────┤
[36m(TunerInternal pid=2920)[0m │ checkpoint_dir_name   checkpoint_000000 │
[36m(TunerInternal pid=2920)[0m │ time_this_iter_s                0.07184 │
[36m(TunerInternal pid=2920)[0m │ time_total_s                    7.47695 │
[36m(TunerInternal pid=2920)[0m │ training_iteration                   21 │
[36m(TunerInternal pid=2920)[0m │ train-rmse                      0.45359 │
[36m(TunerInternal pid=2920)[0m ╰─────────────────────────────────────────╯
[36m(TunerInternal pid=2920)[0m Training saved a checkpoint for iteration 21 at: (local)/mnt/data/ddl-end-to-end-demo/air/final_fit/final_fit/XGBoos

[36m(TunerInternal pid=2920)[0m Wrote the latest version of all result files and experiment state to '/mnt/data/ddl-end-to-end-demo/air/final_fit/final_fit' in 0.0263s.
[36m(TunerInternal pid=2920)[0m Failed to fetch metrics for 1 trial(s):
[36m(TunerInternal pid=2920)[0m - XGBoostTrainer_27700_00000: FileNotFoundError('Could not fetch metrics for XGBoostTrainer_27700_00000: both result.json and progress.csv were not found at /mnt/data/ddl-end-to-end-demo/air/final_fit/final_fit/XGBoostTrainer_27700_00000_0_2025-10-07_06-40-14')
2025/10/07 13:40:30 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ray-xgboost-dev-wadkars, version 8


Name: ray-xgboost-dev-wadkars
Version: 8
Description: ray-xgboost-dev-wadkars
Status: READY
Stage: None
🏃 View run final_fit at: http://127.0.0.1:8765/#/experiments/264/runs/87f9152b41b849a4bba4d954bd857ae7
🧪 View experiment at: http://127.0.0.1:8765/#/experiments/264
🏃 View run xgb_parent at: http://127.0.0.1:8765/#/experiments/264/runs/44235d7186724f37bdce88fa1391d60d
🧪 View experiment at: http://127.0.0.1:8765/#/experiments/264


In [None]:
from utils import mlflow_utils
import pandas as pd
import mlflow.pyfunc

my_model = mlflow_utils.load_registered_model_version(model_name,"latest")

# your split-style payload
split = {
  "columns": ["MedInc","HouseAge","AveRooms","AveBedrms","Population","AveOccup","Latitude","Longitude"],
  "data": [
    [3.1333,30.0,5.925531914893617,1.1312056737588652,966.0,3.425531914893617,36.51,-119.65],
    [2.3355,18.0,5.711722488038277,1.0598086124401913,1868.0,2.2344497607655502,33.97,-117.01],
    [3.3669,29.0,4.5898778359511345,1.0767888307155322,1071.0,1.869109947643979,34.15,-118.37],
    [3.875,46.0,4.0,1.0,59.0,4.538461538461538,33.12,-117.11],
    [4.3482,9.0,5.7924528301886795,1.1037735849056605,409.0,1.929245283018868,35.36,-119.06]
  ]
}

# make a DataFrame
X = pd.DataFrame(split["data"], columns=split["columns"])
preds = my_model.predict(X)
print(preds)

In [None]:
import ray, sys, pyarrow as pa, pandas as pd
print("DRIVER:", sys.version)
print("DRIVER pyarrow:", pa.__version__)
print("DRIVER pandas :", pd.__version__)

@ray.remote
def _env_probe():
    import sys, pyarrow as pa, pandas as pd
    return {
        "python": sys.version.split()[0],
        "pyarrow": pa.__version__,
        "pandas": pd.__version__,
    }

print("WORKER:", ray.get(_env_probe.remote()))
