In [1]:
import os
import shutil

import os
import zipfile
import urllib.request
import utilities

In [3]:
import os, onnxruntime as ort
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # or unset it: os.environ.pop("CUDA_VISIBLE_DEVICES", None)

# Must be the GPU build:
# pip install --upgrade onnxruntime-gpu==1.18.0   # pick version matching your CUDA 12.x stack

prov = ort.get_available_providers()
print(prov)
assert "CUDAExecutionProvider" in prov, f"ORT not GPU-enabled. Providers={prov}"

['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'AzureExecutionProvider', 'CPUExecutionProvider']


In [4]:
datasets_dir = os.environ['DOMINO_DATASETS_DIR']
project_ds_folder = os.environ['DOMINO_PROJECT_NAME'] 

download_base_folder=f"{datasets_dir}/{project_ds_folder}"
models_folder = "models"
yolo_model_name="yolov8n"
model_names = ["yolov8n.pt", "yolov5n.pt", "yolov8m.pt", "yolov8s.pt"]


In [5]:
# Adds: log entire Ultralytics run dir to MLflow, then delete it.
# Writes the run under /tmp/ultra_runs/<name>, logs it, removes it.

import os, time, yaml, math, random, json, shutil, statistics as stats
from pathlib import Path
from typing import Dict, Any, List
from datetime import datetime
import mlflow
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from ultralytics import YOLO
from datetime import datetime
import matplotlib.pyplot as plt
import mlflow
from ultralytics import YOLO

try:
    import mlflow
    _HAS_MLFLOW = True
except Exception:
    _HAS_MLFLOW = False


def _ensure_dir(p: Path) -> Path:
    p.mkdir(parents=True, exist_ok=True)
    return p


def _latency_benchmark(model: YOLO, image_paths: List[Path], imgsz: int, device: str) -> Dict[str, Any]:
    if not image_paths:
        return {"p50_ms": None, "p90_ms": None, "p99_ms": None, "mean_ms": None, "count": 0}
    lat_ms = []
    _ = model.predict(source=str(image_paths[0]), imgsz=imgsz, device=device, verbose=False)  # warmup
    for p in image_paths:
        t0 = time.perf_counter()
        _ = model.predict(source=str(p), imgsz=imgsz, device=device, verbose=False)
        lat_ms.append((time.perf_counter() - t0) * 1000.0)
    lat_sorted = sorted(lat_ms)
    def pct(v, q):
        idx = min(len(v)-1, max(0, int(math.ceil(q*len(v))-1)))
        return v[idx]
    return {
        "p50_ms": pct(lat_sorted, 0.50),
        "p90_ms": pct(lat_sorted, 0.90),
        "p99_ms": pct(lat_sorted, 0.99),
        "mean_ms": float(stats.mean(lat_sorted)),
        "count": len(lat_sorted),
        "raw_ms": lat_ms,
    }


def ensure_mlflow_experiment(experiment_name: str) -> int:
    """
    Ensure an MLflow experiment with the given name exists.
    If it does not, create it. Then set it as the current experiment.

    Args:
        experiment_name: Name of the experiment
        artifact_location: Optional path or URI where artifacts will be stored

    Returns:
        experiment_id (int)
    """
    try:
        exp = mlflow.get_experiment_by_name(experiment_name)
        if exp is None:
            exp_id = mlflow.create_experiment(
                experiment_name
            )
        else:
            exp_id = exp.experiment_id
        mlflow.set_experiment(experiment_name)
        return exp_id
    except Exception as e:
        raise RuntimeError(f"Failed to ensure experiment {experiment_name}: {e}")



# assumes this helper exists as defined earlier
# def load_registered_yolo_model(model_name: str, version: str = "latest") -> YOLO: ...

def evaluate_model(
    base_path: str,
    model_path: str = None,
    imgsz: int = 640,
    device: str = "cpu",
    limit_images: int = 1000,
    subset_seed: int = 0,
    experiment_name: str = None,
    registry_model_name: str = None,
    registry_model_version: str = "latest",
    parent_run_id: str = None,  # NEW
) -> Dict[str, Any]:
    base = Path(base_path)
    img_dir = base / "images" / "val2017"
    if not img_dir.exists():
        raise FileNotFoundError(f"Missing image dir: {img_dir}")

    def _ensure_dir(p: Path) -> Path:
        p.mkdir(parents=True, exist_ok=True)
        return p

    artifacts = _ensure_dir(base / "artifacts")
    plots_dir = _ensure_dir(artifacts / "plots")
    metrics_dir = _ensure_dir(artifacts / "metrics")
    config_dir = _ensure_dir(artifacts / "config")

    all_imgs = sorted([p for p in img_dir.glob("*.jpg")] +
                      [p for p in img_dir.glob("*.jpeg")] +
                      [p for p in img_dir.glob("*.png")])
    if not all_imgs:
        raise RuntimeError(f"No images found under {img_dir}")
    rng = random.Random(subset_seed)
    rng.shuffle(all_imgs)
    sub_imgs = all_imgs[:min(limit_images, len(all_imgs))]

    subset_list = artifacts / "val_subset.txt"
    with open(subset_list, "w") as f:
        for p in sub_imgs:
            f.write(str(p.resolve()) + "\n")

    data_config = {
        "path": str(base),
        "train": "images/val2017",
        "val": str(subset_list),
        "names": list(range(80)),
    }
    yaml_path = base / "coco_val_subset.yaml"
    with open(yaml_path, "w") as f:
        yaml.dump(data_config, f)

    if registry_model_name:
        model = utilities.load_registered_yolo_model(registry_model_name, version=registry_model_version)
        model_id = f"{registry_model_name}:{registry_model_version}"
    else:
        if not model_path:
            raise ValueError("Provide either registry_model_name or model_path")
        model = YOLO(model_path, task="detect")
        model_id = Path(model_path).stem

    tmp_project = Path("/tmp/ultra_runs")
    run_name = f"val_{model_id}_subset{len(sub_imgs)}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

    val_res = model.val(
        data=str(yaml_path),
        imgsz=imgsz,
        device=device,
        save_json=False,
        verbose=False,
        project=str(tmp_project),
        name=run_name,
        exist_ok=True,
        workers=0,
    )
    save_dir = Path(val_res.save_dir)

    metrics = {
        "map": float(val_res.box.map),
        "ap50": float(val_res.box.map50),
        "ap75": float(val_res.box.map75),
        "mean_precision": float(val_res.box.mp),
        "mean_recall": float(val_res.box.mr),
        "evaluated_images": len(sub_imgs),
        "model_id": model_id,
    }
    (metrics_dir / "headline.json").write_text(json.dumps(metrics, indent=2))

    try:
        latency = _latency_benchmark(model, sub_imgs[:min(100, len(sub_imgs))], imgsz=imgsz, device=device)
    except NameError:
        latency = {}
    (metrics_dir / "latency.json").write_text(json.dumps({k: v for k, v in latency.items() if k != "raw_ms"}, indent=2))

    if latency.get("raw_ms"):
        plt.figure()
        plt.hist(latency["raw_ms"], bins=20)
        plt.xlabel("Latency (ms)")
        plt.ylabel("Count")
        plt.title("Per-image latency (batch=1)")
        plt.tight_layout()
        plt.savefig(plots_dir / "latency_hist.png")
        plt.close()

    eval_cfg = {
        "imgsz": imgsz,
        "device": device,
        "subset_seed": subset_seed,
        "limit_images": limit_images,
        "subset_list": str(subset_list),
        "model_id": model_id,
        "ultralytics_save_dir": str(save_dir),
        "source": "registry" if registry_model_name else "path",
    }
    (config_dir / "eval.json").write_text(json.dumps(eval_cfg, indent=2))
    
    if experiment_name:
        utilities.ensure_mlflow_experiment(experiment_name)

    if experiment_name or parent_run_id:
        try:
            if parent_run_id:
                #with mlflow.start_run(run_name=run_name,run_id=parent_run_id):
                    with mlflow.start_run(run_name=run_name, nested=True):
                        mlflow.set_tags({"model_id": model_id, "source": eval_cfg["source"]})
                        mlflow.log_params({
                            "imgsz": imgsz, "device": device,
                            "limit_images": len(sub_imgs), "subset_seed": subset_seed
                        })
                        mlflow.log_metrics({
                            "map": metrics["map"], "ap50": metrics["ap50"], "ap75": metrics["ap75"],
                            "mean_precision": metrics["mean_precision"], "mean_recall": metrics["mean_recall"],
                            "latency_p50_ms": latency.get("p50_ms") or 0.0,
                            "latency_p90_ms": latency.get("p90_ms") or 0.0,
                            "latency_p99_ms": latency.get("p99_ms") or 0.0,
                            "latency_mean_ms": latency.get("mean_ms") or 0.0,
                        })
                        mlflow.log_artifact(str(metrics_dir / "headline.json"))
                        mlflow.log_artifact(str(metrics_dir / "latency.json"))
                        mlflow.log_artifact(str(config_dir / "eval.json"))
                        mlflow.log_artifact(str(artifacts / "val_subset.txt"))
                        if (plots_dir / "latency_hist.png").exists():
                            mlflow.log_artifact(str(plots_dir / "latency_hist.png"))
                        if save_dir.exists():
                            mlflow.log_artifacts(str(save_dir), artifact_path="ultralytics_run")
            else:
                with mlflow.start_run(run_name=run_name):
                    mlflow.set_tags({"model_id": model_id, "source": eval_cfg["source"]})
                    mlflow.log_params({
                        "imgsz": imgsz, "device": device,
                        "limit_images": len(sub_imgs), "subset_seed": subset_seed
                    })
                    mlflow.log_metrics({
                        "map": metrics["map"], "ap50": metrics["ap50"], "ap75": metrics["ap75"],
                        "mean_precision": metrics["mean_precision"], "mean_recall": metrics["mean_recall"],
                        "latency_p50_ms": latency.get("p50_ms") or 0.0,
                        "latency_p90_ms": latency.get("p90_ms") or 0.0,
                        "latency_p99_ms": latency.get("p99_ms") or 0.0,
                        "latency_mean_ms": latency.get("mean_ms") or 0.0,
                    })
                    mlflow.log_artifact(str(metrics_dir / "headline.json"))
                    mlflow.log_artifact(str(metrics_dir / "latency.json"))
                    mlflow.log_artifact(str(config_dir / "eval.json"))
                    mlflow.log_artifact(str(artifacts / "val_subset.txt"))
                    if (plots_dir / "latency_hist.png").exists():
                        mlflow.log_artifact(str(plots_dir / "latency_hist.png"))
                    if save_dir.exists():
                        mlflow.log_artifacts(str(save_dir), artifact_path="ultralytics_run")
        finally:
            if save_dir.exists():
                shutil.rmtree(save_dir, ignore_errors=True)
            parent = tmp_project
            if parent.exists() and not any(parent.iterdir()):
                shutil.rmtree(parent, ignore_errors=True)
    else:
        if save_dir.exists():
            shutil.rmtree(save_dir, ignore_errors=True)

    return metrics


In [9]:
#I needed to increase the shared memory usage to 10GB
domino_user_name = os.environ['DOMINO_USER_NAME']
experiment_name=f"cv-benchmark-{domino_user_name}"
base_folder=f"{download_base_folder}/coco"

utilities.ensure_mlflow_experiment(experiment_name)

model_names = ["yolov8n", "yolov5n", "yolov8m", "yolov8s"]
base_path=f"{download_base_folder}/coco"
imgsz=640
device="0"
limit_images=50
subset_seed=0

with mlflow.start_run(run_name=f"parent_benchmark_{limit_images}") as parent:
    parent_id = parent.info.run_id
    mlflow.log_params({
                        "base_path":base_path,
                        "imgsz": imgsz, "device": device,
                        "limit_images": limit_images, "subset_seed": subset_seed
                    })
    for model_name in model_names:    
        print(f"Evaluating {model_name}")
        evaluate_model(
            base_path=base_path,
            registry_model_name=model_name,
            registry_model_version="latest",
            imgsz=imgsz,
            device=device,
            limit_images=limit_images,
            subset_seed=subset_seed,
            experiment_name=None,          # experiment already set
            parent_run_id=parent_id,       # pass parent
        )


Evaluating yolov8n


  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 6/6 [00:01<00:00,  3.90it/s]

[Loaded] yolov8n:latest from /tmp/yolov8n__hou17ow/model.onnx
Ultralytics 8.3.182 ðŸš€ Python-3.8.10 torch-2.3.1+cu121 CUDA:0 (NVIDIA A10G, 22724MiB)
Loading /tmp/yolov8n__hou17ow/model.onnx for ONNX Runtime inference...
Using ONNX Runtime CUDAExecutionProvider





Setting batch=1 input of shape (1, 3, 640, 640)
[34m[1mval: [0mFast image access âœ… (ping: 3.7Â±0.1 ms, read: 108.3Â±38.1 MB/s, size: 167.0 KB)


[34m[1mval: [0mScanning /mnt/data/reference-cv-model-comparison/coco/labels/val2017.cache... 50 images, 2 backgrounds, 0 corrupt: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 50/50 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 50/50 [00:01<00:00, 36.22it/s]


                   all         50        564      0.529      0.531      0.585      0.446
Speed: 0.9ms preprocess, 3.8ms inference, 0.0ms loss, 12.2ms postprocess per image
Results saved to [1m/tmp/ultra_runs/val_yolov8n:latest_subset50_20250821_132507[0m
Loading /tmp/yolov8n__hou17ow/model.onnx for ONNX Runtime inference...
Using ONNX Runtime CUDAExecutionProvider
Evaluating yolov5n


Downloading artifacts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 6/6 [00:01<00:00,  4.51it/s]

[Loaded] yolov5n:latest from /tmp/yolov5n_puhqva4m/model.onnx
Ultralytics 8.3.182 ðŸš€ Python-3.8.10 torch-2.3.1+cu121 CUDA:0 (NVIDIA A10G, 22724MiB)
Loading /tmp/yolov5n_puhqva4m/model.onnx for ONNX Runtime inference...
Using ONNX Runtime CUDAExecutionProvider
Setting batch=1 input of shape (1, 3, 640, 640)
[34m[1mval: [0mFast image access âœ… (ping: 0.0Â±0.0 ms, read: 127.7Â±32.1 MB/s, size: 187.6 KB)



[34m[1mval: [0mScanning /mnt/data/reference-cv-model-comparison/coco/labels/val2017.cache... 50 images, 2 backgrounds, 0 corrupt: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 50/50 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 50/50 [00:00<00:00, 67.98it/s]


                   all         50        564       0.59      0.521       0.56      0.419
Speed: 0.3ms preprocess, 4.0ms inference, 0.0ms loss, 1.4ms postprocess per image
Results saved to [1m/tmp/ultra_runs/val_yolov5n:latest_subset50_20250821_132523[0m
Loading /tmp/yolov5n_puhqva4m/model.onnx for ONNX Runtime inference...
Using ONNX Runtime CUDAExecutionProvider
Evaluating yolov8m


Downloading artifacts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 6/6 [00:08<00:00,  1.36s/it]

[Loaded] yolov8m:latest from /tmp/yolov8m_diqozcwp/model.onnx
Ultralytics 8.3.182 ðŸš€ Python-3.8.10 torch-2.3.1+cu121 CUDA:0 (NVIDIA A10G, 22724MiB)
Loading /tmp/yolov8m_diqozcwp/model.onnx for ONNX Runtime inference...
Using ONNX Runtime CUDAExecutionProvider
Setting batch=1 input of shape (1, 3, 640, 640)
[34m[1mval: [0mFast image access âœ… (ping: 0.0Â±0.0 ms, read: 91.8Â±39.5 MB/s, size: 156.1 KB)



[34m[1mval: [0mScanning /mnt/data/reference-cv-model-comparison/coco/labels/val2017.cache... 50 images, 2 backgrounds, 0 corrupt: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 50/50 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 50/50 [00:00<00:00, 50.17it/s]


                   all         50        564      0.635      0.675      0.687      0.547
Speed: 0.4ms preprocess, 9.0ms inference, 0.0ms loss, 1.2ms postprocess per image
Results saved to [1m/tmp/ultra_runs/val_yolov8m:latest_subset50_20250821_132542[0m
Loading /tmp/yolov8m_diqozcwp/model.onnx for ONNX Runtime inference...
Using ONNX Runtime CUDAExecutionProvider
Evaluating yolov8s


Downloading artifacts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 6/6 [00:03<00:00,  1.61it/s]

[Loaded] yolov8s:latest from /tmp/yolov8s_fbpi3czn/model.onnx
Ultralytics 8.3.182 ðŸš€ Python-3.8.10 torch-2.3.1+cu121 CUDA:0 (NVIDIA A10G, 22724MiB)
Loading /tmp/yolov8s_fbpi3czn/model.onnx for ONNX Runtime inference...
Using ONNX Runtime CUDAExecutionProvider
Setting batch=1 input of shape (1, 3, 640, 640)
[34m[1mval: [0mFast image access âœ… (ping: 0.0Â±0.0 ms, read: 116.5Â±50.1 MB/s, size: 168.1 KB)



[34m[1mval: [0mScanning /mnt/data/reference-cv-model-comparison/coco/labels/val2017.cache... 50 images, 2 backgrounds, 0 corrupt: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 50/50 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 50/50 [00:00<00:00, 58.81it/s]


                   all         50        564       0.64       0.61      0.653      0.498
Speed: 0.6ms preprocess, 6.4ms inference, 0.0ms loss, 1.2ms postprocess per image
Results saved to [1m/tmp/ultra_runs/val_yolov8s:latest_subset50_20250821_132608[0m
Loading /tmp/yolov8s_fbpi3czn/model.onnx for ONNX Runtime inference...
Using ONNX Runtime CUDAExecutionProvider


In [None]:

#metrics = evaluate_model(f"{download_base_folder}/coco",model_path,limit_images=50,experiment_name=experiment_name,device="cpu")
