In [1]:
import os
import shutil

import os
import zipfile
import urllib.request
import utilities

Creating new Ultralytics Settings v0.0.6 file âœ… 
View Ultralytics Settings with 'yolo settings' or at '/home/domino/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


#### Purpose

Ensure that ONNX Runtime is GPU-enabled by verifying the presence of the CUDAExecutionProvider. This is essential for leveraging GPU acceleration during model inference. If CUDA is not available, the code raises an error to avoid unintentional CPU fallback.

#### Context

Use this check before running inference-heavy workloads in computer vision pipelines. It's particularly relevant when running in cloud environments or containers, where GPU access may not be guaranteed by default.

In [2]:
import os, onnxruntime as ort

# Retrieve list of available ONNX Runtime execution providers (e.g., CPU, CUDA, etc.)
prov = ort.get_available_providers()

# Print the available providers to verify GPU support
print(prov)

# Ensure that ONNX Runtime has access to GPU acceleration via CUDA
# If not, raise an error with available providers listed
assert "CUDAExecutionProvider" in prov, f"ORT not GPU-enabled. Providers={prov}"


['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'AzureExecutionProvider', 'CPUExecutionProvider']


#### Purpose

Define key environment-based and hardcoded paths and model names used throughout the notebook or script. These paths help organize downloaded datasets and models for consistent storage and retrieval, while model_names lists the YOLO models to be processed.

#### Context

These variables are typically set up at the start of the workflow.

- `datasets_dir` and `project_ds_folder` are pulled from Domino environment variables to ensure compatibility with the platform's file system.

- `download_base_folder` combines them to point to the appropriate working directory.

- `models_folder` specifies the subdirectory to store exported models.

- `model_names` provides a list of YOLO model variants for batch processing or export.

In [3]:
datasets_dir = os.environ['DOMINO_DATASETS_DIR']
project_ds_folder = os.environ['DOMINO_PROJECT_NAME'] 

download_base_folder=f"{datasets_dir}/{project_ds_folder}"
models_folder = "models"
model_names = ["yolov8n", "yolov5n", "yolov8m", "yolov8s"]


#### Purpose

Evaluates a YOLO-based object detection model on a COCO 2017 validation subset. It measures accuracy and latency, logs metrics to MLflow (if configured), and stores artifacts (plots, configs, and metrics) for reproducibility. This enables model-to-model comparisons on consistent datasets.

#### Context

Used in workflows where:

- Multiple computer vision models (e.g., YOLOv5, YOLOv8) are benchmarked side-by-side.

- Registered models (via MLflow Model Registry) or local model paths are evaluated.

- Validation results need to be versioned and stored for audit and experiment tracking (Domino Experiment Manager).

In [4]:
import os, time, yaml, math, random, json, shutil, statistics as stats
from pathlib import Path
from typing import Dict, Any, List
from datetime import datetime
import mlflow
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from ultralytics import YOLO

# Check for MLflow availability
try:
    import mlflow
    _HAS_MLFLOW = True
except Exception:
    _HAS_MLFLOW = False

# Ensures a directory exists; creates it if needed
def _ensure_dir(p: Path) -> Path:
    p.mkdir(parents=True, exist_ok=True)
    return p

# Run latency benchmarks for a given YOLO model on a list of images
def _latency_benchmark(model: YOLO, image_paths: List[Path], imgsz: int, device: str) -> Dict[str, Any]:
    if not image_paths:
        return {"p50_ms": None, "p90_ms": None, "p99_ms": None, "mean_ms": None, "count": 0}

    lat_ms = []
    _ = model.predict(source=str(image_paths[0]), imgsz=imgsz, device=device, verbose=False)  # warm-up
    for p in image_paths:
        t0 = time.perf_counter()
        _ = model.predict(source=str(p), imgsz=imgsz, device=device, verbose=False)
        lat_ms.append((time.perf_counter() - t0) * 1000.0)

    lat_sorted = sorted(lat_ms)
    def pct(v, q):
        idx = min(len(v)-1, max(0, int(math.ceil(q*len(v))-1)))
        return v[idx]

    return {
        "p50_ms": pct(lat_sorted, 0.50),
        "p90_ms": pct(lat_sorted, 0.90),
        "p99_ms": pct(lat_sorted, 0.99),
        "mean_ms": float(stats.mean(lat_sorted)),
        "count": len(lat_sorted),
        "raw_ms": lat_ms,
    }

# Evaluates a YOLO object detection model (from local path or MLflow registry),
# computes accuracy and latency metrics, logs results to MLflow, and stores
# all relevant artifacts for reproducibility.

def evaluate_model(
    base_path: str,
    model_path: str = None,
    imgsz: int = 640,
    device: str = "cpu",
    limit_images: int = 1000,
    subset_seed: int = 0,
    experiment_name: str = None,
    registry_model_name: str = None,
    registry_model_version: str = "latest",
    parent_run_id: str = None,
) -> Dict[str, Any]:

    # Step 1: Set up input image directory
    base = Path(base_path)
    img_dir = base / "images" / "val2017"
    if not img_dir.exists():
        raise FileNotFoundError(f"Missing image dir: {img_dir}")

    # Step 2: Set up output directories for artifacts
    artifacts = _ensure_dir(base / "artifacts")
    plots_dir = _ensure_dir(artifacts / "plots")
    metrics_dir = _ensure_dir(artifacts / "metrics")
    config_dir = _ensure_dir(artifacts / "config")

    # Step 3: Load and shuffle images, apply subset limit
    all_imgs = sorted([p for p in img_dir.glob("*.jpg")] +
                      [p for p in img_dir.glob("*.jpeg")] +
                      [p for p in img_dir.glob("*.png")])
    if not all_imgs:
        raise RuntimeError(f"No images found under {img_dir}")
    rng = random.Random(subset_seed)
    rng.shuffle(all_imgs)
    sub_imgs = all_imgs[:min(limit_images, len(all_imgs))]

    # Step 4: Save subset to file for reproducibility
    subset_list = artifacts / "val_subset.txt"
    with open(subset_list, "w") as f:
        for p in sub_imgs:
            f.write(str(p.resolve()) + "\n")

    # Step 5: Write minimal COCO config YAML for Ultralytics
    data_config = {
        "path": str(base),
        "train": "images/val2017",
        "val": str(subset_list),
        "names": list(range(80)),  # Dummy list of 80 class names
    }
    yaml_path = base / "coco_val_subset.yaml"
    with open(yaml_path, "w") as f:
        yaml.dump(data_config, f)

    # Step 6: Load YOLO model from Registry or local path
    if registry_model_name:
        model = utilities.load_registered_yolo_model(registry_model_name, version=registry_model_version)
        model_id = f"{registry_model_name}:{registry_model_version}"
    else:
        if not model_path:
            raise ValueError("Provide either registry_model_name or model_path")
        model = YOLO(model_path, task="detect")
        model_id = Path(model_path).stem

    # Step 7: Set up Ultralytics output folder for the run
    tmp_project = Path("/tmp/ultra_runs")
    run_name = f"val_{model_id}_subset{len(sub_imgs)}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

    # Step 8: Run validation (metrics: mAP, precision, recall, etc.)
    val_res = model.val(
        data=str(yaml_path),
        imgsz=imgsz,
        device=device,
        save_json=False,
        verbose=False,
        project=str(tmp_project),
        name=run_name,
        exist_ok=True,
        workers=0,
    )
    save_dir = Path(val_res.save_dir)

    # Step 9: Gather performance metrics
    metrics = {
        "map": float(val_res.box.map),
        "ap50": float(val_res.box.map50),
        "ap75": float(val_res.box.map75),
        "mean_precision": float(val_res.box.mp),
        "mean_recall": float(val_res.box.mr),
        "evaluated_images": len(sub_imgs),
        "model_id": model_id,
    }
    (metrics_dir / "headline.json").write_text(json.dumps(metrics, indent=2))

    # Step 10: Run latency benchmark (single-image inference timing)
    try:
        latency = _latency_benchmark(model, sub_imgs[:min(100, len(sub_imgs))], imgsz=imgsz, device=device)
    except NameError:
        latency = {}
    (metrics_dir / "latency.json").write_text(json.dumps({k: v for k, v in latency.items() if k != "raw_ms"}, indent=2))

    # Step 11: Plot latency histogram (if available)
    if latency.get("raw_ms"):
        plt.figure()
        plt.hist(latency["raw_ms"], bins=20)
        plt.xlabel("Latency (ms)")
        plt.ylabel("Count")
        plt.title("Per-image latency (batch=1)")
        plt.tight_layout()
        plt.savefig(plots_dir / "latency_hist.png")
        plt.close()

    # Step 12: Save run config for traceability
    eval_cfg = {
        "imgsz": imgsz,
        "device": device,
        "subset_seed": subset_seed,
        "limit_images": limit_images,
        "subset_list": str(subset_list),
        "model_id": model_id,
        "ultralytics_save_dir": str(save_dir),
        "source": "registry" if registry_model_name else "path",
    }
    (config_dir / "eval.json").write_text(json.dumps(eval_cfg, indent=2))

    # Step 13: Log everything to MLflow (if enabled)
    if experiment_name:
        utilities.ensure_mlflow_experiment(experiment_name)

    if experiment_name or parent_run_id:
        try:
            with mlflow.start_run(run_name=run_name, nested=bool(parent_run_id)):
                mlflow.set_tags({"model_id": model_id, "source": eval_cfg["source"]})
                mlflow.log_params({
                    "imgsz": imgsz,
                    "device": device,
                    "limit_images": len(sub_imgs),
                    "subset_seed": subset_seed
                })
                mlflow.log_metrics({
                    "map": metrics["map"],
                    "ap50": metrics["ap50"],
                    "ap75": metrics["ap75"],
                    "mean_precision": metrics["mean_precision"],
                    "mean_recall": metrics["mean_recall"],
                    "latency_p50_ms": latency.get("p50_ms", 0.0),
                    "latency_p90_ms": latency.get("p90_ms", 0.0),
                    "latency_p99_ms": latency.get("p99_ms", 0.0),
                    "latency_mean_ms": latency.get("mean_ms", 0.0),
                })
                mlflow.log_artifact(str(metrics_dir / "headline.json"))
                mlflow.log_artifact(str(metrics_dir / "latency.json"))
                mlflow.log_artifact(str(config_dir / "eval.json"))
                mlflow.log_artifact(str(artifacts / "val_subset.txt"))
                if (plots_dir / "latency_hist.png").exists():
                    mlflow.log_artifact(str(plots_dir / "latency_hist.png"))
                if save_dir.exists():
                    mlflow.log_artifacts(str(save_dir), artifact_path="ultralytics_run")
        finally:
            # Step 14: Clean up temporary run folders
            if save_dir.exists():
                shutil.rmtree(save_dir, ignore_errors=True)
            parent = tmp_project
            if parent.exists() and not any(parent.iterdir()):
                shutil.rmtree(parent, ignore_errors=True)
    else:
        if save_dir.exists():
            shutil.rmtree(save_dir, ignore_errors=True)

    # Step 15: Return headline metrics to caller
    return metrics


### Benchmarking Workflow Overview

This benchmarking flow enables consistent and reproducible evaluation of multiple YOLO models using the COCO dataset and MLflow for experiment tracking.

## Workflow Steps

1. Start a Parent MLflow Run

- All evaluations are grouped under a single parent run.

- Helps organize metrics and artifacts across models for easy comparison.

2. Iterate Through YOLO Model Variants

- Each model (e.g., yolov8n, yolov5n, etc.) is loaded from the MLflow Model Registry.

- Evaluation is performed using a controlled subset of validation images.

3. Call evaluate_model()
   For each model:
   - Runs val() using Ultralytics YOLO.

   - Collects accuracy metrics (e.g., mAP, AP50, precision, recall).

   - Benchmarks inference latency on the selected device.

   - Logs results to MLflow as a child run.

4. Log Metrics and Artifacts

   Each child run logs:

   - Evaluation metrics (JSON)

   - Subset configuration (TXT)

   - Latency histograms (PNG)

   - Ultralytics output artifacts

5. Reproducibility by Design

   - Uses fixed random seed (subset_seed) and image limit to ensure consistent subsets.

   - Results are isolated and traceable per model while remaining linked to the parent.

### Benefits

- **Isolated runs**: Each model evaluation is self-contained and trackable.

- **Fair comparison**: All models are evaluated on the same image subset.

- **Re-runnable**: Deterministic behavior allows rerunning or adding new models easily.

- **Experiment visibility**: MLflow UI allows side-by-side metric comparisons.

In [5]:
#I needed to increase the shared memory usage to 10GB
domino_user_name = os.environ['DOMINO_USER_NAME']
experiment_name=f"cv-benchmark-{domino_user_name}"
base_folder=f"{download_base_folder}/coco"

utilities.ensure_mlflow_experiment(experiment_name)


base_path=f"{download_base_folder}/coco"
imgsz=640
device="0"
limit_images=50
subset_seed=0

with mlflow.start_run(run_name=f"parent_benchmark_{limit_images}") as parent:
    parent_id = parent.info.run_id
    mlflow.log_params({
                        "base_path":base_path,
                        "imgsz": imgsz, "device": device,
                        "limit_images": limit_images, "subset_seed": subset_seed
                    })
    for model_name in model_names:    
        print(f"Evaluating {model_name}")
        evaluate_model(
            base_path=base_path,
            registry_model_name=model_name,
            registry_model_version="latest",
            imgsz=imgsz,
            device=device,
            limit_images=limit_images,
            subset_seed=subset_seed,
            experiment_name=None,          # experiment already set
            parent_run_id=parent_id,       # pass parent
        )


Evaluating yolov8n


  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 6/6 [00:01<00:00,  3.90it/s]


[Loaded] yolov8n:latest from /tmp/yolov8n_fkrlu_ro/model.onnx
Ultralytics 8.3.182 ðŸš€ Python-3.8.10 torch-2.3.1+cu121 CUDA:0 (NVIDIA A10G, 22724MiB)
Loading /tmp/yolov8n_fkrlu_ro/model.onnx for ONNX Runtime inference...
Using ONNX Runtime CUDAExecutionProvider
Setting batch=1 input of shape (1, 3, 640, 640)


Downloading https://ultralytics.com/assets/Arial.ttf to '/home/domino/.config/Ultralytics/Arial.ttf': 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 755k/755k [00:00<00:00, 23.5MB/s]

[34m[1mval: [0mFast image access âœ… (ping: 2.3Â±0.4 ms, read: 14.6Â±3.1 MB/s, size: 145.7 KB)



[34m[1mval: [0mScanning /mnt/data/reference-cv-model-comparison/coco/labels/val2017.cache... 50 images, 2 backgrounds, 0 corrupt: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 50/50 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 50/50 [00:04<00:00, 11.09it/s]


                   all         50        564      0.529      0.531      0.585      0.446
Speed: 2.8ms preprocess, 4.0ms inference, 0.0ms loss, 28.9ms postprocess per image
Results saved to [1m/tmp/ultra_runs/val_yolov8n:latest_subset50_20250822_150209[0m
Loading /tmp/yolov8n_fkrlu_ro/model.onnx for ONNX Runtime inference...
Using ONNX Runtime CUDAExecutionProvider
Evaluating yolov5n


Downloading artifacts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 6/6 [00:01<00:00,  4.74it/s]

[Loaded] yolov5n:latest from /tmp/yolov5n_zrj_pzbe/model.onnx
Ultralytics 8.3.182 ðŸš€ Python-3.8.10 torch-2.3.1+cu121 CUDA:0 (NVIDIA A10G, 22724MiB)
Loading /tmp/yolov5n_zrj_pzbe/model.onnx for ONNX Runtime inference...
Using ONNX Runtime CUDAExecutionProvider
Setting batch=1 input of shape (1, 3, 640, 640)





[34m[1mval: [0mFast image access âœ… (ping: 0.0Â±0.0 ms, read: 96.7Â±33.5 MB/s, size: 153.8 KB)


[34m[1mval: [0mScanning /mnt/data/reference-cv-model-comparison/coco/labels/val2017.cache... 50 images, 2 backgrounds, 0 corrupt: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 50/50 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 50/50 [00:00<00:00, 50.39it/s]


                   all         50        564       0.59      0.521       0.56      0.419
Speed: 0.3ms preprocess, 3.9ms inference, 0.0ms loss, 6.7ms postprocess per image
Results saved to [1m/tmp/ultra_runs/val_yolov5n:latest_subset50_20250822_150233[0m
Loading /tmp/yolov5n_zrj_pzbe/model.onnx for ONNX Runtime inference...
Using ONNX Runtime CUDAExecutionProvider
Evaluating yolov8m


Downloading artifacts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 6/6 [00:07<00:00,  1.32s/it]

[Loaded] yolov8m:latest from /tmp/yolov8m_4xy1d64h/model.onnx
Ultralytics 8.3.182 ðŸš€ Python-3.8.10 torch-2.3.1+cu121 CUDA:0 (NVIDIA A10G, 22724MiB)
Loading /tmp/yolov8m_4xy1d64h/model.onnx for ONNX Runtime inference...
Using ONNX Runtime CUDAExecutionProvider
Setting batch=1 input of shape (1, 3, 640, 640)
[34m[1mval: [0mFast image access âœ… (ping: 0.6Â±0.0 ms, read: 79.1Â±12.4 MB/s, size: 111.1 KB)



[34m[1mval: [0mScanning /mnt/data/reference-cv-model-comparison/coco/labels/val2017.cache... 50 images, 2 backgrounds, 0 corrupt: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 50/50 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 50/50 [00:01<00:00, 48.85it/s]


                   all         50        564      0.635      0.675      0.687      0.547
Speed: 0.6ms preprocess, 9.7ms inference, 0.0ms loss, 1.1ms postprocess per image
Results saved to [1m/tmp/ultra_runs/val_yolov8m:latest_subset50_20250822_150252[0m
Loading /tmp/yolov8m_4xy1d64h/model.onnx for ONNX Runtime inference...
Using ONNX Runtime CUDAExecutionProvider
Evaluating yolov8s


Downloading artifacts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 6/6 [00:03<00:00,  1.60it/s]

[Loaded] yolov8s:latest from /tmp/yolov8s_xas6gmz6/model.onnx
Ultralytics 8.3.182 ðŸš€ Python-3.8.10 torch-2.3.1+cu121 CUDA:0 (NVIDIA A10G, 22724MiB)
Loading /tmp/yolov8s_xas6gmz6/model.onnx for ONNX Runtime inference...
Using ONNX Runtime CUDAExecutionProvider
Setting batch=1 input of shape (1, 3, 640, 640)
[34m[1mval: [0mFast image access âœ… (ping: 0.0Â±0.0 ms, read: 117.0Â±51.6 MB/s, size: 151.3 KB)



[34m[1mval: [0mScanning /mnt/data/reference-cv-model-comparison/coco/labels/val2017.cache... 50 images, 2 backgrounds, 0 corrupt: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 50/50 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 50/50 [00:00<00:00, 58.25it/s]


                   all         50        564      0.639       0.61      0.652      0.498
Speed: 0.6ms preprocess, 6.6ms inference, 0.0ms loss, 1.2ms postprocess per image
Results saved to [1m/tmp/ultra_runs/val_yolov8s:latest_subset50_20250822_150318[0m
Loading /tmp/yolov8s_xas6gmz6/model.onnx for ONNX Runtime inference...
Using ONNX Runtime CUDAExecutionProvider


In [6]:

#metrics = evaluate_model(f"{download_base_folder}/coco",model_path,limit_images=50,experiment_name=experiment_name,device="cpu")
