# E2 Test & Evaluation

Experiment 2 (Option A): **Training time (epochs) vs performance** (no early stopping)

We sweep epoch budgets **E ∈ {5, 10, 20, 40, 80}** with a **fixed freezing configuration** (default: F2) for each model (YOLOv8m vs RT-DETR-L).


## 0. Setup


### 0.1 Clone Repo to Colab /content

In [None]:
# Check if in Colab
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("Running in Google Colab")
    # Clone repo if not already cloned
    import os
    if not os.path.exists('Deep_Learning_Gil_Alon'):
        !git clone https://github.com/gil-attar/Deep_Learning_Project_Gil_Alon.git Deep_Learning_Gil_Alon
    %cd Deep_Learning_Gil_Alon
else:
    print("Running locally")
    import os
    from pathlib import Path
    # Navigate to project root if in notebooks/
    if os.path.basename(os.getcwd()) == 'notebooks':
        os.chdir('..')

print(f"Working directory: {os.getcwd()}")

### 0.2 Mounting Google Drive & Setting up Folder Structure for Results

In [None]:
# Mount Google Drive for saving run results
from google.colab import drive
drive.mount("/content/drive")

In [None]:
import time
from pathlib import Path

PROJECT_NAME = "Deep_Learning_Project_Gil_Alon"

# IMPORTANT:
# - For a fresh folder each time, keep the timestamp.
# - To continue the SAME sweep after a disconnect, set RUN_ID to a fixed string
#   (e.g., RUN_ID="E2_fullsweep_v1") and reuse it after reconnect.
RUN_ID = time.strftime("E2_%Y%m%d_%H%M%S")

DRIVE_ROOT = Path("/content/drive/MyDrive/Colab_Outputs") / PROJECT_NAME / RUN_ID

PERSIST_E2_RUNS = DRIVE_ROOT / "E2_runs"       # where Experiment 2 outputs will live
PERSIST_WEIGHTS = DRIVE_ROOT / "pretrained"    # optional cache for pretrained weights

PERSIST_E2_RUNS.mkdir(parents=True, exist_ok=True)
PERSIST_WEIGHTS.mkdir(parents=True, exist_ok=True)

print("Drive root:", DRIVE_ROOT)
print("E2 runs:", PERSIST_E2_RUNS)
print("Weights cache:", PERSIST_WEIGHTS)


### 0.3 Symlink the repo’s runs/ to Drive

In [None]:
from pathlib import Path

REPO = Path.cwd()
print("REPO =", REPO)

E2_RUNS_IN_REPO = REPO / "experiments" / "Experiment_2" / "runs"

# Safety check
assert REPO.exists(), f"Repo path does not exist: {REPO}"

# Remove local runs dir if it exists, then link to Drive
!rm -rf "{E2_RUNS_IN_REPO}"
!ln -s "{PERSIST_E2_RUNS}" "{E2_RUNS_IN_REPO}"

print("Symlink created:")
!ls -la "{E2_RUNS_IN_REPO}"


#### 0.4 Re-routing Model Weights to Drive

In [None]:
WEIGHTS_IN_REPO = REPO / "artifacts" / "weights"

!rm -rf "{WEIGHTS_IN_REPO}"
!ln -s "{PERSIST_WEIGHTS}" "{WEIGHTS_IN_REPO}"

print("Weights dir now points to Drive:")
!ls -la "{WEIGHTS_IN_REPO}"


In [None]:
# Install dependencies
!pip install -q ultralytics roboflow pyyaml pillow numpy matplotlib pandas tqdm


In [None]:
# Check GPU
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 1. Download Dataset

In [None]:
# Set Roboflow API key
import os
os.environ["ROBOFLOW_API_KEY"] = "zEF9icmDY2oTcPkaDcQY"  # Your API key

# Download dataset
!python scripts/download_dataset.py --output_dir data/raw

In [None]:
# Verify dataset downloaded
!echo "Train images: $(ls data/raw/train/images/ 2>/dev/null | wc -l)"
!echo "Valid images: $(ls data/raw/valid/images/ 2>/dev/null | wc -l)"
!echo "Test images: $(ls data/raw/test/images/ 2>/dev/null | wc -l)"

## 2. Fetch COCO-Pretrained Weights (YOLOv8m + RT-DETR-L)

Will be stored under `artifacts/weights/`.


In [None]:
# Fetch pretrained weights (idempotent)
!bash scripts/fetch_weights.sh


## 3. Build Evaluation Indices

In [None]:
# Build train/val/test indices (with ACTUAL image dimensions - this is critical!)
# Remove old indices first to ensure fresh rebuild
import shutil
from pathlib import Path

if Path("data/processed/evaluation").exists():
    shutil.rmtree("data/processed/evaluation")
    print("✓ Removed old indices")

!python scripts/build_evaluation_indices.py \
    --dataset_root data/raw \
    --output_dir data/processed/evaluation

In [None]:
# Verify indices created
import json
from pathlib import Path

test_index_path = "data/processed/evaluation/test_index.json"

if Path(test_index_path).exists():
    with open(test_index_path) as f:
        test_data = json.load(f)
    print(f"✓ Test index: {test_data['metadata']['num_images']} images")
    print(f"  Total objects: {test_data['metadata']['total_objects']}")
    print(f"  Classes: {test_data['metadata']['num_classes']}")
else:
    print(f"❌ Test index not found!")

### 3.1. Create data.yaml for Training

In [None]:
# Create data.yaml with absolute paths for Colab
import yaml
from pathlib import Path
import os

# Get absolute path to dataset
dataset_root = Path('data/raw').resolve()

# Read original data.yaml to get class names
with open(dataset_root / 'data.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Create config with ABSOLUTE paths
train_config = {
    'path': str(dataset_root),  # Absolute base path
    'train': 'train/images',
    'val': 'valid/images', 
    'test': 'test/images',
    'names': config['names'],
    'nc': len(config['names'])
}

# Save to data/processed/
output_path = Path('data/processed/data.yaml')
output_path.parent.mkdir(parents=True, exist_ok=True)

with open(output_path, 'w') as f:
    yaml.dump(train_config, f, default_flow_style=False, sort_keys=False)

print(f"✓ Created data.yaml with absolute paths")
print(f"  Path: {train_config['path']}")
print(f"  Classes: {train_config['nc']}")

## 4. Run Experiment 2 (Epoch Budget Sweep: YOLOv8m vs RT-DETR-L)


In [None]:
# E2 sweep configuration
DRY_RUN = True   # True: quick smoke test (E=1). False: full sweep (E in {5,10,20,40,80}).

FREEZE = "F2"            # fixed freeze preset (see experiments/Experiment_1/freezing/freeze_presets.py)
EPOCHS_LIST = "1" if DRY_RUN else "5 10 20 40 80"
IMGSZ = 640
SEED = 42

# Reporting threshold used inside the E2 runner for per-class/counting metrics.
# NOTE: the bash script must pass this through to the runner if you change it.
REPORT_CONF = 0.25

print(f"DRY_RUN={DRY_RUN} | FREEZE={FREEZE} | EPOCHS_LIST={EPOCHS_LIST} | IMGSZ={IMGSZ} | SEED={SEED} | REPORT_CONF={REPORT_CONF}")


In [None]:
# Run Experiment 2
# This calls experiments/Experiment_2/runOneTest.py for each (model, epochs) with fixed FREEZE.

# If you later update run_experiment2.sh to forward REPORT_CONF, add: REPORT_CONF={REPORT_CONF}
!FREEZE={FREEZE} EPOCHS_LIST="{EPOCHS_LIST}" IMGSZ={IMGSZ} SEED={SEED} bash experiments/Experiment_2/run_experiment2.sh


## 5. Verify Run Outputs (E2)


In [None]:
from pathlib import Path

runs_root = Path("experiments/Experiment_2/runs")
assert runs_root.exists(), f"Missing runs root: {runs_root}"

expected_files = [
    "run_manifest.json",
    "train_summary.json",
    "predictions/test_predictions.json",
    "eval/test/metrics.json",
    "eval/test/summary.csv",
    "run_summary.json",
]

missing = []
run_dirs = sorted([p for p in runs_root.glob("**/E*") if p.is_dir()])
print(f"Found {len(run_dirs)} epoch-budget run directories")

for rd in run_dirs:
    for ef in expected_files:
        if not (rd / ef).exists():
            missing.append((str(rd), ef))

if missing:
    print("WARNING: missing expected artifacts in some runs:")
    for rd, ef in missing[:60]:
        print(f"  - {rd} :: {ef}")
else:
    print("✓ All runs contain the expected artifacts.")


## 6. Aggregate Metrics Across Runs (E2)


In [None]:
import json
from pathlib import Path
import pandas as pd

runs_root = Path("experiments/Experiment_2/runs")

rows = []
for summary_path in runs_root.glob("**/run_summary.json"):
    j = json.loads(summary_path.read_text())
    m = j.get("manifest", {})
    model = m.get("model")
    freeze_id = m.get("freeze_id")
    epochs = int(m.get("epochs"))

    timing = (j.get("train", {}) or {}).get("timing", {})
    wall = timing.get("train_wall_time_seconds")
    spe = timing.get("train_seconds_per_epoch")

    # Prefer evaluator summary.csv for a single reported score (fixed threshold)
    eval_dir = Path((j.get("eval", {}) or {}).get("test", {}).get("eval_dir", ""))
    summary_csv = eval_dir / "summary.csv" if eval_dir else None

    score = None
    score_col = None
    if summary_csv and summary_csv.exists():
        df = pd.read_csv(summary_csv)
        # Try common column candidates in your evaluator outputs
        candidates = [
            "mAP50", "map50", "mAP@0.5", "mAP_0.5",
            "f1", "F1", "precision", "recall",
        ]
        for c in candidates:
            if c in df.columns:
                score_col = c
                score = float(df[c].max())
                break
        if score is None:
            # fallback: pick first numeric column (excluding ids)
            numeric_cols = [c for c in df.columns if c.lower() not in {"class", "class_id", "name"}]
            for c in numeric_cols:
                try:
                    score = float(pd.to_numeric(df[c], errors="coerce").dropna().max())
                    score_col = c
                    break
                except Exception:
                    pass

    # Secondary fallback: Ultralytics results.csv last-epoch val mAP50 if present
    if score is None:
        results_csv = summary_path.parent / "results.csv"
        if results_csv.exists():
            rdf = pd.read_csv(results_csv)
            rdf.columns = [c.strip() for c in rdf.columns]
            for c in ["metrics/mAP50(B)", "metrics/mAP50", "metrics/mAP_0.5"]:
                if c in rdf.columns:
                    score = float(rdf[c].iloc[-1])
                    score_col = c
                    break

    rows.append({
        "model": model,
        "freeze_id": freeze_id,
        "epochs": epochs,
        "train_wall_s": wall,
        "sec_per_epoch": spe,
        "score": score,
        "score_col": score_col,
        "run_dir": str(summary_path.parent),
    })

agg = pd.DataFrame(rows)
agg = agg.sort_values(["model", "epochs"]).reset_index(drop=True)
print("Runs found:", len(agg))
display(agg)

out_csv = Path("experiments/Experiment_2/runs/_aggregate_e2.csv")
out_csv.parent.mkdir(parents=True, exist_ok=True)
agg.to_csv(out_csv, index=False)
print("Saved:", out_csv.resolve())


In [None]:
import json
import math
from pathlib import Path
import pandas as pd
import numpy as np

# This cell summarizes each E* run directory using:
# - run_manifest.json (trainable params, settings)
# - eval/test/metrics.json (test-side evaluator metrics)
# - predictions/test_predictions.json (inference timing)
# - results.csv (per-epoch Ultralytics log, if present)

def _safe_get(d, keys, default=None):
    cur = d
    for k in keys:
        if not isinstance(cur, dict) or k not in cur:
            return default
        cur = cur[k]
    return cur

def auc_trapz(x, y):
    x = np.asarray(x, dtype=float)
    y = np.asarray(y, dtype=float)
    if len(x) < 2:
        return np.nan
    return float(np.trapz(y, x))

def get_map50_col(ep_df: pd.DataFrame):
    # Common Ultralytics mAP50 column names
    candidates = [
        "metrics/mAP50(B)",
        "metrics/mAP50",
        "metrics/mAP_0.5",
        "metrics/mAP@0.5",
        "metrics/mAP50-95(B)",  # fallback (not mAP50 but better than nothing)
    ]
    for c in candidates:
        if c in ep_df.columns:
            return c
    # Last resort: any column containing 'mAP50'
    for c in ep_df.columns:
        if "map50" in c.lower():
            return c
    return None

def get_train_loss_cols(ep_df: pd.DataFrame):
    # YOLO typically: train/box_loss, train/cls_loss, train/dfl_loss
    return [c for c in ep_df.columns if c.startswith('train/') and 'loss' in c.lower()]

def get_val_loss_cols(ep_df: pd.DataFrame):
    # YOLO typically: val/box_loss, val/cls_loss, val/dfl_loss
    return [c for c in ep_df.columns if c.startswith('val/') and 'loss' in c.lower()]

rows = []
for rd in run_dirs:  # run_dirs should be E* directories (see prior cell)
    rd = Path(rd)

    manifest = json.loads((rd / "run_manifest.json").read_text())
    test_metrics = json.loads((rd / "eval/test/metrics.json").read_text())
    preds_test = json.loads((rd / "predictions/test_predictions.json").read_text())

    model = manifest.get('model')
    freeze_id = manifest.get('freeze_id')
    epochs = manifest.get('epochs')

    # Per-epoch log (Ultralytics)
    ep = None
    results_csv = rd / "results.csv"
    if results_csv.exists():
        ep = pd.read_csv(results_csv)
        ep.columns = [c.strip() for c in ep.columns]
        if "epoch" not in ep.columns:
            ep = ep.reset_index().rename(columns={"index": "epoch"})

    map50_col = None
    best_val_map50 = np.nan
    best_epoch = np.nan
    epoch_to_90pct = np.nan
    auc_map50 = np.nan

    train_loss_cols, val_loss_cols = [], []
    best_train_loss = np.nan
    best_val_loss = np.nan
    gen_gap_at_best = np.nan

    if ep is not None and len(ep) > 0:
        map50_col = get_map50_col(ep)
        if map50_col is not None and map50_col in ep.columns and "epoch" in ep.columns:
            y = pd.to_numeric(ep[map50_col], errors='coerce').to_numpy(dtype=float)
            x = pd.to_numeric(ep["epoch"], errors='coerce').to_numpy(dtype=float)

            if np.isfinite(y).any():
                best_idx = int(np.nanargmax(y))
                best_epoch = int(ep.loc[best_idx, "epoch"]) if np.isfinite(ep.loc[best_idx, "epoch"]) else np.nan
                best_val_map50 = float(np.nanmax(y))
                auc_map50 = auc_trapz(x, y)

                target = 0.9 * best_val_map50
                idxs = np.where(y >= target)[0]
                if len(idxs) > 0:
                    epoch_to_90pct = int(ep.loc[int(idxs[0]), "epoch"])

        # Loss summaries + generalization gap (if val losses exist)
        train_loss_cols = get_train_loss_cols(ep)
        val_loss_cols = get_val_loss_cols(ep)

        if train_loss_cols:
            ep["_train_total_loss"] = ep[train_loss_cols].apply(pd.to_numeric, errors='coerce').sum(axis=1)
        if val_loss_cols:
            ep["_val_total_loss"] = ep[val_loss_cols].apply(pd.to_numeric, errors='coerce').sum(axis=1)

        if (train_loss_cols and val_loss_cols and not (isinstance(best_epoch, float) and math.isnan(best_epoch))):
            row = ep[ep["epoch"] == best_epoch]
            if len(row) == 0 and map50_col is not None and map50_col in ep.columns:
                y = pd.to_numeric(ep[map50_col], errors='coerce').to_numpy(dtype=float)
                row = ep.iloc[[int(np.nanargmax(y))]]

            if len(row) > 0:
                best_train_loss = float(row["_train_total_loss"].iloc[0])
                best_val_loss = float(row["_val_total_loss"].iloc[0])
                gen_gap_at_best = float(best_val_loss - best_train_loss)

    # Evaluator primary metric (kept; mostly for final test reporting)
    def extract_primary_metric(metrics_json: dict):
        candidates = [
            ("overall_f1", ["overall", "f1"]),
            ("overall_precision", ["overall", "precision"]),
            ("overall_recall", ["overall", "recall"]),
            ("f1", ["f1"]),
            ("precision", ["precision"]),
            ("recall", ["recall"]),
            ("map50", ["map50"]),
            ("map", ["map"]),
        ]
        for name, path in candidates:
            val = _safe_get(metrics_json, path)
            if isinstance(val, (int, float)) and not (isinstance(val, float) and math.isnan(val)):
                return name, float(val)
        for k, v in metrics_json.items():
            if isinstance(v, (int, float)):
                return str(k), float(v)
        return None, None

    primary_metric_name, primary_metric_val = extract_primary_metric(test_metrics)

    rows.append({
        "model": model,
        "freeze_id": freeze_id,
        "epochs": epochs,

        "trainable_params": _safe_get(manifest, ["param_counts", "trainable"]),
        "total_params": _safe_get(manifest, ["param_counts", "total"]),

        # Per-epoch (Ultralytics)
        "map50_col": map50_col,
        "best_val_map50": best_val_map50,
        "best_epoch": best_epoch,
        "epoch_to_90pct_best": epoch_to_90pct,
        "auc_val_map50": auc_map50,

        "best_train_loss_at_best_epoch": best_train_loss,
        "best_val_loss_at_best_epoch": best_val_loss,
        "gen_gap_val_minus_train_at_best": gen_gap_at_best,

        # Evaluator final (test-side; depends on your evaluation contract)
        "primary_metric_name": primary_metric_name,
        "primary_metric_test": primary_metric_val,

        "avg_inference_time_ms": _safe_get(preds_test, ["timing", "avg_inference_time_ms"]),
        "num_images": _safe_get(preds_test, ["timing", "num_images"]),

        "run_dir": str(rd),
    })

df = pd.DataFrame(rows).sort_values(["model", "freeze_id", "epochs"]).reset_index(drop=True)
display(df)



## 7. Plots (E2)


In [None]:
import matplotlib.pyplot as plt
from pathlib import Path

PLOTS_DIR = Path("experiments/Experiment_2/runs") / "_plots"
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

# Reload aggregate if needed
import pandas as pd
agg_path = Path("experiments/Experiment_2/runs/_aggregate_e2.csv")
agg = pd.read_csv(agg_path) if agg_path.exists() else agg

def savefig(name: str):
    png = PLOTS_DIR / f"{name}.png"
    pdf = PLOTS_DIR / f"{name}.pdf"
    plt.savefig(png, bbox_inches="tight", dpi=200)
    plt.savefig(pdf, bbox_inches="tight")
    print(f"Saved: {png} and {pdf}")

# Plot 1: score vs epochs (per model)
plt.figure(figsize=(7,4))
for model, g in agg.groupby("model"):
    g = g.sort_values("epochs")
    plt.plot(g["epochs"], g["score"], marker="o", label=model)
plt.xlabel("Epoch budget")
plt.ylabel("Performance score (see score_col)")
plt.title("E2: Performance vs Epoch Budget (fixed freeze)")
plt.grid(True)
plt.legend()
savefig("perf_vs_epochs")
plt.show()

# Plot 2: wall time vs epochs (per model)
plt.figure(figsize=(7,4))
for model, g in agg.groupby("model"):
    g = g.sort_values("epochs")
    plt.plot(g["epochs"], g["train_wall_s"], marker="o", label=model)
plt.xlabel("Epoch budget")
plt.ylabel("Train wall-clock time (s)")
plt.title("E2: Training Time vs Epoch Budget")
plt.grid(True)
plt.legend()
savefig("time_vs_epochs")
plt.show()

print("Plots saved under:", PLOTS_DIR.resolve())


## 8. Inspect One Run (Optional)


In [None]:
from pathlib import Path
from IPython.display import Image, display

runs_root = Path("experiments/Experiment_2/runs")
# pick one run dir that has plots
cands = list(runs_root.glob("**/eval/test/plots/*.png"))
if not cands:
    print("No plot PNGs found under eval/test/plots (this is ok if plot generation is disabled).")
else:
    p = sorted(cands)[0]
    print("Displaying:", p)
    display(Image(filename=str(p)))


## Summary

This notebook:
1. Clones the repo and mounts Google Drive.
2. Downloads the dataset (Roboflow) and builds evaluation indices.
3. Writes an absolute-path `data/processed/data.yaml` for Ultralytics.
4. Runs **Experiment 2** epoch-budget sweep via `experiments/Experiment_2/run_experiment2.sh`.
5. Verifies expected artifacts and produces E2 plots.
