# Test Evaluation System

**Purpose:** Quick test of the new evaluation module with minimal training.

This notebook:
1. Trains a tiny model for 5 epochs (just for testing)
2. Generates predictions on test set
3. Runs all 3 evaluation metrics
4. Generates plots

**NOT for actual experiments** - just for debugging the evaluation pipeline!

## 0. Setup

In [None]:
import os
import sys
from pathlib import Path

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))
os.chdir(project_root)

print(f"Working directory: {os.getcwd()}")

In [None]:
# Check if data exists
import json

test_index_path = "data/processed/evaluation/test_index.json"

if not Path(test_index_path).exists():
    print(f"❌ Test index not found at {test_index_path}")
    print("\nRun this first:")
    print("  python scripts/build_evaluation_indices.py")
else:
    with open(test_index_path) as f:
        test_data = json.load(f)
    print(f"✓ Found test index with {test_data['metadata']['num_images']} images")
    print(f"  Total objects: {test_data['metadata']['total_objects']}")
    print(f"  Classes: {test_data['metadata']['num_classes']}")

## 1. Quick Training (5 epochs - just for testing)

Train a tiny model to generate test predictions.

In [None]:
from ultralytics import YOLO
import torch

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Check if data.yaml exists
data_yaml = "data/processed/data.yaml"

if not Path(data_yaml).exists():
    print(f"❌ data.yaml not found at {data_yaml}")
    print("\nRun this first:")
    print("  python scripts/create_data_yaml.py --dataset_root data/raw --output data/processed/data.yaml")
else:
    print(f"✓ Found {data_yaml}")

In [None]:
# Train tiny model (5 epochs)
model = YOLO('yolov8n.pt')  # Nano model

print("Training for 5 epochs (just for testing)...")
results = model.train(
    data=data_yaml,
    epochs=5,  # Very few epochs - just for testing!
    imgsz=640,
    batch=16,
    patience=999,  # No early stopping
    save=True,
    project='runs/test_eval',
    name='quick_test',
    exist_ok=True,
    verbose=True
)

print("\n✓ Training complete (this model is NOT for actual experiments!)")

## 2. Generate Predictions

Run inference on test set and save predictions.

In [None]:
# Load trained model
weights_path = "runs/test_eval/quick_test/weights/best.pt"
model_eval = YOLO(weights_path)

print(f"Loaded model from {weights_path}")

In [None]:
# Load test index
with open(test_index_path) as f:
    test_index = json.load(f)

test_images = test_index['images'][:50]  # Use only first 50 images for quick test
print(f"Running inference on {len(test_images)} test images...")

In [None]:
# Run inference and collect predictions
from tqdm import tqdm
import time

predictions = []

for img_data in tqdm(test_images, desc="Inference"):
    image_id = img_data['image_id']
    image_filename = img_data['image_filename']
    image_path = Path("data/raw/test/images") / image_filename
    
    if not image_path.exists():
        print(f"Warning: {image_path} not found")
        continue
    
    # Run inference with LOW confidence threshold (save almost everything)
    start = time.time()
    results = model_eval.predict(
        source=str(image_path),
        conf=0.01,  # Very low threshold to save all predictions
        imgsz=640,
        verbose=False
    )[0]
    inference_time = time.time() - start
    
    # Extract detections
    detections = []
    if len(results.boxes) > 0:
        boxes = results.boxes
        for i in range(len(boxes)):
            detections.append({
                "class_id": int(boxes.cls[i].item()),
                "class_name": results.names[int(boxes.cls[i].item())],
                "confidence": float(boxes.conf[i].item()),
                "bbox": boxes.xyxy[i].tolist(),
                "bbox_format": "xyxy"
            })
    
    predictions.append({
        "image_id": image_id,
        "detections": detections
    })

print(f"\n✓ Generated predictions for {len(predictions)} images")

In [None]:
# Save predictions in new format
pred_output_path = "evaluation/metrics/test_quick_predictions.json"
Path(pred_output_path).parent.mkdir(parents=True, exist_ok=True)

pred_json = {
    "run_id": "test_quick_5epochs",
    "split": "test",
    "model_family": "yolo",
    "model_name": "yolov8n",
    "inference_settings": {
        "conf_threshold": 0.01,
        "iou_threshold": 0.50,
        "imgsz": 640
    },
    "predictions": predictions
}

with open(pred_output_path, 'w') as f:
    json.dump(pred_json, f, indent=2)

print(f"✓ Saved predictions to {pred_output_path}")

## 3. Run Evaluation

Test all 3 evaluation metrics.

In [None]:
# Import evaluation module
from evaluation.io import load_predictions, load_ground_truth, load_class_names
from evaluation.metrics import (
    eval_detection_prf_at_iou,
    eval_per_class_metrics_and_confusions,
    eval_counting_quality
)
from evaluation.plots import plot_all_metrics

print("✓ Evaluation module imported successfully")

In [None]:
# Load predictions and ground truth
preds = load_predictions(pred_output_path, split="test")
gts = load_ground_truth(test_index_path, split="test")
class_names = load_class_names(test_index_path)

# Filter GTs to match predictions (first 50 images)
pred_image_ids = {p['image_id'] for p in preds}
gts = [g for g in gts if g['image_id'] in pred_image_ids]

print(f"✓ Loaded {len(preds)} predictions")
print(f"✓ Loaded {len(gts)} ground truths")
print(f"✓ Loaded {len(class_names)} classes")

In [None]:
# 1. P/R/F1 at multiple thresholds
print("1. Running detection P/R/F1 evaluation...")
threshold_sweep = eval_detection_prf_at_iou(
    preds, gts,
    iou_threshold=0.5,
    conf_thresholds=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
)

print("\nResults by confidence threshold:")
for conf_thr, metrics in threshold_sweep.items():
    print(f"  conf={conf_thr}: P={metrics['precision']:.3f}, R={metrics['recall']:.3f}, F1={metrics['f1']:.3f}")

best_thr = max(threshold_sweep.keys(), key=lambda k: threshold_sweep[k]['f1'])
print(f"\n✓ Best threshold: {best_thr} (F1={threshold_sweep[best_thr]['f1']:.3f})")

In [None]:
# 2. Per-class metrics
print("\n2. Running per-class evaluation...")
per_class_results = eval_per_class_metrics_and_confusions(
    preds, gts,
    iou_threshold=0.5,
    conf_threshold=float(best_thr),
    class_names=class_names
)

print(f"\n✓ Evaluated {len(per_class_results['per_class'])} classes")
print(f"✓ Found {len(per_class_results['top_confusions'][:5])} top confusions")

# Show top 3 classes by F1
sorted_classes = sorted(
    per_class_results['per_class'].items(),
    key=lambda x: x[1]['f1'],
    reverse=True
)
print("\nTop 3 classes by F1:")
for class_name, metrics in sorted_classes[:3]:
    print(f"  {class_name}: F1={metrics['f1']:.3f} (support={metrics['support']})")

In [None]:
# 3. Counting quality
print("\n3. Running counting quality evaluation...")
counting_results = eval_counting_quality(
    preds, gts,
    iou_threshold=0.5,
    conf_threshold=float(best_thr),
    class_names=class_names
)

print(f"\n✓ Matched-only MAE: {counting_results['matched_only']['global_mae']:.4f}")
print(f"✓ All-predictions MAE: {counting_results['all_predictions']['global_mae']:.4f}")

## 4. Generate Plots

In [None]:
# Generate all plots
output_dir = "evaluation/results/test_quick/"
Path(output_dir).mkdir(parents=True, exist_ok=True)

plot_all_metrics(
    threshold_sweep=threshold_sweep,
    per_class_results=per_class_results['per_class'],
    confusion_data=per_class_results,
    counting_results=counting_results,
    output_dir=output_dir,
    run_name="Quick Test (5 epochs)"
)

print(f"\n✓ All plots saved to {output_dir}")

## 5. Test CLI Script

In [None]:
# Test the standalone evaluation script
!python scripts/evaluate_run.py \
    --predictions evaluation/metrics/test_quick_predictions.json \
    --ground_truth data/processed/evaluation/test_index.json \
    --output_dir evaluation/results/test_quick_cli/ \
    --run_name "Quick Test CLI" \
    --conf_thresholds 0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8

## Summary

If you see this without errors, the evaluation system is working!

**What was tested:**
- ✅ Prediction generation and saving
- ✅ Loading predictions and ground truth
- ✅ Detection P/R/F1 at multiple thresholds
- ✅ Per-class metrics and confusion matrix
- ✅ Counting quality (both methods)
- ✅ Plot generation
- ✅ CLI evaluation script

**Next steps:**
1. Run actual experiments with proper training
2. Use the evaluation system on train/val/test splits
3. Compare models (YOLO vs RT-DETR)