# Debug Evaluation - Per-Check Details

**Debugging only.** Runs evaluation logic locally using output_test_pos and reference folders.
Uses `log_evaluation()` so the JSON includes per-check details.

Adjust `OUTPUT_DIR` and `REFERENCE_DIR` paths as needed.

In [1]:
import asyncio
import json
import os
import sys
from pathlib import Path

# Add project root for imports (agenthle-base)
cwd = Path.cwd()
project_root = cwd
while project_root != project_root.parent and not (project_root / "pyproject.toml").exists():
    project_root = project_root.parent
sys.path.insert(0, str(project_root))

from utils.evaluation import EvaluationContext

# Import NC evaluation helper from main
import importlib.util
task_dir = cwd if (cwd / "main.py").exists() else project_root / "tasks" / "earth_science_tasks" / "GEE_downloading_terraclimate_conus"
main_path = task_dir / "main.py"
spec = importlib.util.spec_from_file_location("task_main", main_path)
task_main = importlib.util.module_from_spec(spec)
spec.loader.exec_module(task_main)
_evaluate_netcdf_content = task_main._evaluate_netcdf_content

In [2]:
# === CONFIG: Adjust paths to your local output/reference folders ===
TASK_DIR = task_dir  # from above; or set explicitly
OUTPUT_DIR = TASK_DIR / "output_test_pos"   # or output_test_neg, output
REFERENCE_DIR = TASK_DIR / "reference"
REQUIRED_VARS = ["pr"]
TASK_TAG = "GEE_downloading_terraclimate_conus"
OUTPUT_JSON_DIR = project_root / "trycua" / "cua-bench" / "debug"

In [4]:
async def run_debug_eval():
    """Run evaluation with log_evaluation for each check - saves per-check details to JSON."""
    output_dir = str(OUTPUT_DIR)
    reference_dir = str(REFERENCE_DIR)

    OUTPUT_JSON_DIR.mkdir(parents=True, exist_ok=True)

    async with EvaluationContext(
        task_tag=TASK_TAG,
        mode="custom",
        split="train",
        output_dir=str(OUTPUT_JSON_DIR),
        auto_save=True,
    ) as ctx:
        checks = []

        try:
            output_files = os.listdir(output_dir)
            reference_files = os.listdir(reference_dir)
        except Exception as e:
            print(f"✗ Error listing directories: {e}")
            ctx.log_evaluation(identifier="list_dirs", score=0.0, error=str(e), message="Failed to list output/reference")
            ctx.add_score(0.0)
            return ctx.get_final_score(num_items=1)

        output_tif = sorted([f for f in output_files if f.lower().endswith(".tif")])
        reference_tif = sorted([f for f in reference_files if f.lower().endswith(".tif")])

        # 1) TIF check
        tif_same_length = len(output_tif) == len(reference_tif)
        tif_same_names = set(output_tif) == set(reference_tif)
        tif_passed = tif_same_length and tif_same_names
        msg = f"TIF count {len(output_tif)} vs ref {len(reference_tif)}, names match: {tif_same_names}"
        ctx.log_evaluation(identifier="tif_files_match", score=1.0 if tif_passed else 0.0, message=msg)
        ctx.add_score(1.0 if tif_passed else 0.0)
        checks.append({"check": "tif_files_match", "passed": tif_passed, "message": msg})

        # 2) NC file exists
        nc_files = [f for f in output_files if f.lower().endswith(".nc")]
        nc_exists = len(nc_files) > 0
        msg = f"Found {len(nc_files)} NC file(s)" if nc_exists else "No .nc file in output"
        ctx.log_evaluation(identifier="nc_file_exists", score=1.0 if nc_exists else 0.0, message=msg)
        ctx.add_score(1.0 if nc_exists else 0.0)
        checks.append({"check": "nc_file_exists", "passed": nc_exists, "message": msg})

        if not nc_exists:
            print("\nEvaluation Results:")
            for c in checks:
                status = "✓" if c["passed"] else "✗"
                print(f"  {status} {c['check']}: {c['message']}")
            return ctx.get_final_score(num_items=len(checks))

        ref_nc_files = [f for f in reference_files if f.lower().endswith(".nc")]
        if not ref_nc_files:
            ctx.log_evaluation(identifier="nc_ref", score=0.0, error="No reference NC file found")
            return ctx.get_final_score(num_items=len(checks))

        output_nc_path = str(OUTPUT_DIR / nc_files[0])
        reference_nc_path = str(REFERENCE_DIR / ref_nc_files[0])

        try:
            nc_checks = _evaluate_netcdf_content(output_nc_path, reference_nc_path, REQUIRED_VARS)
        except Exception as e:
            ctx.log_evaluation(identifier="nc_content", score=0.0, error=str(e))
            return ctx.get_final_score(num_items=len(checks))

        for c in nc_checks:
            ctx.log_evaluation(
                identifier=c["check"],
                score=1.0 if c["passed"] else 0.0,
                message=c["message"],
            )
            ctx.add_score(1.0 if c["passed"] else 0.0)
        checks.extend(nc_checks)

        print("\nEvaluation Results:")
        for c in checks:
            status = "✓" if c["passed"] else "✗"
            print(f"  {status} {c['check']}: {c['message']}")

        num_checks = len(checks)
        return ctx.get_final_score(num_items=num_checks)


# In Jupyter, use await (not asyncio.run - notebook already has an event loop)
score = await run_debug_eval()
print(f"\nFinal score: {score}")


Evaluation Results:
  ✓ tif_files_match: TIF count 60 vs ref 60, names match: True
  ✓ nc_file_exists: Found 1 NC file(s)
  ✓ nc_dims_xy_time: Has x, y, time dims: {'time': 60, 'y': 292, 'x': 670}
  ✓ nc_dim_lengths_match_reference: All dim lengths match reference
  ✓ nc_variables_match_required: Variables ['pr'] match required ['pr']
  ✓ nc_output_minus_reference_zero: output - reference is zero

Final score: 1.0


  ref_dims = dict(ref_ds.dims)
  out_dims = dict(out_ds.dims)


## View saved JSON with per-check details

In [5]:
# List debug evaluation JSONs (most recent first)
jsons = sorted(OUTPUT_JSON_DIR.glob("*evaluation*.json"), key=lambda p: p.stat().st_mtime, reverse=True)
if jsons:
    latest = jsons[0]
    print(f"Latest: {latest}")
    with open(latest) as f:
        print(json.dumps(json.load(f), indent=2))
else:
    print("No evaluation JSONs found. Run the cell above first.")

Latest: /Users/huiqi/Documents/research/agenthle-base/trycua/cua-bench/debug/rs_001_terraclimate_conus_evaluation_20260213_083648.json
{
  "mode": "custom",
  "task_tag": "rs_001_terraclimate_conus",
  "timestamp": "2026-02-13T08:36:40.179658",
  "evaluations": [
    {
      "identifier": "tif_files_match",
      "score": 1.0,
      "message": "TIF count 60 vs ref 60, names match: True"
    },
    {
      "identifier": "nc_file_exists",
      "score": 1.0,
      "message": "Found 1 NC file(s)"
    },
    {
      "identifier": "nc_dims_xy_time",
      "score": 1.0,
      "message": "Has x, y, time dims: {'time': 60, 'y': 292, 'x': 670}"
    },
    {
      "identifier": "nc_dim_lengths_match_reference",
      "score": 1.0,
      "message": "All dim lengths match reference"
    },
    {
      "identifier": "nc_variables_match_required",
      "score": 1.0,
      "message": "Variables ['pr'] match required ['pr']"
    },
    {
      "identifier": "nc_output_minus_reference_zero",
      "sc