# Experiment 1: Run Cross-Agent Evaluations

This notebook runs MCP evaluations across three coding agents: Claude Code, Goose, and Codex.

**Each evaluation takes 2-3 hours** and tests 25 cases √ó 4 MCP servers = 100 evaluations per agent.

**Total time: ~2-3 hours** (if run in parallel) or ~6-9 hours (if run sequentially)

---

## Setup

### Environment Variables Required

**CRITICAL: ALL agents require `OPENAI_API_KEY`** (DeepEval uses OpenAI for evaluation scoring)

**Claude Code:**
- `OPENAI_API_KEY`: From `~/openai.key` (for DeepEval CorrectnessMetric)
- `ANTHROPIC_API_KEY`: From `~/cborg.key` (CBORG proxy)
- `ANTHROPIC_BASE_URL`: `https://api.cborg.lbl.gov`
- `PUBMED_EMAIL`, `PUBMED_API_KEY`: For MCP servers

**Goose:**
- `OPENAI_API_KEY`: From `~/openai.key` (for agent + DeepEval)
- `PUBMED_EMAIL`, `PUBMED_API_KEY`: For MCP servers

**Codex:**
- `OPENAI_API_KEY`: From `~/openai.key` (for agent + DeepEval)
- `PUBMED_EMAIL`, `PUBMED_API_KEY`: For MCP servers

In [None]:
import subprocess
import os
from pathlib import Path
from datetime import datetime
import yaml
import shutil

# Set working directory to project root
project_root = Path.cwd().parent if 'notebook' in str(Path.cwd()) else Path.cwd()
os.chdir(project_root)
print(f"Working directory: {os.getcwd()}")

def run_isolated_eval(agent: str, config_path: str, background: bool = False):
    """
    Run evaluation in isolated /tmp directory to prevent filesystem access to test_cases.yaml.
    
    The key is that BOTH the metacoder run location AND the --workdir must be in /tmp,
    so the agent's working directory is completely isolated from the project directory.
    
    Args:
        agent: Agent name (codex, claude, goose)
        config_path: Path to config file relative to project root
        background: If True, run in background
    
    Returns:
        Tuple of (isolated_dir, output_file, process or None)
    """
    # Create isolated directory in /tmp (completely separate from project)
    isolated_dir = Path(f"/tmp/mcp_eval_isolated_{agent}_{datetime.now().strftime('%Y%m%d_%H%M%S')}")
    isolated_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"=== Creating isolated environment: {isolated_dir} ===")
    
    # Copy ONLY the config file (NOT test_cases.yaml!)
    config_file = project_root / config_path
    config_basename = config_file.name
    isolated_config = isolated_dir / config_basename
    shutil.copy(config_file, isolated_config)
    
    print(f"‚úì Copied config: {config_basename}")
    
    # Create isolated workdir for agent sessions (also in /tmp)
    isolated_workdir = isolated_dir / "eval_workdir"
    isolated_workdir.mkdir(exist_ok=True)
    
    print(f"‚úì Created isolated workdir: {isolated_workdir}")
    
    # Verify test_cases.yaml is NOT accessible
    if (isolated_dir / "test_cases.yaml").exists():
        raise RuntimeError("ERROR: test_cases.yaml found in isolated_dir!")
    if (isolated_workdir / "test_cases.yaml").exists():
        raise RuntimeError("ERROR: test_cases.yaml found in isolated_workdir!")
    
    print("‚úì Verified test_cases.yaml is NOT accessible")
    
    # Set output file (in original project directory)
    output_file = project_root / f"results/compare_agents/{agent}_mcp_only_{datetime.now().strftime('%Y%m%d')}.yaml"
    
    print(f"=== Running evaluation ===")
    print(f"Agent: {agent}")
    print(f"Config: {config_basename}")
    print(f"Run dir: {isolated_dir}")
    print(f"Workdir: {isolated_workdir}")
    print(f"Output: {output_file}")
    print("")
    
    # Set environment variables
    env = os.environ.copy()
    env["OPENAI_API_KEY"] = open(Path.home() / "openai.key.another").read().strip()
    env["PUBMED_EMAIL"] = "justinreese@lbl.gov"
    env["PUBMED_API_KEY"] = "01eec0a16472164c6d69163bd28368311808"
    
    # For Claude: need ANTHROPIC_API_KEY
    if agent == "claude":
        env["ANTHROPIC_API_KEY"] = open(Path.home() / "cborg.key").read().strip()
        env["ANTHROPIC_BASE_URL"] = "https://api.cborg.lbl.gov"
    
    # Use project's venv python to run metacoder (since we're running from /tmp)
    venv_python = project_root / ".venv/bin/python"
    
    # Run evaluation from isolated directory with isolated workdir
    # CRITICAL: Both CWD and --workdir must be in /tmp to fully isolate the agent
    # NOTE: Codex config now has disable_shell_tool: true to enforce MCP-only mode
    cmd = [
        str(venv_python), "-m", "metacoder.metacoder", "eval",
        str(isolated_config),
        "--workdir", str(isolated_workdir),
        "-o", str(output_file)
    ]
    
    if background:
        print(f"üöÄ Starting evaluation in background...")
        process = subprocess.Popen(
            cmd,
            cwd=isolated_dir,  # Run FROM /tmp
            env=env,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )
        return isolated_dir, output_file, process
    else:
        print(f"üöÄ Starting evaluation (this will take 2-3 hours)...")
        result = subprocess.run(
            cmd,
            cwd=isolated_dir,  # Run FROM /tmp
            env=env,
            capture_output=True,
            text=True
        )
        
        if result.returncode != 0:
            print(f"‚ùå Evaluation failed with return code {result.returncode}")
            print(f"STDERR: {result.stderr}")
            raise RuntimeError(f"Evaluation failed: {result.stderr}")
        
        print(f"‚úÖ Evaluation complete!")
        print(f"Output saved to: {output_file}")
        
        return isolated_dir, output_file, None

## Running Evaluations in Isolated Mode

**NEW APPROACH:** Run evaluations in isolated /tmp directories to prevent filesystem access to test_cases.yaml.

**Why?** Early evaluations revealed that Claude and Codex were accessing the project directory and reading test_cases.yaml which contains expected outputs. Goose did not do this. Running in isolated directories ensures fair comparison.

### Run Codex Evaluation

In [ ]:
# Run Codex evaluation in isolated directory with shell_tool disabled (MCP-only mode)
isolated_dir, output_file, _ = run_isolated_eval(
    "codex",
    "project/generated/literature_mcp_eval_config_codex.yaml",
    background=False
)

### Run Claude Evaluation

# Run Claude evaluation in isolated directory
# Note: Update config path once Claude GPT-5 config is created
isolated_dir, output_file, _ = run_isolated_eval(
    "claude",
    "project/generated/literature_mcp_eval_config_claude.yaml",  # Update if needed
    background=False
)

## Verify Isolation

After evaluations complete, verify that agents did NOT access project files:

In [2]:
def verify_no_project_access(result_file: str):
    """
    Verify that the agent did NOT access any files in the project directory.
    
    Checks for access to:
    - test_cases.yaml (the answers file)
    - project/ directory
    - results/ directory
    - notebook/ directory
    
    Args:
        result_file: Path to evaluation result YAML file
    
    Returns:
        True if no project access detected, False otherwise
    """
    print(f"=" * 80)
    print(f"Verifying isolation for: {result_file}")
    print(f"=" * 80)
    
    result_path = Path(result_file)
    if not result_path.exists():
        print(f"‚ùå Result file not found: {result_file}")
        return False
    
    with open(result_path, 'r') as f:
        content = f.read()
    
    # Suspicious patterns that indicate project directory access
    suspicious_patterns = [
        "test_cases.yaml",
        "project/test_cases.yaml",
        f"{project_root}/project",
        f"{project_root}/results",
        f"{project_root}/notebook",
        "mcp_literature_eval/project",
        "mcp_literature_eval/results",
    ]
    
    violations = []
    for pattern in suspicious_patterns:
        if pattern in content:
            count = content.count(pattern)
            violations.append((pattern, count))
    
    if violations:
        print(f"\n‚ùå PROJECT ACCESS DETECTED!")
        print(f"\nFound {len(violations)} suspicious patterns:")
        for pattern, count in violations:
            print(f"  - '{pattern}': {count} occurrences")
        
        print(f"\n‚ö†Ô∏è  The agent accessed files in the project directory!")
        print(f"This means the isolation was NOT effective.")
        return False
    else:
        print(f"\n‚úÖ NO PROJECT ACCESS DETECTED")
        print(f"The agent did NOT access any files in:")
        print(f"  - project/")
        print(f"  - results/")
        print(f"  - notebook/")
        print(f"\n‚úì Isolation was effective!")
        return True

# Example usage after evaluation completes:
# verify_no_project_access("results/compare_agents/claude_gpt5_isolated_20251207.yaml")
# verify_no_project_access("results/compare_agents/codex_gpt5_isolated_20251207.yaml")