In [2]:
#!/usr/bin/env python
"""
Comprehensive Pointing Evaluation Pipeline
Evaluates zero-shot and few-shot (with/without hard negatives) pointing performance
"""

import os
import sys
import json
from pathlib import Path
from datetime import datetime

# Add src to path
ROOT_DIR = "../.."
sys.path.append(os.path.join(ROOT_DIR, 'src'))

# Load API keys
with open(f"{ROOT_DIR}/API_KEYS2.json", "r") as file:
    api_keys = json.load(file)

os.environ['OPENAI_API_KEY'] = api_keys['OPENAI_API_KEY']
os.environ['ANTHROPIC_API_KEY'] = api_keys['ANTHROPIC_API_KEY']
os.environ['GOOGLE_API_KEY'] = api_keys['GOOGLE_API_KEY']

# Import from endopoint package
from endopoint.datasets.cholecseg8k import CholecSeg8kAdapter
from endopoint.eval import PointingEvaluator

# Import dataset and few-shot utilities
from datasets import load_dataset
from few_shot_selection import (
    load_balanced_indices,
    load_fewshot_plan,
)

print("✓ Environment setup complete")

✓ Environment setup complete


In [3]:
def main(num_samples=None, models=None, use_cache=True):
    """Main evaluation function.
    
    Args:
        num_samples: Optional number of samples to evaluate (uses linspace to select subset).
                    If None, uses all test samples.
        models: Optional list of model names to evaluate. If None, uses default models.
        use_cache: Whether to use cache for model responses (default: True).
                  Set to False to bypass cache (useful for testing changes).
    """
    
    # Configuration
    DEFAULT_MODELS = [
        "gpt-4o-mini",
        "claude-3-5-sonnet-20241022",
        "gemini-2.0-flash-exp"
    ]
    
    MODELS = models if models is not None else DEFAULT_MODELS
    
    # Data directories
    data_dir = Path(ROOT_DIR) / "data_info" / "cholecseg8k"
    
    # Test indices file
    test_indices_file = str(data_dir / "balanced_indices_train_100.json")
    
    # Few-shot plan files
    fewshot_plan_files = {
        "standard": str(data_dir / "fewshot_plan_train_pos1_neg1_seed43_excl100.json"),
        "hard_negatives": str(data_dir / "fewshot_plan_train_pos1_neg1_nearmiss1_seed45_excl100.json"),
    }
    
    # Check if files exist
    if not Path(test_indices_file).exists():
        print(f"❌ Test indices file not found: {test_indices_file}")
        print("Please run prepare_fewshot_examples.py first")
        return
    
    for plan_name, plan_file in fewshot_plan_files.items():
        if not Path(plan_file).exists():
            print(f"❌ Few-shot plan file not found: {plan_file}")
            print("Please run prepare_fewshot_examples.py first")
            return
    
    print("\n" + "="*60)
    print("Starting Pointing Evaluation")
    print("="*60)
    print(f"Models to evaluate: {', '.join(MODELS)}")
    
    # Load dataset
    print("\n📊 Loading CholecSeg8k dataset...")
    dataset = load_dataset("minwoosun/CholecSeg8k")
    print(f"✓ Dataset loaded")
    
    # Load test indices
    test_indices = load_balanced_indices(test_indices_file)
    print(f"✓ Loaded {len(test_indices)} test samples")
    
    # Select subset using linspace if requested
    if num_samples is not None and num_samples < len(test_indices):
        import numpy as np
        # Use linspace to get evenly spaced indices
        selected_idx = np.linspace(0, len(test_indices) - 1, num_samples, dtype=int)
        test_indices = [test_indices[i] for i in selected_idx]
        print(f"📌 Selected {len(test_indices)} evenly spaced samples for evaluation")
    
    # Load few-shot plans
    fewshot_plans = {}
    for plan_name, plan_file in fewshot_plan_files.items():
        fewshot_plans[plan_name] = load_fewshot_plan(plan_file)
        print(f"✓ Loaded few-shot plan: {plan_name}")
    
    # Create output directory with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = Path(ROOT_DIR) / "results" / f"pointing_{timestamp}"
    
    # Initialize evaluator
    evaluator = PointingEvaluator(
        models=MODELS,
        dataset=dataset,
        dataset_adapter=CholecSeg8kAdapter(),
        canvas_width=768,
        canvas_height=768,
        output_dir=output_dir,
        use_cache=use_cache,
    )
    
    # Run evaluation
    results = evaluator.run_full_evaluation(
        test_indices=test_indices,
        fewshot_plans=fewshot_plans,
    )
    
    print("\n✨ Evaluation complete!")
    
    # Print final summary
    print("\n" + "="*60)
    print("Final Summary")
    print("="*60)
    
    for model_name in MODELS:
        print(f"\n{model_name}:")
        model_results = results[model_name]
        for eval_type in ["zero_shot", "few_shot_standard", "few_shot_hard_negatives"]:
            if eval_type in model_results:
                metrics = model_results[eval_type]["metrics"]
                print(f"  {eval_type:25} Acc: {metrics['overall_accuracy']:.3f}, F1: {metrics['avg_f1']:.3f}")


In [4]:
# Cell 2: Call the function directly
main(num_samples=5, use_cache=False)


Starting Pointing Evaluation
Models to evaluate: gpt-4o-mini, claude-3-5-sonnet-20241022, gemini-2.0-flash-exp

📊 Loading CholecSeg8k dataset...
✓ Dataset loaded
✓ Loaded 100 test samples
📌 Selected 5 evenly spaced samples for evaluation
✓ Loaded few-shot plan: standard
✓ Loaded few-shot plan: hard_negatives
Initialized evaluator:
  Models: ['gpt-4o-mini', 'claude-3-5-sonnet-20241022', 'gemini-2.0-flash-exp']
  Organs: 12
  Canvas: 768x768
  Output: ../../results/pointing_20250901_041511

📊 Evaluating 5 test samples

Evaluating: gpt-4o-mini

🔄 Running zero-shot with gpt-4o-mini...


gpt-4o-mini zero-shot: 100%|██████████| 5/5 [02:04<00:00, 24.81s/it]



🔄 Running few-shot (standard) with gpt-4o-mini...


gpt-4o-mini standard: 100%|██████████| 5/5 [01:53<00:00, 22.77s/it]



🔄 Running few-shot (hard_negatives) with gpt-4o-mini...


gpt-4o-mini hard_negatives: 100%|██████████| 5/5 [02:00<00:00, 24.09s/it]



📊 gpt-4o-mini Results:
  zero_shot:
    Overall Accuracy: 0.617
    Avg F1: 0.676
  few_shot_standard:
    Overall Accuracy: 0.583
    Avg F1: 0.615
  few_shot_hard_negatives:
    Overall Accuracy: 0.617
    Avg F1: 0.648

Evaluating: claude-3-5-sonnet-20241022

🔄 Running zero-shot with claude-3-5-sonnet-20241022...


claude-3-5-sonnet-20241022 zero-shot: 100%|██████████| 5/5 [02:27<00:00, 29.48s/it]



🔄 Running few-shot (standard) with claude-3-5-sonnet-20241022...


claude-3-5-sonnet-20241022 standard: 100%|██████████| 5/5 [02:27<00:00, 29.56s/it]



🔄 Running few-shot (hard_negatives) with claude-3-5-sonnet-20241022...


claude-3-5-sonnet-20241022 hard_negatives: 100%|██████████| 5/5 [02:31<00:00, 30.35s/it]



📊 claude-3-5-sonnet-20241022 Results:
  zero_shot:
    Overall Accuracy: 0.700
    Avg F1: 0.737
  few_shot_standard:
    Overall Accuracy: 0.700
    Avg F1: 0.737
  few_shot_hard_negatives:
    Overall Accuracy: 0.700
    Avg F1: 0.737

Evaluating: gemini-2.0-flash-exp

🔄 Running zero-shot with gemini-2.0-flash-exp...


gemini-2.0-flash-exp zero-shot: 100%|██████████| 5/5 [03:21<00:00, 40.30s/it]



🔄 Running few-shot (standard) with gemini-2.0-flash-exp...


gemini-2.0-flash-exp standard: 100%|██████████| 5/5 [03:46<00:00, 45.36s/it]



🔄 Running few-shot (hard_negatives) with gemini-2.0-flash-exp...


gemini-2.0-flash-exp hard_negatives: 100%|██████████| 5/5 [03:55<00:00, 47.17s/it]



📊 gemini-2.0-flash-exp Results:
  zero_shot:
    Overall Accuracy: 0.800
    Avg F1: 0.749
  few_shot_standard:
    Overall Accuracy: 0.583
    Avg F1: 0.577
  few_shot_hard_negatives:
    Overall Accuracy: 0.567
    Avg F1: 0.504

💾 Saving results to ../../results/pointing_20250901_041511


ImportError: Missing optional dependency 'tabulate'.  Use pip or conda to install tabulate.