# Experiment TextualVerifier Using Best Sample

In [1]:
import ast
import time
import json
import tiktoken
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from threading import Lock
from dataclasses import dataclass, asdict
from typing import List, Dict, Any, Optional

import textgrad as tg
from textgrad.engine import get_engine
from textgrad.variable import Variable
from textgrad.verifier import TextualVerifierExperiment

## Load Dataset

In [2]:
sample = pd.read_csv("dataset/sample/prm800k-03-algo3-clean.csv")
sample

Unnamed: 0,id,labeler,timestamp,problem,ground_truth_answer,total_steps,steps,neg_1,zero,pos_1
0,1,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-30T14:37:13.296218,There are an infinite number of vectors $\math...,\begin{pmatrix} -7 \\ 16 \\ 5 \end{pmatrix},34,"[{'text': ""Let's set $\\mathbf{v} = \\begin{pm...",19,6,9
1,2,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-30T13:26:58.414691,When rolling a certain unfair six-sided die wi...,29,35,"[{'text': ""Well, let's think about this for a ...",18,1,16
2,3,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-31T14:39:30.588403,Find all solutions to\n\[\sin \left( \tan^{-1}...,3 \pm 2 \sqrt{2},34,"[{'text': ""Let's set $y = \\tan^{-1} x$."", 'ra...",11,1,22
3,4,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-29T07:48:01.714041,The solutions of the equation $z^4+4z^3i-6z^2-...,11,40,[{'text': 'There is a formula for the area of ...,16,2,21
4,5,e90a38f3-3135-4465-87af-3e6322e3d772,2022-07-22T20:02:50.866783,A sequence $(a_n)$ is defined as follows:\n\[a...,-1,36,"[{'text': ""So we're given that $a_{i + 1} = \\...",7,3,26
...,...,...,...,...,...,...,...,...,...,...
66,440,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-28T08:12:20.344377,Find the product $CD$ of the integers $C$ and ...,-5,17,[{'text': 'I think the first step here is to f...,3,0,14
67,442,d8aa7923-b970-45e1-9734-e4a7f6c4a7db,2022-07-31T22:47:06.498122,What real values of $x$ are not in the domain ...,-4,31,[{'text': 'To find values of $x$ that are not ...,1,0,30
68,444,d8aa7923-b970-45e1-9734-e4a7f6c4a7db,2022-07-24T10:40:50.685197,How many license plates can be formed if every...,58500,14,[{'text': 'So we need to count the number of p...,2,2,10
69,445,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-30T11:25:46.657657,"If $f(x)=5x^2+3x+4$, what is the value of $f(-...",18,7,"[{'text': 'To find f(-2), we just need to plug...",1,0,6


## Experiment

In [3]:
engine = get_engine("gemini-1.5-pro")
tg.set_backward_engine("gemini-1.5-pro", override=True)
model_name="gemini-1.5-pro"

  from .autonotebook import tqdm as notebook_tqdm


### Experiment Tracker Class

In [4]:
@dataclass
class LLMCallMetrics:
    """Track individual LLM calls"""
    call_id: str
    stage: str  # 'variant_generation', 'voting', 'decision'
    step_index: int
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int
    latency_ms: float
    timestamp: float
    success: bool
    error_message: Optional[str] = None

@dataclass
class StepMetrics:
    """Track metrics for each step verification"""
    step_index: int
    original_step: str
    variants_generated: int
    variants_successful: int
    selected_variant: str
    selection_reason: str
    step_processing_time_ms: float
    llm_calls: List[LLMCallMetrics]

@dataclass
class DetailedStepResult:
    """Capture detailed step verification results"""
    original: str
    generated_variants: List[Dict[str, str]]
    verified: str

@dataclass
class DetailedExperimentResult:
    """Detailed experiment result with step-by-step breakdown"""
    steps: List[DetailedStepResult]

@dataclass
class ExperimentMetrics:
    """Comprehensive experiment tracking"""
    # Basic Info
    experiment_id: str
    problem_id: str
    problem_text: str
    timestamp: float
    
    # Input/Output
    original_solution: str
    verified_solution: str
    final_decision: str  # 'REPLACE' or 'SUFFICIENT'
    
    # Performance Metrics
    total_processing_time_ms: float
    total_llm_calls: int
    total_tokens: int
    total_prompt_tokens: int
    total_completion_tokens: int
    
    # Step-by-Step Metrics
    total_steps: int
    steps_modified: int
    step_metrics: List[StepMetrics]
    
    # Quality Metrics
    original_has_errors: bool
    verified_fixes_errors: bool
    improvement_score: float  # -1 to 1, where 1 is significant improvement
    
    # Cost Estimation (if using paid APIs)
    estimated_cost_usd: float
    
    # Detailed Logs
    stage_outputs: Dict[str, Any]
    error_log: List[str]

class ExperimentTracker:
    """Robust tracking for textual verifier experiments - supports all model types"""
    
    def __init__(self, model_name: str = "gpt-3.5-turbo", enable_token_counting: bool = True):
        self.model_name = model_name
        self.enable_token_counting = enable_token_counting
        self.current_experiment: Optional[ExperimentMetrics] = None
        self.all_experiments: List[ExperimentMetrics] = []
        
        # Initialize token counter with fallback support
        self.encoding = None
        if enable_token_counting:
            self.encoding = self._get_safe_encoding(model_name)
        
        # Enhanced token pricing with more models
        self.token_prices = {
            # OpenAI models
            "gpt-3.5-turbo": {"prompt": 0.001/1000, "completion": 0.002/1000},
            "gpt-4": {"prompt": 0.03/1000, "completion": 0.06/1000},
            "gpt-4-turbo": {"prompt": 0.01/1000, "completion": 0.03/1000},
            "gpt-4o": {"prompt": 0.005/1000, "completion": 0.015/1000},
            
            # Anthropic models
            "claude-3-haiku": {"prompt": 0.00025/1000, "completion": 0.00125/1000},
            "claude-3-sonnet": {"prompt": 0.003/1000, "completion": 0.015/1000},
            "claude-3-opus": {"prompt": 0.015/1000, "completion": 0.075/1000},
            "claude-3.5-sonnet": {"prompt": 0.003/1000, "completion": 0.015/1000},
            
            # Google models (estimated pricing)
            "gemini-1.5-pro": {"prompt": 0.0035/1000, "completion": 0.0105/1000},
            "gemini-1.5-flash": {"prompt": 0.00075/1000, "completion": 0.003/1000},
            "gemini-pro": {"prompt": 0.0005/1000, "completion": 0.0015/1000},
            
            # Default fallback
            "default": {"prompt": 0.001/1000, "completion": 0.003/1000}
        }
    
    def _get_safe_encoding(self, model_name: str):
        """Get encoding with safe fallbacks for unsupported models"""
        try:
            # Try to get model-specific encoding
            return tiktoken.encoding_for_model(model_name)
        except KeyError:
            print(f"INFO: No specific tokenizer for {model_name}, using fallback")
            try:
                # Common fallbacks for different model families
                if "gpt-4" in model_name.lower():
                    return tiktoken.encoding_for_model("gpt-4")
                elif "gpt-3.5" in model_name.lower():
                    return tiktoken.encoding_for_model("gpt-3.5-turbo")
                elif "claude" in model_name.lower():
                    # Claude uses similar tokenization to GPT-4
                    return tiktoken.get_encoding("cl100k_base")
                elif "gemini" in model_name.lower():
                    # Use general tokenizer for Gemini
                    return tiktoken.get_encoding("cl100k_base")
                else:
                    # Universal fallback
                    return tiktoken.get_encoding("cl100k_base")
            except Exception as e:
                print(f"WARN: Could not initialize tokenizer: {e}")
                return None
    
    def count_tokens(self, text: str) -> int:
        """Count tokens with robust fallback estimation"""
        if not text:
            return 0
            
        text = str(text)  # Ensure it's a string
        
        if not self.enable_token_counting or self.encoding is None:
            # Fallback: estimate based on word count
            # Different models have different token-to-word ratios
            words = len(text.split())
            if "gemini" in self.model_name.lower():
                return int(words * 1.2)  # Gemini tends to be more efficient
            elif "claude" in self.model_name.lower():
                return int(words * 1.3)  # Claude similar to GPT
            else:
                return int(words * 1.3)  # General estimation
        
        try:
            return len(self.encoding.encode(text))
        except Exception as e:
            print(f"WARN: Token counting failed: {e}")
            # Final fallback: character-based estimation
            return len(text) // 4  # Rough approximation: 4 chars per token
    
    def get_token_price(self, model_name: str) -> Dict[str, float]:
        """Get token pricing with fallback"""
        # Direct match
        if model_name in self.token_prices:
            return self.token_prices[model_name]
        
        # Fuzzy matching for model families
        model_lower = model_name.lower()
        for key in self.token_prices:
            if key.replace("-", "").replace(".", "") in model_lower.replace("-", "").replace(".", ""):
                return self.token_prices[key]
        
        # Final fallback
        print(f"INFO: Using default pricing for unknown model: {model_name}")
        return self.token_prices["default"]
    
    def start_experiment(self, problem_id: str, problem_text: str, original_solution: str) -> str:
        """Start tracking a new experiment"""
        experiment_id = f"exp_{int(time.time())}_{problem_id}"
        
        self.current_experiment = ExperimentMetrics(
            experiment_id=experiment_id,
            problem_id=problem_id,
            problem_text=problem_text,
            timestamp=time.time(),
            original_solution=original_solution,
            verified_solution="",
            final_decision="",
            total_processing_time_ms=0,
            total_llm_calls=0,
            total_tokens=0,
            total_prompt_tokens=0,
            total_completion_tokens=0,
            total_steps=0,
            steps_modified=0,
            step_metrics=[],
            original_has_errors=False,
            verified_fixes_errors=False,
            improvement_score=0.0,
            estimated_cost_usd=0.0,
            stage_outputs={},
            error_log=[]
        )
        
        return experiment_id
    
    def track_llm_call(self, stage: str, step_index: int, prompt: str, 
                      response: str, latency_ms: float, success: bool = True, 
                      error: str = None) -> LLMCallMetrics:
        """Track individual LLM call with robust token counting"""
        call_id = f"{stage}_{step_index}_{int(time.time())}"
        
        # Safe token counting
        try:
            prompt_tokens = self.count_tokens(prompt)
            completion_tokens = self.count_tokens(response) if response else 0
        except Exception as e:
            print(f"WARN: Token counting failed for call {call_id}: {e}")
            # Emergency fallback
            prompt_tokens = len(str(prompt).split()) if prompt else 0
            completion_tokens = len(str(response).split()) if response else 0
        
        total_tokens = prompt_tokens + completion_tokens
        
        call_metrics = LLMCallMetrics(
            call_id=call_id,
            stage=stage,
            step_index=step_index,
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
            total_tokens=total_tokens,
            latency_ms=latency_ms,
            timestamp=time.time(),
            success=success,
            error_message=error
        )
        
        # Update experiment totals
        if self.current_experiment:
            self.current_experiment.total_llm_calls += 1
            self.current_experiment.total_tokens += total_tokens
            self.current_experiment.total_prompt_tokens += prompt_tokens
            self.current_experiment.total_completion_tokens += completion_tokens
            
            # Update cost estimation
            try:
                prices = self.get_token_price(self.model_name)
                call_cost = (prompt_tokens * prices["prompt"] + 
                           completion_tokens * prices["completion"])
                self.current_experiment.estimated_cost_usd += call_cost
            except Exception as e:
                print(f"WARN: Cost calculation failed: {e}")
        
        return call_metrics
    
    def track_step_verification(self, step_index: int, original_step: str, 
                              variants: List[str], selected_variant: str,
                              selection_reason: str, processing_time_ms: float,
                              llm_calls: List[LLMCallMetrics]):
        """Track step-level verification metrics"""
        step_metrics = StepMetrics(
            step_index=step_index,
            original_step=original_step,
            variants_generated=len(variants),
            variants_successful=len([v for v in variants if v and v.strip()]),
            selected_variant=selected_variant,
            selection_reason=selection_reason,
            step_processing_time_ms=processing_time_ms,
            llm_calls=llm_calls
        )
        
        if self.current_experiment:
            self.current_experiment.step_metrics.append(step_metrics)
            if original_step != selected_variant:
                self.current_experiment.steps_modified += 1
    
    def track_stage_output(self, stage: str, output: Any):
        """Track outputs from different stages"""
        if self.current_experiment:
            self.current_experiment.stage_outputs[stage] = output
    
    def track_error(self, error: str):
        """Track errors during processing"""
        if self.current_experiment:
            self.current_experiment.error_log.append(f"{time.time()}: {error}")
    
    def finish_experiment(self, verified_solution: str, final_decision: str,
                         total_processing_time_ms: float, improvement_score: float = 0.0):
        """Complete experiment tracking"""
        if self.current_experiment:
            self.current_experiment.verified_solution = verified_solution
            self.current_experiment.final_decision = final_decision
            self.current_experiment.total_processing_time_ms = total_processing_time_ms
            self.current_experiment.total_steps = len(self.current_experiment.step_metrics)
            self.current_experiment.improvement_score = improvement_score
            
            # Add to completed experiments
            self.all_experiments.append(self.current_experiment)
            self.current_experiment = None
    
    def get_summary_stats(self) -> Dict[str, Any]:
        """Get summary statistics across all experiments"""
        if not self.all_experiments:
            return {}
        
        total_experiments = len(self.all_experiments)
        successful_experiments = [exp for exp in self.all_experiments if not exp.error_log]
        
        return {
            "total_experiments": total_experiments,
            "successful_experiments": len(successful_experiments),
            "success_rate": len(successful_experiments) / total_experiments,
            "total_llm_calls": sum(exp.total_llm_calls for exp in self.all_experiments),
            "total_tokens": sum(exp.total_tokens for exp in self.all_experiments),
            "total_cost_usd": sum(exp.estimated_cost_usd for exp in self.all_experiments),
            "avg_processing_time_ms": sum(exp.total_processing_time_ms for exp in self.all_experiments) / total_experiments,
            "replacement_rate": sum(1 for exp in self.all_experiments if exp.final_decision == "REPLACE") / total_experiments,
            "avg_steps_per_problem": sum(exp.total_steps for exp in self.all_experiments) / total_experiments,
            "avg_modifications_per_problem": sum(exp.steps_modified for exp in self.all_experiments) / total_experiments,
            "avg_improvement_score": sum(exp.improvement_score for exp in self.all_experiments) / total_experiments,
            "avg_tokens_per_call": sum(exp.total_tokens for exp in self.all_experiments) / sum(exp.total_llm_calls for exp in self.all_experiments) if sum(exp.total_llm_calls for exp in self.all_experiments) > 0 else 0,
            "model_name": self.model_name
        }
    
    def export_detailed_results(self, filename: str):
        """Export detailed results to JSON with error handling"""
        try:
            export_data = {
                "summary": self.get_summary_stats(),
                "experiments": [asdict(exp) for exp in self.all_experiments],
                "metadata": {
                    "export_timestamp": time.time(),
                    "model_name": self.model_name,
                    "token_counting_enabled": self.enable_token_counting,
                    "total_experiments": len(self.all_experiments)
                }
            }
            
            with open(filename, 'w') as f:
                json.dump(export_data, f, indent=2, default=str)
            
            print(f"✓ Results exported successfully to {filename}")
            
        except Exception as e:
            print(f"✗ Export failed: {e}")
            # Try simplified export
            try:
                simple_data = {
                    "summary": self.get_summary_stats(),
                    "experiment_count": len(self.all_experiments),
                    "error": str(e)
                }
                fallback_filename = f"fallback_{filename}"
                with open(fallback_filename, 'w') as f:
                    json.dump(simple_data, f, indent=2, default=str)
                print(f"✓ Fallback export saved to {fallback_filename}")
            except Exception as e2:
                print(f"✗ Even fallback export failed: {e2}")
    
    def print_experiment_summary(self, experiment_id: str = None):
        """Print summary for specific experiment or latest"""
        exp = None
        if experiment_id:
            exp = next((e for e in self.all_experiments if e.experiment_id == experiment_id), None)
        else:
            exp = self.all_experiments[-1] if self.all_experiments else None
        
        if not exp:
            print("No experiment found")
            return
        
        print(f"\n{'='*60}")
        print(f"EXPERIMENT SUMMARY: {exp.experiment_id}")
        print(f"{'='*60}")
        print(f"Model: {self.model_name}")
        print(f"Problem: {exp.problem_text[:100]}...")
        print(f"Total Steps: {exp.total_steps}")
        print(f"Steps Modified: {exp.steps_modified}")
        print(f"Final Decision: {exp.final_decision}")
        print(f"Processing Time: {exp.total_processing_time_ms:.2f}ms")
        print(f"LLM Calls: {exp.total_llm_calls}")
        print(f"Total Tokens: {exp.total_tokens:,}")
        print(f"Estimated Cost: ${exp.estimated_cost_usd:.4f}")
        print(f"Improvement Score: {exp.improvement_score:.2f}")
        
        if exp.error_log:
            print(f"\nErrors ({len(exp.error_log)}):")
            for error in exp.error_log[-3:]:  # Show last 3 errors
                print(f"  - {error}")
        
        print(f"\nStep Breakdown:")
        for i, step in enumerate(exp.step_metrics):
            modified = "✓" if step.original_step != step.selected_variant else "○"
            print(f"  Step {i+1} {modified}: {step.variants_generated} variants, "
                  f"{step.step_processing_time_ms:.1f}ms, "
                  f"{len(step.llm_calls)} LLM calls")

    def print_batch_summary(self):
        """Print comprehensive batch summary"""
        stats = self.get_summary_stats()
        if not stats:
            print("No experiments completed yet")
            return
        
        print(f"\n{'='*60}")
        print(f"BATCH EXPERIMENT SUMMARY")
        print(f"{'='*60}")
        print(f"Model: {self.model_name}")
        print(f"Total Experiments: {stats['total_experiments']}")
        print(f"Successful: {stats['successful_experiments']}")
        print(f"Success Rate: {stats['success_rate']:.1%}")
        print(f"Total LLM Calls: {stats['total_llm_calls']:,}")
        print(f"Total Tokens: {stats['total_tokens']:,}")
        print(f"Total Cost: ${stats['total_cost_usd']:.4f}")
        print(f"Avg Processing Time: {stats['avg_processing_time_ms']:.0f}ms")
        print(f"Replacement Rate: {stats['replacement_rate']:.1%}")
        print(f"Avg Steps per Problem: {stats['avg_steps_per_problem']:.1f}")
        print(f"Avg Modifications per Problem: {stats['avg_modifications_per_problem']:.1f}")
        print(f"Avg Tokens per Call: {stats['avg_tokens_per_call']:.0f}")

class EnhancedExperimentTracker(ExperimentTracker):
    """Enhanced tracker that captures detailed step information"""
    
    def __init__(self, model_name: str = "gpt-3.5-turbo", enable_token_counting: bool = True):
        super().__init__(model_name, enable_token_counting)
        self.current_detailed_result = None
    
    def start_detailed_experiment(self, problem_id: str, problem_text: str, original_solution: str):
        """Start tracking with detailed step capture"""
        experiment_id = self.start_experiment(problem_id, problem_text, original_solution)
        self.current_detailed_result = DetailedExperimentResult(steps=[])
        return experiment_id
    
    def track_detailed_step_verification(self, step_index: int, original_step: str, 
                                       variants: List[str], selected_variant: str,
                                       selection_reason: str, processing_time_ms: float,
                                       llm_calls: List[LLMCallMetrics]):
        """Track step verification with detailed capture"""
        # Call parent method
        super().track_step_verification(
            step_index, original_step, variants, selected_variant,
            selection_reason, processing_time_ms, llm_calls
        )
        
        # Add detailed tracking
        if self.current_detailed_result:
            detailed_step = DetailedStepResult(
                original=original_step,
                generated_variants=[{"text": variant} for variant in variants],
                verified=selected_variant
            )
            self.current_detailed_result.steps.append(detailed_step)
    
    def get_detailed_result(self) -> Dict:
        """Get the detailed result in the requested format"""
        if self.current_detailed_result:
            return {
                "steps": [
                    {
                        "original": step.original,
                        "generated_variants": step.generated_variants,
                        "verified": step.verified
                    }
                    for step in self.current_detailed_result.steps
                ]
            }
        return {"steps": []}

# Safe context manager for experiments
class DetailedSafeExperimentContext:
    """Context manager for detailed experiments"""
    
    def __init__(self, tracker: EnhancedExperimentTracker, problem_id: str, problem_text: str, original_solution: str):
        self.tracker = tracker
        self.problem_id = problem_id
        self.problem_text = problem_text
        self.original_solution = original_solution
        self.start_time = None
        self.experiment_id = None
    
    def __enter__(self):
        self.start_time = time.time()
        self.experiment_id = self.tracker.start_detailed_experiment(
            self.problem_id, self.problem_text, self.original_solution
        )
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        total_time = (time.time() - self.start_time) * 1000 if self.start_time else 0
        
        if exc_type is not None:
            # Error occurred
            self.tracker.track_error(f"Experiment failed: {exc_val}")
            self.tracker.finish_experiment(
                verified_solution="",
                final_decision="ERROR",
                total_processing_time_ms=total_time,
                improvement_score=-1.0
            )
        elif self.tracker.current_experiment:
            # Experiment wasn't manually finished
            self.tracker.finish_experiment(
                verified_solution="",
                final_decision="INCOMPLETE",
                total_processing_time_ms=total_time,
                improvement_score=0.0
            )

# Additional Analysis Functions (updated for robustness)
def analyze_variant_quality(tracker: ExperimentTracker) -> Dict[str, Any]:
    """Analyze variant generation quality with error handling"""
    try:
        all_steps = []
        for exp in tracker.all_experiments:
            all_steps.extend(exp.step_metrics)
        
        if not all_steps:
            return {"error": "No step metrics available"}
        
        total_variants = sum(s.variants_generated for s in all_steps)
        total_successful = sum(s.variants_successful for s in all_steps)
        
        return {
            "total_steps_analyzed": len(all_steps),
            "avg_variants_per_step": total_variants / len(all_steps),
            "variant_success_rate": total_successful / total_variants if total_variants > 0 else 0,
            "steps_with_multiple_variants": len([s for s in all_steps if s.variants_generated > 1]),
            "modification_rate": len([s for s in all_steps if s.original_step != s.selected_variant]) / len(all_steps)
        }
    except Exception as e:
        return {"error": f"Analysis failed: {e}"}

def analyze_efficiency_patterns(tracker: ExperimentTracker) -> Dict[str, Any]:
    """Analyze efficiency patterns with error handling"""
    try:
        if not tracker.all_experiments:
            return {"error": "No experiments available"}
        
        successful_experiments = [exp for exp in tracker.all_experiments if not exp.error_log]
        efficient_experiments = [exp for exp in successful_experiments 
                               if exp.total_processing_time_ms < 30000]  # < 30 seconds
        
        total_tokens = sum(exp.total_tokens for exp in successful_experiments)
        total_steps = sum(exp.total_steps for exp in successful_experiments)
        total_calls = sum(exp.total_llm_calls for exp in successful_experiments)
        
        return {
            "total_successful_experiments": len(successful_experiments),
            "efficiency_rate": len(efficient_experiments) / len(successful_experiments) if successful_experiments else 0,
            "avg_tokens_per_step": total_tokens / total_steps if total_steps > 0 else 0,
            "avg_calls_per_step": total_calls / total_steps if total_steps > 0 else 0,
            "avg_processing_time_ms": sum(exp.total_processing_time_ms for exp in successful_experiments) / len(successful_experiments) if successful_experiments else 0,
            "fastest_experiment_ms": min(exp.total_processing_time_ms for exp in successful_experiments) if successful_experiments else 0,
            "slowest_experiment_ms": max(exp.total_processing_time_ms for exp in successful_experiments) if successful_experiments else 0
        }
    except Exception as e:
        return {"error": f"Efficiency analysis failed: {e}"}

# Quick integration function for immediate use
def create_robust_tracker(model_name: str, enable_token_counting: bool = True) -> ExperimentTracker:
    """Create a tracker with robust error handling"""
    try:
        tracker = ExperimentTracker(model_name=model_name, enable_token_counting=enable_token_counting)
        print(f"✓ Tracker initialized successfully for model: {model_name}")
        return tracker
    except Exception as e:
        print(f"WARN: Tracker initialization had issues: {e}")
        print("Continuing with fallback configuration...")
        return ExperimentTracker(model_name=model_name, enable_token_counting=False)

if __name__ == "__main__":
    # Test the robust tracker
    tracker = create_robust_tracker("gemini-1.5-pro")
    
    # Test token counting
    test_text = "This is a test sentence for token counting."
    tokens = tracker.count_tokens(test_text)
    print(f"Test token count: {tokens} tokens for: '{test_text}'")
    
    print("✓ Robust tracker ready for use!")

INFO: No specific tokenizer for gemini-1.5-pro, using fallback
✓ Tracker initialized successfully for model: gemini-1.5-pro
Test token count: 9 tokens for: 'This is a test sentence for token counting.'
✓ Robust tracker ready for use!


### Expriment Iterations

In [5]:
def format_steps(steps):
    formatted_steps = ""
    for step in steps:
        new_step = f"<Step>{step['text']}</Step>\n"
        formatted_steps += new_step
    return formatted_steps

In [6]:
# Thread-safe results collection
results_lock = Lock()
results = []

def detailed_thread_safe_evaluate_sample(row_dict, engine, model_name="gemini-1.5-pro"):
    """
    Thread-safe wrapper for evaluation with detailed result capture
    """
    try:
        # Use enhanced tracker
        tracker = EnhancedExperimentTracker(model_name, enable_token_counting=True)
        
        # Extract problem data
        problem = row_dict['problem']
        steps_list = ast.literal_eval(row_dict['steps']) if isinstance(row_dict['steps'], str) else row_dict['steps']
        solution_steps = format_steps(steps_list)
        problem_id = f"problem_{hash(problem)}"
        
        # Use detailed context manager
        with DetailedSafeExperimentContext(tracker, problem_id, problem, solution_steps) as ctx:
            # Create TextGrad variables
            question = Variable(problem, requires_grad=True, role_description="math question")
            solution = Variable(solution_steps, requires_grad=True, role_description="solution")
            verification_prompt = Variable("Verify and improve this mathematical solution step by step.",
                                          requires_grad=False, role_description="verification prompt")
            
            # Create detailed verifier
            verifier = TextualVerifierExperiment(
                verifier_engine=engine,
                step_eval_iterations=3,
                logger=False,
                tracker=tracker
            )
            
            # Perform verification
            verified_result = verifier.verify(
                instance=question,
                prompt=verification_prompt,
                calculation=solution
            )
            
            # Calculate metrics
            improvement_score = 0.5 if verified_result.value != solution_steps else 0.0
            final_decision = "REPLACE" if verified_result.value != solution_steps else "SUFFICIENT"
            
            # Get detailed result
            detailed_result = tracker.get_detailed_result()
            
            # Manually finish the experiment
            if tracker.current_experiment:
                total_time = (time.time() - ctx.start_time) * 1000
                tracker.finish_experiment(
                    verified_solution=verified_result.value,
                    final_decision=final_decision,
                    total_processing_time_ms=total_time,
                    improvement_score=improvement_score
                )
            
            # Extract metrics for return
            experiment_data = {
                'problem_id': problem_id,
                'original_problem': problem,
                'original_solution': solution_steps,
                'verified_solution': verified_result.value,
                'final_decision': final_decision,
                'improvement_score': improvement_score,
                'processing_time_ms': tracker.all_experiments[-1].total_processing_time_ms if tracker.all_experiments else 0,
                'total_llm_calls': tracker.all_experiments[-1].total_llm_calls if tracker.all_experiments else 0,
                'total_tokens': tracker.all_experiments[-1].total_tokens if tracker.all_experiments else 0,
                'estimated_cost': tracker.all_experiments[-1].estimated_cost_usd if tracker.all_experiments else 0,
                'steps_processed': tracker.all_experiments[-1].total_steps if tracker.all_experiments else 0,
                'steps_modified': tracker.all_experiments[-1].steps_modified if tracker.all_experiments else 0,
                'success': True,
                'error_message': None,
                'result': detailed_result  # Add the detailed result here
            }
            
            return experiment_data
            
    except Exception as e:
        # Return error information
        return {
            'problem_id': f"problem_{hash(row_dict.get('problem', 'unknown'))}",
            'original_problem': row_dict.get('problem', ''),
            'original_solution': '',
            'verified_solution': '',
            'final_decision': 'ERROR',
            'improvement_score': -1.0,
            'processing_time_ms': 0,
            'total_llm_calls': 0,
            'total_tokens': 0,
            'estimated_cost': 0,
            'steps_processed': 0,
            'steps_modified': 0,
            'success': False,
            'error_message': str(e),
            'result': {"steps": []}  # Empty result for errors
        }

### Experiment Running Concurrently

In [7]:
# Batch tracking aggregator
class BatchTracker:
    """Aggregate results from multiple thread-local trackers"""
    
    def __init__(self):
        self.results = []
        self.start_time = time.time()
    
    def add_result(self, result):
        with results_lock:
            self.results.append(result)
    
    def get_summary(self):
        if not self.results:
            return {}
        
        successful = [r for r in self.results if r['success']]
        failed = [r for r in self.results if not r['success']]
        
        return {
            'total_experiments': len(self.results),
            'successful': len(successful),
            'failed': len(failed),
            'success_rate': len(successful) / len(self.results) if self.results else 0,
            'total_processing_time': time.time() - self.start_time,
            'total_llm_calls': sum(r['total_llm_calls'] for r in successful),
            'total_tokens': sum(r['total_tokens'] for r in successful),
            'total_cost': sum(r['estimated_cost'] for r in successful),
            'replacement_rate': sum(1 for r in successful if r['final_decision'] == 'REPLACE') / len(successful) if successful else 0,
            'avg_processing_time_ms': sum(r['processing_time_ms'] for r in successful) / len(successful) if successful else 0,
            'avg_improvement_score': sum(r['improvement_score'] for r in successful) / len(successful) if successful else 0
        }
    
    def print_summary(self):
        summary = self.get_summary()
        if not summary:
            print("No results to summarize")
            return
        
        print(f"\n{'='*60}")
        print(f"CONCURRENT BATCH EXPERIMENT SUMMARY")
        print(f"{'='*60}")
        print(f"Total Experiments: {summary['total_experiments']}")
        print(f"Successful: {summary['successful']}")
        print(f"Failed: {summary['failed']}")
        print(f"Success Rate: {summary['success_rate']:.1%}")
        print(f"Total Processing Time: {summary['total_processing_time']:.1f}s")
        print(f"Total LLM Calls: {summary['total_llm_calls']:,}")
        print(f"Total Tokens: {summary['total_tokens']:,}")
        print(f"Total Cost: ${summary['total_cost']:.4f}")
        print(f"Replacement Rate: {summary['replacement_rate']:.1%}")
        print(f"Avg Processing Time: {summary['avg_processing_time_ms']:.0f}ms")
        print(f"Avg Improvement Score: {summary['avg_improvement_score']:.2f}")
        
        if summary['failed'] > 0:
            print(f"\nError Analysis:")
            error_results = [r for r in self.results if not r['success']]
            error_types = {}
            for error_result in error_results:
                error_msg = error_result['error_message']
                error_type = error_msg.split(':')[0] if error_msg else 'Unknown'
                error_types[error_type] = error_types.get(error_type, 0) + 1
            
            for error_type, count in error_types.items():
                print(f"  {error_type}: {count} occurrences")

In [8]:
def run_experiment_with_detailed_results():
    """Example of how to run the modified experiment"""
    results = []
    start_time = time.time()
    
    batch_tracker = BatchTracker()
    
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = [
            executor.submit(detailed_thread_safe_evaluate_sample, row.to_dict(), engine, "gemini-1.5-pro") 
            for _, row in sample[6:7].iterrows()  # Your slice
        ]
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
            try:
                result = future.result(timeout=300)
                if result is not None:
                    batch_tracker.add_result(result)
                    results.append(result)
            except Exception as e:
                print(f"Task failed: {e}")
                error_result = {
                    'success': False,
                    'error_message': str(e),
                    'final_decision': 'ERROR',
                    'result': {"steps": []}
                }
                batch_tracker.add_result(error_result)
                results.append(error_result)
    
    # Create DataFrame with detailed results
    experiment_df = pd.DataFrame(results)
    
    print(f"Completed in {time.time() - start_time:.1f} seconds")
    batch_tracker.print_summary()
    
    # The 'result' column now contains the detailed step information
    # Save results with detailed information
    experiment_df.to_csv('results/detailed-prm800k-03-algo3-clean-result.csv', index=False)
    
    # Example of accessing the detailed results
    if len(experiment_df) > 0:
        first_result = experiment_df.iloc[0]['result']
        print("\nExample detailed result structure:")
        print(json.dumps(first_result, indent=2))
    
    return experiment_df

In [9]:
run_experiment_with_detailed_results()

INFO: No specific tokenizer for gemini-1.5-pro, using fallback


Processing: 100%|██████████| 1/1 [00:00<00:00,  6.12it/s]

Completed in 0.2 seconds

CONCURRENT BATCH EXPERIMENT SUMMARY
Total Experiments: 1
Successful: 1
Failed: 0
Success Rate: 100.0%
Total Processing Time: 0.2s
Total LLM Calls: 61
Total Tokens: 112,035
Total Cost: $0.4857
Replacement Rate: 100.0%
Avg Processing Time: 181ms
Avg Improvement Score: 0.50

Example detailed result structure:
{
  "steps": [
    {
      "original": "There are a total of $7$ people, so there are $7!$ ways to seat all of them with no restrictions.",
      "generated_variants": [
        {
          "text": "Since we are arranging the 7 people around a circular table, there are $(7-1)! = 6!$ ways to seat them with no restrictions."
        },
        {
          "text": "Since we are arranging the 7 people around a circular table, there are $(7-1)! = 6!$ ways to seat them with no restrictions."
        },
        {
          "text": "Since we are arranging the 7 people around a circular table, there are $(7-1)! = 6!$ ways to seat them with no restrictions."
        }




Unnamed: 0,problem_id,original_problem,original_solution,verified_solution,final_decision,improvement_score,processing_time_ms,total_llm_calls,total_tokens,estimated_cost,steps_processed,steps_modified,success,error_message,result
0,problem_3502781112986903463,In how many ways can $7$ people sit around a r...,"<Step>There are a total of $7$ people, so ther...",<Step>Since we are arranging the 7 people arou...,REPLACE,0.5,181.216955,61,112035,0.485741,15,15,True,,{'steps': [{'original': 'There are a total of ...
