# Updated Experiment TV Using Best Sample

In [1]:
import ast
import time
import json
import tiktoken
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from threading import Lock
from dataclasses import dataclass, asdict
from typing import List, Dict, Any, Optional

import textgrad as tg
from textgrad.engine import get_engine
from textgrad.variable import Variable
from textgrad.verifier import TextualVerifierWithTracker

ImportError: cannot import name 'TextualVerifierExperiment' from 'textgrad.verifier' (/Users/eugeniusms/Development/SKRIPSI/sevet/textgrad/textgrad/verifier/__init__.py)

## Load Dataset

In [None]:
sample = pd.read_csv("dataset/sample/prm800k-03-algo3-clean.csv")
sample

## Experiment

In [None]:
engine = get_engine("gemini-1.5-pro")
tg.set_backward_engine("gemini-1.5-pro", override=True)
model_name="gemini-1.5-pro"

## Experiment Tracker

## Expriment Iterations

In [None]:
def format_steps(steps):
    formatted_steps = ""
    for step in steps:
        new_step = f"<STEP>{step['text']}</STEP>\n"
        formatted_steps += new_step
    return formatted_steps

In [None]:
# Thread-safe results collection
results_lock = Lock()
results = []

def evaluate_sample(row_dict, engine):
    try:
        # Extract problem data
        problem = row_dict['problem']
        steps_list = ast.literal_eval(row_dict['steps']) if isinstance(row_dict['steps'], str) else row_dict['steps']
        solution_steps = format_steps(steps_list)
        problem_id = f"problem_{hash(problem)}"
        
        # Variables
        question = Variable(problem,
                    requires_grad=False,
                    role_description="math question")
        instruction = Variable("""You will answering a math question. 
                                Please using step-by-step explanation. Be super concise.""",
                                requires_grad=False,
                                role_description="instruction")
        calculation = Variable(solution_steps,
                                requires_grad=True,
                                role_description="solution to the math question")

        verification_task_prompts = [
            # Perspective 1: Rule-based verifier (objective, procedural)
            """
            Evaluate the calculation step strictly based on mathematical correctness and procedural rules. 
            If the step violates any algebraic or logical principle, replace it with the corrected version of that step only. 
            Do not proceed to solve the full problem.
            Please proceed output with 
            """,

            # Perspective 2: Teaching assistant (didactic, pedagogical)
            """
            Review the calculation step from the perspective of a teaching assistant helping a student learn. 
            If there's an error or suboptimal explanation, provide a corrected version that would best aid the student's understanding. 
            Focus only on the step in question, without solving the full problem.
            """,

            # Perspective 3: Exam grader (concise, evaluative)
            """
            Assess the calculation step as an exam grader would when evaluating a student's solution. 
            Identify whether the step is mathematically valid and aligns with standard problem-solving conventions. 
            If incorrect, rewrite only the flawed step to reflect what a student should have written to receive full credit. 
            Avoid expanding beyond the current step or solving the entire problem.
            """
        ]

        # Verifier
        verifier = TextualVerifierWithTracker(
            verifier_engine=engine, 
            use_cot_generation=False,
            use_step_breakdown=True,
            verification_task_prompts=verification_task_prompts,
            enable_logging=True
        )
        # Verify
        verified_result = verifier.verify(instance=question,
                                        instruction=instruction,
                                        calculation=calculation)      

        # Tracker data
        tracker_data = verifier.get_tracker()      
            
        # Extract metrics for return
        experiment_data = {
            'problem_id': problem_id,
            'original_problem': problem,
            'original_solution': solution_steps,
            'verified_solution': verified_result.value,
            'success': True,
            'error_message': None,
            'processing_time_ms': tracker_data['processing_time_ms'],
            'total_llm_calls': tracker_data['total_llm_calls'],
            'total_input_tokens': tracker_data['total_input_tokens'],
            'total_output_tokens': tracker_data['total_output_tokens'],
            'result': tracker_data 
        }
        
        return experiment_data
            
    except Exception as e:
        # Return error information
        return {
            'problem_id': f"problem_{hash(row_dict.get('problem', 'unknown'))}",
            'original_problem': row_dict.get('problem', ''),
            'original_solution': '',
            'verified_solution': '',
            'success': False,
            'error_message': str(e),
            'processing_time_ms': 0,
            'total_llm_calls': 0,
            'total_input_tokens': 0,
            'total_output_tokens': 0,
            'result': {}  # Empty result for errors
        }

In [None]:
def run_experiment():
    """Example of how to run the modified experiment"""
    results = []
    start_time = time.time()
    
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = [
            executor.submit(evaluate_sample, row.to_dict(), engine) 
            for _, row in sample.iterrows()  # Your slice
        ]
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
            try:
                result = future.result(timeout=300)
                if result is not None:
                    results.append(result)
            except Exception as e:
                print(f"Task failed: {e}")
                error_result = {
                    'success': False,
                    'error_message': str(e),
                    'final_decision': 'ERROR',
                    'result': {"steps": []}
                }
                results.append(error_result)
    
    # Create DataFrame with detailed results
    experiment_df = pd.DataFrame(results)
    
    print(f"Completed in {time.time() - start_time:.1f} seconds")
    
    # Save results 
    experiment_df.to_csv('results/prm800k-03-algo3-clean.csv', index=False)
    
    return experiment_df

In [None]:
run_experiment()