# Updated Experiment TV Using Best Sample

In [None]:
import re
import ast
import time
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from threading import Lock

import textgrad as tg
from textgrad.engine import get_engine
from textgrad.variable import Variable
from textgrad.verifier import TextualVerifierWithTracker

## Load Dataset

In [None]:
sample = pd.read_csv("datasets/sample/prm800k-03-algo3-clean.csv")
sample

## Experiment

In [None]:
engine = get_engine("gemini-1.5-pro")
tg.set_backward_engine("gemini-1.5-pro", override=True)
model_name="gemini-1.5-pro"

## Expriment Iterations

In [None]:
def format_steps(steps):
    formatted_steps = ""
    for step in steps:
        new_step = f"<STEP>{step['text']}</STEP>\n"
        formatted_steps += new_step
    return formatted_steps

In [None]:
def get_step_list(formatted_steps):
    steps = []
    matches = re.findall(r"<VERIFIED>(.*?)</VERIFIED>", formatted_steps, re.DOTALL)
    for match in matches:
        steps.append({"text": match.strip()})
    return steps

In [None]:
def get_first_integer(text):
    number = ''
    found = False

    for char in text:
        if char.isdigit():
            number += char
            found = True
        elif found:
            break

    return int(number) if number else None

In [None]:
def get_transition(original_rating, verified_rating):
    transition = {
        'neg1_to_neg1': 0,
        'neg1_to_zero': 0,
        'neg1_to_pos1': 0,
        'zero_to_neg1': 0,
        'zero_to_zero': 0,
        'zero_to_pos1': 0,
        'pos1_to_neg1': 0,
        'pos1_to_zero': 0,
        'pos1_to_pos1': 0,
    }

    for i, rating in enumerate(original_rating):
        verified = verified_rating[i]
        if rating == -1:
            if verified == -1:
                transition['neg1_to_neg1'] += 1
            elif verified == 0:
                transition['neg1_to_zero'] += 1
            elif verified == 1:
                transition['neg1_to_pos1'] += 1
        elif rating == 0:
            if verified == -1:
                transition['zero_to_neg1'] += 1
            elif verified == 0:
                transition['zero_to_zero'] += 1
            elif verified == 1:
                transition['zero_to_pos1'] += 1
        elif rating == 1:
            if verified == -1:
                transition['pos1_to_neg1'] += 1
            elif verified == 0:
                transition['pos1_to_zero'] += 1
            elif verified == 1:
                transition['pos1_to_pos1'] += 1

    return transition


In [None]:
def extract_answer_llm(question, solution):
    prompt = f"""
    <Question>
    {question}
    </Question>
    
    <Solution>
    {solution}
    </Solution>

    <Task>
    Based on last step of solution, what the answer of question?
    ONLY provide final answer. Be super concise.
    </Task>
    """
    answer = engine.generate(prompt)
        
    return answer

In [None]:
def check_answer_correctness_llm(ground_truth_answer, answer_to_check):
    prompt = f"""
    <GroundTruthAnswer>
    {ground_truth_answer}
    </GroundTruthAnswer>
    
    <AnswerToCheck>
    {answer_to_check}
    </AnswerToCheck>

    <Task>
    Based on GroundTruthAnswer, is the AnswerToCheck is correct?
    Response MUST ONLY in "TRUE" or "FALSE" without quote.
    </Task>
    """
    correctness = engine.generate(prompt)
        
    return correctness

In [None]:
def rating_step_llm(ground_truth_step, step_to_rate):
    prompt = f"""
    <GroundTruthStep>
    {ground_truth_step}
    </GroundTruthStep>
    
    <StepToRate>
    {step_to_rate}
    </StepToRate>

    <RatingInformation>
    -1: Great: correct, verifiable, appropriate, and insightful.
    0: Okay: don't contribute anything of essence, reasonable, verifiably cor- rect, and appropriate, but also redundant, stalling, or just don't add any value.
    1: Bad: hard to verify, wrong, contains gibberish, contains off-topic text or non-sequiturs, suggests attempting something unreasonable, derails the conversation, leads the solution into dead or circles.
    </RatingInformation>

    <Task>
    Based on GroundTruthStep, PLEASE rating the StepToRate.
    Grading based on RatingInformation.
    
    Response MUST ONLY in $NUMBER (-1 or 0 or 1).
    </Task>
    """
    rating = engine.generate(prompt)
        
    return rating

In [None]:
def get_verification_task_prompts(variant_wanted):
    verification_task_prompts = [
        # Perspective 1: Rule-based verifier (objective, procedural)
        """
        Evaluate the calculation step strictly based on mathematical correctness and procedural rules. 
        If the step violates any algebraic or logical principle, replace it with the corrected version of that step only. 
        Do not proceed to solve the full problem.
        """,

        # Perspective 2: Teaching assistant (didactic, pedagogical)
        """
        Review the calculation step from the perspective of a teaching assistant helping a student learn. 
        If there's an error or suboptimal explanation, provide a corrected version that would best aid the student's understanding. 
        Focus only on the step in question, without solving the full problem.
        """,

        # Perspective 3: Exam grader (concise, evaluative)
        """
        Assess the calculation step as an exam grader would when evaluating a student's solution. 
        Identify whether the step is mathematically valid and aligns with standard problem-solving conventions. 
        If incorrect, rewrite only the flawed step to reflect what a student should have written to receive full credit. 
        Avoid expanding beyond the current step or solving the entire problem.
        """,

        # Perspective 4: Peer reviewer (collaborative, analytical)
        """
        Examine the calculation step as a fellow mathematician reviewing a colleague's work. 
        Check for mathematical rigor, clarity of notation, and logical consistency. 
        If the step contains errors or ambiguities, suggest an improved version that maintains mathematical precision. 
        Focus solely on refining the current step without advancing the solution.
        """,

        # Perspective 5: Textbook editor (clear, standardized)
        """
        Review the calculation step as a textbook editor ensuring mathematical accuracy and clarity for publication. 
        Verify that the step follows standard mathematical notation and conventions. 
        If corrections are needed, rewrite the step to match the quality and style expected in a high-quality mathematics textbook. 
        Limit your response to improving only the given step.
        """
    ]
    if variant_wanted > len(verification_task_prompts):
        raise IndexError(f"No prompt in index {variant_wanted-1}")
    return verification_task_prompts[0:variant_wanted]

In [None]:
# Thread-safe results collection
results_lock = Lock()
results = []

def evaluate_sample(row_dict, engine, num_variant):
    try:
        # Extract problem data
        id = row_dict['id']
        problem = row_dict['problem']
        steps_list = ast.literal_eval(row_dict['steps']) if isinstance(row_dict['steps'], str) else row_dict['steps']
        solution_steps = format_steps(steps_list)
        
        # Variables
        question = Variable(problem,
                    requires_grad=False,
                    role_description="math question")
        instruction = Variable("""You will answering a math question. 
                                Please using step-by-step explanation. Be super concise.""",
                                requires_grad=False,
                                role_description="instruction")
        calculation = Variable(solution_steps,
                                requires_grad=True,
                                role_description="solution to the math question")

        # Verifier
        verifier = TextualVerifierWithTracker(
            verifier_engine=engine, 
            use_cot_generation=False,
            use_step_breakdown=True,
            verification_task_prompts=get_verification_task_prompts(num_variant),
            enable_logging=False
        )
        # Verify
        verified_result = verifier.verify(instance=question,
                                        instruction=instruction,
                                        calculation=calculation)      

        # Tracker data
        tracker_data = verifier.get_tracker()

        # Save to result
        original_rating = [step['rating'] for step in steps_list]
        original_answer = extract_answer_llm(question=problem, solution=solution_steps)
        original_answer_correctness = check_answer_correctness_llm(
            ground_truth_answer=row_dict['ground_truth_answer'], 
            answer_to_check=original_answer)

        verified_step_list = get_step_list(verified_result.value)
        
        ground_truth_steps = ast.literal_eval(row_dict['ground_truth_steps'])
        verified_rating = [get_first_integer(rating_step_llm(
            ground_truth_step=ground_truth_steps[i],
            step_to_rate=step)) for i, step in enumerate(verified_step_list)]

        verified_answer = extract_answer_llm(
            question=problem, 
            solution=format_steps(verified_step_list))
        verified_answer_correctness = check_answer_correctness_llm(
            ground_truth_answer=row_dict['ground_truth_answer'], 
            answer_to_check=verified_answer)
        verified_total_steps = len(verified_step_list)
        verified_neg1 = verified_rating.count(-1)
        verified_zero = verified_rating.count(0)
        verified_pos1 = verified_rating.count(1)

        transition = get_transition(
            original_rating=original_rating,
            verified_rating=verified_rating)
            
        # Extract metrics for return
        experiment_data = {
            'id': id,
            'problem': problem,
            'ground_truth_answer':row_dict['ground_truth_answer'],
            # Original
            'original_solution': solution_steps,
            'original_rating': original_rating,
            'original_answer': original_answer,
            'original_answer_correctness': original_answer_correctness,
            'original_total_steps': row_dict['total_steps'],
            'original_neg1': row_dict['neg_1'],
            'original_zero': row_dict['zero'],
            'original_pos1': row_dict['pos_1'],
            # Verified
            'verified_solution': verified_result.value,
            'verified_rating': verified_rating,
            'verified_answer': verified_answer,
            'verifier_answer_correctness': verified_answer_correctness,
            'verified_total_steps': verified_total_steps,
            'verifier_neg1': verified_neg1,
            'verifier_zero': verified_zero,
            'verifier_pos1': verified_pos1,
            # Stats
            'neg1_to_neg1': transition['neg1_to_neg1'],
            'neg1_to_zero': transition['neg1_to_zero'],
            'neg1_to_pos1': transition['neg1_to_pos1'],
            'zero_to_neg1': transition['zero_to_neg1'],
            'zero_to_zero': transition['zero_to_zero'],
            'zero_to_pos1': transition['zero_to_pos1'],
            'pos1_to_neg1': transition['pos1_to_neg1'],
            'pos1_to_zero': transition['pos1_to_zero'],
            'pos1_to_pos1': transition['pos1_to_pos1'],
            # Other
            'success': True,
            'error_message': None,
            'processing_time_ms': tracker_data['processing_time_ms'],
            'total_llm_calls': tracker_data['total_llm_calls'],
            'total_input_tokens': tracker_data['total_input_tokens'],
            'total_output_tokens': tracker_data['total_output_tokens'],
            'result': tracker_data 
        }
        
        return experiment_data
            
    except Exception as e:
        return {
            'id': id,
            'problem': '',
            'ground_truth_answer': '',
            # Original
            'original_solution': '',
            'original_rating': None,
            'original_answer': '',
            'original_answer_correctness': None,
            'original_total_steps': None,
            'original_neg1': None,
            'original_zero': None,
            'original_pos1': None,
            # Verified
            'verified_solution': '',
            'verified_rating': None,
            'verified_answer': '',
            'verifier_answer_correctness': None,
            'verified_total_steps': None,
            'verifier_neg1': None,
            'verifier_zero': None,
            'verifier_pos1': None,
            # Stats
            'neg1_to_neg1': 0,
            'neg1_to_zero': 0,
            'neg1_to_pos1': 0,
            'zero_to_neg1': 0,
            'zero_to_zero': 0,
            'zero_to_pos1': 0,
            'pos1_to_neg1': 0,
            'pos1_to_zero': 0,
            'pos1_to_pos1': 0,
            # Other
            'success': False,
            'error_message': str(e),
            'processing_time_ms': 0,
            'total_llm_calls': 0,
            'total_input_tokens': 0,
            'total_output_tokens': 0,
            'result': {}
        }

In [None]:
def run_experiment(num_variant):
    """Example of how to run the modified experiment"""
    results = []
    start_time = time.time()
    
    with ThreadPoolExecutor(max_workers=32) as executor:
        futures = [
            executor.submit(evaluate_sample, row.to_dict(), engine, num_variant) 
            for _, row in sample.iterrows()  # Your slice
        ]
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
            try:
                result = future.result(timeout=None)
                if result is not None:
                    results.append(result)
            except Exception as e:
                print(f"Task failed: {e}")
                error_result = {
                    'success': False,
                    'error_message': str(e),
                    'result': {}
                }
                results.append(error_result)
    
    # Create DataFrame with detailed results
    experiment_df = pd.DataFrame(results)
    
    print(f"Completed in {time.time() - start_time:.1f} seconds")
    
    # Save results 
    experiment_df.to_csv(f'results/textualverifier-{num_variant}v.csv', index=False)
    
    return experiment_df

In [None]:
run_experiment(1)

In [None]:
run_experiment(2)

In [None]:
run_experiment(3)

In [None]:
run_experiment(4)

In [None]:
run_experiment(5)