# Updated Experiment TV Using Best Sample

In [1]:
import ast
import time
import json
import tiktoken
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from threading import Lock
from dataclasses import dataclass, asdict
from typing import List, Dict, Any, Optional

import textgrad as tg
from textgrad.engine import get_engine
from textgrad.variable import Variable
from textgrad.verifier import TextualVerifierWithTracker

## Load Dataset

In [2]:
sample = pd.read_csv("dataset/sample/prm800k-03-algo3-clean.csv")
sample

Unnamed: 0,id,labeler,timestamp,problem,ground_truth_answer,total_steps,steps,neg_1,zero,pos_1
0,1,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-30T14:37:13.296218,There are an infinite number of vectors $\math...,\begin{pmatrix} -7 \\ 16 \\ 5 \end{pmatrix},34,"[{'text': ""Let's set $\\mathbf{v} = \\begin{pm...",19,6,9
1,2,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-30T13:26:58.414691,When rolling a certain unfair six-sided die wi...,29,35,"[{'text': ""Well, let's think about this for a ...",18,1,16
2,3,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-31T14:39:30.588403,Find all solutions to\n\[\sin \left( \tan^{-1}...,3 \pm 2 \sqrt{2},34,"[{'text': ""Let's set $y = \\tan^{-1} x$."", 'ra...",11,1,22
3,4,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-29T07:48:01.714041,The solutions of the equation $z^4+4z^3i-6z^2-...,11,40,[{'text': 'There is a formula for the area of ...,16,2,21
4,5,e90a38f3-3135-4465-87af-3e6322e3d772,2022-07-22T20:02:50.866783,A sequence $(a_n)$ is defined as follows:\n\[a...,-1,36,"[{'text': ""So we're given that $a_{i + 1} = \\...",7,3,26
...,...,...,...,...,...,...,...,...,...,...
66,440,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-28T08:12:20.344377,Find the product $CD$ of the integers $C$ and ...,-5,17,[{'text': 'I think the first step here is to f...,3,0,14
67,442,d8aa7923-b970-45e1-9734-e4a7f6c4a7db,2022-07-31T22:47:06.498122,What real values of $x$ are not in the domain ...,-4,31,[{'text': 'To find values of $x$ that are not ...,1,0,30
68,444,d8aa7923-b970-45e1-9734-e4a7f6c4a7db,2022-07-24T10:40:50.685197,How many license plates can be formed if every...,58500,14,[{'text': 'So we need to count the number of p...,2,2,10
69,445,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-30T11:25:46.657657,"If $f(x)=5x^2+3x+4$, what is the value of $f(-...",18,7,"[{'text': 'To find f(-2), we just need to plug...",1,0,6


## Experiment

In [3]:
engine = get_engine("gemini-1.5-pro")
tg.set_backward_engine("gemini-1.5-pro", override=True)
model_name="gemini-1.5-pro"

  from .autonotebook import tqdm as notebook_tqdm


## Experiment Tracker

## Expriment Iterations

In [4]:
def format_steps(steps):
    formatted_steps = ""
    for step in steps:
        new_step = f"<STEP>{step['text']}</STEP>\n"
        formatted_steps += new_step
    return formatted_steps

In [5]:
# Thread-safe results collection
results_lock = Lock()
results = []

def evaluate_sample(row_dict, engine):
    try:
        # Extract problem data
        problem = row_dict['problem']
        steps_list = ast.literal_eval(row_dict['steps']) if isinstance(row_dict['steps'], str) else row_dict['steps']
        solution_steps = format_steps(steps_list)
        problem_id = f"problem_{hash(problem)}"
        
        # Variables
        question = Variable(problem,
                    requires_grad=False,
                    role_description="math question")
        instruction = Variable("""You will answering a math question. 
                                Please using step-by-step explanation. Be super concise.""",
                                requires_grad=False,
                                role_description="instruction")
        calculation = Variable(solution_steps,
                                requires_grad=True,
                                role_description="solution to the math question")

        verification_task_prompts = [
            # Perspective 1: Rule-based verifier (objective, procedural)
            """
            Evaluate the calculation step strictly based on mathematical correctness and procedural rules. 
            If the step violates any algebraic or logical principle, replace it with the corrected version of that step only. 
            Do not proceed to solve the full problem.
            Please proceed output with 
            """,

            # Perspective 2: Teaching assistant (didactic, pedagogical)
            """
            Review the calculation step from the perspective of a teaching assistant helping a student learn. 
            If there's an error or suboptimal explanation, provide a corrected version that would best aid the student's understanding. 
            Focus only on the step in question, without solving the full problem.
            """,

            # Perspective 3: Exam grader (concise, evaluative)
            """
            Assess the calculation step as an exam grader would when evaluating a student's solution. 
            Identify whether the step is mathematically valid and aligns with standard problem-solving conventions. 
            If incorrect, rewrite only the flawed step to reflect what a student should have written to receive full credit. 
            Avoid expanding beyond the current step or solving the entire problem.
            """
        ]

        # Verifier
        verifier = TextualVerifierWithTracker(
            verifier_engine=engine, 
            use_cot_generation=False,
            use_step_breakdown=True,
            verification_task_prompts=verification_task_prompts,
            enable_logging=True
        )
        # Verify
        verified_result = verifier.verify(instance=question,
                                        instruction=instruction,
                                        calculation=calculation)      

        # Tracker data
        tracker_data = verifier.get_tracker()      
            
        # Extract metrics for return
        experiment_data = {
            'problem_id': problem_id,
            'original_problem': problem,
            'original_solution': solution_steps,
            'verified_solution': verified_result.value,
            'success': True,
            'error_message': None,
            'processing_time_ms': tracker_data['processing_time_ms'],
            'total_llm_calls': tracker_data['total_llm_calls'],
            'total_input_tokens': tracker_data['total_input_tokens'],
            'total_output_tokens': tracker_data['total_output_tokens'],
            'result': tracker_data 
        }
        
        return experiment_data
            
    except Exception as e:
        # Return error information
        return {
            'problem_id': f"problem_{hash(row_dict.get('problem', 'unknown'))}",
            'original_problem': row_dict.get('problem', ''),
            'original_solution': '',
            'verified_solution': '',
            'success': False,
            'error_message': str(e),
            'processing_time_ms': 0,
            'total_llm_calls': 0,
            'total_input_tokens': 0,
            'total_output_tokens': 0,
            'result': {}  # Empty result for errors
        }

In [6]:
def run_experiment():
    """Example of how to run the modified experiment"""
    results = []
    start_time = time.time()
    
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = [
            executor.submit(evaluate_sample, row.to_dict(), engine) 
            for _, row in sample[26:27].iterrows()  # Your slice
        ]
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
            try:
                result = future.result(timeout=300)
                if result is not None:
                    results.append(result)
            except Exception as e:
                print(f"Task failed: {e}")
                error_result = {
                    'success': False,
                    'error_message': str(e),
                    'final_decision': 'ERROR',
                    'result': {"steps": []}
                }
                results.append(error_result)
    
    # Create DataFrame with detailed results
    experiment_df = pd.DataFrame(results)
    
    print(f"Completed in {time.time() - start_time:.1f} seconds")
    
    # Save results 
    experiment_df.to_csv('results/new-prm800k-03-algo3-clean-result.csv', index=False)
    
    return experiment_df

In [7]:
run_experiment()

INFO:textgrad:TextualVerifier: Start verification process...
INFO:textgrad:TextualVerifier: Ready to verify 5 calculation steps...
INFO:textgrad:TextualVerifier: Verifying step 1/5...
INFO:textgrad:TextualVerifier: Generating step 1 variant 1/3...


Processing:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:textgrad:TextualVerifier: Generating step 1 variant 2/3...
INFO:textgrad:TextualVerifier: Generating step 1 variant 3/3...
INFO:textgrad:TextualVerifier: Running majority voting for step 1...
INFO:textgrad:TextualVerifier: Verifying step 2/5...
INFO:textgrad:TextualVerifier: Generating step 2 variant 1/3...
INFO:textgrad:TextualVerifier: Generating step 2 variant 2/3...
INFO:textgrad:TextualVerifier: Generating step 2 variant 3/3...
INFO:textgrad:TextualVerifier: Running majority voting for step 2...
INFO:textgrad:TextualVerifier: Verifying step 3/5...
INFO:textgrad:TextualVerifier: Generating step 3 variant 1/3...
INFO:textgrad:TextualVerifier: Generating step 3 variant 2/3...
INFO:textgrad:TextualVerifier: Generating step 3 variant 3/3...
INFO:textgrad:TextualVerifier: Running majority voting for step 3...
INFO:textgrad:TextualVerifier: Verifying step 4/5...
INFO:textgrad:TextualVerifier: Generating step 4 variant 1/3...
INFO:textgrad:TextualVerifier: Generating step 4 variant 2

Processing: 100%|██████████| 1/1 [00:28<00:00, 28.25s/it]

Completed in 28.3 seconds





Unnamed: 0,problem_id,original_problem,original_solution,verified_solution,success,error_message,processing_time_ms,total_llm_calls,total_input_tokens,total_output_tokens,result
0,problem_-7991849741346505023,Compute $99^2+99+1$ in your head.,<STEP>What's the square of 99?</STEP>\n<STEP>T...,<VERIFIED>```$99^2 = (100-1)^2 = 100^2 - 2(100...,True,,28266.791992,20,2857,396,{'setup': {'verifier_engine': '<textgrad.engin...
