# Updated Experiment TV Using Best Sample

In [1]:
import re
import ast
import time
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from threading import Lock

import textgrad as tg
from textgrad.engine import get_engine
from textgrad.variable import Variable
from textgrad.verifier import TextualVerifierWithTracker

## Load Dataset

In [2]:
sample = pd.read_csv("datasets/sample/prm800k-03-algo3-clean.csv")
sample

Unnamed: 0,id,labeler,timestamp,problem,ground_truth_answer,total_steps,steps,neg_1,zero,pos_1
0,1,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-30T14:37:13.296218,There are an infinite number of vectors $\math...,\begin{pmatrix} -7 \\ 16 \\ 5 \end{pmatrix},34,"[{'text': ""Let's set $\\mathbf{v} = \\begin{pm...",19,6,9
1,2,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-30T13:26:58.414691,When rolling a certain unfair six-sided die wi...,29,35,"[{'text': ""Well, let's think about this for a ...",18,1,16
2,3,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-31T14:39:30.588403,Find all solutions to\n\[\sin \left( \tan^{-1}...,3 \pm 2 \sqrt{2},34,"[{'text': ""Let's set $y = \\tan^{-1} x$."", 'ra...",11,1,22
3,4,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-29T07:48:01.714041,The solutions of the equation $z^4+4z^3i-6z^2-...,11,40,[{'text': 'There is a formula for the area of ...,16,2,21
4,5,e90a38f3-3135-4465-87af-3e6322e3d772,2022-07-22T20:02:50.866783,A sequence $(a_n)$ is defined as follows:\n\[a...,-1,36,"[{'text': ""So we're given that $a_{i + 1} = \\...",7,3,26
...,...,...,...,...,...,...,...,...,...,...
66,440,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-28T08:12:20.344377,Find the product $CD$ of the integers $C$ and ...,-5,17,[{'text': 'I think the first step here is to f...,3,0,14
67,442,d8aa7923-b970-45e1-9734-e4a7f6c4a7db,2022-07-31T22:47:06.498122,What real values of $x$ are not in the domain ...,-4,31,[{'text': 'To find values of $x$ that are not ...,1,0,30
68,444,d8aa7923-b970-45e1-9734-e4a7f6c4a7db,2022-07-24T10:40:50.685197,How many license plates can be formed if every...,58500,14,[{'text': 'So we need to count the number of p...,2,2,10
69,445,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-30T11:25:46.657657,"If $f(x)=5x^2+3x+4$, what is the value of $f(-...",18,7,"[{'text': 'To find f(-2), we just need to plug...",1,0,6


## Experiment

In [3]:
engine = get_engine("gemini-1.5-pro")
tg.set_backward_engine("gemini-1.5-pro", override=True)
model_name="gemini-1.5-pro"

  from .autonotebook import tqdm as notebook_tqdm


## Expriment Iterations

In [4]:
def format_steps(steps):
    formatted_steps = ""
    for step in steps:
        new_step = f"<STEP>{step['text']}</STEP>\n"
        formatted_steps += new_step
    return formatted_steps

In [None]:
def get_step_list(formatted_steps):
    steps = []
    matches = re.findall(r"<VERIFIED>(.*?)</VERIFIED>", formatted_steps, re.DOTALL)
    for match in matches:
        steps.append(match.strip())
    return steps

In [None]:
def get_transition(original_rating, verified_rating):
    transition = {
        'neg1_to_neg1': 0,
        'neg1_to_zero': 0,
        'neg1_to_pos1': 0,
        'zero_to_neg1': 0,
        'zero_to_zero': 0,
        'zero_to_pos1': 0,
        'pos1_to_neg1': 0,
        'pos1_to_zero': 0,
        'pos1_to_pos1': 0,
    }

    for i, rating in enumerate(original_rating):
        verified = verified_rating[i]
        if rating == -1:
            if verified == -1:
                transition['neg1_to_neg1'] += 1
            elif verified == 0:
                transition['neg1_to_zero'] += 1
            elif verified == 1:
                transition['neg1_to_pos1'] += 1
        elif rating == 0:
            if verified == -1:
                transition['zero_to_neg1'] += 1
            elif verified == 0:
                transition['zero_to_zero'] += 1
            elif verified == 1:
                transition['zero_to_pos1'] += 1
        elif rating == 1:
            if verified == -1:
                transition['pos1_to_neg1'] += 1
            elif verified == 0:
                transition['pos1_to_zero'] += 1
            elif verified == 1:
                transition['pos1_to_pos1'] += 1

    return transition


In [None]:
def extract_answer_llm(question, solution):
    prompt = f"""
    <Question>
    {question}
    </Question>
    
    <Solution>
    {solution}
    </Solution>

    <Task>
    Based on last step of solution, what the answer of question?
    ONLY provide final answer. Be super concise.
    </Task>
    """
    answer = engine.generate(prompt)
        
    return answer

In [None]:
def check_answer_correctness_llm(ground_truth_answer, answer_to_check):
    prompt = f"""
    <GroundTruthAnswer>
    {ground_truth_answer}
    </GroundTruthAnswer>
    
    <AnswerToCheck>
    {answer_to_check}
    </AnswerToCheck>

    <Task>
    Based on GroundTruthAnswer, is the AnswerToCheck is correct?
    Response MUST ONLY in "TRUE" or "FALSE" without quote.
    </Task>
    """
    correctness = engine.generate(prompt)
        
    return correctness

In [None]:
def rating_step_llm(ground_truth_step, step_to_rate):
    prompt = f"""
    <GroundTruthStep>
    {ground_truth_step}
    </GroundTruthStep>
    
    <StepToRate>
    {step_to_rate}
    </StepToRate>

    <RatingInformation>
    -1: Great: correct, verifiable, appropriate, and insightful.
    0: Okay: don't contribute anything of essence, reasonable, verifiably cor- rect, and appropriate, but also redundant, stalling, or just don't add any value.
    1: Bad: hard to verify, wrong, contains gibberish, contains off-topic text or non-sequiturs, suggests attempting something unreasonable, derails the conversation, leads the solution into dead or circles.
    </RatingInformation>

    <Task>
    Based on GroundTruthStep, PLEASE rating the StepToRate.
    Grading based on RatingInformation.
    
    Response MUST ONLY in $NUMBER (-1 or 0 or 1).
    </Task>
    """
    rating = engine.generate(prompt)
        
    return int(rating)

In [5]:
# Thread-safe results collection
results_lock = Lock()
results = []

def evaluate_sample(row_dict, engine):
    try:
        # Extract problem data
        id = row_dict['id']
        problem = row_dict['problem']
        steps_list = ast.literal_eval(row_dict['steps']) if isinstance(row_dict['steps'], str) else row_dict['steps']
        solution_steps = format_steps(steps_list)
        
        # Variables
        question = Variable(problem,
                    requires_grad=False,
                    role_description="math question")
        instruction = Variable("""You will answering a math question. 
                                Please using step-by-step explanation. Be super concise.""",
                                requires_grad=False,
                                role_description="instruction")
        calculation = Variable(solution_steps,
                                requires_grad=True,
                                role_description="solution to the math question")

        verification_task_prompts = [
            # Perspective 1: Rule-based verifier (objective, procedural)
            """
            Evaluate the calculation step strictly based on mathematical correctness and procedural rules. 
            If the step violates any algebraic or logical principle, replace it with the corrected version of that step only. 
            Do not proceed to solve the full problem.
            Please proceed output with 
            """,

            # Perspective 2: Teaching assistant (didactic, pedagogical)
            """
            Review the calculation step from the perspective of a teaching assistant helping a student learn. 
            If there's an error or suboptimal explanation, provide a corrected version that would best aid the student's understanding. 
            Focus only on the step in question, without solving the full problem.
            """,

            # Perspective 3: Exam grader (concise, evaluative)
            """
            Assess the calculation step as an exam grader would when evaluating a student's solution. 
            Identify whether the step is mathematically valid and aligns with standard problem-solving conventions. 
            If incorrect, rewrite only the flawed step to reflect what a student should have written to receive full credit. 
            Avoid expanding beyond the current step or solving the entire problem.
            """
        ]

        # Verifier
        verifier = TextualVerifierWithTracker(
            verifier_engine=engine, 
            use_cot_generation=False,
            use_step_breakdown=True,
            verification_task_prompts=verification_task_prompts,
            enable_logging=False
        )
        # Verify
        verified_result = verifier.verify(instance=question,
                                        instruction=instruction,
                                        calculation=calculation)      

        # Tracker data
        tracker_data = verifier.get_tracker()

        # Save to result
        original_rating = [step['rating'] for step in row_dict['steps']]
        original_answer = extract_answer_llm(question=problem, solution=solution_steps)
        original_answer_correctness = check_answer_correctness_llm(
            ground_truth_answer=row_dict['ground_truth_answer'], 
            answer_to_check=original_answer)
        
        verified_step_list = get_step_list(verified_result.value)
        verified_rating = [rating_step_llm(step) for step in verified_step_list]
        verified_answer = extract_answer_llm(question=problem, solution=format_steps(verified_step_list))
        verified_answer_correctness = check_answer_correctness_llm(
            ground_truth_answer=row_dict['ground_truth_answer'], 
            answer_to_check=verified_answer)
        verified_total_steps = len(verified_step_list)
        verified_neg1 = verified_rating.count(-1)
        verified_zero = verified_rating.count(0)
        verified_pos1 = verified_rating.count(1)

        transition = get_transition(
            original_rating=original_rating,
            verified_rating=verified_rating)
            
        # Extract metrics for return
        experiment_data = {
            'id': id,
            'problem': problem,
            'ground_truth_answer':row_dict['ground_truth_answer'],
            # Original
            'original_solution': solution_steps,
            'original_rating': original_rating,
            'original_answer': original_answer,
            'original_answer_correctness': original_answer_correctness,
            'original_total_steps': row_dict['total_steps'],
            'original_neg1': row_dict['neg_1'],
            'original_zero': row_dict['zero'],
            'original_pos1': row_dict['pos_1'],
            # Verified
            'verified_solution': verified_result.value,
            'verified_rating': verified_rating,
            'verified_answer': verified_answer,
            'verifier_answer_correctness': verified_answer_correctness,
            'verified_total_steps': verified_total_steps,
            'verifier_neg1': verified_neg1,
            'verifier_zero': verified_zero,
            'verifier_pos1': verified_pos1,
            # Stats
            'neg1_to_neg1': transition['neg1_to_neg1'],
            'neg1_to_zero': transition['neg1_to_zero'],
            'neg1_to_pos1': transition['neg1_to_pos1'],
            'zero_to_neg1': transition['zero_to_neg1'],
            'zero_to_zero': transition['zero_to_zero'],
            'zero_to_pos1': transition['zero_to_pos1'],
            'pos1_to_neg1': transition['pos1_to_neg1'],
            'pos1_to_zero': transition['pos1_to_zero'],
            'pos1_to_pos1': transition['pos1_to_pos1'],
            # Other
            'success': True,
            'error_message': None,
            'processing_time_ms': tracker_data['processing_time_ms'],
            'total_llm_calls': tracker_data['total_llm_calls'],
            'total_input_tokens': tracker_data['total_input_tokens'],
            'total_output_tokens': tracker_data['total_output_tokens'],
            'result': tracker_data 
        }
        
        return experiment_data
            
    except Exception as e:
        return {
            'id': id,
            'problem': '',
            'ground_truth_answer': '',
            # Original
            'original_solution': '',
            'original_rating': None,
            'original_answer': '',
            'original_answer_correctness': None,
            'original_total_steps': None,
            'original_neg1': None,
            'original_zero': None,
            'original_pos1': None,
            # Verified
            'verified_solution': '',
            'verified_rating': None,
            'verified_answer': '',
            'verifier_answer_correctness': None,
            'verified_total_steps': None,
            'verifier_neg1': None,
            'verifier_zero': None,
            'verifier_pos1': None,
            # Stats
            'neg1_to_neg1': 0,
            'neg1_to_zero': 0,
            'neg1_to_pos1': 0,
            'zero_to_neg1': 0,
            'zero_to_zero': 0,
            'zero_to_pos1': 0,
            'pos1_to_neg1': 0,
            'pos1_to_zero': 0,
            'pos1_to_pos1': 0,
            # Other
            'success': False,
            'error_message': str(e),
            'processing_time_ms': 0,
            'total_llm_calls': 0,
            'total_input_tokens': 0,
            'total_output_tokens': 0,
            'result': {}
        }

In [6]:
def run_experiment():
    """Example of how to run the modified experiment"""
    results = []
    start_time = time.time()
    
    with ThreadPoolExecutor(max_workers=32) as executor:
        futures = [
            executor.submit(evaluate_sample, row.to_dict(), engine) 
            for _, row in sample.iterrows()  # Your slice
        ]
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
            try:
                result = future.result(timeout=None)
                if result is not None:
                    results.append(result)
            except Exception as e:
                print(f"Task failed: {e}")
                error_result = {
                    'success': False,
                    'error_message': str(e),
                    'result': {}
                }
                results.append(error_result)
    
    # Create DataFrame with detailed results
    experiment_df = pd.DataFrame(results)
    
    print(f"Completed in {time.time() - start_time:.1f} seconds")
    
    # Save results 
    experiment_df.to_csv('results/textualverifier-3v.csv', index=False)
    
    return experiment_df

In [7]:
run_experiment()

Processing: 100%|██████████| 71/71 [17:58<00:00, 15.19s/it]


Completed in 1078.5 seconds


Unnamed: 0,id,problem,original_solution,verified_solution,success,error_message,processing_time_ms,total_llm_calls,total_input_tokens,total_output_tokens,result
0,27,Compute $99^2+99+1$ in your head.,<STEP>What's the square of 99?</STEP>\n<STEP>T...,<VERIFIED>```$99^2 = (100-1)^2 = 100^2 - 2(100...,True,,2.029099e+02,20,2857,396,{'setup': {'verifier_engine': '<textgrad.engin...
1,12,Suppose that $f$ is a polynomial such that \[(...,<STEP>Let's recall what the degree of a polyno...,<VERIFIED>```\nLet $d$ be the degree of $f(x)$...,True,,5.972023e+04,28,9529,1884,{'setup': {'verifier_engine': '<textgrad.engin...
2,7,In how many ways can $7$ people sit around a r...,"<STEP>There are a total of $7$ people, so ther...",<VERIFIED>```There are a total of $(7-1)! = 6!...,True,,9.544039e+04,60,32754,3706,{'setup': {'verifier_engine': '<textgrad.engin...
3,34,"On a particular map, $3$ inches on the map equ...",<STEP>Let's call the distance between the buil...,<VERIFIED>Let 'x' be the distance between the ...,True,,1.047520e+05,48,12011,992,{'setup': {'verifier_engine': '<textgrad.engin...
4,20,Let $f$ be the function defined by $f(x) = x^3...,<STEP>First let's find the polynomial for $g(x...,"<VERIFIED>```\nLet $r_1,$ $r_2,$ and $r_3$ be ...",True,,1.327366e+05,32,17492,2485,{'setup': {'verifier_engine': '<textgrad.engin...
...,...,...,...,...,...,...,...,...,...,...,...
66,55,"Positive integers $a$, $b$, and $2009$, with $...",<STEP>Now let's think about the definition of ...,"<VERIFIED>```Let $a, b, 2009$ be a geometric s...",True,,4.698546e+05,76,99703,9956,{'setup': {'verifier_engine': '<textgrad.engin...
67,41,The smallest distance between the origin and a...,<STEP>I bet the answer is 6.</STEP>\n<STEP>Rig...,<VERIFIED>```\nLet $d$ be the distance between...,True,,6.766431e+05,120,183260,11387,{'setup': {'verifier_engine': '<textgrad.engin...
68,3,Find all solutions to\n\[\sin \left( \tan^{-1}...,<STEP>Let's set $y = \tan^{-1} x$.</STEP>\n<ST...,<VERIFIED>```Let $y = \tan^{-1} x.$ Then $\ta...,True,,8.402972e+05,136,266852,17992,{'setup': {'verifier_engine': '<textgrad.engin...
69,442,What real values of $x$ are not in the domain ...,<STEP>To find values of $x$ that are not in th...,<VERIFIED>```\nThe values of $x$ not in the do...,True,,4.617091e+05,124,80246,4613,{'setup': {'verifier_engine': '<textgrad.engin...
