# Updated Experiment TV Using Best Sample

In [1]:
import re
import ast
import time
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from threading import Lock

import textgrad as tg
from textgrad.engine import get_engine
from textgrad.variable import Variable
from textgrad.verifier import TextualVerifierWithTracker

## Load Dataset

In [2]:
sample = pd.read_csv("datasets/sample/prm800k-03-algo3-clean.csv")
sample

Unnamed: 0,id,labeler,timestamp,problem,ground_truth_answer,total_steps,ground_truth_steps,steps,neg_1,zero,pos_1
0,1,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-30T14:37:13.296218,There are an infinite number of vectors $\math...,\begin{pmatrix} -7 \\ 16 \\ 5 \end{pmatrix},34,"[""Let's set $\\mathbf{v} = \\begin{pmatrix} a ...","[{'text': ""Let's set $\\mathbf{v} = \\begin{pm...",22,3,9
1,2,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-30T13:26:58.414691,When rolling a certain unfair six-sided die wi...,29,35,"[""Well, let's think about this for a moment. W...","[{'text': ""Well, let's think about this for a ...",18,1,16
2,3,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-31T14:39:30.588403,Find all solutions to\n\[\sin \left( \tan^{-1}...,3 \pm 2 \sqrt{2},34,"[""Let's set $y = \\tan^{-1} x$."", 'Then $\\cot...","[{'text': ""Let's call $\\tan^{-1}(x)=a$."", 'ra...",12,1,21
3,4,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-29T07:48:01.714041,The solutions of the equation $z^4+4z^3i-6z^2-...,11,40,['There is a formula for the area of any conve...,[{'text': 'There is a formula for the area of ...,16,2,22
4,5,e90a38f3-3135-4465-87af-3e6322e3d772,2022-07-22T20:02:50.866783,A sequence $(a_n)$ is defined as follows:\n\[a...,-1,36,"[""So we're given that $a_{i + 1} = \\frac{1}{1...","[{'text': ""So we're given that $a_{i + 1} = \\...",7,4,25
...,...,...,...,...,...,...,...,...,...,...,...
65,440,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-28T08:12:20.344377,Find the product $CD$ of the integers $C$ and ...,-5,17,['I think the first step here is to find the l...,[{'text': 'I think the first step here is to f...,3,0,14
66,442,d8aa7923-b970-45e1-9734-e4a7f6c4a7db,2022-07-31T22:47:06.498122,What real values of $x$ are not in the domain ...,-4,31,['To find values of $x$ that are not in the do...,[{'text': 'To find values of $x$ that are not ...,4,0,27
67,444,d8aa7923-b970-45e1-9734-e4a7f6c4a7db,2022-07-24T10:40:50.685197,How many license plates can be formed if every...,58500,14,"['So for the first part of the license plate, ...",[{'text': 'First we need to find the number of...,1,1,12
68,446,debabc6d-f79c-4ee5-a9db-5e284390254c,2022-07-28T07:53:49.057201,Evaluate $\log_264$.,6,5,"[""First, let's think about what this problem i...","[{'text': ""Ok, let's first break down $264$ in...",3,0,2


## Experiment

In [3]:
engine = get_engine("gemini-1.5-pro")
tg.set_backward_engine("gemini-1.5-pro", override=True)
model_name="gemini-1.5-pro"

  from .autonotebook import tqdm as notebook_tqdm


## Expriment Iterations

In [4]:
def format_steps(steps):
    formatted_steps = ""
    for step in steps:
        new_step = f"<STEP>{step['text']}</STEP>\n"
        formatted_steps += new_step
    return formatted_steps

In [5]:
def get_step_list(formatted_steps):
    steps = []
    matches = re.findall(r"<VERIFIED>(.*?)</VERIFIED>", formatted_steps, re.DOTALL)
    for match in matches:
        steps.append({"text": match.strip()})
    return steps

In [6]:
def get_first_integer(text):
    number = ''
    found = False

    for char in text:
        if char.isdigit():
            number += char
            found = True
        elif found:
            break

    return int(number) if number else None

In [7]:
def get_transition(original_rating, verified_rating):
    transition = {
        'neg1_to_neg1': 0,
        'neg1_to_zero': 0,
        'neg1_to_pos1': 0,
        'zero_to_neg1': 0,
        'zero_to_zero': 0,
        'zero_to_pos1': 0,
        'pos1_to_neg1': 0,
        'pos1_to_zero': 0,
        'pos1_to_pos1': 0,
    }

    for i, rating in enumerate(original_rating):
        verified = verified_rating[i]
        if rating == -1:
            if verified == -1:
                transition['neg1_to_neg1'] += 1
            elif verified == 0:
                transition['neg1_to_zero'] += 1
            elif verified == 1:
                transition['neg1_to_pos1'] += 1
        elif rating == 0:
            if verified == -1:
                transition['zero_to_neg1'] += 1
            elif verified == 0:
                transition['zero_to_zero'] += 1
            elif verified == 1:
                transition['zero_to_pos1'] += 1
        elif rating == 1:
            if verified == -1:
                transition['pos1_to_neg1'] += 1
            elif verified == 0:
                transition['pos1_to_zero'] += 1
            elif verified == 1:
                transition['pos1_to_pos1'] += 1

    return transition


In [8]:
def extract_answer_llm(question, solution):
    prompt = f"""
    <Question>
    {question}
    </Question>
    
    <Solution>
    {solution}
    </Solution>

    <Task>
    Based on last step of solution, what the answer of question?
    ONLY provide final answer. Be super concise.
    </Task>
    """
    answer = engine.generate(prompt)
        
    return answer

In [9]:
def check_answer_correctness_llm(ground_truth_answer, answer_to_check):
    prompt = f"""
    <GroundTruthAnswer>
    {ground_truth_answer}
    </GroundTruthAnswer>
    
    <AnswerToCheck>
    {answer_to_check}
    </AnswerToCheck>

    <Task>
    Based on GroundTruthAnswer, is the AnswerToCheck is correct?
    Response MUST ONLY in "TRUE" or "FALSE" without quote.
    </Task>
    """
    correctness = engine.generate(prompt)
        
    return correctness

In [10]:
def rating_step_llm(ground_truth_step, step_to_rate):
    prompt = f"""
    <GroundTruthStep>
    {ground_truth_step}
    </GroundTruthStep>
    
    <StepToRate>
    {step_to_rate}
    </StepToRate>

    <RatingInformation>
    -1: Great: correct, verifiable, appropriate, and insightful.
    0: Okay: don't contribute anything of essence, reasonable, verifiably cor- rect, and appropriate, but also redundant, stalling, or just don't add any value.
    1: Bad: hard to verify, wrong, contains gibberish, contains off-topic text or non-sequiturs, suggests attempting something unreasonable, derails the conversation, leads the solution into dead or circles.
    </RatingInformation>

    <Task>
    Based on GroundTruthStep, PLEASE rating the StepToRate.
    Grading based on RatingInformation.
    
    Response MUST ONLY in $NUMBER (-1 or 0 or 1).
    </Task>
    """
    rating = engine.generate(prompt)
        
    return rating

In [11]:
def get_verification_task_prompts(variant_wanted):
    verification_task_prompts = [
        # Perspective 1: Rule-based verifier (objective, procedural)
        """
        Evaluate the calculation step strictly based on mathematical correctness and procedural rules. 
        If the step violates any algebraic or logical principle, replace it with the corrected version of that step only. 
        Do not proceed to solve the full problem.
        Please proceed output with 
        """,

        # Perspective 2: Teaching assistant (didactic, pedagogical)
        """
        Review the calculation step from the perspective of a teaching assistant helping a student learn. 
        If there's an error or suboptimal explanation, provide a corrected version that would best aid the student's understanding. 
        Focus only on the step in question, without solving the full problem.
        """,

        # Perspective 3: Exam grader (concise, evaluative)
        """
        Assess the calculation step as an exam grader would when evaluating a student's solution. 
        Identify whether the step is mathematically valid and aligns with standard problem-solving conventions. 
        If incorrect, rewrite only the flawed step to reflect what a student should have written to receive full credit. 
        Avoid expanding beyond the current step or solving the entire problem.
        """,

        # Perspective 4: Peer reviewer (collaborative, analytical)
        """
        Examine the calculation step as a fellow mathematician reviewing a colleague's work. 
        Check for mathematical rigor, clarity of notation, and logical consistency. 
        If the step contains errors or ambiguities, suggest an improved version that maintains mathematical precision. 
        Focus solely on refining the current step without advancing the solution.
        """,

        # Perspective 5: Textbook editor (clear, standardized)
        """
        Review the calculation step as a textbook editor ensuring mathematical accuracy and clarity for publication. 
        Verify that the step follows standard mathematical notation and conventions. 
        If corrections are needed, rewrite the step to match the quality and style expected in a high-quality mathematics textbook. 
        Limit your response to improving only the given step.
        """
    ]
    if variant_wanted > len(verification_task_prompts):
        raise IndexError(f"No prompt in index {variant_wanted-1}")
    return verification_task_prompts[0:variant_wanted]

In [12]:
# Thread-safe results collection
results_lock = Lock()
results = []

def evaluate_sample(row_dict, engine, num_variant):
    try:
        # Extract problem data
        id = row_dict['id']
        problem = row_dict['problem']
        steps_list = ast.literal_eval(row_dict['steps']) if isinstance(row_dict['steps'], str) else row_dict['steps']
        solution_steps = format_steps(steps_list)
        
        # Variables
        question = Variable(problem,
                    requires_grad=False,
                    role_description="math question")
        instruction = Variable("""You will answering a math question. 
                                Please using step-by-step explanation. Be super concise.""",
                                requires_grad=False,
                                role_description="instruction")
        calculation = Variable(solution_steps,
                                requires_grad=True,
                                role_description="solution to the math question")

        # Verifier
        verifier = TextualVerifierWithTracker(
            verifier_engine=engine, 
            use_cot_generation=False,
            use_step_breakdown=True,
            verification_task_prompts=get_verification_task_prompts(num_variant),
            enable_logging=False
        )
        # Verify
        verified_result = verifier.verify(instance=question,
                                        instruction=instruction,
                                        calculation=calculation)      

        # Tracker data
        tracker_data = verifier.get_tracker()

        # Save to result
        original_rating = [step['rating'] for step in steps_list]
        original_answer = extract_answer_llm(question=problem, solution=solution_steps)
        original_answer_correctness = check_answer_correctness_llm(
            ground_truth_answer=row_dict['ground_truth_answer'], 
            answer_to_check=original_answer)

        verified_step_list = get_step_list(verified_result.value)
        
        ground_truth_steps = ast.literal_eval(row_dict['ground_truth_steps'])
        verified_rating = [get_first_integer(rating_step_llm(
            ground_truth_step=ground_truth_steps[i],
            step_to_rate=step)) for i, step in enumerate(verified_step_list)]

        verified_answer = extract_answer_llm(
            question=problem, 
            solution=format_steps(verified_step_list))
        verified_answer_correctness = check_answer_correctness_llm(
            ground_truth_answer=row_dict['ground_truth_answer'], 
            answer_to_check=verified_answer)
        verified_total_steps = len(verified_step_list)
        verified_neg1 = verified_rating.count(-1)
        verified_zero = verified_rating.count(0)
        verified_pos1 = verified_rating.count(1)

        transition = get_transition(
            original_rating=original_rating,
            verified_rating=verified_rating)
            
        # Extract metrics for return
        experiment_data = {
            'id': id,
            'problem': problem,
            'ground_truth_answer':row_dict['ground_truth_answer'],
            # Original
            'original_solution': solution_steps,
            'original_rating': original_rating,
            'original_answer': original_answer,
            'original_answer_correctness': original_answer_correctness,
            'original_total_steps': row_dict['total_steps'],
            'original_neg1': row_dict['neg_1'],
            'original_zero': row_dict['zero'],
            'original_pos1': row_dict['pos_1'],
            # Verified
            'verified_solution': verified_result.value,
            'verified_rating': verified_rating,
            'verified_answer': verified_answer,
            'verifier_answer_correctness': verified_answer_correctness,
            'verified_total_steps': verified_total_steps,
            'verifier_neg1': verified_neg1,
            'verifier_zero': verified_zero,
            'verifier_pos1': verified_pos1,
            # Stats
            'neg1_to_neg1': transition['neg1_to_neg1'],
            'neg1_to_zero': transition['neg1_to_zero'],
            'neg1_to_pos1': transition['neg1_to_pos1'],
            'zero_to_neg1': transition['zero_to_neg1'],
            'zero_to_zero': transition['zero_to_zero'],
            'zero_to_pos1': transition['zero_to_pos1'],
            'pos1_to_neg1': transition['pos1_to_neg1'],
            'pos1_to_zero': transition['pos1_to_zero'],
            'pos1_to_pos1': transition['pos1_to_pos1'],
            # Other
            'success': True,
            'error_message': None,
            'processing_time_ms': tracker_data['processing_time_ms'],
            'total_llm_calls': tracker_data['total_llm_calls'],
            'total_input_tokens': tracker_data['total_input_tokens'],
            'total_output_tokens': tracker_data['total_output_tokens'],
            'result': tracker_data 
        }
        
        return experiment_data
            
    except Exception as e:
        return {
            'id': id,
            'problem': '',
            'ground_truth_answer': '',
            # Original
            'original_solution': '',
            'original_rating': None,
            'original_answer': '',
            'original_answer_correctness': None,
            'original_total_steps': None,
            'original_neg1': None,
            'original_zero': None,
            'original_pos1': None,
            # Verified
            'verified_solution': '',
            'verified_rating': None,
            'verified_answer': '',
            'verifier_answer_correctness': None,
            'verified_total_steps': None,
            'verifier_neg1': None,
            'verifier_zero': None,
            'verifier_pos1': None,
            # Stats
            'neg1_to_neg1': 0,
            'neg1_to_zero': 0,
            'neg1_to_pos1': 0,
            'zero_to_neg1': 0,
            'zero_to_zero': 0,
            'zero_to_pos1': 0,
            'pos1_to_neg1': 0,
            'pos1_to_zero': 0,
            'pos1_to_pos1': 0,
            # Other
            'success': False,
            'error_message': str(e),
            'processing_time_ms': 0,
            'total_llm_calls': 0,
            'total_input_tokens': 0,
            'total_output_tokens': 0,
            'result': {}
        }

In [13]:
def run_experiment(num_variant):
    """Example of how to run the modified experiment"""
    results = []
    start_time = time.time()
    
    with ThreadPoolExecutor(max_workers=32) as executor:
        futures = [
            executor.submit(evaluate_sample, row.to_dict(), engine, num_variant) 
            for _, row in sample.iterrows()  # Your slice
        ]
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
            try:
                result = future.result(timeout=None)
                if result is not None:
                    results.append(result)
            except Exception as e:
                print(f"Task failed: {e}")
                error_result = {
                    'success': False,
                    'error_message': str(e),
                    'result': {}
                }
                results.append(error_result)
    
    # Create DataFrame with detailed results
    experiment_df = pd.DataFrame(results)
    
    print(f"Completed in {time.time() - start_time:.1f} seconds")
    
    # Save results 
    experiment_df.to_csv(f'results/textualverifier-{num_variant}v.csv', index=False)
    
    return experiment_df

In [14]:
run_experiment(1)

I0000 00:00:1750099976.676050 33724945 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers
Processing: 100%|██████████| 70/70 [04:16<00:00,  3.66s/it]


Completed in 256.3 seconds


Unnamed: 0,id,problem,ground_truth_answer,original_solution,original_rating,original_answer,original_answer_correctness,original_total_steps,original_neg1,original_zero,...,pos1_to_neg1,pos1_to_zero,pos1_to_pos1,success,error_message,processing_time_ms,total_llm_calls,total_input_tokens,total_output_tokens,result
0,27,Compute $99^2+99+1$ in your head.,9901,<STEP>Let's call 99 x. So we want to calculate...,"[0, 1, 1, 1, 1]",9901\n,TRUE\n,5,0,1,...,0,0,4,True,,26.707764,5,1047,199,{'setup': {'verifier_engine': '<textgrad.engin...
1,12,Suppose that $f$ is a polynomial such that \[(...,3,<STEP>Well if $(x-1)\cdot f(x)=3x^4+x^3-25x^2+...,"[-1, -1, -1, 1, -1, 1, -1]",3\n,TRUE\n,7,5,0,...,0,1,1,True,,17627.018066,7,1871,247,{'setup': {'verifier_engine': '<textgrad.engin...
2,20,Let $f$ be the function defined by $f(x) = x^3...,34,<STEP>First let's find the polynomial for $g(x...,"[1, 1, 1, -1, 1, -1, -1, -1]",-34\n,FALSE\n,8,4,0,...,0,0,4,True,,25202.554932,8,3674,450,{'setup': {'verifier_engine': '<textgrad.engin...
3,34,"On a particular map, $3$ inches on the map equ...",\frac{639}{40},<STEP>Let's call the distance between the buil...,"[1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1]",639/40\n,TRUE\n,12,2,0,...,0,1,9,True,,22628.519531,12,3598,189,{'setup': {'verifier_engine': '<textgrad.engin...
4,26,"Let $a,$ $b,$ $c,$ $d$ be positive real number...",16,"<STEP>Hey, this looks like a job for the AM-HM...","[1, 1, 1, 1, 1, -1, 1, 1, 0, 1, 1]",16\n,TRUE\n,11,1,1,...,0,0,9,True,,28741.846191,11,4727,515,{'setup': {'verifier_engine': '<textgrad.engin...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,32,"Let $a,$ $b,$ $c,$ and $d$ be positive real nu...",27648,<STEP>Let's use AM-GM inequality.</STEP>\n<STE...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, -1...",3456\n,FALSE\n,24,6,0,...,0,0,18,True,,198556.968994,24,59556,4620,{'setup': {'verifier_engine': '<textgrad.engin...
66,4,The solutions of the equation $z^4+4z^3i-6z^2-...,11,<STEP>There is a formula for the area of any c...,"[1, 1, 0, 1, 1, -1, 1, -1, -1, 1, -1, -1, 1, 1...",11\n,TRUE\n,40,16,2,...,0,1,21,True,,184353.641846,40,96134,3670,{'setup': {'verifier_engine': '<textgrad.engin...
67,28,Let $f$ be defined by \[f(x) = \left\{\n\begi...,0,<STEP>Let's calculate $f^{-1}(0)$.</STEP>\n<ST...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0\n,TRUE\n,53,7,3,...,0,0,43,True,,183179.092285,53,90884,2532,{'setup': {'verifier_engine': '<textgrad.engin...
68,2,When rolling a certain unfair six-sided die wi...,29,"<STEP>Well, let's think about this for a momen...","[1, -1, -1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1...",29\n,TRUE\n,35,18,1,...,0,0,16,True,,217327.651855,35,101179,4390,{'setup': {'verifier_engine': '<textgrad.engin...


In [15]:
run_experiment(2)

Processing: 100%|██████████| 70/70 [13:20<00:00, 11.44s/it]


Completed in 801.0 seconds


Unnamed: 0,id,problem,ground_truth_answer,original_solution,original_rating,original_answer,original_answer_correctness,original_total_steps,original_neg1,original_zero,...,pos1_to_neg1,pos1_to_zero,pos1_to_pos1,success,error_message,processing_time_ms,total_llm_calls,total_input_tokens,total_output_tokens,result
0,27,Compute $99^2+99+1$ in your head.,9901,<STEP>Let's call 99 x. So we want to calculate...,"[0, 1, 1, 1, 1]",9901\n,TRUE\n,5,0,1,...,0,0,4,True,,110.998291,15,2630,548,{'setup': {'verifier_engine': '<textgrad.engin...
1,12,Suppose that $f$ is a polynomial such that \[(...,3,<STEP>Well if $(x-1)\cdot f(x)=3x^4+x^3-25x^2+...,"[-1, -1, -1, 1, -1, 1, -1]",3\n,TRUE\n,7,5,0,...,0,1,1,True,,58103.796875,21,6563,1547,{'setup': {'verifier_engine': '<textgrad.engin...
2,20,Let $f$ be the function defined by $f(x) = x^3...,34,<STEP>First let's find the polynomial for $g(x...,"[1, 1, 1, -1, 1, -1, -1, -1]",-34\n,FALSE\n,8,4,0,...,0,0,4,True,,65895.946045,24,9931,1862,{'setup': {'verifier_engine': '<textgrad.engin...
3,34,"On a particular map, $3$ inches on the map equ...",\frac{639}{40},<STEP>Let's call the distance between the buil...,"[1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1]",639/40\n,TRUE\n,12,2,0,...,0,0,10,True,,62901.198242,36,10441,1018,{'setup': {'verifier_engine': '<textgrad.engin...
4,25,"Consider the rectangle with vertices at $(5,4)...",63,<STEP>To figure out the number of integer coor...,"[1, 1, 1, 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1,...",48\n,FALSE\n,17,7,0,...,0,1,9,True,,69803.834717,51,21210,1680,{'setup': {'verifier_engine': '<textgrad.engin...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,32,"Let $a,$ $b,$ $c,$ and $d$ be positive real nu...",27648,<STEP>Let's use AM-GM inequality.</STEP>\n<STE...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, -1...",3456\n,FALSE\n,24,6,0,...,0,0,18,True,,535258.415771,72,118032,12147,{'setup': {'verifier_engine': '<textgrad.engin...
66,442,What real values of $x$ are not in the domain ...,-4,<STEP>To find values of $x$ that are not in th...,"[1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",-4\n,TRUE\n,31,4,0,...,0,0,27,True,,235371.562256,93,58344,4176,{'setup': {'verifier_engine': '<textgrad.engin...
67,28,Let $f$ be defined by \[f(x) = \left\{\n\begi...,0,<STEP>Let's calculate $f^{-1}(0)$.</STEP>\n<ST...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0\n,TRUE\n,53,7,3,...,0,0,43,True,,544292.128906,159,255664,11725,{'setup': {'verifier_engine': '<textgrad.engin...
68,1,There are an infinite number of vectors $\math...,\begin{pmatrix} -7 \\ 16 \\ 5 \end{pmatrix},<STEP>Let's set $\mathbf{v} = \begin{pmatrix} ...,"[1, -1, -1, 0, 1, -1, 0, -1, -1, -1, -1, -1, -...",$\sqrt{330}$\n,FALSE\n,34,22,3,...,0,0,9,True,,628507.835693,102,216742,16992,{'setup': {'verifier_engine': '<textgrad.engin...


In [16]:
run_experiment(3)

Processing: 100%|██████████| 70/70 [14:17<00:00, 12.25s/it]


Completed in 857.4 seconds


Unnamed: 0,id,problem,ground_truth_answer,original_solution,original_rating,original_answer,original_answer_correctness,original_total_steps,original_neg1,original_zero,...,pos1_to_neg1,pos1_to_zero,pos1_to_pos1,success,error_message,processing_time_ms,total_llm_calls,total_input_tokens,total_output_tokens,result
0,27,Compute $99^2+99+1$ in your head.,9901,<STEP>Let's call 99 x. So we want to calculate...,"[0, 1, 1, 1, 1]",9901\n,TRUE\n,5,0,1,...,0,0,4,True,,94.368896,20,3292,620,{'setup': {'verifier_engine': '<textgrad.engin...
1,20,Let $f$ be the function defined by $f(x) = x^3...,34,<STEP>First let's find the polynomial for $g(x...,"[1, 1, 1, -1, 1, -1, -1, -1]",-34\n,FALSE\n,8,4,0,...,0,0,4,True,,69566.233887,32,12524,1882,{'setup': {'verifier_engine': '<textgrad.engin...
2,12,Suppose that $f$ is a polynomial such that \[(...,3,<STEP>Well if $(x-1)\cdot f(x)=3x^4+x^3-25x^2+...,"[-1, -1, -1, 1, -1, 1, -1]",3\n,TRUE\n,7,5,0,...,0,0,2,True,,71173.647949,28,7591,1500,{'setup': {'verifier_engine': '<textgrad.engin...
3,34,"On a particular map, $3$ inches on the map equ...",\frac{639}{40},<STEP>Let's call the distance between the buil...,"[1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1]",639/40\n,TRUE\n,12,2,0,...,0,1,9,True,,81361.197266,48,13914,993,{'setup': {'verifier_engine': '<textgrad.engin...
4,19,Compute $17^{-1}\pmod{83}$. Express your answe...,44,<STEP>Let's use the fact that $17\cdot 5=85$ t...,"[1, 1, 0, 0, -1, 0, -1, 1, 1]",44\n,TRUE\n,9,2,3,...,0,0,4,True,,97723.160156,36,15211,2853,{'setup': {'verifier_engine': '<textgrad.engin...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,2,When rolling a certain unfair six-sided die wi...,29,"<STEP>Well, let's think about this for a momen...","[1, -1, -1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1...",29\n,TRUE\n,35,18,1,...,0,0,16,True,,616018.083984,140,251646,12847,{'setup': {'verifier_engine': '<textgrad.engin...
66,442,What real values of $x$ are not in the domain ...,-4,<STEP>To find values of $x$ that are not in th...,"[1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",-4\n,TRUE\n,31,4,0,...,0,0,27,True,,263532.189941,124,80678,5552,{'setup': {'verifier_engine': '<textgrad.engin...
67,3,Find all solutions to\n\[\sin \left( \tan^{-1}...,3 \pm 2 \sqrt{2},<STEP>Let's call $\tan^{-1}(x)=a$.</STEP>\n<ST...,"[1, -1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1...","$3-2\sqrt{2}, 3+2\sqrt{2}$\n",TRUE\n,34,12,1,...,0,1,20,True,,643731.822754,136,257466,17194,{'setup': {'verifier_engine': '<textgrad.engin...
68,9,"For every positive integer $n$, let $\text{mod...",1,"<STEP>So $f(i,j)$ takes two numbers and gives ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, ...",1\n,TRUE\n,52,7,5,...,0,1,39,True,,721600.396484,208,437078,13189,{'setup': {'verifier_engine': '<textgrad.engin...


In [17]:
run_experiment(4)

Processing: 100%|██████████| 70/70 [15:00<00:00, 12.86s/it]


Completed in 900.3 seconds


Unnamed: 0,id,problem,ground_truth_answer,original_solution,original_rating,original_answer,original_answer_correctness,original_total_steps,original_neg1,original_zero,...,pos1_to_neg1,pos1_to_zero,pos1_to_pos1,success,error_message,processing_time_ms,total_llm_calls,total_input_tokens,total_output_tokens,result
0,27,Compute $99^2+99+1$ in your head.,9901,<STEP>Let's call 99 x. So we want to calculate...,"[0, 1, 1, 1, 1]",9901\n,TRUE\n,5,0,1,...,0,0,4,True,,119.780029,25,3845,649,{'setup': {'verifier_engine': '<textgrad.engin...
1,12,Suppose that $f$ is a polynomial such that \[(...,3,<STEP>Well if $(x-1)\cdot f(x)=3x^4+x^3-25x^2+...,"[-1, -1, -1, 1, -1, 1, -1]",3\n,TRUE\n,7,5,0,...,0,1,1,True,,61247.205811,35,9160,1529,{'setup': {'verifier_engine': '<textgrad.engin...
2,20,Let $f$ be the function defined by $f(x) = x^3...,34,<STEP>First let's find the polynomial for $g(x...,"[1, 1, 1, -1, 1, -1, -1, -1]",-34\n,FALSE\n,8,4,0,...,0,0,4,True,,80667.852295,40,17320,2738,{'setup': {'verifier_engine': '<textgrad.engin...
3,34,"On a particular map, $3$ inches on the map equ...",\frac{639}{40},<STEP>Let's call the distance between the buil...,"[1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1]",639/40\n,TRUE\n,12,2,0,...,0,0,10,True,,100990.344971,60,17715,1404,{'setup': {'verifier_engine': '<textgrad.engin...
4,36,If a snack-size tin of peaches has $40$ calori...,2000,<STEP>Let's call the number of calories needed...,"[1, 1, 1, 1, 1, 1, 0]",2000\n,TRUE\n,7,0,1,...,0,0,6,True,,48645.894287,35,6630,594,{'setup': {'verifier_engine': '<textgrad.engin...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,442,What real values of $x$ are not in the domain ...,-4,<STEP>To find values of $x$ that are not in th...,"[1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...",-4\n,TRUE\n,31,4,0,...,0,0,27,True,,344243.803955,155,113647,6633,{'setup': {'verifier_engine': '<textgrad.engin...
66,28,Let $f$ be defined by \[f(x) = \left\{\n\begi...,0,<STEP>Let's calculate $f^{-1}(0)$.</STEP>\n<ST...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0\n,TRUE\n,53,7,3,...,0,0,43,True,,703620.266846,265,348346,14037,{'setup': {'verifier_engine': '<textgrad.engin...
67,41,The smallest distance between the origin and a...,12,"<STEP>Notice that if we complete the square, w...","[0, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 0, -1,...",12\n,TRUE\n,30,10,2,...,0,0,18,True,,672611.215332,150,251101,14823,{'setup': {'verifier_engine': '<textgrad.engin...
68,9,"For every positive integer $n$, let $\text{mod...",1,"<STEP>So $f(i,j)$ takes two numbers and gives ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, ...",1\n,TRUE\n,52,7,5,...,0,0,40,True,,794454.052979,260,459542,13342,{'setup': {'verifier_engine': '<textgrad.engin...


In [18]:
run_experiment(5)

Processing: 100%|██████████| 70/70 [19:24<00:00, 16.64s/it] 


Completed in 1165.0 seconds


Unnamed: 0,id,problem,ground_truth_answer,original_solution,original_rating,original_answer,original_answer_correctness,original_total_steps,original_neg1,original_zero,...,pos1_to_neg1,pos1_to_zero,pos1_to_pos1,success,error_message,processing_time_ms,total_llm_calls,total_input_tokens,total_output_tokens,result
0,27,Compute $99^2+99+1$ in your head.,9901,<STEP>Let's call 99 x. So we want to calculate...,"[0, 1, 1, 1, 1]",9901\n,TRUE\n,5,0,1,...,0,0,4,True,,1.535850e+02,30,4618,750,{'setup': {'verifier_engine': '<textgrad.engin...
1,12,Suppose that $f$ is a polynomial such that \[(...,3,<STEP>Well if $(x-1)\cdot f(x)=3x^4+x^3-25x^2+...,"[-1, -1, -1, 1, -1, 1, -1]",3\n,TRUE\n,7,5,0,...,0,0,2,True,,8.526904e+04,42,12350,2349,{'setup': {'verifier_engine': '<textgrad.engin...
2,34,"On a particular map, $3$ inches on the map equ...",\frac{639}{40},<STEP>Let's call the distance between the buil...,"[1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1]",639/40\n,TRUE\n,12,2,0,...,0,0,10,True,,9.753301e+04,72,20128,1614,{'setup': {'verifier_engine': '<textgrad.engin...
3,19,Compute $17^{-1}\pmod{83}$. Express your answe...,44,<STEP>Let's use the fact that $17\cdot 5=85$ t...,"[1, 1, 0, 0, -1, 0, -1, 1, 1]",44\n,TRUE\n,9,2,3,...,0,0,4,True,,1.073150e+05,54,29675,4873,{'setup': {'verifier_engine': '<textgrad.engin...
4,20,Let $f$ be the function defined by $f(x) = x^3...,34,<STEP>First let's find the polynomial for $g(x...,"[1, 1, 1, -1, 1, -1, -1, -1]",-34\n,FALSE\n,8,4,0,...,0,0,4,True,,1.067961e+05,48,21814,3368,{'setup': {'verifier_engine': '<textgrad.engin...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,21,What is the largest integer less than $2010$ t...,1440,<STEP>Let's call the number x and then we can ...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 0, 0, 1, 1, 0,...",1440\n,TRUE\n,52,7,8,...,0,1,36,True,,8.749967e+05,312,479318,18125,{'setup': {'verifier_engine': '<textgrad.engin...
66,4,The solutions of the equation $z^4+4z^3i-6z^2-...,11,<STEP>There is a formula for the area of any c...,"[1, 1, 0, 1, 1, -1, 1, -1, -1, 1, -1, -1, 1, 1...",11\n,TRUE\n,40,16,2,...,0,0,22,True,,8.955300e+05,240,463396,20184,{'setup': {'verifier_engine': '<textgrad.engin...
67,28,Let $f$ be defined by \[f(x) = \left\{\n\begi...,0,<STEP>Let's calculate $f^{-1}(0)$.</STEP>\n<ST...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0\n,TRUE\n,53,7,3,...,0,0,43,True,,1.079226e+06,318,626411,23291,{'setup': {'verifier_engine': '<textgrad.engin...
68,5,A sequence $(a_n)$ is defined as follows:\n\[a...,-1,<STEP>So we're given that $a_{i + 1} = \frac{1...,"[1, 1, 1, 1, 0, 0, 1, 1, 0, -1, -1, -1, -1, -1...",-1\n,TRUE\n,36,7,4,...,0,0,25,True,,1.118249e+06,216,496426,28325,{'setup': {'verifier_engine': '<textgrad.engin...
