# Experiment TextualVerifier Using Best Sample

In [None]:
import pandas as pd
import textgrad as tg
from textgrad.engine import get_engine
from textgrad.variable import Variable
from textgrad.optimizer import TextualGradientDescent
from textgrad.verifier import TextualVerifierExperiment
from textgrad.loss import TextLoss

## Load Dataset

In [None]:
sample = pd.read_csv("dataset/sample/prm800k-03-algo3-clean.csv")
sample

## Experiment

In [None]:
engine = get_engine("gemini-1.5-pro")
tg.set_backward_engine("gemini-1.5-pro", override=True)

In [None]:
def format_steps(steps):
    formatted_steps = ""
    for step in steps:
        new_step = f"<Step>{step['text']}</Step>\n"
        formatted_steps += new_step
    return formatted_steps

In [None]:
import ast

def evaluate_sample_with_textgrad_textual_verifier(row_data):
    problem = row_data['problem'] 
    steps_list = ast.literal_eval(row_data['steps'])
    solution_steps = format_steps(steps_list)
    print(problem)
    print(solution_steps)

    solution = Variable(solution_steps,
                        requires_grad=True,
                        role_description=f"Solution to the math question: {problem}")
    verification_prompt = Variable("You will evaluate the solution to a math question.",
                                    requires_grad=False,
                                    role_description="system prompt")

    # TextualVerifierV3
    verifier = TextualVerifierExperiment(verifier_engine=engine, step_eval_iterations=3, logger=True)
    verified_result = verifier.verify(instance=solution, 
                                    prompt=verification_prompt,
                                    calculation=solution)
    verified_result_value = verified_result.value

    print(verified_result_value)

    # result = {
    #     "id": row_data["id"],
    #     "raw_solution": row_data["raw_solution"],
    #     "correct_answer": row_data["correct_answer"],
    #     "source": row_data["source"],
    #     "subject": row_data["subject"]
    # }

    # return result

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time

results = []
start_time = time.time()

with ThreadPoolExecutor(max_workers=128) as executor:
    # Submit all tasks
    futures = [
        executor.submit(evaluate_sample_with_textgrad_textual_verifier, row.to_dict()) 
        for _, row in sample[26:27].iterrows()
    ]
    
    # Use tqdm for progress tracking
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
        result = future.result()
        if result is not None:
            results.append(result)

# experiment_df = pd.DataFrame(results)

# print(f"Completed in {time.time() - start_time:.1f} seconds")
# experiment_df.to_csv('results/prm800k-03-algo3-clean-result.csv', index=False)