# Deep Experiment Evaluaton TV2 TextGrad

## Setup

In [40]:
# Setup and Imports
import random
import pandas as pd
import numpy as np
import textgrad as tg
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
from collections import Counter
# TextGrad
from textgrad.engine import get_engine
from textgrad.variable import Variable
from textgrad.verifier import TextualVerifierV2Analysis
from textgrad.loss import TextLoss

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

## Load Datasets

In [41]:
# Dataset Loading Function
def load_prm800k_safely():
    """Load PRM800K with error handling"""
    
    try:
        print("Attempting to load PRM800K dataset...")
        
        # Loading with streaming first (safer for large datasets)
        dataset = load_dataset("tasksource/PRM800K", streaming=True)
        print("[V] Successfully loaded PRM800K in streaming mode")
        return dataset, "streaming"
        
    except Exception as e:
        print(f"[X] Streaming failed: {e}")

In [42]:
# Load dataset
dataset_stream, load_method = load_prm800k_safely()

Attempting to load PRM800K dataset...
[V] Successfully loaded PRM800K in streaming mode


In [43]:
dataset_stream

IterableDatasetDict({
    train: IterableDataset({
        features: Unknown,
        num_shards: 2
    })
    test: IterableDataset({
        features: Unknown,
        num_shards: 2
    })
})

In [44]:
samples = []
for example in dataset_stream["train"].take(3):
    samples.append(example)

df_samples = pd.DataFrame(samples)
df_samples

Unnamed: 0,labeler,timestamp,generation,is_quality_control_question,is_initial_screening_question,question,label
0,e90a38f3-3135-4465-87af-3e6322e3d772,2022-07-13T18:55:54.496450,,False,False,{'problem': 'How many seconds are in 7.8 minut...,{'steps': [{'completions': [{'text': '7.8 minu...
1,e90a38f3-3135-4465-87af-3e6322e3d772,2022-07-17T16:56:51.323252,,False,False,{'problem': 'How many positive two-digit integ...,"{'steps': [{'completions': [{'text': ""Let's ca..."
2,e90a38f3-3135-4465-87af-3e6322e3d772,2022-07-02T18:33:27.255302,,False,False,{'problem': 'The fifth and eighth terms of a g...,{'steps': [{'completions': [{'text': 'So we ha...


In [45]:
df_samples.to_csv('dataset/prm800k.csv', index=False)

OSError: Cannot save file into a non-existent directory: 'csv'

## Setup Experiment

In [None]:
# Setup Engine
engine = get_engine("gemini-1.5-pro")
tg.set_backward_engine("gemini-1.5-pro", override=True)

In [None]:
# Evaluate with TextualVerifierV2
def evaluate_with_tv2(row_data):
    match = initial_solution[initial_solution["id"] == row_data["id"]]
    if match.empty:
        return None  # or raise error
    formatted_question = match.iloc[0]["formatted_question"]
    result = {
        "id": row_data["id"],
        "raw_solution": row_data["raw_solution"],
        "correct_answer": row_data["correct_answer"],
        "source": row_data["source"],
        "subject": row_data["subject"]
    }
    
    solution = Variable(row_data["raw_solution"],
                    requires_grad=True,
                    role_description=f"Solution to the math question: {formatted_question}")
    loss_system_prompt = Variable("""You will evaluate a solution to a math question. 
                                    Do not attempt to solve it yourself, do not give a solution, 
                                    only identify errors. Be super concise.""",
                                    requires_grad=False,
                                    role_description="system prompt")
    loss = TextLoss(loss_system_prompt, engine=engine)

    # TextualVerifierV2
    verifier = TextualVerifierV2Analysis(verifier_engine=engine, step_eval_iterations=3, logger=False)
    loss_result = loss(solution)
    verified_result = verifier.verify(instance=solution, 
                                    prompt=loss_system_prompt,
                                    calculation=loss_result)

    print(verified_result)
    result[f"verified_solution"] = solution.value
    return result

## Evaluate

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time
import os

results = []
start_time = time.time()
optimal_workers = min(32, os.cpu_count() * 4)

with ThreadPoolExecutor(max_workers=optimal_workers) as executor:
    # Submit all tasks
    futures = [
        executor.submit(evaluate_with_tv2, row.to_dict()) 
        for _, row in initial_solution[:1].iterrows()
    ]
    
    # Use tqdm for progress tracking
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
        result = future.result()
        if result is not None:
            results.append(result)

raw_textgrad = pd.DataFrame(results)

print(f"Completed in {time.time() - start_time:.1f} seconds")
raw_textgrad.to_csv('results/tv2_textgrad.csv', index=False)

Processing: 100%|██████████| 1/1 [00:00<00:00,  5.65it/s]

The primary error lies in inconsistently accounting for carbon atoms contributed by the Grignard reagent. In Step 1, only the phenyl group's carbon is considered, neglecting the carbon directly attached to the magnesium bromide.  This oversight then propagates through subsequent steps.  To improve the process, meticulously account for *all* carbon atoms from *each* reactant involved in forming the products.  Specifically, when considering Grignard reactions, remember that the alkyl/aryl group acts as a nucleophile, contributing its entire carbon framework to the product.  Apply this rigorous atom-tracking approach consistently throughout the synthesis pathway to accurately determine the total carbon count in each product.
Completed in 0.2 seconds



