# Evaluation TextGrad with TextualVerifier in Loss

In [None]:
import re
import time
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

import textgrad as tg
from textgrad.engine import get_engine
from textgrad.variable import Variable
from textgrad.loss import TextLoss
from textgrad.optimizer import TextualGradientDescent
from textgrad.verifier import TextualVerifierWithTracker


## Load Dataset

In [None]:
initial_solution = pd.read_csv("datasets/initial-solution.csv")
initial_solution

## Setup Engine

In [None]:
engine = get_engine("gemini-1.5-pro")
tg.set_backward_engine("gemini-1.5-pro", override=True)

## Experiment Iterations

In [None]:
def extract_answer(text):
    # Now extract correctly
    match = re.search(r"(?i)Answer\s*:\s*([A-D])", text, re.IGNORECASE)
    if match:
        return match.group(1).upper()
    return "Z" # Z -> means for None (Incorrect Answer)

In [None]:
def evaluate(row_data, engine):
    try:
        match = initial_solution[initial_solution["id"] == row_data["id"]]
        if match.empty:
            return None  # or raise error
        formatted_question = match.iloc[0]["formatted_question"]
        
        solution = Variable(row_data["raw_solution"],
                            requires_grad=True,
                            role_description=f"Solution to the math question: {formatted_question}")
        loss_system_prompt = Variable("""You will evaluate a solution to a math question. 
                                        Do not attempt to solve it yourself, do not give a solution, 
                                        only identify errors. Be super concise.""",
                                        requires_grad=False,
                                        role_description="system prompt")
        verification_task_prompts = [
            """
            1. When using $instruction to $instance, I got the loss $calculation.
            2. Evaluate the loss $calculation value correctly reflects the performance of the instance.
            3. If the loss $calculation is incorrect or inconsistent, provide the corrected version of the loss $calculation. 
            4. Do NOT calculate the solution/instance, evaluate $calculation ONLY.
            """
        ]

        optimizer = TextualGradientDescent([solution])
        loss = TextLoss(loss_system_prompt, engine=engine)
        loss_value = loss(solution) # Forward method in Loss Function
        original_loss = loss_value.value

        # TextualVerifier
        verifier = TextualVerifierWithTracker(
            verifier_engine=engine, 
            use_cot_generation=False,
            use_step_breakdown=False,
            verification_task_prompts=verification_task_prompts,
            enable_logging=False,
        )

        verified_result = verifier.verify(instance=solution,
                                        instruction=loss_system_prompt,
                                        calculation=loss_value)

        tracker_data = verifier.get_tracker() # Tracker data     

        loss_value.set_value(verified_result.value) 
        
        loss_value.backward()
        optimizer.step()

        return {
            "id": row_data["id"],
            "source": row_data["source"],
            "subject": row_data["subject"],
            "correct_answer": row_data["correct_answer"],
            "original_solution": row_data["raw_solution"],
            "original_loss": original_loss,
            "verified_loss": verified_result,
            "optimized_solution": solution.value,
            "verified_optimized_solution": "",
            "final_solution": solution.value,
            "final_solution_answer": extract_answer(solution.value), 
            "success": True,
            "error_message": None,
            "processing_time_ms": tracker_data['processing_time_ms'],
            "total_llm_calls": tracker_data['total_llm_calls'],
            "total_input_tokens": tracker_data['total_input_tokens'],
            "total_output_tokens": tracker_data['total_output_tokens'],
            "result": tracker_data
        }
        
    except Exception as e:
        # Return error information
        return {
            "id": row_data["id"],
            "source": "",
            "subject": "",
            "correct_answer": "",
            "original_solution": "",
            "original_loss": "",
            "verified_loss": "",
            "optimized_solution": "",
            "verified_optimized_solution": "",
            "final_solution": "",
            "final_solution_answer": "", 
            "success": False,
            "error_message": str(e),
            "processing_time_ms": 0,
            "total_llm_calls": 0,
            "total_input_tokens": 0,
            "total_output_tokens": 0,
            "result": {}
        }

In [None]:
def run_evaluation():
    results = []
    start_time = time.time()
    
    with ThreadPoolExecutor(max_workers=32) as executor:
        futures = [
            executor.submit(evaluate, row.to_dict(), engine) 
            for _, row in initial_solution.iterrows()  
        ]
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
            try:
                result = future.result(timeout=None)
                if result is not None:
                    results.append(result)
            except Exception as e:
                print(f"Task failed: {e}")
                error_result = {
                    'success': False,
                    'error_message': str(e),
                    'result': {}
                }
                results.append(error_result)

    experiment_df = pd.DataFrame(results)
    print(f"Completed in {time.time() - start_time:.1f} seconds")
    experiment_df.to_csv('results/result-03-textgrad-tv-l.csv', index=False)
    
    return experiment_df

In [None]:
run_evaluation()