# Evaluation TextGrad with TextualVerifier in Loss

In [1]:
import re
import time
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

import textgrad as tg
from textgrad.engine import get_engine
from textgrad.variable import Variable
from textgrad.loss import TextLoss
from textgrad.optimizer import TextualGradientDescent
from textgrad.verifier import TextualVerifierWithTracker


## Load Dataset

In [2]:
initial_solution = pd.read_csv("datasets/initial-solution.csv")
initial_solution

Unnamed: 0,id,formatted_question,raw_solution,correct_answer,source,subject
0,2,Answer the following multiple choice question....,Here's how we can determine the number of carb...,A,GPQA-Diamond,-
1,4,Answer the following multiple choice question....,Maxwell's equations in our universe are:\n\n1....,A,GPQA-Diamond,-
2,8,Answer the following multiple choice question....,Here's how we can analyze the results and dete...,B,GPQA-Diamond,-
3,1,Answer the following multiple choice question....,The energy-time uncertainty principle states t...,A,GPQA-Diamond,-
4,22,Answer the following multiple choice question....,The question asks about the oxidizing power of...,D,GPQA-Diamond,-
...,...,...,...,...,...,...
407,394,Answer the following multiple choice question....,The police car is moving towards the wall. Le...,B,MMLU-CP,college_physics
408,384,Answer the following multiple choice question....,Here's how we can solve this problem:\n\n1. **...,A,MMLU-CP,college_physics
409,404,Answer the following multiple choice question....,The diffraction of electrons by a crystal latt...,A,MMLU-CP,college_physics
410,390,Answer the following multiple choice question....,Here's how we can solve this problem:\n\n1. **...,D,MMLU-CP,college_physics


## Setup Engine

In [3]:
engine = get_engine("gemini-1.5-pro")
tg.set_backward_engine("gemini-1.5-pro", override=True)

  from .autonotebook import tqdm as notebook_tqdm


## Experiment Iterations

In [4]:
def extract_answer(text):
    # Now extract correctly
    match = re.search(r"(?i)Answer\s*:\s*([A-D])", text, re.IGNORECASE)
    if match:
        return match.group(1).upper()
    return "Z" # Z -> means for None (Incorrect Answer)

In [5]:
def extract_answer_llm(question, final_solution):
    prompt = f"""
    <Question>
    {question}
    </Question>
    
    <Solution>
    {final_solution}
    </Solution>

    <Task>
    Based on last step of solution, which letter (ABCD) selected in question options?
    Response MUST ONLY in 1 letter where LETTER is one of ABCD!
    </Task>
    """
    answer = engine.generate(prompt)
        
    return answer

In [6]:
def evaluate(row_data, engine):
    try:
        match = initial_solution[initial_solution["id"] == row_data["id"]]
        if match.empty:
            return None  # or raise error
        formatted_question = match.iloc[0]["formatted_question"]
        
        solution = Variable(row_data["raw_solution"],
                            requires_grad=True,
                            role_description=f"Solution to the math question: {formatted_question}")
        loss_system_prompt = Variable("""You will evaluate a solution to a math question. 
                                        Do not attempt to solve it yourself, do not give a solution, 
                                        only identify errors. Be super concise.""",
                                        requires_grad=False,
                                        role_description="system prompt")
        verification_task_prompts = [
            """
            1. When using $instruction to $instance, I got the loss $calculation.
            2. Evaluate the loss $calculation value correctly reflects the performance of the instance.
            3. If the loss $calculation is incorrect or inconsistent, provide the corrected version of the loss $calculation. 
            4. Do NOT calculate the solution/instance, evaluate $calculation ONLY.
            """
        ]

        optimizer = TextualGradientDescent([solution])
        loss = TextLoss(loss_system_prompt, engine=engine)
        loss_value = loss(solution) # Forward method in Loss Function
        original_loss = loss_value.value

        # TextualVerifier
        verifier = TextualVerifierWithTracker(
            verifier_engine=engine, 
            use_cot_generation=False,
            use_step_breakdown=False,
            verification_task_prompts=verification_task_prompts,
            enable_logging=False,
        )

        verified_result = verifier.verify(instance=solution,
                                        instruction=loss_system_prompt,
                                        calculation=loss_value)

        tracker_data = verifier.get_tracker() # Tracker data     

        loss_value.set_value(verified_result.value) 
        
        loss_value.backward()
        optimizer.step()

        final_solution_answer = extract_answer(solution.value)
        if final_solution_answer == "Z":
            final_solution_answer = extract_answer_llm(
                question=formatted_question, 
                final_solution=solution.value)

        return {
            "id": row_data["id"],
            "source": row_data["source"],
            "subject": row_data["subject"],
            "question": formatted_question,
            "correct_answer": row_data["correct_answer"],
            "original_solution": row_data["raw_solution"],
            "original_loss": original_loss,
            "verified_loss": verified_result,
            "optimized_solution": solution.value,
            "verified_optimized_solution": "",
            "final_solution": solution.value,
            "final_solution_answer": final_solution_answer,
            "success": True,
            "error_message": None,
            "processing_time_ms": tracker_data['processing_time_ms'],
            "total_llm_calls": tracker_data['total_llm_calls'],
            "total_input_tokens": tracker_data['total_input_tokens'],
            "total_output_tokens": tracker_data['total_output_tokens'],
            "result": tracker_data
        }
        
    except Exception as e:
        # Return error information
        return {
            "id": row_data["id"],
            "source": "",
            "subject": "",
            "question": "",
            "correct_answer": "",
            "original_solution": "",
            "original_loss": "",
            "verified_loss": "",
            "optimized_solution": "",
            "verified_optimized_solution": "",
            "final_solution": "",
            "final_solution_answer": "", 
            "success": False,
            "error_message": str(e),
            "processing_time_ms": 0,
            "total_llm_calls": 0,
            "total_input_tokens": 0,
            "total_output_tokens": 0,
            "result": {}
        }

In [7]:
def run_evaluation():
    results = []
    start_time = time.time()
    
    with ThreadPoolExecutor(max_workers=32) as executor:
        futures = [
            executor.submit(evaluate, row.to_dict(), engine) 
            for _, row in initial_solution.iterrows()  
        ]
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
            try:
                result = future.result(timeout=None)
                if result is not None:
                    results.append(result)
            except Exception as e:
                print(f"Task failed: {e}")
                error_result = {
                    'success': False,
                    'error_message': str(e),
                    'result': {}
                }
                results.append(error_result)

    experiment_df = pd.DataFrame(results)
    print(f"Completed in {time.time() - start_time:.1f} seconds")
    experiment_df.to_csv('results/textgrad-tv-l.csv', index=False)
    
    return experiment_df

In [8]:
run_evaluation()

Processing:   0%|          | 1/412 [00:00<01:26,  4.77it/s]

["Here is a conversation:\n\n<CONVERSATION><LM_SYSTEM_PROMPT> You will evaluate a solution to a math question. \n                                        Do not attempt to solve it yourself, do not give a solution, \n                                        only identify errors. Be super concise. </LM_SYSTEM_PROMPT>\n\n<LM_INPUT> Here's how to solve this problem:\n\n1. **Understand the concept:** EDTA is a chelating agent, meaning it binds tightly to metal ions like calcium. The given Kf (formation constant) value is very large, indicating that the complex formation is highly favored.  This means most of the calcium will be in the Ca-EDTA complex form.\n\n2. **Write the equilibrium expression:** The equilibrium for the formation of the Ca-EDTA complex can be written as:\n   Ca¬≤‚Å∫ + EDTA‚Å¥‚Åª  <=> [Ca-EDTA]¬≤‚Åª\n   Kf = [Ca-EDTA]¬≤‚Åª / ([Ca¬≤‚Å∫][EDTA‚Å¥‚Åª])\n\n3. **Set up the variables:** Let 'x' be the concentration of free Ca¬≤‚Å∫ ions. Since the complex is 1:1, the concentration

Processing:  16%|‚ñà‚ñå        | 66/412 [00:00<00:01, 195.96it/s]

["Here is a conversation:\n\n<CONVERSATION><LM_SYSTEM_PROMPT> You will evaluate a solution to a math question. \n                                        Do not attempt to solve it yourself, do not give a solution, \n                                        only identify errors. Be super concise. </LM_SYSTEM_PROMPT>\n\n<LM_INPUT> Here's how we can approach this question:\n\n1. **Identify the reactants and products:** We have A (solid), B (gas), C (bright red product), D (gas), E (hazardous product), F (strong acid), G (weak acid), and H (solvent).\n\n2. **Analyze the reactions:**\n   - A + 8B ‚Üí C\n   - C + 2D ‚Üí E\n   - C + H‚ÇÇO ‚Üí A + F + G\n   - D + B ‚Üí H\n\n3. **Deduce possible identities:** Since we're dealing with symmetry, the question likely involves simple molecules. Let's consider some possibilities:\n   - **B is likely oxygen (O‚ÇÇ):**  Reacting with 8 equivalents suggests a high oxidation state.\n   - **A could be sulfur (S):** Sulfur reacts with oxygen to form various 

Processing:  28%|‚ñà‚ñà‚ñä       | 115/412 [00:00<00:01, 218.10it/s]

["Here is a conversation:\n\n<CONVERSATION><LM_SYSTEM_PROMPT> You will evaluate a solution to a math question. \n                                        Do not attempt to solve it yourself, do not give a solution, \n                                        only identify errors. Be super concise. </LM_SYSTEM_PROMPT>\n\n<LM_INPUT> The target molecule is methyl 2-propyl-1,2,4a,5,6,7,8,8a-octahydronaphthalene-1-carboxylate. This is a bicyclic structure with a propyl group and a methyl carboxylate group on the same carbon (C1).  The rest of the ring system suggests a Diels-Alder reaction is involved in its synthesis.\n\nLet's analyze the options:\n\n* **A) methyl (2E,8E,10E)-tetradeca-2,8,10-trienoate:** This linear triene could potentially cyclize, but it would form a different ring system and wouldn't directly give the desired substitution pattern.\n\n* **B) 1-vinylcyclohex-1-ene and methyl hex-2-ynoate:** 1-vinylcyclohex-1-ene is a diene, and methyl hex-2-ynoate is a dienophile (alkyne). 

Processing:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 196/412 [00:00<00:00, 302.99it/s]

["Here is a conversation:\n\n<CONVERSATION><LM_SYSTEM_PROMPT> You will evaluate a solution to a math question. \n                                        Do not attempt to solve it yourself, do not give a solution, \n                                        only identify errors. Be super concise. </LM_SYSTEM_PROMPT>\n\n<LM_INPUT> Here's how we can solve this problem:\n\n1. **Find the eigenvalues and eigenvectors of the operator P.**  The eigenvalues of P are the possible outcomes of a measurement.  We need to solve the characteristic equation det(P - ŒªI) = 0, where I is the identity matrix.\n\n2. **Express the state vector as a linear combination of the eigenvectors.** This will allow us to determine the probability of each measurement outcome.\n\n3. **Calculate the probability of measuring 0.** The probability of measuring a particular eigenvalue is the square of the absolute value of the coefficient of the corresponding eigenvector in the linear combination.\n\nLet's perform the calcu

Processing:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 291/412 [00:01<00:00, 393.17it/s]

["Here is a conversation:\n\n<CONVERSATION><LM_SYSTEM_PROMPT> You will evaluate a solution to a math question. \n                                        Do not attempt to solve it yourself, do not give a solution, \n                                        only identify errors. Be super concise. </LM_SYSTEM_PROMPT>\n\n<LM_INPUT> Linear logistic regression can be used with linearly separable and non-linearly separable data. So A is incorrect.\n\nLinear soft margin SVM can be used with linearly separable and non-linearly separable data. So B is incorrect.\n\nThe centroid method can be used with linearly separable and non-linearly separable data. So C is incorrect.\n\nLinear hard-margin SVM can only be used with linearly separable data. If the data are not linearly separable, the constraints in the hard-margin SVM optimization problem are not feasible, and no solution can be found.\n\nAnswer: D\n </LM_INPUT>\n\n<LM_OUTPUT> <VERIFIED>The analysis of linear logistic regression, linear soft m

Processing:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 393/412 [00:01<00:00, 442.27it/s]

['Here is a conversation:\n\n<CONVERSATION><LM_SYSTEM_PROMPT> You will evaluate a solution to a math question. \n                                        Do not attempt to solve it yourself, do not give a solution, \n                                        only identify errors. Be super concise. </LM_SYSTEM_PROMPT>\n\n<LM_INPUT> The proton is initially accelerated through a potential difference V. This gives it a kinetic energy equal to the change in potential energy, which is qV, where q is the charge of the proton.  So, initially, 1/2 * m * v^2 = qV, where m is the mass of the proton and v is its velocity in the z-direction.\n\nWhen the proton enters the region with the electric and magnetic fields, the electric field exerts a force qE in the +x-direction, and the magnetic field exerts a force qvB in the -x-direction (using the right-hand rule for the cross product v x B, where v is in the +z-direction and B is in the +y-direction). Since the proton\'s trajectory is not affected, thes

Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 412/412 [00:19<00:00, 21.34it/s] 


Completed in 19.5 seconds


Unnamed: 0,id,source,subject,question,correct_answer,original_solution,original_loss,verified_loss,optimized_solution,verified_optimized_solution,final_solution,final_solution_answer,success,error_message,processing_time_ms,total_llm_calls,total_input_tokens,total_output_tokens,result
0,30,GPQA-Diamond,-,Answer the following multiple choice question....,B,The energy of the emitted light is given as 2....,The final answer is incorrect. The complementa...,<VERIFIED>The complementary color of green is ...,The energy of the emitted light is given as 2....,,The energy of the emitted light is given as 2....,D,True,,41.522949,1,308,31,{'setup': {'verifier_engine': '<textgrad.engin...
1,10,GPQA-Diamond,-,Answer the following multiple choice question....,D,We need to determine which planet has the high...,* **Incorrect comparison:** The answer should ...,<VERIFIED>* **Incorrect comparison:** The answ...,We need to determine which planet has the high...,,We need to determine which planet has the high...,C,True,,38.276855,1,471,101,{'setup': {'verifier_engine': '<textgrad.engin...
2,11,GPQA-Diamond,-,Answer the following multiple choice question....,D,Let's analyze each statement:\n\nA. This state...,No errors identified.\n,<VERIFIED>D\n</VERIFIED>,Let's analyze each statement:\n\nA. This state...,,Let's analyze each statement:\n\nA. This state...,D,True,,37.229736,1,298,1,{'setup': {'verifier_engine': '<textgrad.engin...
3,22,GPQA-Diamond,-,Answer the following multiple choice question....,D,The question asks about the oxidizing power of...,The analysis of thermodynamic influence (first...,<VERIFIED>The calculation correctly identifies...,The question asks about the oxidizing power of...,,The question asks about the oxidizing power of...,D,True,,44.541016,1,261,21,{'setup': {'verifier_engine': '<textgrad.engin...
4,33,GPQA-Diamond,-,Answer the following multiple choice question....,C,"* The reaction is a Diels-Alder reaction, a [4...",The analysis incorrectly assumes the methyl gr...,<VERIFIED>The analysis correctly identifies th...,"* The reaction is a Diels-Alder reaction, a [4...",,"* The reaction is a Diels-Alder reaction, a [4...",A,True,,32.378174,1,312,45,{'setup': {'verifier_engine': '<textgrad.engin...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407,34,GPQA-Diamond,-,Answer the following multiple choice question....,B,Here's how we can determine the products of th...,* **A:** The major product would likely arise ...,<VERIFIED>* **A:** The major product would lik...,Here's how we can determine the products of th...,,Here's how we can determine the products of th...,B\n,True,,43.729980,1,434,74,{'setup': {'verifier_engine': '<textgrad.engin...
408,118,GPQA-Diamond,-,Answer the following multiple choice question....,C,"The reaction between 4,4-dimethylcyclopent-1-e...",1. **Incorrect regiochemistry:** The hydroxyl ...,<VERIFIED>1. **Incorrect regiochemistry:** The...,"The reaction between 4,4-dimethylcyclopent-1-e...",,"The reaction between 4,4-dimethylcyclopent-1-e...",None of the provided options are correct. The...,True,,14.614990,1,560,127,{'setup': {'verifier_engine': '<textgrad.engin...
409,62,GPQA-Diamond,-,Answer the following multiple choice question....,C,Here's how we can determine the correct sequen...,* **Incorrect analysis of Modified Option C:**...,<VERIFIED>* **Incorrect analysis of Modified O...,Here's how we can determine the correct sequen...,,Here's how we can determine the correct sequen...,C\n,True,,2.991211,1,661,72,{'setup': {'verifier_engine': '<textgrad.engin...
410,358,MMLU-CP,college_physics,Answer the following multiple choice question....,C,The particle's lifetime in its rest frame is 2...,No errors.\n,<VERIFIED>$d = vt = (0.60c)(2.5 \times 10^{-3}...,The particle's lifetime in its rest frame is 2...,,The particle's lifetime in its rest frame is 2...,C\n,True,,6.323242,1,257,25,{'setup': {'verifier_engine': '<textgrad.engin...
