# Solution Optimization Evaluaton TV3 TextGrad

In [7]:
import pandas as pd
import textgrad as tg
from textgrad.engine import get_engine
from textgrad.variable import Variable
from textgrad.optimizer import TextualGradientDescent
from textgrad.verifier import TextualVerifierV4
from textgrad.loss import TextLoss

## Load Datasets

In [8]:
initial_solution = pd.read_csv("csv/initial_solution.csv")
initial_solution

Unnamed: 0,id,formatted_question,raw_solution,correct_answer,source,subject
0,2,Answer the following multiple choice question....,Here's how we can determine the number of carb...,A,GPQA-Diamond,-
1,4,Answer the following multiple choice question....,Maxwell's equations in our universe are:\n\n1....,A,GPQA-Diamond,-
2,8,Answer the following multiple choice question....,Here's how we can analyze the results and dete...,B,GPQA-Diamond,-
3,1,Answer the following multiple choice question....,The energy-time uncertainty principle states t...,A,GPQA-Diamond,-
4,22,Answer the following multiple choice question....,The question asks about the oxidizing power of...,D,GPQA-Diamond,-
...,...,...,...,...,...,...
407,394,Answer the following multiple choice question....,The police car is moving towards the wall. Le...,B,MMLU-CP,college_physics
408,384,Answer the following multiple choice question....,Here's how we can solve this problem:\n\n1. **...,A,MMLU-CP,college_physics
409,404,Answer the following multiple choice question....,The diffraction of electrons by a crystal latt...,A,MMLU-CP,college_physics
410,390,Answer the following multiple choice question....,Here's how we can solve this problem:\n\n1. **...,D,MMLU-CP,college_physics


In [9]:
# Test size only 50 rows each datasets (Total 150 rows)

df_gpqa = initial_solution[initial_solution['source'] == 'GPQA-Diamond'].head(50)
df_mmlu_ml = initial_solution[initial_solution['source'] == 'MMLU-ML'].head(50)
df_mmlu_cp = initial_solution[initial_solution['source'] == 'MMLU-CP'].head(50)
df_test = pd.concat([df_gpqa, df_mmlu_ml, df_mmlu_cp], ignore_index=True)

df_test

Unnamed: 0,id,formatted_question,raw_solution,correct_answer,source,subject
0,2,Answer the following multiple choice question....,Here's how we can determine the number of carb...,A,GPQA-Diamond,-
1,4,Answer the following multiple choice question....,Maxwell's equations in our universe are:\n\n1....,A,GPQA-Diamond,-
2,8,Answer the following multiple choice question....,Here's how we can analyze the results and dete...,B,GPQA-Diamond,-
3,1,Answer the following multiple choice question....,The energy-time uncertainty principle states t...,A,GPQA-Diamond,-
4,22,Answer the following multiple choice question....,The question asks about the oxidizing power of...,D,GPQA-Diamond,-
...,...,...,...,...,...,...
145,339,Answer the following multiple choice question....,The proton is initially accelerated through a ...,D,MMLU-CP,college_physics
146,388,Answer the following multiple choice question....,Einstein's theory of the photoelectric effect ...,D,MMLU-CP,college_physics
147,364,Answer the following multiple choice question....,We are given that the mass of object B is twic...,C,MMLU-CP,college_physics
148,380,Answer the following multiple choice question....,"The electric displacement current, denoted by ...",A,MMLU-CP,college_physics


## Experiment

In [10]:
engine = get_engine("gemini-1.5-pro")
tg.set_backward_engine("gemini-1.5-pro", override=True)

In [11]:
def evaluate_with_raw_textgrad(row_data):
    match = initial_solution[initial_solution["id"] == row_data["id"]]
    if match.empty:
        return None  # or raise error
    formatted_question = match.iloc[0]["formatted_question"]
    result = {
        "id": row_data["id"],
        "raw_solution": row_data["raw_solution"],
        "correct_answer": row_data["correct_answer"],
        "source": row_data["source"],
        "subject": row_data["subject"]
    }
    
    solution = Variable(row_data["raw_solution"],
                    requires_grad=True,
                    role_description=f"Solution to the math question: {formatted_question}")
    loss_system_prompt = Variable("""You will evaluate a solution to a math question. 
                                    Do not attempt to solve it yourself, do not give a solution, 
                                    only identify errors. Be super concise.""",
                                    requires_grad=False,
                                    role_description="system prompt")
    optimizer = TextualGradientDescent([solution])
    loss = TextLoss(loss_system_prompt, engine=engine)

    # TextualVerifierV4
    verifier = TextualVerifierV4(verifier_engine=engine, step_eval_iterations=3, logger=False)
    
    # Iterate 5 times
    for i in range(1, 6):
        optimizer.zero_grad()  # Clean gradients
        loss_result = loss(solution)

        # TextualVerifierV4
        verified_result = verifier.verify(instance=solution, 
                                    prompt=loss_system_prompt,
                                    calculation=loss_result)
        loss_result.set_value(verified_result.value) 
        
        loss_result.backward()
        optimizer.step()
        result[f"solution_{i}"] = solution.value

    return result

## Running Evaluation

### TV TextGrad

In [12]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time

results = []
start_time = time.time()

with ThreadPoolExecutor(max_workers=128) as executor:
    # Submit all tasks
    futures = [
        executor.submit(evaluate_with_raw_textgrad, row.to_dict()) 
        for _, row in initial_solution[:20].iterrows()
    ]
    
    # Use tqdm for progress tracking
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
        result = future.result()
        if result is not None:
            results.append(result)

raw_textgrad = pd.DataFrame(results)

print(f"Completed in {time.time() - start_time:.1f} seconds")
raw_textgrad.to_csv('results/tv4_textgrad.csv', index=False)

Processing:   5%|▌         | 1/20 [00:00<00:03,  4.80it/s]

CALCULATION Dimethyl fumarate *does* have a chiral center when part of a polymer chain.  The prompt does not specify that the molecules are monomers.

CALCULATION The final answer is incorrect. The complementary color of green is magenta, not red.
CALCULATION The Corey-Chaykovsky reaction adds a methylene group (CH2) across the carbonyl, not just a single carbon.  Therefore, the final product has 12 carbons, not 11.

CALCULATION The analysis of thermodynamic influence is correct, but the kinetic analysis is flawed.  Proton transfer *is* involved, but higher proton concentration (acidic conditions) doesn't necessarily mean *faster* proton transfer in the specific context of oxygen reduction.  The reasoning given is insufficient to conclude "slower" kinetics in base.

CALCULATION The analysis incorrectly assumes the methyl groups will be cis.  They will be trans. The *syn* relationship between the oxygen bridge and sulfur is also incorrect.  They will be *anti*.

CALCULATION No errors id

Processing:  55%|█████▌    | 11/20 [00:50<00:42,  4.68s/it]

CALCULATION No errors.

['Here is a conversation:\n\n<CONVERSATION><LM_SYSTEM_PROMPT> You will evaluate a solution to a math question. \n                                    Do not attempt to solve it yourself, do not give a solution, \n                                    only identify errors. Be super concise. </LM_SYSTEM_PROMPT>\n\n<LM_INPUT> The process described is pair production from the collision of two photons, one of which is a high-energy gamma-ray and the other is a CMB photon.  The threshold energy for this process is given by the equation:\n\n$E_{\\gamma} E_{CMB} (1 - \\cos\\theta) = 2(m_e c^2)^2$\n\nwhere $E_{\\gamma}$ is the energy of the gamma-ray, $E_{CMB}$ is the energy of the CMB photon, $\\theta$ is the angle between the two photons, and $m_e$ is the rest mass of the electron.\n\nFor head-on collisions ($\\theta = 180^\\circ$), the equation simplifies to:\n\n$E_{\\gamma} E_{CMB} (1 - (-1)) = 2(m_e c^2)^2$\n$2 E_{\\gamma} E_{CMB} = 2(m_e c^2)^2$\n$E_{\\gamma} E_{CMB} 

Processing:  60%|██████    | 12/20 [01:09<00:51,  6.45s/it]

CALCULATION The reasoning and answer are correct. There are no errors.

["Here is a conversation:\n\n<CONVERSATION><LM_SYSTEM_PROMPT> You will evaluate a solution to a math question. \n                                    Do not attempt to solve it yourself, do not give a solution, \n                                    only identify errors. Be super concise. </LM_SYSTEM_PROMPT>\n\n<LM_INPUT> ChIP-seq identifies the binding sites of DNA-associated proteins.  The addition of DSG to PFA fixation results in more extensive protein-protein crosslinking, which can disrupt protein complexes.  IKAROS, a transcription factor, often functions within protein complexes at active promoters and enhancers.  Therefore, the disappearing peaks observed upon DSG addition are most likely to be found at these locations where IKAROS-containing protein complexes are disrupted by the crosslinking.  While DSG may cause some non-specific crosslinking at other genomic locations (A, C, and D), the most pronounced e

Processing:  65%|██████▌   | 13/20 [01:39<01:08,  9.73s/it]

CALCULATION No errors.  Your analysis of each statement and the overall answer are correct.

["Here is a conversation:\n\n<CONVERSATION><LM_SYSTEM_PROMPT> You will evaluate a solution to a math question. \n                                    Do not attempt to solve it yourself, do not give a solution, \n                                    only identify errors. Be super concise. </LM_SYSTEM_PROMPT>\n\n<LM_INPUT> A. Correct.  Ribosomal frameshifting is a mechanism used by viruses like SARS-CoV-2 to produce multiple proteins from overlapping open reading frames (ORFs). This -1 frameshift, facilitated by slippery sequences and a pseudoknot structure in the mRNA, is essential for generating two key polyproteins pp1a and pp1ab.  The statement correctly points out that the SARS-CoV-2 frameshifting mechanism is largely similar to that of SARS-CoV.\n\nB. Correct. The efficiency of ribosomal frameshifting is influenced by the structural dynamics of the pseudoknot.  The statement accurately refle

Processing:  70%|███████   | 14/20 [02:29<01:39, 16.60s/it]

CALCULATION The analysis of condition 2 is flawed.  It's not a *consequence* of short interaction time.  "Transparency" is not the important factor; it's the *short interaction time* itself that matters.  The explanation mixes up the underlying principle (short interaction time) with a consequence (negligible influence of other nucleons *during* the short interaction).

['Here is a conversation:\n\n<CONVERSATION><LM_SYSTEM_PROMPT> You will evaluate a solution to a math question. \n                                    Do not attempt to solve it yourself, do not give a solution, \n                                    only identify errors. Be super concise. </LM_SYSTEM_PROMPT>\n\n<LM_INPUT> The first reaction, A + H2SO4 ---> 2,8-dimethylspiro[4.5]decan-6-one, appears to be an acid-catalyzed dehydration of an alcohol to a ketone.  Therefore, A is likely 2,8-dimethylspiro[4.5]decan-6-ol.\n\nThe second reaction, B + BuLi + H+ ---> 4-methyl-1-phenylpent-3-en-1-ol, involves BuLi, which acts as a

Processing:  75%|███████▌  | 15/20 [02:40<01:18, 15.63s/it]

CALCULATION * **Compound 6 analysis is incorrect.**  (1s,3s,5s)-cyclohexane-1,3,5-triol has a plane of symmetry and is therefore achiral.  It would *not* exhibit optical activity.
* **The lowercase 's' vs uppercase 'S' is incorrectly interpreted.** Lowercase s is not a valid stereochemical descriptor.  The use of lowercase likely represents a typo, and the intent was almost certainly uppercase S.  Even with all 'S' configurations, the molecule is still achiral.
* **The final count of optically active compounds is incorrect** due to the errors described above.


Processing:  80%|████████  | 16/20 [02:45<00:53, 13.26s/it]

CALCULATION The charge of the Ca-EDTA complex is incorrectly given as 2-.  It should be neutral.  This affects the equilibrium expression. Also, the problem states the *initial* concentration of Ca-EDTA is 0.02 M, not the equilibrium concentration.

CALCULATION Reaction B analysis is incorrect. The proposed nucleophile, after deprotonation, would attack the *alkene* portion of methyl 2-cyclopentylidene-2-phenylacetate, not the ester carbonyl.  The analysis also incorrectly identifies the major product of a Claisen condensation, not a Michael addition, for the reaction between the enolate of ethyl 2-ethylbutanoate and ethyl 2-ethylbutanoate.

['Here is a conversation:\n\n<CONVERSATION><LM_SYSTEM_PROMPT> You will evaluate a solution to a math question. \n                                    Do not attempt to solve it yourself, do not give a solution, \n                                    only identify errors. Be super concise. </LM_SYSTEM_PROMPT>\n\n<LM_INPUT> 1. **(Z)-1-chloro-2-methylbu

Processing:  85%|████████▌ | 17/20 [03:04<00:43, 14.59s/it]

CALCULATION Butyllithium (BuLi) is a strong base, but it's primarily used as a **nucleophile**.  It will not deprotonate the alpha-hydrogen of a ketone as easily as described.  Instead, it will add to the carbonyl carbon.  The analysis of the second reaction completely misinterprets the role of BuLi.

["Here is a conversation:\n\n<CONVERSATION><LM_SYSTEM_PROMPT> You will evaluate a solution to a math question. \n                                    Do not attempt to solve it yourself, do not give a solution, \n                                    only identify errors. Be super concise. </LM_SYSTEM_PROMPT>\n\n<LM_INPUT> The Michael reaction involves the addition of a nucleophile to an α,β-unsaturated carbonyl compound. Let's analyze each reaction:\n\n**Reaction A:**\n* **Nucleophile:** methyl 2-oxocyclohexane-1-carboxylate (specifically, the enolate formed by deprotonation at the alpha position to the ester group using NaOEt)\n* **Michael acceptor:** 2,4-dimethyl-1-(vinylsulfinyl)benzene 

Processing:  95%|█████████▌| 19/20 [03:13<00:09,  9.80s/it]

["Here is a conversation:\n\n<CONVERSATION><LM_SYSTEM_PROMPT> You will evaluate a solution to a math question. \n                                    Do not attempt to solve it yourself, do not give a solution, \n                                    only identify errors. Be super concise. </LM_SYSTEM_PROMPT>\n\n<LM_INPUT> The first reaction, A + H2SO4 ---> 2,8-dimethylspiro[4.5]decan-6-one, involves sulfuric acid, a strong dehydrating agent.  This suggests the reaction is a dehydration of an alcohol to form a ketone.  Therefore, reactant A is 2,8-dimethylspiro[4.5]decan-6-ol.\n\nThe second reaction, B + BuLi + H+ ---> 4-methyl-1-phenylpent-3-en-1-ol, is more complex.  BuLi (butyllithium) acts as a strong base.  Considering the product and the available reactants, BuLi will deprotonate the most acidic hydrogen in reactant B.\n\nIf reactant B is 4-methyl-1-phenylpent-3-en-1-one (option A), BuLi would deprotonate the alpha-hydrogen (the hydrogen on the carbon next to the carbonyl group).  T

Processing: 100%|██████████| 20/20 [03:21<00:00, 10.10s/it]

Completed in 201.9 seconds



