# Solution Optimization Evaluaton TV3 TextGrad

In [1]:
import pandas as pd
import textgrad as tg
from textgrad.engine import get_engine
from textgrad.variable import Variable
from textgrad.optimizer import TextualGradientDescent
from textgrad.verifier import TextualVerifierV4
from textgrad.loss import TextLoss

## Load Datasets

In [2]:
initial_solution = pd.read_csv("csv/initial_solution.csv")
initial_solution

Unnamed: 0,id,formatted_question,raw_solution,correct_answer,source,subject
0,2,Answer the following multiple choice question....,Here's how we can determine the number of carb...,A,GPQA-Diamond,-
1,4,Answer the following multiple choice question....,Maxwell's equations in our universe are:\n\n1....,A,GPQA-Diamond,-
2,8,Answer the following multiple choice question....,Here's how we can analyze the results and dete...,B,GPQA-Diamond,-
3,1,Answer the following multiple choice question....,The energy-time uncertainty principle states t...,A,GPQA-Diamond,-
4,22,Answer the following multiple choice question....,The question asks about the oxidizing power of...,D,GPQA-Diamond,-
...,...,...,...,...,...,...
407,394,Answer the following multiple choice question....,The police car is moving towards the wall. Le...,B,MMLU-CP,college_physics
408,384,Answer the following multiple choice question....,Here's how we can solve this problem:\n\n1. **...,A,MMLU-CP,college_physics
409,404,Answer the following multiple choice question....,The diffraction of electrons by a crystal latt...,A,MMLU-CP,college_physics
410,390,Answer the following multiple choice question....,Here's how we can solve this problem:\n\n1. **...,D,MMLU-CP,college_physics


In [3]:
# Test size only 50 rows each datasets (Total 150 rows)

df_gpqa = initial_solution[initial_solution['source'] == 'GPQA-Diamond'].head(50)
df_mmlu_ml = initial_solution[initial_solution['source'] == 'MMLU-ML'].head(50)
df_mmlu_cp = initial_solution[initial_solution['source'] == 'MMLU-CP'].head(50)
df_test = pd.concat([df_gpqa, df_mmlu_ml, df_mmlu_cp], ignore_index=True)

df_test

Unnamed: 0,id,formatted_question,raw_solution,correct_answer,source,subject
0,2,Answer the following multiple choice question....,Here's how we can determine the number of carb...,A,GPQA-Diamond,-
1,4,Answer the following multiple choice question....,Maxwell's equations in our universe are:\n\n1....,A,GPQA-Diamond,-
2,8,Answer the following multiple choice question....,Here's how we can analyze the results and dete...,B,GPQA-Diamond,-
3,1,Answer the following multiple choice question....,The energy-time uncertainty principle states t...,A,GPQA-Diamond,-
4,22,Answer the following multiple choice question....,The question asks about the oxidizing power of...,D,GPQA-Diamond,-
...,...,...,...,...,...,...
145,339,Answer the following multiple choice question....,The proton is initially accelerated through a ...,D,MMLU-CP,college_physics
146,388,Answer the following multiple choice question....,Einstein's theory of the photoelectric effect ...,D,MMLU-CP,college_physics
147,364,Answer the following multiple choice question....,We are given that the mass of object B is twic...,C,MMLU-CP,college_physics
148,380,Answer the following multiple choice question....,"The electric displacement current, denoted by ...",A,MMLU-CP,college_physics


## Experiment

In [4]:
engine = get_engine("gemini-1.5-pro")
tg.set_backward_engine("gemini-1.5-pro", override=True)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def evaluate_with_raw_textgrad(row_data):
    match = initial_solution[initial_solution["id"] == row_data["id"]]
    if match.empty:
        return None  # or raise error
    formatted_question = match.iloc[0]["formatted_question"]
    result = {
        "id": row_data["id"],
        "raw_solution": row_data["raw_solution"],
        "correct_answer": row_data["correct_answer"],
        "source": row_data["source"],
        "subject": row_data["subject"]
    }
    
    solution = Variable(row_data["raw_solution"],
                    requires_grad=True,
                    role_description=f"Solution to the math question: {formatted_question}")
    loss_system_prompt = Variable("""You will evaluate a solution to a math question. 
                                    Do not attempt to solve it yourself, do not give a solution, 
                                    only identify errors. Be super concise.""",
                                    requires_grad=False,
                                    role_description="system prompt")
    optimizer = TextualGradientDescent([solution])
    loss = TextLoss(loss_system_prompt, engine=engine)

    # TextualVerifierV4
    verifier = TextualVerifierV4(verifier_engine=engine, step_eval_iterations=3, logger=False)
    
    # Iterate 5 times
    for i in range(1, 6):
        optimizer.zero_grad()  # Clean gradients
        loss_result = loss(solution)

        # TextualVerifierV4
        verified_result = verifier.verify(instance=solution, 
                                    prompt=loss_system_prompt,
                                    calculation=loss_result)
        loss_result.set_value(verified_result.value) 
        
        loss_result.backward()
        optimizer.step()
        result[f"solution_{i}"] = solution.value

    return result

## Running Evaluation

### TV TextGrad

In [6]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time

results = []
start_time = time.time()

with ThreadPoolExecutor(max_workers=128) as executor:
    # Submit all tasks
    futures = [
        executor.submit(evaluate_with_raw_textgrad, row.to_dict()) 
        for _, row in initial_solution[:10].iterrows()
    ]
    
    # Use tqdm for progress tracking
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
        result = future.result()
        if result is not None:
            results.append(result)

raw_textgrad = pd.DataFrame(results)

print(f"Completed in {time.time() - start_time:.1f} seconds")
raw_textgrad.to_csv('results/tv4_textgrad.csv', index=False)

I0000 00:00:1748866820.000306 2553761 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers
Processing:   0%|          | 0/10 [00:00<?, ?it/s]

CALCULATION No errors identified.

VERIFIER ```xml
<!-- No output as all steps have majority "No error" -->
```
CALCULATION The answer overlooks required changes to Ampère-Maxwell's law.  A magnetic current term analogous to the electric current term would also need to be added.

VERIFIER Since all feedback for each step is "No error" (or a variant of it), no output is generated.
CALCULATION The analysis incorrectly assumes the methyl groups will be cis.  They will be trans. The *syn* relationship between the oxygen bridge and sulfur is also incorrect.  They will be *anti*.

VERIFIER ```xml
```

Since all variants for each step have the feedback "No error", the majority vote for each step is "No error".  Therefore, no steps are outputted.
CALCULATION The analysis misinterprets the concept of epistasis.  Epistasis describes a situation where the phenotype of a double mutant resembles the phenotype of *one* of the single mutants.  While the analysis correctly identifies the g1g2 double m

Processing:   0%|          | 0/10 [00:30<?, ?it/s]


CALCULATION * Dimethyl fumarate does *not* exhibit optical isomerism, even in its polymeric form.  The polymerization occurs in a way that maintains symmetry, preventing chirality.
* Cyclohexane-1,3,5-trione *does* exhibit tautomerism. It exists predominantly in its enol form. 

VERIFIER ```xml
<Step6>Incomplete.  Dimethyl fumarate *as a polymer* can exhibit optical isomerism. This possibility is acknowledged earlier but disregarded without proper justification in this step.</Step6>
<Step7>Potentially incorrect due to the oversight in Step 6. If polymeric dimethyl fumarate is considered, then B could also be dimethyl fumarate, leading to different answer choices.</Step7>
```
["Here is a conversation:\n\n<CONVERSATION><LM_SYSTEM_PROMPT> You will evaluate a solution to a math question. \n                                    Do not attempt to solve it yourself, do not give a solution, \n                                    only identify errors. Be super concise. </LM_SYSTEM_PROMPT>\n\n<LM_I