# Solution Optimization Analysis

In [61]:
import re
import pandas as pd
from collections import Counter

## Load Datasets

In [62]:
raw_textgrad = pd.read_csv("results/raw_textgrad.csv")
raw_textgrad

Unnamed: 0,id,raw_solution,correct_answer,source,subject,solution_1,solution_2,solution_3,solution_4,solution_5
0,2,<Solution>\n1. **Reaction 1:** trans-cinnamald...,A,GPQA-Diamond,-,<Solution>\n1. **Reaction 1:** trans-Cinnamald...,<Solution>\n1. **Reaction 1:** trans-Cinnamald...,<Solution>\n1. **Reaction 1:** trans-Cinnamald...,<Solution>\n1. **Reaction 1:** trans-Cinnamald...,<Solution>\n1. **Reaction 1:** trans-Cinnamald...
1,1,<Solution>\nThe energy-time uncertainty princi...,B,GPQA-Diamond,-,<Solution>\nThe energy-time uncertainty princi...,<Solution>\nThe energy-time uncertainty princi...,<Solution>\nThe energy-time uncertainty princi...,<Solution>\nThe energy-time uncertainty princi...,<Solution>\nThe energy-time uncertainty princi...
2,39,<Solution>\nThe probability density function i...,B,GPQA-Diamond,-,<Solution>\nThe probability density function i...,<Solution>\nThe probability density function i...,<Solution>\nThe probability density function i...,<Solution>\nThe probability density function i...,<Solution>\nThe probability density function i...
3,4,"<Solution>\nIn our universe, Maxwell's equatio...",B,GPQA-Diamond,-,"<Solution>\nIn our universe, Maxwell's equatio...","<Solution>\nIn our universe, Maxwell's equatio...","<Solution>\nIn our universe, Maxwell's equatio...","<Solution>\nIn our universe, Maxwell's equatio...","<Solution>\nIn our universe, Maxwell's equatio..."
4,8,<Solution>\nHere's how we can analyze the resu...,B,GPQA-Diamond,-,<Solution>\nHere's how we can analyze the resu...,<Solution>\nHere's how we can analyze the resu...,<Solution>\nHere's how we can analyze the resu...,<Solution>\nHere's how we can analyze the resu...,<Solution>\nHere's how we can analyze the resu...
...,...,...,...,...,...,...,...,...,...,...
145,348,<Solution>\nA muon (μ) is a fundamental subato...,B,MMLU-CP,college_physics,<Solution>\nA muon (μ) is a fundamental subato...,<Solution>\nThe negative muon (μ-) and the ele...,<Solution>\nThe negative muon (μ-) and the ele...,<Solution>\nThe question implies a comparison ...,<Solution>\nThe question implies a comparison ...
146,364,<Solution>\nLet m_A be the mass of object A an...,B,MMLU-CP,college_physics,<Solution>\nLet m_A be the mass of object A an...,<Solution>\nAssuming an inertial frame of refe...,<Solution>\nLet m_A and m_B represent the mass...,<Solution>\nLet m_A and m_B represent the mass...,<Solution>\nLet m_A and m_B represent the mass...
147,385,<Solution>\nThe potential energy of the elevat...,C,MMLU-CP,college_physics,<Solution>\nThe potential energy of the elevat...,<Solution>\nAssuming negligible air resistance...,<Solution>\nAccording to the principle of cons...,<Solution>\nAccording to the principle of cons...,<Solution>\nThe potential energy (PE) converte...
148,339,<Solution>\nHere's how to solve this problem:\...,B,MMLU-CP,college_physics,<Solution>\nHere's how to solve this problem:\...,"<Solution>\nHere's how to solve this problem, ...","<Solution>\nHere's how to solve this problem, ...",<Solution>\nHere's how to solve this problem:\...,<Solution>\nHere's a revised solution that acc...


## Functions

In [63]:
def extract_answer(text):
    # Fix malformed </Answer>X</Answer> into proper <Answer>X</Answer>
    text = re.sub(r"</Answer>\s*\$?([A-D])\s*</Answer>", r"<Answer>\1</Answer>", text)
    
    # Now extract correctly
    match = re.search(r"<Answer>\s*\$?([A-D])\s*</Answer>", text, re.IGNORECASE)
    if match:
        return match.group(1).upper()
    return None


In [64]:
def process_answer(row_data):
    data = {
        "correct_answer": row_data["correct_answer"],
        "initial_answer": extract_answer(row_data["raw_solution"]),
        "solution_answer_1": extract_answer(row_data["solution_1"]),
        "solution_answer_2": extract_answer(row_data["solution_2"]),
        "solution_answer_3": extract_answer(row_data["solution_3"]),
        "solution_answer_4": extract_answer(row_data["solution_4"]),
        "final_answer": extract_answer(row_data["solution_5"])
    }
    # Get zero-shot & final result
    data["zero_shot_result"] = data["correct_answer"] == data["initial_answer"]
    data["final_result"] = data["correct_answer"] == data["final_answer"]

    # Majority voting among solution_1 to solution_5
    voted_answers = [
        data["solution_answer_1"],
        data["solution_answer_2"],
        data["solution_answer_3"],
        data["solution_answer_4"],
        data["final_answer"]
    ]
    
    # Filter out None values
    voted_answers = [a for a in voted_answers if a is not None]

    # Get majority answer
    if voted_answers:
        majority_vote = Counter(voted_answers).most_common(1)[0][0]
    else:
        majority_vote = None

    data["majority_answer"] = majority_vote
    data["majority_result"] = data["correct_answer"] == majority_vote

    return data


In [65]:
def process_result_data(result_df):
    processed_answer = []
    for index, row in result_df.iterrows():
        processed_answer.append(process_answer(row))
    return processed_answer

In [69]:
def calculate_true_percentages(df, columns):
    result = {}
    total = len(df)
    for col in columns:
        true_count = df[col].sum()  # Since True == 1 and False == 0
        result[col] = round((true_count / total) * 100, 2)
    return result

## 1. Analysis Raw TextGrad

In [70]:
columns_to_check = ["zero_shot_result", "final_result", "majority_result"]

In [67]:
# Raw TextGrad
raw_textgrad_processed_answer_list = process_result_data(raw_textgrad)
raw_textgrad_processed_answer = pd.DataFrame(raw_textgrad_processed_answer_list)
raw_textgrad_processed_answer

Unnamed: 0,correct_answer,initial_answer,solution_answer_1,solution_answer_2,solution_answer_3,solution_answer_4,final_answer,zero_shot_result,final_result,majority_answer,majority_result
0,A,B,B,A,B,A,B,False,False,B,False
1,B,B,B,B,B,B,B,True,True,B,True
2,B,B,B,B,B,B,B,True,True,B,True
3,B,C,C,C,C,C,C,False,False,C,False
4,B,B,B,A,B,B,B,True,True,B,True
...,...,...,...,...,...,...,...,...,...,...,...
145,B,B,B,B,B,B,B,True,True,B,True
146,B,B,B,B,B,B,B,True,True,B,True
147,C,C,C,C,C,C,C,True,True,C,True
148,B,B,B,B,,,,True,False,B,True


In [68]:
# Drop None Rows
clean_raw_textgrad_processed_answer = raw_textgrad_processed_answer.dropna()
clean_raw_textgrad_processed_answer

Unnamed: 0,correct_answer,initial_answer,solution_answer_1,solution_answer_2,solution_answer_3,solution_answer_4,final_answer,zero_shot_result,final_result,majority_answer,majority_result
0,A,B,B,A,B,A,B,False,False,B,False
1,B,B,B,B,B,B,B,True,True,B,True
2,B,B,B,B,B,B,B,True,True,B,True
3,B,C,C,C,C,C,C,False,False,C,False
4,B,B,B,A,B,B,B,True,True,B,True
...,...,...,...,...,...,...,...,...,...,...,...
144,A,A,A,A,A,A,A,True,True,A,True
145,B,B,B,B,B,B,B,True,True,B,True
146,B,B,B,B,B,B,B,True,True,B,True
147,C,C,C,C,C,C,C,True,True,C,True


In [71]:

percentages = calculate_true_percentages(clean_raw_textgrad_processed_answer, columns_to_check)
percentages

{'zero_shot_result': np.float64(84.44),
 'final_result': np.float64(81.48),
 'majority_result': np.float64(83.7)}