In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
from tqdm import tqdm
from langchain_openai import ChatOpenAI
from langchain.cache import SQLiteCache
from langchain.globals import set_llm_cache
from langchain_core.prompts import ChatPromptTemplate

In [3]:
os.chdir("..")

In [4]:
from src.tools import python_script_sync, problem_solver_sync

## Read GSM8k dataset

In [5]:
set_llm_cache(SQLiteCache(database_path=".langchain.db"))

In [6]:
splits = {'train': 'main/train-00000-of-00001.parquet', 'test': 'main/test-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/openai/gsm8k/" + splits["train"])
df["numeric_response"] = df["answer"].map(lambda x: float(x[x.find("### ") + len("### "):].replace(",", "").strip()))

  from .autonotebook import tqdm as notebook_tqdm


## Evaluation on GSM8K
### LLM alone

In [7]:
system = """
Given a response from a problem you must split it into response and justification and return only the response.\
Response should not include units or the word Response, just numbers.\

Example:\
Problem: John went to the supermarket to buy 2 apples, 3 oranges and one bannana. Price per unit is the following:\
Bannana: 3$, apple 5$, oranges 7$.\

Problem solution:\
John will pay 34$ for 2 apples, 3 oranges and one bannana.\

Response:\
34
"""

SOL_PROMPT = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Problem:\n{problem}\n\nProblem solution:\n{sol}"),
    ]
)

llm = ChatOpenAI(model="gpt-4o-mini")
chain = SOL_PROMPT | llm

In [8]:
responses = []
failures = {}
for i in tqdm(range(len(df))):
    question = df.question.iloc[i]
    res = llm.invoke(question).content
    
    try:
        response = chain.invoke({"problem":question, "sol":res}).content
        if "," in response:
            response = sum([float(v.strip()) for v in response.split(",")])
        else:
            response = float(response)
    except Exception as e:
        failures[i] = {"error":e, "response":response}
        response = None
    responses.append(response)
    if i%1000 == 0:
        aux = df.iloc[0:len(responses)].copy()
        aux["llm_response"] = responses
        aux["check"] = aux.numeric_response == aux.llm_response
        print(aux.check.sum()/len(responses))
df["llm_response"] = responses

  0%|          | 15/7473 [00:00<02:41, 46.23it/s]

1.0


 14%|█▍        | 1030/7473 [00:05<00:26, 243.65it/s]

0.9170829170829171


 27%|██▋       | 2049/7473 [00:08<00:16, 329.46it/s]

0.9240379810094953


 41%|████      | 3047/7473 [00:12<00:15, 288.46it/s]

0.9253582139286904


 54%|█████▍    | 4031/7473 [00:15<00:14, 242.77it/s]

0.9280179955011247


 67%|██████▋   | 5031/7473 [00:19<00:09, 256.23it/s]

0.9254149170165967


 81%|████████  | 6023/7473 [00:23<00:05, 282.71it/s]

0.9246792201299784


 94%|█████████▍| 7034/7473 [00:26<00:01, 256.80it/s]

0.9244393658048851


100%|██████████| 7473/7473 [00:28<00:00, 264.41it/s]


In [9]:
errors = []
error_responses = []
for iteration, error_dict in failures.items():
    errors.append(error_dict["error"])
    error_responses.append(error_dict["response"])
len(set(errors)), len(set(error_responses))

(25, 25)

In [10]:
set(errors)

{ValueError("could not convert string to float: '07:30'"),
 ValueError("could not convert string to float: '10:00'"),
 ValueError("could not convert string to float: '120 200'"),
 ValueError("could not convert string to float: '15 + x'"),
 ValueError("could not convert string to float: '15n'"),
 ValueError("could not convert string to float: '2/3'"),
 ValueError("could not convert string to float: '20l + 32'"),
 ValueError("could not convert string to float: '23e'"),
 ValueError("could not convert string to float: '2:00'"),
 ValueError("could not convert string to float: '2√11'"),
 ValueError("could not convert string to float: '3 to 47'"),
 ValueError("could not convert string to float: '4 18'"),
 ValueError("could not convert string to float: '4/3'"),
 ValueError("could not convert string to float: '40 - 2S'"),
 ValueError("could not convert string to float: '5:00'"),
 ValueError("could not convert string to float: '5:45'"),
 ValueError("could not convert string to float: '6/19'"),
 

In [11]:
100 * len(set(errors)) / len(df)

0.3345376689415228

### Solving task by task

In [12]:
responses = []
failures = {}
for i in tqdm(range(len(df))):
    question = df.question.iloc[i]
    res = problem_solver_sync(question)
    
    try:
        response = chain.invoke({"problem":question, "sol":res}).content
        if "," in response:
            response = sum([float(v.strip()) for v in response.split(",")])
        elif ":" in response:
            response = float(response.split(":")[-1].strip())
        elif "=" in response:
            response = float(response.split("=")[-1].strip())
        else:
            response = float(response)
    except Exception as e:
        failures[i] = {"error":e, "response":response}
        response = None
    responses.append(response)
    if i%1000 == 0:
        aux = df.iloc[0:len(responses)].copy()
        aux["CoT_response"] = responses
        aux["check"] = aux.numeric_response == aux.CoT_response
        print(aux.check.sum()/len(responses))
df["CoT_response"] = responses

  0%|          | 4/7473 [00:00<07:51, 15.85it/s]

1.0


 13%|█▎        | 1003/7473 [00:57<07:43, 13.97it/s]

0.7902097902097902


 27%|██▋       | 2005/7473 [01:45<03:44, 24.39it/s]

0.7981009495252374


 40%|████      | 3005/7473 [02:35<04:17, 17.38it/s]

0.8010663112295902


 54%|█████▎    | 4005/7473 [03:22<02:34, 22.49it/s]

0.8070482379405148


 67%|██████▋   | 5003/7473 [04:25<02:31, 16.30it/s]

0.8032393521295741


 80%|████████  | 6004/7473 [05:27<01:12, 20.26it/s]

0.8051991334777537


 94%|█████████▎| 7004/7473 [06:24<00:32, 14.45it/s]

0.8024567918868734


100%|██████████| 7473/7473 [06:48<00:00, 18.29it/s]


In [13]:
errors = []
error_responses = []
for iteration, error_dict in failures.items():
    errors.append(error_dict["error"])
    error_responses.append(error_dict["response"])
len(set(errors)), len(set(error_responses))

(16, 16)

In [14]:
set(errors)

{ValueError("could not convert string to float: '1/4'"),
 ValueError("could not convert string to float: '10 13'"),
 ValueError("could not convert string to float: '12 8 6'"),
 ValueError("could not convert string to float: '120 200'"),
 ValueError("could not convert string to float: '1800 3000'"),
 ValueError("could not convert string to float: '186 + 4x'"),
 ValueError("could not convert string to float: '25 25 25 25'"),
 ValueError("could not convert string to float: '4 18'"),
 ValueError("could not convert string to float: '95\\n\\n95'"),
 ValueError("could not convert string to float: '9x + 120'"),
 ValueError("could not convert string to float: 'K = U + 26  \\nK + (U - 17) + U = 411  \\nK + 2U - 17 = 411  \\nK + 2U = 428  \\nSubstituting K = U + 26:  \\n(U + 26) + 2U = 428  \\n3U + 26 = 428  \\n3U = 402  \\nU = 134  \\nNow'"),
 ValueError("could not convert string to float: 'Please provide the number of shoes Melissa is repairing.'"),
 ValueError("could not convert string to floa

In [15]:
100 * len(set(errors)) / len(df)

0.2141041081225746

### Python script

In [16]:
responses = []
failures = {}
for i in tqdm(range(len(df))):
    question = df.question.iloc[i]
    try:
        _, response = python_script_sync(question)
        if type(response) is tuple:
            response = sum(response)
    except Exception as e:
        failures[i] = {"error":e, "response":response}
        response = None
    responses.append(response)
    if i % 1000 == 0:
        aux = df.iloc[0:len(responses)].copy()
        aux["python_response"] = responses
        aux["check"] = aux.numeric_response == aux.python_response
        print(aux.check.sum()/len(responses))
df["python_response"] = responses

  0%|          | 4/7473 [00:00<03:14, 38.32it/s]

1.0


  8%|▊         | 611/7473 [00:14<02:26, 46.86it/s]

# Number of brothers
total_brothers = 7

# Birthdays in the first half of the year
march_birthday = 3
# April and May have no birthdays mentioned
june_birthday = 0

# Total presents to buy in the first half of the year
first_half_presents = (march_birthday + april_birthday + june_birthday) * 2  # 2 presents for each brother

# Birthdays in the second half of the year
october_birthday = 1
november_birthday = 1
december_birthday = 2

# Total presents to buy in the second half of the year
second_half_presents = (october_birthday + november_birthday + december_birthday) * 2  # 2 presents for each brother

# Calculate the difference
result = second_half_presents - first_half_presents


 13%|█▎        | 1006/7473 [00:23<02:20, 45.99it/s]

0.8771228771228772


 15%|█▌        | 1126/7473 [00:25<02:18, 45.83it/s]

# Given information
phil_wins = 12
phil_wins = charlie_wins + 3
charlie_wins = dana_wins - 2
perry_wins = dana_wins + 5

# Calculating the number of games won by each player
charlie_wins = phil_wins - 3
dana_wins = charlie_wins + 2
perry_wins = dana_wins + 5

# Calculate how many more games Perry won than Phil
result = perry_wins - phil_wins


 23%|██▎       | 1702/7473 [00:38<02:08, 44.75it/s]

# Given data
time_ny_sf = None  # Time taken from New York to San Francisco (unknown)
time_no_ny = (3/4) * time_ny_sf  # Time taken from New Orleans to New York
time_sf_landing = 24  # Time taken from New York to land in San Francisco
time_after_no_landing = 16  # Time taken after landing in New York from New Orleans

# We know:
# Total time from New Orleans to San Francisco = time_no_ny + time_ny_sf + time_sf_landing
# time_no_ny = (3/4) * time_ny_sf
# Thus: 
# total_time = (3/4) * time_ny_sf + time_ny_sf + time_sf_landing

# Setting up the equation:
# total_time = (3/4) * time_ny_sf + time_ny_sf + 24
# total_time = (1 + 3/4) * time_ny_sf + 24
# total_time = (7/4) * time_ny_sf + 24

# We also know that she lands in San Francisco 24 hours after departing from New York, which includes the time taken from NY to SF.
# Therefore:
# total_time - 16 = 24

# Let's solve for time_ny_sf
from sympy import symbols, solve

time_ny_sf = symbols('time_ny_sf')
total_time_expr = (7/4) * time_ny_sf + 2

 27%|██▋       | 2008/7473 [00:45<02:00, 45.36it/s]

0.8860569715142429


 38%|███▊      | 2845/7473 [01:03<01:52, 41.24it/s]

graham_crackers = 48
marshmallows = 6

s'mores_possible_with_crackers = graham_crackers // 2
s'mores_possible_with_marshmallows = marshmallows

s'mores_possible = min(s'mores_possible_with_crackers, s'mores_possible_with_marshmallows)

total_marshmallows_needed = s'mores_possible + (s'mores_possible * 1)
marshmallows_needed = total_marshmallows_needed - marshmallows

result = max(0, marshmallows_needed)


 40%|████      | 3006/7473 [01:07<01:39, 44.69it/s]

0.8820393202265912


 41%|████      | 3071/7473 [01:09<01:44, 42.10it/s]

# Given information
roger_experience = 50  # Roger's total experience when he retires
years_until_retirement = roger_experience

# Coworker information
peter_experience = 19 - 7  # Peter's experience based on daughter's age
mike_experience = None  # Mike's experience to be determined
robert_experience = peter_experience - 4  # Robert's experience
tom_experience = 2 * robert_experience  # Tom's experience

# Roger's experience in relation to others
total_experience_of_others = peter_experience + tom_experience + robert_experience + mike_experience
roger_experience = total_experience_of_others  # Roger's experience equals the sum of the others

# We can solve for Mike's experience now
mike_experience = robert_experience - 2

# Updating total experience of others
total_experience_of_others = peter_experience + tom_experience + robert_experience + mike_experience

# Recalculate Roger's experience based on others
roger_experience = total_experience_of_others
years_until_retirement = 50 - ro

 47%|████▋     | 3483/7473 [01:18<01:30, 43.98it/s]

# Given data
number_of_people = 8
s'mores_per_person = 3
cost_for_4_s'mores = 3

# Total S'mores needed
total_s'mores = number_of_people * s'mores_per_person

# Cost calculation
cost_per_s'more = cost_for_4_s'mores / 4
total_cost = total_s'mores * cost_per_s'more

result = total_cost


 48%|████▊     | 3576/7473 [01:20<01:35, 40.62it/s]

# Time taken for each piece of clothing
time_b blouse = 15  # minutes
time_dress = 20    # minutes

# Total time spent on each type of clothing
time_spent_blouses = 2 * 60  # converting hours to minutes
time_spent_dresses = 3 * 60   # converting hours to minutes

# Number of pieces ironed
num_blouses = time_spent_blouses // time_blouse
num_dresses = time_spent_dresses // time_dress

# Total pieces of clothing ironed
result = num_blouses + num_dresses


 53%|█████▎    | 3977/7473 [01:30<01:13, 47.43it/s]

# Let b be the number of brown eggs and w be the number of white eggs.
# According to the problem, w = 3 * b
# After dropping the basket, she has 12 eggs left and 5 of the brown eggs survived.
# Therefore, the number of eggs broken is the total before the accident minus the remaining eggs.

# Let b be the number of brown eggs
# After the accident:
remaining_eggs = 12
survived_brown_eggs = 5

# Total eggs before the accident
total_eggs_before = remaining_eggs + (b - survived_brown_eggs)

# Total eggs before the accident can be represented as:
# Total eggs = w + b
# w = 3 * b
# So, total_eggs_before = 3 * b + b = 4 * b

# Hence, we can write:
# 4 * b = remaining_eggs + (b - survived_brown_eggs)
# 4 * b = 12 + (b - 5)

# Solving for b
from sympy import symbols, Eq, solve

b = symbols('b')
equation = Eq(4 * b, 12 + (b - 5))
solution = solve(equation, b)[0]

# Calculate the number of white eggs
w = 3 * solution

# Total eggs before the accident
total_eggs_before = w + solution

# Number of 

 54%|█████▎    | 4010/7473 [01:30<01:09, 49.95it/s]

0.884028992751812


 67%|██████▋   | 5009/7473 [01:51<00:53, 45.69it/s]

0.8810237952409518


 79%|███████▉  | 5922/7473 [02:12<00:32, 47.53it/s]

# Given data
total_good_oranges = 55

# Tree A details
tree_a_percentage = 0.5
oranges_per_tree_a = 10
good_oranges_percentage_a = 0.6

# Tree B details
tree_b_percentage = 0.5
oranges_per_tree_b = 15
good_oranges_percentage_b = 1/3

# Let x be the total number of trees
# Therefore, trees of type A = 0.5 * x and trees of type B = 0.5 * x

# Good oranges from Tree A
good_oranges_a = (oranges_per_tree_a * good_oranges_percentage_a) * (tree_a_percentage * x)

# Good oranges from Tree B
good_oranges_b = (oranges_per_tree_b * good_oranges_percentage_b) * (tree_b_percentage * x)

# Total good oranges from both trees
total_good_oranges_calculated = good_oranges_a + good_oranges_b

# Set up the equation
# total_good_oranges_calculated = 55
# (10 * 0.6 * 0.5 * x) + (15 * (1/3) * 0.5 * x) = 55

from sympy import symbols, Eq, solve

x = symbols('x')
equation = Eq((10 * 0.6 * 0.5 * x) + (15 * (1/3) * 0.5 * x), total_good_oranges)
solution = solve(equation)

# Total number of trees
total_trees = so

 80%|████████  | 6009/7473 [02:14<00:31, 46.48it/s]

0.8818530244959173


 88%|████████▊ | 6552/7473 [02:26<00:21, 43.50it/s]

# Let x be George's monthly income
# He donates half of his income: x / 2
# He spends $20 from the other half: (x / 2) - 20
# He has $100 left: ((x / 2) - 20) + 100 = (x / 2)

from sympy import symbols, Eq, solve

x = symbols('x')
equation = Eq((x / 2) - 20 + 100, x / 2)
solution = solve(equation, x)

result = solution[0]


 91%|█████████▏| 6825/7473 [02:32<00:14, 45.20it/s]

# Constants
cost_per_chocolate_bar = 1.50
s'mores_per_chocolate_bar = 3
scouts = 15
s'mores_per_scout = 2

# Total s'mores needed
total_s'mores_needed = scouts * s'mores_per_scout

# Total chocolate bars needed
total_chocolate_bars_needed = total_s'mores_needed / s'mores_per_chocolate_bar

# Total cost
total_cost = total_chocolate_bars_needed * cost_per_chocolate_bar

result = total_cost


 94%|█████████▍| 7009/7473 [02:36<00:10, 43.77it/s]

0.8827310384230824


 97%|█████████▋| 7241/7473 [02:42<00:05, 45.93it/s]

# Define the age differences
jenny_age_difference = 5  # Jenny is 5 years older than Charlie
charlie_age_difference = 3  # Charlie is 3 years older than Bobby

# Let's denote Bobby's current age as b
b = 0  # Assume Bobby's current age is 0 for simplicity

# Calculate current ages
charlie_age = b + charlie_age_difference
jenny_age = charlie_age + jenny_age_difference

# We need to find the age at which Jenny is twice Bobby's age
# Let x be the number of years until that point
# At that time, Jenny's age will be jenny_age + x
# Bobby's age will be b + x
# We need to solve the equation: jenny_age + x = 2 * (b + x)

# Set up the equation
from sympy import symbols, Eq, solve

x = symbols('x')
equation = Eq(jenny_age + x, 2 * (b + x))

# Solve for x
years_until_jenny_twice_bobby = solve(equation, x)[0]

# Calculate Charlie's age at that time
charlie_age_when_jenny_twice = charlie_age + years_until_jenny_twice

result = charlie_age_when_jenny_twice


 98%|█████████▊| 7321/7473 [02:43<00:03, 38.69it/s]

potatoes = 15
fries_per_potato = 25
total_fries_needed = 200

total_fries_available = potatoes * fries_per_potato
fries_leftover = total_fries_available - total_fries_needed

potatoes_used = fries_needed // fries_per_potato + (fries_needed % fries_per_potato > 0)
potatoes_leftover = potatoes - potatoes_used

result = potatoes_leftover


100%|██████████| 7473/7473 [02:47<00:00, 44.60it/s]


In [17]:
errors = []
error_responses = []
for iteration, error_dict in failures.items():
    errors.append(error_dict["error"])
    error_responses.append(error_dict["response"])
len(set(errors)), len(set(error_responses))

(13, 11)

In [18]:
set(errors)

{SyntaxError('invalid syntax',
             ('<string>', 2, 8, 'time_b blouse = 15  # minutes\n', 2, 14)),
 IndexError('list index out of range'),
 NameError("name 'april_birthday' is not defined"),
 NameError("name 'b' is not defined"),
 NameError("name 'charlie_wins' is not defined"),
 NameError("name 'fries_needed' is not defined"),
 NameError("name 'x' is not defined"),
 NameError("name 'years_until_jenny_twice' is not defined"),
 TypeError("unsupported operand type(s) for *: 'float' and 'NoneType'"),
 TypeError("unsupported operand type(s) for +: 'int' and 'NoneType'"),
 SyntaxError('unterminated string literal (detected at line 3)',
             ('<string>', 3, 2, "s'mores_per_person = 3", 3, 2)),
 SyntaxError('unterminated string literal (detected at line 3)',
             ('<string>', 3, 2, "s'mores_per_chocolate_bar = 3", 3, 2)),
 SyntaxError('unterminated string literal (detected at line 4)',
             ('<string>',
              4,
              2,
              "s'mores_p

In [19]:
100 * len(set(errors)) / len(df)

0.17395958784959187

# Benchmark

In [20]:
for col in ["llm_response", "CoT_response", "python_response"]:
    df[f"check_{col}"] = df[col] == df["numeric_response"]
    acc = round(100 * df[f'check_{col}'].sum() / len(df), 2)
    print(f"{col.split('_')[0]} accuracy: {acc}")

llm accuracy: 92.49
CoT accuracy: 80.28
python accuracy: 88.21
