In [1]:
from datasets import load_dataset


dataset_name = "HuggingFaceH4/MATH-500"
dataset = load_dataset(dataset_name)

README.md:   0%|          | 0.00/412 [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/447k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
dataset["test"][0]

{'problem': 'Convert the point $(0,3)$ in rectangular coordinates to polar coordinates.  Enter your answer in the form $(r,\\theta),$ where $r > 0$ and $0 \\le \\theta < 2 \\pi.$',
 'solution': 'We have that $r = \\sqrt{0^2 + 3^2} = 3.$  Also, if we draw the line connecting the origin and $(0,3),$ this line makes an angle of $\\frac{\\pi}{2}$ with the positive $x$-axis.\n\n[asy]\nunitsize(0.8 cm);\n\ndraw((-0.5,0)--(3.5,0));\ndraw((0,-0.5)--(0,3.5));\ndraw(arc((0,0),3,0,90),red,Arrow(6));\n\ndot((0,3), red);\nlabel("$(0,3)$", (0,3), W);\ndot((3,0), red);\n[/asy]\n\nTherefore, the polar coordinates are $\\boxed{\\left( 3, \\frac{\\pi}{2} \\right)}.$',
 'answer': '\\left( 3, \\frac{\\pi}{2} \\right)',
 'subject': 'Precalculus',
 'level': 2,
 'unique_id': 'test/precalculus/807.json'}

In [4]:
import openai

client = openai.Client(
    base_url=f"http://172.17.0.1:1337/v1",
    api_key="None",
)

test_num = 0

test_question = dataset["test"][test_num]["problem"]

prompt = (
    open("hidden_capacity_reasoning/evaluation/math_500/math_500_prompt")
    .read()
    .format(question=test_question)
)

# https://github.com/deepseek-ai/DeepSeek-R1?tab=readme-ov-file#deepseek-r1-evaluation
# For all our models, the maximum generation length is set to 32,768 tokens. For benchmarks requiring sampling, we use a temperature of
# 0.6
# , a top-p value of
# 0.95
# , and generate 64 responses per query to estimate pass@1.
response = client.chat.completions.create(
    model="test",
    messages=[
        {
            "role": "user",
            "content": prompt,
        },
    ],
    temperature=0.6,
    max_tokens=32768,
    top_p=0.95,
)

model_responce = response.choices[0].message.content
print(model_responce)

Okay, so I have this problem here where I need to convert the rectangular coordinate (0,3) to polar coordinates. Hmm, polar coordinates, right? I remember that polar coordinates are represented as (r, θ), where r is the distance from the origin to the point, and θ is the angle made with the positive x-axis. 

Alright, let me think. In rectangular coordinates, we have x and y values, and in polar, it's r and θ. The formula to convert from rectangular to polar is r = sqrt(x² + y²) and θ = arctan(y/x). Wait, but I need to be careful here because arctan can give me angles in different quadrants depending on the signs of x and y. 

Looking at the point (0,3), it's on the y-axis. So, x is 0 and y is 3. That should be straightforward because it's on the positive y-axis. So, if x is 0, the angle θ would be π/2 radians, which is 90 degrees, right? Because starting from the positive x-axis, going up π/2 radians points straight up along the y-axis.

But wait, let me verify this with the formulas.

In [5]:
model_responce

"Alright, so I have this problem where I need to convert the rectangular coordinate (0, 3) to polar coordinates. Hmm, okay. I remember that polar coordinates are represented as (r, θ), where r is the distance from the origin to the point, and θ is the angle made with the positive x-axis. \n\nFirst, I should probably recall the formulas for converting from rectangular to polar coordinates. I think they are:\n\nr = √(x² + y²)\n\nand\n\nθ = arctan(y/x)\n\nYeah, that sounds right. So, let me plug in the values from the rectangular coordinate (0, 3) into these equations. \n\nStarting with r. Since x is 0 and y is 3, substituting into the formula gives:\n\nr = √(0² + 3²) = √(0 + 9) = √9 = 3\n\nOkay, so r is 3. That seems straightforward.\n\nNow, for θ. The formula is θ = arctan(y/x). Plugging in the values:\n\nθ = arctan(3/0)\n\nHmm, wait a second. Dividing by zero can be tricky. I remember that when x is 0, the arctan function can sometimes be undefined or result in an angle where the tange

In [11]:
from lm_eval.tasks.hendrycks_math.utils import strip_string, remove_boxed, is_equiv


def dataset_answer_filter(answer):
    answer = strip_string(answer)
    replace_items = []
    for item in replace_items:
        answer = answer.replace(item, "")

    answer = "".join(answer.split(" "))

    return answer


import re


def model_answer_filter(answer):
    try:
        if "</think>" in answer:
            answer = answer.split("</think>")[1]
            answer = re.search(r"\\boxed\{.*\}", answer)
            answer = remove_boxed(answer.group(0))
            answer = dataset_answer_filter(answer)
        else:
            answer = "error"
    except Exception as e:
        print(e)
        answer = "error"
    return answer


gold_answer = dataset_answer_filter(dataset["test"][test_num]["answer"])
model_answer = model_answer_filter(model_responce)
print(gold_answer, model_answer, is_equiv(gold_answer, model_answer))

(3,\frac{\pi}{2}) (3,\frac{\pi}{2}) True


### Generate Answers

In [5]:
import concurrent
import openai
import logging

logging.getLogger("openai").setLevel(logging.ERROR)


# https://docs.together.ai/docs/prompting-deepseek-r1
def sglang_generate(prompt: str):

    client = openai.Client(
        base_url=f"http://172.17.0.1:1337/v1",
        api_key="None",
    )
    response = client.chat.completions.create(
        model="sglang",
        messages=[
            {
                "role": "user",
                "content": prompt,
            },
        ],
        temperature=0.6,
        max_tokens=32768,
        top_p=0.95,
    )

    model_responce = response.choices[0].message.content
    return model_responce


def batch_generation_sglang(prompts):
    with concurrent.futures.ThreadPoolExecutor(max_workers=len(prompts)) as executor:
        prompts_results = list(
            executor.map(
                sglang_generate,
                prompts,
            )
        )
    return prompts_results


base_prompt = open(
    "hidden_capacity_reasoning/evaluation/math_500/math_500_prompt"
).read()
test_problems = [dataset["test"][i]["problem"] for i in range(3)]
test_problems = [base_prompt.format(question=item) for item in test_problems]
test_problems = batch_generation_sglang(test_problems)

In [6]:
print(test_problems[2])

Alright, so I've got this function f(x) = (3x - 2)/(x - 2), and I need to find the value of f(-2) + f(-1) + f(0). Hmm, okay. Let me break this down step by step because I want to make sure I don't make any mistakes.

First off, let me recall what f(x) represents. It's a rational function, meaning it's a fraction where both the numerator and the denominator are polynomials. The numerator is 3x - 2, and the denominator is x - 2. So, f(x) is defined for all x except where the denominator is zero, which would be x = 2. That's something to note because if I plug in x = 2, I'll get division by zero, which is undefined. So, I need to keep that in mind if I'm calculating f(2), but since the problem isn't asking for f(2), I don't have to worry about it here.

Now, the problem is asking for the sum of f(-2), f(-1), and f(0). So, I need to compute each of these three function values separately and then add them together. Let me tackle each one at a time.

Starting with f(-2). That means I substit

### Generate Answers on full dataset

In [None]:
from more_itertools import chunked
from tqdm.notebook import tqdm

base_prompt = open(
    "hidden_capacity_reasoning/evaluation/math_500/math_500_prompt"
).read()

batch_size = 128 * 2 * 4
new_dataset = dataset["test"].to_list()
dataset_with_answers = []
# 16 min 19 sec
# 5 min 14 sec
for batch in tqdm(list(chunked(new_dataset, batch_size))):
    test_problems = [item["problem"] for item in batch]
    test_problems = [base_prompt.format(question=item) for item in test_problems]
    test_problems = batch_generation_sglang(test_problems)
    for answer, item in zip(test_problems, batch):
        item["model_answer"] = answer

    dataset_with_answers.extend(batch)

  0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
len(dataset_with_answers), dataset_with_answers[0]

(500,
 {'problem': 'Convert the point $(0,3)$ in rectangular coordinates to polar coordinates.  Enter your answer in the form $(r,\\theta),$ where $r > 0$ and $0 \\le \\theta < 2 \\pi.$',
  'solution': 'We have that $r = \\sqrt{0^2 + 3^2} = 3.$  Also, if we draw the line connecting the origin and $(0,3),$ this line makes an angle of $\\frac{\\pi}{2}$ with the positive $x$-axis.\n\n[asy]\nunitsize(0.8 cm);\n\ndraw((-0.5,0)--(3.5,0));\ndraw((0,-0.5)--(0,3.5));\ndraw(arc((0,0),3,0,90),red,Arrow(6));\n\ndot((0,3), red);\nlabel("$(0,3)$", (0,3), W);\ndot((3,0), red);\n[/asy]\n\nTherefore, the polar coordinates are $\\boxed{\\left( 3, \\frac{\\pi}{2} \\right)}.$',
  'answer': '\\left( 3, \\frac{\\pi}{2} \\right)',
  'subject': 'Precalculus',
  'level': 2,
  'unique_id': 'test/precalculus/807.json',
  'model_answer': "Okay, so I have this problem here where I need to convert the rectangular coordinates (0, 3) to polar coordinates. Hmm, polar coordinates... I remember they involve a radius, r,

In [17]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")

In [18]:
short_answers = 0
for item in dataset_with_answers:
    if not "</think>" in item["model_answer"]:
        # print(item)
        short_answers += 1
        tokens = tokenizer.encode(item["model_answer"])
        print(len(tokens))
        # print("====")
        # print("====")
        # print("====")
short_answers

Token indices sequence length is longer than the specified maximum sequence length for this model (32767 > 16384). Running this sequence through the model will result in indexing errors


32767
32769
32769
32769
32769
32769
32769
32769
32769
32769


10

In [20]:
import os
import json

os.system("mkdir -p hidden_capacity_reasoning/evaluation/math_500/evals")
with open(
    "hidden_capacity_reasoning/evaluation/math_500/evals/deepseek_1.5b.json", "w"
) as f:
    json.dump(dataset_with_answers, f)

### Run evaluation

In [1]:
import json

with open(
    "hidden_capacity_reasoning/evaluation/math_500/evals/deepseek_1.5b.json", "r"
) as f:
    dataset_with_answers = json.load(f)

In [2]:
dataset_with_answers[0]

{'problem': 'Convert the point $(0,3)$ in rectangular coordinates to polar coordinates.  Enter your answer in the form $(r,\\theta),$ where $r > 0$ and $0 \\le \\theta < 2 \\pi.$',
 'solution': 'We have that $r = \\sqrt{0^2 + 3^2} = 3.$  Also, if we draw the line connecting the origin and $(0,3),$ this line makes an angle of $\\frac{\\pi}{2}$ with the positive $x$-axis.\n\n[asy]\nunitsize(0.8 cm);\n\ndraw((-0.5,0)--(3.5,0));\ndraw((0,-0.5)--(0,3.5));\ndraw(arc((0,0),3,0,90),red,Arrow(6));\n\ndot((0,3), red);\nlabel("$(0,3)$", (0,3), W);\ndot((3,0), red);\n[/asy]\n\nTherefore, the polar coordinates are $\\boxed{\\left( 3, \\frac{\\pi}{2} \\right)}.$',
 'answer': '\\left( 3, \\frac{\\pi}{2} \\right)',
 'subject': 'Precalculus',
 'level': 2,
 'unique_id': 'test/precalculus/807.json',
 'model_answer': "Alright, so I need to convert the rectangular coordinates (0, 3) to polar coordinates. Hmm, okay. I remember that polar coordinates are represented as (r, θ), where r is the distance from th

In [2]:
from lm_eval.tasks.hendrycks_math.utils import strip_string, remove_boxed, is_equiv
from hidden_capacity_reasoning.evaluation.math_500.utils import (
    dataset_answer_filter,
    model_answer_filter,
)

# print(gold_answer, model_answer, is_equiv(gold_answer, model_answer))
correct_items = 0
for pos, item in enumerate(dataset_with_answers):
    answer = dataset_answer_filter(item["answer"])
    model_answer = model_answer_filter(item["model_answer"])
    if is_equiv(answer, model_answer):
        correct_items += 1
    else:
        # print(answer, item["answer"], model_answer)
        print("dataset answer=", answer)
        print("model answer=", model_answer)
        print(pos)
        print("===")

INFO 04-17 22:17:44 __init__.py:194] No platform detected, vLLM is running on UnspecifiedPlatform
dataset answer= \text{Evelyn}
model answer= Evelyn
4
===
dataset answer= 4
model answer= error
9
===
dataset answer= \frac{3}{56}
model answer= \frac{1}{9}
11
===
dataset answer= 28
model answer= 120
18
===
dataset answer= 1,-2
model answer= -2}\)and\(\boxed{1
25
===
dataset answer= 144
model answer= 132
26
===
dataset answer= 3,5,7
model answer= 3}\),\(\boxed{5}\),and\(\boxed{7
36
===
dataset answer= 70\sqrt{2}
model answer= 10\sqrt{74}
43
===
dataset answer= 203
model answer= 1603
50
===
dataset answer= 9901
model answer= error
58
===
dataset answer= (6,31,-1)
model answer= (\frac{75}{53},-\frac{1624}{53},\frac{52}{53})
60
===
dataset answer= 501
model answer= 993
80
===
dataset answer= \frac{3}{2}
model answer= 15
90
===
dataset answer= 80
model answer= 110
94
===
dataset answer= 1\pm\sqrt{19}
model answer= 2
96
===
dataset answer= \text{east}
model answer= E
97
===
dataset answer= \beg

In [18]:
74 / 2

37.0

In [None]:
print(dataset_with_answers[451]["model_answer"])

In [3]:
correct_items / len(dataset_with_answers)

0.81

In [None]:
0.808

In [None]:
import json

dataset = open(
    "hidden_capacity_reasoning/evaluation/math_500/datasets/test.jsonl"
).readlines()
json.loads(dataset[1])

{'problem': 'Define\n\\[p = \\sum_{k = 1}^\\infty \\frac{1}{k^2} \\quad \\text{and} \\quad q = \\sum_{k = 1}^\\infty \\frac{1}{k^3}.\\]Find a way to write\n\\[\\sum_{j = 1}^\\infty \\sum_{k = 1}^\\infty \\frac{1}{(j + k)^3}\\]in terms of $p$ and $q.$',
 'solution': 'We count the number of times $\\frac{1}{n^3}$ appears in the sum\n\\[\\sum_{j = 1}^\\infty \\sum_{k = 1}^\\infty \\frac{1}{(j + k)^3},\\]where $n$ is a fixed positive integer.  (In other words, we are conditioning the sum on $j + k$.)  We get a term of $\\frac{1}{n^3}$ each time $j + k = n.$  The pairs $(j,k)$ that work are $(1,n - 1),$ $(2,n - 2),$ $\\dots,$ $(n - 1,1),$ for a total of $n - 1$ pairs.  Therefore,\n\\begin{align*}\n\\sum_{j = 1}^\\infty \\sum_{k = 1}^\\infty \\frac{1}{(j + k)^3} &= \\sum_{n = 1}^\\infty \\frac{n - 1}{n^3} \\\\\n&= \\sum_{n = 1}^\\infty \\left( \\frac{n}{n^3} - \\frac{1}{n^3} \\right) \\\\\n&= \\sum_{n = 1}^\\infty \\left( \\frac{1}{n^2} - \\frac{1}{n^3} \\right) \\\\\n&= \\sum_{n = 1}^\\inft

In [1]:
import json

dataset = open(
    "hidden_capacity_reasoning/evaluation/math_500/datasets/train.jsonl"
).readlines()
json.loads(dataset[1])

{'problem': 'If $5x - 3 = 12$, what is the value of $5x + 3$?',
 'solution': 'Adding 6 to both sides of $5x - 3 =12$ gives $5x -3 + 6 = 12 + 6$.  Simplifying both sides gives $5x + 3 = \\boxed{18}$.',
 'answer': '18',
 'subject': 'Prealgebra',
 'level': 2,
 'unique_id': 'test/prealgebra/2002.json'}

In [12]:
len(dataset)

12000

### Generate Traces

In [2]:
dataset = [json.loads(item) for item in dataset]

In [None]:
from datasets import Dataset

dataset = Dataset.from_list(dataset)

In [5]:
dataset[0]

{'problem': 'How many vertical asymptotes does the graph of $y=\\frac{2}{x^2+x-6}$ have?',
 'solution': 'The denominator of the rational function factors into $x^2+x-6=(x-2)(x+3)$. Since the numerator is always nonzero, there is a vertical asymptote whenever the denominator is $0$, which occurs for $x = 2$ and $x = -3$.  Therefore, the graph has $\\boxed{2}$ vertical asymptotes.',
 'answer': '2',
 'subject': 'Algebra',
 'level': 3,
 'unique_id': 'test/algebra/1.json'}

In [12]:
len(dataset), dataset[0]

(12000,
 {'problem': 'How many vertical asymptotes does the graph of $y=\\frac{2}{x^2+x-6}$ have?',
  'solution': 'The denominator of the rational function factors into $x^2+x-6=(x-2)(x+3)$. Since the numerator is always nonzero, there is a vertical asymptote whenever the denominator is $0$, which occurs for $x = 2$ and $x = -3$.  Therefore, the graph has $\\boxed{2}$ vertical asymptotes.',
  'answer': '2',
  'subject': 'Algebra',
  'level': 3,
  'unique_id': 'test/algebra/1.json'})

In [None]:
from more_itertools import chunked
from tqdm.notebook import tqdm

base_prompt = open(
    "hidden_capacity_reasoning/evaluation/math_500/math_500_prompt"
).read()

batch_size = 128 * 2 * 4
new_dataset = dataset
dataset_with_answers = []
# 16 min 19 sec
# 5 min 14 sec
for batch in tqdm(list(chunked(new_dataset, batch_size))):
    test_problems = [item["problem"] for item in batch]
    test_problems = [base_prompt.format(question=item) for item in test_problems]
    test_problems = batch_generation_sglang(test_problems)
    for answer, item in zip(test_problems, batch):
        item["model_answer"] = answer

    dataset_with_answers.extend(batch)

  0%|          | 0/12 [00:00<?, ?it/s]

### check train dataset

In [1]:
from datasets import load_dataset

dataset = load_dataset("dim/hendrycks_math_train_12k_DeepSeek-R1-Distill-Qwen-1.5B_max_len_4096")

dataset = dataset["train"].train_test_split(
    test_size=250,
    seed=42,
)
dataset = dataset["test"].filter(lambda x: "</think>" in x["model_answer"])
dataset

Dataset({
    features: ['problem', 'solution', 'answer', 'subject', 'level', 'unique_id', 'model_answer'],
    num_rows: 155
})

In [2]:
from lm_eval.tasks.hendrycks_math.utils import strip_string, remove_boxed, is_equiv
from hidden_capacity_reasoning.evaluation.math_500.utils import (
    dataset_answer_filter,
    model_answer_filter,
)
from datasets import load_dataset

# total dataset accuracy 0.813 with parsing errors
# print(gold_answer, model_answer, is_equiv(gold_answer, model_answer))
correct_items = 0
for pos, item in enumerate(dataset):
    try:
        answer = dataset_answer_filter(item["answer"])
        model_answer = model_answer_filter(item["model_answer"])
        # print(answer, model_answer)
        # break
        if is_equiv(answer, model_answer):
            correct_items += 1
    except:
        pass

correct_items / len(dataset)

INFO 04-20 21:01:40 __init__.py:190] Automatically detected platform cuda.
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'group'


0.8774193548387097