In [1]:
from datasets import load_dataset
import torch

# Load the dataset
ds = load_dataset("HuggingFaceH4/MATH-500")
# Split the dataset into training and validation sets
train_val_dataset = ds["test"].train_test_split(test_size=0.1)
train_dataset = train_val_dataset["train"]
eval_dataset = train_val_dataset["test"]

 # This example only has a test split, so we use that, for demonstration purposes.

In [2]:
import pandas as pd
pd.DataFrame(train_dataset[:5])

Unnamed: 0,problem,solution,answer,subject,level,unique_id
0,Determine if the graph of the equation below i...,"This looks like the equation of a circle, but ...",\text{ellipse},Intermediate Algebra,2,test/intermediate_algebra/860.json
1,A class of 30 students recently took a test. ...,"From the given information, the total amount o...",84,Prealgebra,3,test/prealgebra/846.json
2,Compute $\arcsin \left( -\frac{1}{2} \right).$...,Since $\sin \left( -\frac{\pi}{6} \right) = -\...,-\frac{\pi}{6},Precalculus,2,test/precalculus/1105.json
3,What's the largest eight-digit base 2 integer?...,The largest eight-digit base 2 integer is 1 le...,255,Number Theory,3,test/number_theory/691.json
4,Add 313.9 to 12.6. Express the result as a dec...,We have \[\n\begin{array}{@{}c@{}c@{}c@{}c@{}c...,326.5,Prealgebra,2,test/prealgebra/1784.json


In [7]:
import textwrap
import random

wrapper = textwrap.TextWrapper(width=80)

sample = train_dataset[random.randint(0, len(train_dataset) - 1)]

problem = wrapper.fill(sample['problem'])
solution = wrapper.fill(sample['solution'])

print('Problem:\n', problem, '\n\n')
print('Solution:\n', solution)

Problem:
 Evaluate \[\sin (\arcsin 0.4 + \arcsin 0.5) \cdot \sin (\arcsin 0.5 - \arcsin
0.4).\] 


Solution:
 From the angle addition and subtraction formulas, \begin{align*} \sin (x + y) &=
\sin x \cos y + \cos x \sin y, \\ \sin (x - y) &= \sin x \cos y - \cos x \sin y,
\end{align*}so \begin{align*} \sin (x + y) \sin (x - y) &= (\sin x \cos y + \cos
x \sin y)(\sin x \cos y - \cos x \sin y) \\ &= \sin^2 x \cos^2 y + \sin x \cos x
\sin y \cos y - \sin x \cos x \sin y \cos y - \cos^2 x \sin^2 y \\ &= \sin^2 x
(1 - \sin^2 y) - (1 - \sin^2 x) \sin^2 y \\ &= \sin^2 x - \sin^2 x \sin^2 y -
\sin^2 y + \sin^2 x \sin^2 y \\ &= \sin^2 x - \sin^2 y. \end{align*}Taking $x =
\arcsin 0.5$ and $y = \arcsin 0.4,$ we get \begin{align*} \sin (\arcsin 0.5 +
\arcsin 0.4) \cdot \sin (\arcsin 0.5 - \arcsin 0.4) &= \sin^2 (\arcsin 0.5) -
\sin^2 (\arcsin 0.4) \\ &= 0.5^2 - 0.4^2 \\ &= 0.09 = \boxed{\frac{9}{100}}.
\end{align*}


In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the model and tokenizer
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

# The model may not have a pad token set by default, so set it (using the EOS token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [9]:
# Set a system prompt to be prepended to the user prompt.
# This is a simple example, but you could use a more complex system prompt.
system_prompt = "Solve the following math problem."

# Define a function to format the prompt and apply loss masking.
# This function builds a full text with a "User:" prompt and an "Assistant:" response.
# It then computes which tokens belong to the prompt (to be masked in the loss)
# The function assumes that each example has a `problem` and a `solution`, which is true for the MATH-500 dataset.
def tokenize_and_mask(example, tokenizer, max_length=1024, system_prompt=system_prompt):
    # Build a prompt with the system, user, and assistant messages.
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": example['problem']},
        {"role": "assistant", "content": example['solution']}
    ]

    # Tokenize the prompt (without special tokens) to know its length.
    prompt_ids = tokenizer.apply_chat_template(
        messages[:-1],
        return_tensors='pt',
        return_dict=True,
        add_special_tokens=False
    )["input_ids"][0]  # Remove batch dimension

    # Tokenize the full conversation (with special tokens and truncation)
    tokenized = tokenizer.apply_chat_template(
        messages,
        truncation=True,
        max_length=max_length,
        add_special_tokens=True,
        return_tensors="pt",
        return_dict=True,
    )
    input_ids = tokenized["input_ids"][0]  # Remove batch dimension
    attention_mask = tokenized["attention_mask"][0]  # Remove batch dimension

    # Create labels as a copy of input_ids.
    labels = input_ids.clone()  
    prompt_length = len(prompt_ids)
    labels[:prompt_length] = torch.tensor([-100] * prompt_length)

    return {"input_ids": input_ids, "labels": labels, "attention_mask": attention_mask}

def tokenize_for_generation(example, tokenizer):
    # Build a prompt with the system and user messages only, for generation (not for training)
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": example['problem']}
    ]

    # Tokenize the full conversation
    tokenized = tokenizer.apply_chat_template(
        messages,
        truncation=True,
        add_special_tokens=True,
        return_tensors="pt",
        return_dict=True,
        add_generation_prefix=True, # Doesn't seem to work with Qwen2.5-0.5B-Instruct
    )

    generation_prefix = '<|im_start|>assistant\n'
    generation_prefix_tokenized = tokenizer(generation_prefix, return_tensors='pt')["input_ids"]
    
    input_ids = tokenized["input_ids"][0]  # Remove batch dimension
    attention_mask = tokenized["attention_mask"][0]

    # Unsqueeze generation_prefix_tokenized to match dimensions
    generation_prefix_tokenized = generation_prefix_tokenized.squeeze(0)

    # Add the generation prefix to the input_ids
    input_ids = torch.cat([input_ids, generation_prefix_tokenized], dim=0)
    attention_mask = torch.cat([attention_mask, torch.ones_like(generation_prefix_tokenized)], dim=0)

    return {"input_ids": input_ids, "attention_mask": attention_mask}



In [10]:
# Map the formatting function over the dataset.
# This applies the formatting function to each example in the dataset.
# The result is that we have a dataset where each math problem is formatted as a prompt for the model,
# and the solution is formatted as a response that the model should generate.
# Each example is also tokenized
# (If your dataset is large you might use batched=True; here we keep it simple.)
train_dataset_tokenized = train_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
eval_dataset_tokenized = eval_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})

# Get a sample dataset so we can examine model generations before and after training
sample_dataset = eval_dataset.select(range(3))
sample_dataset_tokenized = sample_dataset.map(tokenize_for_generation, batched=False, fn_kwargs={"tokenizer": tokenizer})

train_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
eval_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
sample_dataset_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [11]:
def generate_and_print(sample_dataset, sample_dataset_tokenized, model, tokenizer):
    outputs = []
    for sample in sample_dataset_tokenized:
        input_ids_batch = sample['input_ids'].unsqueeze(0).to(model.device)
        attention_mask_batch = sample['attention_mask'].unsqueeze(0).to(model.device)

        generated_ids = model.generate(
            input_ids=input_ids_batch,
            attention_mask=attention_mask_batch,
            max_new_tokens=512,  # change as needed
        )
        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        outputs.append(generated_text)

    for i, sample in enumerate(sample_dataset_tokenized):
        # Retrieve the original problem and solution from the un-tokenized dataset
        original = sample_dataset[i]
        print("Problem:")
        print(wrapper.fill(original["problem"]))
        print("\nTrue Solution:")
        print(wrapper.fill(original["solution"]))
        print("\nModel's Solution:")
        model_output = outputs[i].split("assistant\n")[-1].strip()
        print(wrapper.fill(model_output))
        print("\n" + "-" * 80 + "\n")

# Generate and print model outputs before training
generate_and_print(sample_dataset, sample_dataset_tokenized, model, tokenizer)

Problem:
Find the remainder when $(5x + 9)^{611} + (x + 5)^{11} + (x - 1)^{11} + 3x^2 +
1$ is divided by $x + 2.$

True Solution:
By the Remainder Theorem, to find the remainder, we set $x = -2.$  This gives us
\[(-1)^{611} + 3^{11} + (-3)^{11} + 3(-2)^2 + 1 = \boxed{12}.\]

Model's Solution:
To solve the problem of finding the remainder when \((5x + 9)^{611} + (x +
5)^{11} + (x - 1)^{11} + 3x^2 + 1\) is divided by \(x + 2\), we can use
polynomial long division or the Remainder Theorem. However, a more efficient
approach involves recognizing that the given expression can be simplified using
the fact that \(x = -2\) is a root of the divisor.  First, let's substitute \(x
= -2\) into the expression:  \[ (5(-2) + 9)^{611} + ((-2) + 5)^{11} + ((-2) -
1)^{11} + 3(-2)^2 + 1 \]  Calculating each term separately:  1. \((5(-2) +
9)^{611} = (-10 + 9)^{611} = (-1)^{611} = -1\) 2. \((-2 + 5)^{11} = (3)^{11}\)
3. \((( -2 ) - 1)^{11} = (-3)^{11}\) 4. \(3(-2)^2 = 3 \cdot 4 = 12\)  So,
substituting the

In [12]:
# Create a simple data collator
def data_collator(features):
    input_ids = torch.nn.utils.rnn.pad_sequence(
        [f["input_ids"].clone().detach() for f in features],
        batch_first=True,
        padding_value=tokenizer.pad_token_id,
    )
    labels = torch.nn.utils.rnn.pad_sequence(
        [f["labels"].clone().detach() for f in features],
        batch_first=True,
        padding_value=-100,
    )
    attention_mask = torch.nn.utils.rnn.pad_sequence(
        [f["attention_mask"].clone().detach() for f in features],
        batch_first=True,
        padding_value=0,
    )
    return {
        "input_ids": input_ids,
        "labels": labels,
        "attention_mask": attention_mask,
    }