In [None]:
import nest_asyncio
from dotenv import load_dotenv

nest_asyncio.apply()
load_dotenv()

In [None]:
from datasets import load_dataset, Dataset
from pydantic_ai import Agent, RunContext
from transformers import AutoTokenizer
import asyncio
from itertools import chain
from collections import Counter
import json


class MathDataset:
    def __init__(
        self,
        data_path: str = "AI-MO/aimo-validation-aime",
        model_name: str = "openai:gpt-5-mini",
        tokenizer_name: str = "Qwen/Qwen3-0.6B",
    ) -> None:
        ds = load_dataset(data_path, split="train")
        self.questions = [str(question) for question in ds["problem"]]
        self.answers = [str(answer) for answer in ds["answer"]]
        self.agent = Agent(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    @property
    def num_unique_tokens(self) -> int:
        input_ids = self.tokenizer(self.questions)["input_ids"]
        return len(set(chain(*input_ids)))

    @property
    def token_counts(self) -> dict[str, int]:
        input_ids = self.tokenizer(self.questions)["input_ids"]
        return {self.tokenizer.decode(i): c for i, c in Counter(chain(*input_ids)).items()}

    def replace(self, old: str, new: str) -> None:
        self.questions = [question.replace(old, new) for question in self.questions]

    async def llm_transform(self, transform_instruction: str) -> None:
        prompts = [
            (
                "Transform the question according to the instruction.\n\n"
                "**The transformed question must preserve the exact mathematical formulation "
                "and all constraints of the original question. Do not alter numbers, variables, "
                "equations, or assumptions.**\n\n"
                f"# Question\n\n{question}\n\n"
                f"# Instruction\n\n{transform_instruction}\n\n"
                "Respond with the transformed question only."
            )
            for question in self.questions
        ]

        futures = [self.agent.run(prompt) for prompt in prompts]
        results = await asyncio.gather(*futures)
        self.questions = [result.output for result in results]




In [None]:
before = (
    "Quadratic polynomials $P(x)$ and $Q(x)$ have leading coefficients $2$ and $-2,$ respectively. "
    "The graphs of both polynomials pass through the two points $(16,54)$ and $(20,53).$ Find $P(0) + Q(0).$"
)

after = (
    "P(x) = 2x^2 + bx + c\n"
    "Q(x) = -2x^2 + bx + c\n"
    "P(16) = Q(16) = 54\n"
    "P(20) = Q(20) = 53\n"
    "find P(0) + Q(0)"
)

In [None]:
before = (
    "Let $ABCD$ be a parallelogram with $\\angle BAD < 90^\\circ.$ A circle tangent to sides $\\overline{DA},$ $\\overline{AB},$" 
    "and $\\overline{BC}$ intersects diagonal $\\overline{AC}$ at points $P$ and $Q$ with $AP < AQ,$ as shown. "
    "Suppose that $AP=3,$ $PQ=9,$ and $QC=16.$ Then the area of $ABCD$ can be expressed in the form $m\\sqrt{n},$ "
    "where $m$ and $n$ are positive integers, and $n$ is not divisible by the square of any prime. Find $m+n.$"
)

after = (
    "shape ABCD\n"
    "len(AB) = len(CD)\n"
    "len(BC) = len(DA)\n"
    "angle BAD < 90 degrees\n"
    "circle E\n"
    "E is tangent to DA, AB, BC\n"
    "E intersects AC at points P, Q\n"
    "AP < AQ\n"
    "AP = 3\n"
    "PQ = 9\n"
    "QC = 16\n"
    "area(ABCD) = m * sqrt(n)\n"
    "m > 0\n"
    "n > 0\n"
    "n is not divisable by any square prime"
    "find m + n"
)

In [None]:
dataevolve_agent = Agent(
    "openai:gpt-5-mini",
    deps_type=MathDataset,
    instructions=(
        "You must optimize math questions to minimize the number if unique tokens while strictly preserving their "
        "mathematical content. Your goal is to reduce the vocab size to under 512 unique tokens. You must ensure "
        "the mathematical of all questions is unchanged, and that they still have the same answer."
    )
)


@dataevolve_agent.tool
def get_questions(ctx: RunContext[MathDataset]) -> str:
    """
    Return the current list of questions as a stringified Python list.

    What it does:
      - Serializes the in-memory dataset questions to a string (e.g., for inspection/logging).

    Returns:
      str: A JSON-like string representation of the questions list.
    """
    return str(ctx.deps.questions)


@dataevolve_agent.tool
def get_num_unique_tokens(ctx: RunContext[MathDataset]) -> int:
    """
    Compute the number of unique tokenizer tokens across all questions.

    What it does:
      - Tokenizes every question with the dataset tokenizer.
      - Flattens token ids and counts unique ids to measure vocabulary breadth.

    Returns:
      int: Count of unique tokens in the current questions.
    """
    return ctx.deps.num_unique_tokens


@dataevolve_agent.tool
def get_token_counts(ctx: RunContext[MathDataset]) -> str:
    """
    Return a frequency table of decoded tokens across all questions (as pretty-printed JSON).

    What it does:
      - Tokenizes all questions.
      - Aggregates counts per token id.
      - Decodes each token id to text for human-readable analysis.

    Returns:
      str: JSON string mapping token (text) → frequency, sorted by token text.
    """
    return json.dumps(ctx.deps.token_counts, indent=2, sort_keys=True)


@dataevolve_agent.tool
def replace(ctx: RunContext[MathDataset], old: str, new: str) -> None:
    """
    Perform a literal string replacement across all questions.

    What it does:
      - Replaces every occurrence of `old` with `new` in each question.

    Args:
      old (str): Substring to be replaced (literal match).
      new (str): Replacement text.

    Side effects:
      - Mutates the in-memory questions list.
    """
    ctx.deps.replace(old, new)


@dataevolve_agent.tool
async def llm_transform(ctx: RunContext[MathDataset], transform_instruction: str) -> None:
    """
    Apply an LLM-guided transformation to every question, under strict fidelity constraints.

    What it does:
      - For each question, prompts the LLM to transform text while preserving the exact
        mathematical formulation (numbers, variables, relations, and constraints).
      - Updates the questions list with the transformed outputs.

    Args:
      transform_instruction (str): A concise directive, e.g.,
        - "Shorten wording; replace phrases with symbols where safe."
        - "Remove filler; keep all constraints; keep LaTeX intact."

    Side effects:
      - Mutates the in-memory questions list with LLM outputs.
    """
    await ctx.deps.llm_transform(transform_instruction)

In [9]:
prompt = """\
Transform the following math question into a compact, line-based DSL that preserves the exact mathematics.

STRICT FIDELITY REQUIREMENTS
- Do NOT change numbers, variables, relations, equations, or assumptions.
- Preserve all names (points, variables, functions) exactly as given.
- Keep LaTeX/math semantics intact; only re-express wording.
- Output ONLY the DSL lines. No commentary, no markdown fences, no bullets.

DSL STYLE
- One fact/constraint per line.
- Prefer short tokens/symbols to reduce vocabulary.
- Allowed tokens (aim to stay within): shape, point, line, circle, angle, len, area, perimeter, radius, diameter, tangent, secant, intersects, parallel, equals, <, >, <=, >=, congruent, similar, midpoint, slope, product, sum, difference, ratio, gcd, lcm, prime, composite, integer, real, find.
- Use concise math forms: e.g., P(x)=2x^2+b*x+c, P(16)=54, area(ABCD)=m*sqrt(n).
- End with a single goal line starting with: "find ..."

FEW-SHOT EXAMPLES
before:
Quadratic polynomials $P(x)$ and $Q(x)$ have leading coefficients $2$ and $-2,$ respectively. The graphs of both polynomials pass through the two points $(16,54)$ and $(20,53).$ Find $P(0) + Q(0).$
after:
P(x) = 2x^2 + b*x + c
Q(x) = -2x^2 + b*x + c
P(16) = 54
Q(16) = 54
P(20) = 53
Q(20) = 53
find P(0) + Q(0)

before:
Let $ABCD$ be a parallelogram with $\\angle BAD < 90^\\circ.$ A circle tangent to sides $\\overline{DA},$ $\\overline{AB},$ and $\\overline{BC}$ intersects diagonal $\\overline{AC}$ at points $P$ and $Q$ with $AP < AQ,$ as shown. Suppose that $AP=3,$ $PQ=9,$ and $QC=16.$ Then the area of $ABCD$ can be expressed in the form $m\\sqrt{n},$ where $m$ and $n$ are positive integers, and $n$ is not divisible by the square of any prime. Find $m+n.$
after:
shape ABCD
len(AB) = len(CD)
len(BC) = len(DA)
angle BAD < 90 degrees
circle E
E is tangent to DA
E is tangent to AB
E is tangent to BC
E intersects AC at P
E intersects AC at Q
AP < AQ
AP = 3
PQ = 9
QC = 16
area(ABCD) = m * sqrt(n)
m > 0
n > 0
n is squarefree
find m + n

NOW CONVERT THIS QUESTION
"""

agent = Agent("openai:gpt-5")
ds = MathDataset()
prompts = [prompt + question for question in ds.questions]
futures = [agent.run(prompt) for prompt in prompts]
results = await asyncio.gather(*futures)
ds.questions = [result.output for result in results]

In [None]:
# import json
# with open("data/v1.json", "w") as f:
#     json.dump({"questions": ds.questions, "answers": ds.answers}, f)

In [None]:
prompt = "Solve the following question and provide your answer in \\boxed{}\n\n"""
prompts = [prompt + question for question in ds.questions]
futures = [agent.run(prompt) for prompt in prompts]
results = await asyncio.gather(*futures)
llm_answers = [result.output for result in results]

In [None]:
from math_verify import parse, verify

grades = [verify(parse(a), parse(gt)) for a, gt in zip(llm_answers, ds.answers)]

sum(grades) / len(grades)

In [None]:
prompt = "Solve the following question and provide your answer in \\boxed{}\n\n"""
prompts = [prompt + question for question in MathDataset().questions]
futures = [agent.run(prompt) for prompt in prompts]
results = await asyncio.gather(*futures)
llm_answers_control = [result.output for result in results]

In [None]:
grades_control = [verify(parse(a), parse(gt)) for a, gt in zip(llm_answers_control, ds.answers)]

sum(grades) / len(grades)