In [1]:
import nest_asyncio

nest_asyncio.apply()

In [7]:
import asyncio
import json
from collections import Counter
from itertools import chain

from pydantic_ai import Agent, RunContext
from transformers import AutoTokenizer


class MathDataset:
    def __init__(
        self,
        questions: list[str],
        answers: list[str],
        model_name: str = "openai:gpt-5",
        tokenizer_name: str = "Qwen/Qwen3-0.6B",
    ) -> None:
        self.questions = questions
        self.answers = answers
        self.agent = Agent(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    @property
    def num_unique_tokens(self) -> int:
        input_ids = self.tokenizer(self.questions)["input_ids"]
        n = len(set(chain(*input_ids)))
        print(f"There are {n} unique tokens.")
        return n

    @property
    def token_counts(self) -> dict[str, int]:
        input_ids = self.tokenizer(self.questions)["input_ids"]
        return {self.tokenizer.decode(i): c for i, c in Counter(chain(*input_ids)).items()}

    def replace(self, old: str, new: str) -> None:
        self.questions = [question.replace(old, new) for question in self.questions]

    async def llm_transform(self, transform_instruction: str) -> None:
        prompts = [
            (
                "Transform the question according to the instruction.\n\n"
                "**The transformed question must preserve the exact mathematical formulation "
                "and all constraints of the original question. Do not alter numbers, variables, "
                "equations, or assumptions.**\n\n"
                f"# Question\n\n{question}\n\n"
                f"# Instruction\n\n{transform_instruction}\n\n"
                "Respond with the transformed question only."
            )
            for question in self.questions
        ]

        futures = [self.agent.run(prompt) for prompt in prompts]
        results = await asyncio.gather(*futures)
        self.questions = [result.output for result in results]


In [3]:
before = (
    "Quadratic polynomials $P(x)$ and $Q(x)$ have leading coefficients $2$ and $-2,$ respectively. "
    "The graphs of both polynomials pass through the two points $(16,54)$ and $(20,53).$ Find $P(0) + Q(0).$"
)

after = "P(x) = 2x^2 + bx + c\nQ(x) = -2x^2 + bx + c\nP(16) = Q(16) = 54\nP(20) = Q(20) = 53\nfind P(0) + Q(0)"

In [6]:
dataevolve_agent = Agent("openai:gpt-5-mini", deps_type=MathDataset)


@dataevolve_agent.tool
def get_questions(ctx: RunContext[MathDataset]) -> str:
    """Return the current list of questions."""
    return str(ctx.deps.questions)


@dataevolve_agent.tool
def get_num_unique_tokens(ctx: RunContext[MathDataset]) -> str:
    """Compute the number of unique tokenizer tokens across all questions."""
    return f"There are {ctx.deps.num_unique_tokens} unique tokens."


@dataevolve_agent.tool
def get_token_counts(ctx: RunContext[MathDataset]) -> str:
    """Return a frequency table of decoded tokens across all questions as JSON."""
    return json.dumps(ctx.deps.token_counts, indent=2, sort_keys=True)


@dataevolve_agent.tool
def replace(ctx: RunContext[MathDataset], old: str, new: str) -> str:
    """Perform a literal string replacement across all questions."""
    ctx.deps.replace(old, new)
    return f"There are {ctx.deps.num_unique_tokens} unique tokens after replacement."


@dataevolve_agent.tool
async def llm_transform(ctx: RunContext[MathDataset], transform_instruction: str) -> str:
    """
    Apply an LLM-guided transformation to every question, under strict fidelity constraints.

    What it does:
      - For each question, prompts the LLM to transform text while preserving the exact
        mathematical formulation (numbers, variables, relations, and constraints).
      - Updates the questions list with the transformed outputs.

    Args:
      transform_instruction (str): A concise directive, e.g.,
        - "Shorten wording; replace phrases with symbols where safe."
        - "Remove filler; keep all constraints; keep LaTeX intact."

    Side effects:
      - Mutates the in-memory questions list with LLM outputs.
    """
    await ctx.deps.llm_transform(transform_instruction)
    return f"There are {ctx.deps.num_unique_tokens} unique tokens after transformation."

In [11]:
with open("data/v1.json") as f:
    data = json.load(f)

ds_orig = MathDataset(questions=[item["question"] for item in data], answers=[item["answer"] for item in data])
ds_v1 = MathDataset(questions=[item["transformed_question"] for item in data], answers=[item["answer"] for item in data])

ds_orig.num_unique_tokens, ds_v1.num_unique_tokens

There are 1149 unique tokens.
There are 797 unique tokens.


(1149, 797)

472

In [None]:
# prompt = """\
# Reformat the math questions to use as few unique tokens as possible. Your goal is to minimize the vocab size.
# You can check the current vocab size using the `get_num_unique_tokens` tool. You can modify the questions using
# the `replace` and `llm_transform` tools. Your goal is to get the questions to contain under 512 unique tokens.

# STRICT FIDELITY REQUIREMENTS
# - Do NOT change numbers, variables, relations, equations, or assumptions.
# - Keep math semantics intact.
# - Output ONLY the DSL lines. No commentary, no markdown fences, no bullets.

# DSL STYLE
# - One fact/constraint per line.
# - Prefer short tokens/symbols to reduce vocabulary.
# - Allowed tokens (aim to stay within): shape, point, line, circle, angle, len, area, perimeter, radius, diameter, tangent, secant, intersects, parallel, equals, <, >, <=, >=, congruent, similar, midpoint, slope, product, sum, difference, ratio, gcd, lcm, prime, composite, integer, real, find.
# - Use concise math forms: e.g., P(x)=2x^2+b*x+c, P(16)=54, area(ABCD)=m*sqrt(n).
# - End with a single goal line starting with: "find ..."
# """

# dataevolve_agent.run_sync(user_prompt=prompt, deps=ds_v1)