数学文本题（Math Word Problem, MWP）是自然语言处理与推理结合的典型任务，需要模型理解自然语言并进行数学运算。

大多数题目可以直接通过关键词+数值映射解决（例如“John 有 5 个苹果，又买了 3 个” → “加法”），对于大语言模型，这类题几乎不会造成推理难度，但可以用来测试模型在简单数值推理上的稳定性。因此整体难度相当于小学中高年级（中国教育体系约 3~6 年级水平）。


In [18]:

import math, re, os
from dataclasses import dataclass
from typing import Optional, Dict, Any

import torch
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd

from transformers.utils import logging
logging.set_verbosity_error()

# ---- Choose your model here ----
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
# MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"

DEFAULT_MAX_NEW_TOKENS = 128
GEN_TEMP = 0.0
SEED = 42

torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

def get_device():
    if torch.cuda.is_available():
        return "cuda"
    if torch.backends.mps.is_available():
        return "mps"
    return "cpu"

def to_device(batch, device):
    return {k: v.to(device) for k, v in batch.items()}

解析数据

In [19]:
def extract_numeric_answer(s: str) -> Optional[float]:
    if s is None:
        return None
    m = re.search(r'(?:Answer)\s*[:：]\s*([\-+]?\d+(?:\.\d+)?)', s, flags=re.I)
    if m:
        try:
            return float(m.group(1))
        except Exception:
            pass
    nums = re.findall(r'[\-+]?\d+(?:\.\d+)?', s)
    if not nums:
        return None
    try:
        return float(nums[-1])
    except Exception:
        return None

def feq(a: float, b: float, atol: float = 1e-2, rtol: float = 1e-4) -> bool:
    return math.isclose(a, b, abs_tol=atol, rel_tol=rtol)

def normalize_answer(raw):
    if raw is None:
        return None
    if isinstance(raw, (int, float)):
        return float(raw)
    m = re.search(r'[\-+]?\d+(?:\.\d+)?', str(raw))
    return float(m.group(0)) if m else None

# Prompt 模板

In [20]:

SYSTEM_CN = "You are a helpful math assistant. Solve the problem and provide only the final numeric answer in the format: Answer: <number>."

USER_DIRECT = """Solve the following math word problem directly and output only the final numeric answer on the last line as: Answer: <number>.
Problem:
{problem}
"""

USER_COT = """Solve the following math word problem step-by-step. Show key quantities, relationships, and equations. On the last line, output only: Answer: <number>.
Problem:
{problem}
"""

FEW_SHOTS = [
    {
        "q": "Tom had 7 red balloons and bought 5 more. How many red balloons does he have now?",
        "a": "Step 1: Total = 7 + 5 = 12\nAnswer: 12"
    },
    {
        "q": "A car travels at 50 miles per hour for 3 hours. How far does it travel?",
        "a": "Step 1: distance = rate x time = 50 x 3 = 150\nAnswer: 150"
    },
    {
        "q": "Sara had 12 apples. She gave 4 to her friend and then bought 3 more. How many apples now?",
        "a": "Step 1: 12 - 4 = 8\nStep 2: 8 + 3 = 11\nAnswer: 11"
    }
]

def supports_chat_template(tokenizer) -> bool:
    try:
        tmpl = tokenizer.chat_template
        return tmpl is not None
    except Exception:
        return False

def build_chat_prompt(tokenizer, problem: str, strategy: str):
    if supports_chat_template(tokenizer):
        if strategy == "cot":
            user = USER_COT.format(problem=problem)
            messages = [{"role": "system", "content": SYSTEM_CN}]
            for ex in FEW_SHOTS:
                messages.append({"role": "user", "content": ex["q"]})
                messages.append({"role": "assistant", "content": ex["a"]})
            messages.append({"role": "user", "content": user})
        else:
            user = USER_DIRECT.format(problem=problem)
            messages = [{"role": "system", "content": SYSTEM_CN},
                        {"role": "user", "content": user}]
        return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # Fallback when a tokenizer has no chat_template
    if strategy == "cot":
        return "[SYSTEM]\n" + SYSTEM_CN + "\n[USER]\n" + USER_COT.format(problem=problem) + "\n[ASSISTANT]"
    else:
        return "[SYSTEM]\n" + SYSTEM_CN + "\n[USER]\n" + USER_DIRECT.format(problem=problem) + "\n[ASSISTANT]"

# 模型推理

In [21]:
from dataclasses import dataclass

@dataclass
class GenConfig:
    max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS
    temperature: float = GEN_TEMP
    top_p: float = 1.0
    do_sample: bool = False

def generate_one(model, tokenizer, prompt_text: str, device: str, gen_cfg: GenConfig) -> str:
    inputs = tokenizer([prompt_text], return_tensors="pt")
    inputs = to_device(inputs, device)
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=gen_cfg.max_new_tokens,
            temperature=gen_cfg.temperature,
            top_p=gen_cfg.top_p,
            do_sample=gen_cfg.do_sample,
            eos_token_id=tokenizer.eos_token_id,
        )
    gen_ids = output_ids[0, inputs["input_ids"].shape[1]:]
    return tokenizer.decode(gen_ids, skip_special_tokens=True).strip()



# 数据读取（SVAMP）

In [22]:

def load_svamp(split="test"):
    return load_dataset("MU-NLPC/Calc-svamp", split=split)

def example_to_problem(ex: Dict[str, Any]) -> str:
    return ex["question"]

ds = load_svamp("test")
print(ds)
print(ds[0])


Dataset({
    features: ['id', 'question', 'chain', 'result', 'result_float', 'equation', 'problem_type'],
    num_rows: 1000
})
{'id': 'svamp__chal-1', 'question': 'Each pack of dvds costs 76 dollars. If there is a discount of 25 dollars on each pack, how much do you have to pay to buy each pack?', 'chain': '<gadget id="calculator">76 - 25</gadget>\n<output>51</output>\n\n<result>51</result>', 'result': '51', 'result_float': 51.0, 'equation': '( 76.0 - 25.0 )', 'problem_type': 'Subtraction'}


加载模型

In [23]:
device = get_device()
print("Device:", device)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype="auto",
    device_map=None
).to(device).eval()

print("Loaded:", MODEL_NAME)


Device: mps
Loaded: Qwen/Qwen2.5-0.5B-Instruct


# 评测主流程

In [24]:

def evaluate_svamp(strategy: str = "direct",
                   limit: Optional[int] = None,
                   save_path: str = "svamp_predictions.csv",
                   split: str = "test"):
    ds = load_svamp(split=split)
    if limit is not None:
        ds = ds.select(range(min(limit, len(ds))))
    print(f"Loaded samples: {len(ds)} (split={split})")
    gen_cfg = GenConfig()

    n_correct = 0
    rows = []

    for ex in tqdm(ds, desc=f"Evaluating ({strategy})"):
        problem = example_to_problem(ex)
        gold = normalize_answer(ex.get("result_float", ex.get("result")))
        prompt_text = build_chat_prompt(tokenizer, problem, strategy)
        out = generate_one(model, tokenizer, prompt_text, device, gen_cfg)
        pred = extract_numeric_answer(out)

        ok = (gold is not None and pred is not None and feq(pred, gold))
        if ok:
            n_correct += 1

        rows.append({
            "problem": problem,
            "gold": gold,
            "prediction": pred,
            "raw_output": out
        })

    acc = n_correct / len(ds) if len(ds) > 0 else 0.0
    print(f"=== Calc-SVAMP ({strategy}) Accuracy: {acc*100:.2f}% ===")

    df = pd.DataFrame(rows)
    df.to_csv(save_path, index=False)
    print(f"Predictions saved to {save_path}")
    return acc, save_path


In [None]:

strategy = "direct"  # or "cot"
# you can modify limit to short debug
acc_small, csv_small = evaluate_svamp(strategy=strategy, limit=None, save_path=f"svamp_{strategy}_debug.csv")
acc_small, csv_small


Loaded samples: 5 (split=test)


Evaluating (direct): 100%|██████████| 5/5 [00:01<00:00,  3.92it/s]

=== Calc-SVAMP (direct) Accuracy: 40.00% ===
Predictions saved to svamp_direct_debug.csv





(0.4, 'svamp_direct_debug.csv')