In [1]:
%pip install -q -U openai

Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import re
import json
import datetime
import time
import random
import glob

# 路径与配置
DATA_DIR = "./data"
SRC_MD = f"{DATA_DIR}/blackwukong.md"
TS = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
OUT_BASE_JSONL = f"{DATA_DIR}/wukong_base_{TS}.jsonl"
OUT_JSONL = f"{DATA_DIR}/wukong_dataset_{TS}.jsonl"

from openai import OpenAI

# 注意：为演示方便，这里直接在代码中写入密钥与模型，不推荐在生产环境硬编码敏感信息，建议改用环境变量或密钥管理服务
BASE_URL = "https://api.siliconflow.cn/v1"
MODEL_ID = "Qwen/Qwen3-235B-A22B-Instruct-2507"
API_KEY = "sk-xxx"

client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
print(f"Using model: {MODEL_ID} @ {BASE_URL}")


Using model: Qwen/Qwen3-235B-A22B-Instruct-2507 @ https://api.siliconflow.cn/v1


In [None]:
# 读取与切分

with open(SRC_MD, "r", encoding="utf-8") as f:
    raw_markdown = f.read()

md_src = raw_markdown

# 按标题切分；无标题回退按段落
matches = list(re.finditer(r"(?m)^(#{2,3})\s+(.+)$", md_src))
sections = []
if not matches:
    paras = [p.strip() for p in re.split(r"\n\s*\n", md_src) if len(p.strip()) >= 100]
    sections = paras
else:
    for i, m in enumerate(matches):
        s = m.start()
        e = matches[i + 1].start() if i + 1 < len(matches) else len(md_src)
        block = md_src[s:e].strip()
        if len(block) >= 100:
            sections.append(block)

# 去重保序
seen = set()
uniq = []
for t in sections:
    key = re.sub(r"\s+", " ", t).lower()[:240]
    if key in seen:
        continue
    seen.add(key)
    uniq.append(t)
sections = uniq

print(f"sections={len(sections)}")

sections=14


In [4]:
# 教师模型调用与解析

SYS_PROMPT = (
    "你是《黑神话：悟空》的资深资料整理者。"
    "将给定原文片段转写为一条训练样本，严格输出JSON："
    '{"instruction":"用户问题","output":"权威完整答案"}。'
    "要求："
    "1. instruction 是自然语言问题；"
    "2. output 仅依据原文，不要臆测；"
    "3. 禁止任何额外说明或代码块。"
)

def ask_teacher(block: str) -> str:
    resp = client.chat.completions.create(
        model=MODEL_ID,
        messages=[
            {"role": "system", "content": SYS_PROMPT},
            {"role": "user", "content": block},
        ],
        temperature=0.2,
        max_tokens=600,
        response_format={"type": "json_object"},
    )
    return resp.choices[0].message.content

def parse_json_pair(text: str):
    m = re.search(r"\{[\s\S]*\}", text)
    if not m:
        raise ValueError("教师模型返回非JSON")
    obj = json.loads(m.group(0))
    ins = (obj.get("instruction") or "").strip()
    out = (obj.get("output") or "").strip()
    if not ins or not out:
        raise ValueError("缺少必要字段")
    return ins, out


In [None]:
# 生成基础 instruction/output，并写入 OUT_BASE_JSONL

os.makedirs(os.path.dirname(OUT_BASE_JSONL), exist_ok=True)
base_written = 0

with open(OUT_BASE_JSONL, "w", encoding="utf-8") as fbase:
    for seg in sections:
        resp = None
        for _attempt in range(3):
            try:
                resp = client.chat.completions.create(
                    model=MODEL_ID,
                    messages=[
                        {"role": "system", "content": SYS_PROMPT},
                        {"role": "user", "content": seg},
                    ],
                    temperature=0.2,
                    max_tokens=600,
                    response_format={"type": "json_object"},
                )
                break
            except Exception:
                if _attempt == 2:
                    resp = None
                    break
                time.sleep(1.5 ** _attempt + random.random() * 0.3)
        if resp is None:
            continue
        obj = json.loads(resp.choices[0].message.content)
        ins = (obj.get("instruction") or "").strip()
        out = (obj.get("output") or "").strip()
        if not ins or not out:
            continue
        fbase.write(json.dumps({"instruction": ins, "output": out}, ensure_ascii=False) + "\n")
        base_written += 1

print(f"base saved: {base_written} -> {OUT_BASE_JSONL}")

base saved: 14 -> ./data/wukong_base_20251109_201117.jsonl


In [None]:
# 读取基础集，进行问法改写并写入最终集 OUT_JSONL（顺序执行，无函数）

NUM_VARIANTS = 14

os.makedirs(os.path.dirname(OUT_JSONL), exist_ok=True)
written = 0
seen_q = set()

# 选择最新的基础集文件
base_files = sorted(glob.glob(f"{DATA_DIR}/wukong_base_*.jsonl"), key=os.path.getmtime, reverse=True)
IN_BASE_JSONL = base_files[0]

with open(IN_BASE_JSONL, "r", encoding="utf-8") as fr, open(OUT_JSONL, "w", encoding="utf-8") as fw:
    for line in fr:
        line = line.strip()
        if not line:
            continue
        obj = json.loads(line)
        base_q = (obj.get("instruction") or "").strip()
        answer = (obj.get("output") or "").strip()
        if not base_q or not answer:
            continue

        r2 = None
        for _attempt in range(3):
            try:
                r2 = client.chat.completions.create(
                    model=MODEL_ID,
                    messages=[
                        {"role": "system", "content": "严格输出 JSON 对象：{\\\"paraphrases\\\": [\\\"...\\\"]}；禁止任何额外文本/代码块/前后缀。若需引号请用中文引号「」或在 JSON 中转义为 \\\\\"。每项必须是可直接回答的等价问法，不改变边界与条件。"},
                        {"role": "user", "content": f"基础问题：{base_q}\n数量：{NUM_VARIANTS}\n输出键：paraphrases"},
                    ],
                    temperature=0.6,
                    max_tokens=800,
                    response_format={"type": "json_object"},
                )
                break
            except Exception:
                if _attempt == 2:
                    r2 = None
                    break
                time.sleep(1.5 ** _attempt + random.random() * 0.3)
        if r2 is None:
            continue
        obj2 = json.loads(r2.choices[0].message.content)
        arr = obj2.get("paraphrases", [])
        arr = [x.strip() for x in arr if isinstance(x, str) and x.strip()]
        if not arr:
            continue

        # 规范：以问号结尾并全局去重
        for s in arr:
            if not s.endswith(("?", "？")):
                s = s.rstrip("？?") + "？"
            if s in seen_q:
                continue
            seen_q.add(s)
            fw.write(json.dumps({"instruction": s, "output": answer}, ensure_ascii=False) + "\n")
            written += 1

print(f"saved: {written} -> {OUT_JSONL}")

saved: 183 -> ./data/wukong_dataset_20251109_215706.jsonl
