# 一、批量 ASR 转写

In [14]:
import pandas as pd, whisper, tqdm, os
from opencc import OpenCC
import json

In [8]:
df = pd.read_csv("../../data_map.csv")
model = whisper.load_model("base")  
cc = OpenCC('t2s')  # 繁转简

100%|███████████████████████████████████████| 139M/139M [00:37<00:00, 3.86MiB/s]


In [17]:
# 读取json_path中的json文件，获取title放入real_text列
# 这里json_path列存的路径是相对于当前notebook的路径
# 若文件缺失/字段缺失，可按需补充异常处理
def load_title(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data.get("title", "null")

In [18]:
def transcribe_to_simplified(path):
    r = model.transcribe(path, language="zh", task="transcribe")
    text = r["text"].strip()
    text_simplified = cc.convert(text)  # 转成简体
    return text_simplified

In [19]:
asr_results = []
for p in tqdm.tqdm(df["audio_path"]):
    r = transcribe_to_simplified(p)
    asr_results.append(r.strip())

100%|██████████| 1000/1000 [02:19<00:00,  7.16it/s]


In [20]:
df["asr_text"] = asr_results
df["real_text"] = df["json_path"].apply(load_title)
df.to_csv("../../map_with_asr.csv", index=False)

# 二、划分数据集

In [21]:
import os
import pandas as pd
import numpy as np

In [24]:
INPUT_CSV  = "../../map_with_asr.csv" 
OUTPUT_CSV =  "../../map_with_asr_split.csv"
TRAIN_CSV  = "../../train.csv"
VAL_CSV    = "../../val.csv"
TEST_CSV   = "../../test.csv"

RATIOS = (0.7, 0.15, 0.15)  # 训练/验证/测试占比
SEED = 4022510316           


In [25]:
df = pd.read_csv(INPUT_CSV)
assert abs(sum(RATIOS) - 1.0) < 1e-6, "RATIOS 之和必须为 1"

n = len(df)
rng = np.random.default_rng(SEED)
perm = rng.permutation(n)

train_end = int(RATIOS[0] * n) # 训练集结束索引
val_end   = train_end + int(RATIOS[1] * n) # 验证集结束索引

# 划分数据集
split = np.array(["train"] * n)
split[perm[train_end:val_end]] = "val"
split[perm[val_end:]] = "test"

df["split"] = split
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig") # 输出总表
print(f"Saved: {OUTPUT_CSV} (train={np.sum(split=='train')}, val={np.sum(split=='val')}, test={np.sum(split=='test')})")

# 另外输出按 split 划分的三个文件
df[df["split"] == "train"].to_csv(TRAIN_CSV, index=False, encoding="utf-8-sig")
df[df["split"] == "val"].to_csv(VAL_CSV, index=False, encoding="utf-8-sig")
df[df["split"] == "test"].to_csv(TEST_CSV, index=False, encoding="utf-8-sig")
print(f"Saved: {TRAIN_CSV}, {VAL_CSV}, {TEST_CSV}")

Saved: ../../map_with_asr_split.csv (train=700, val=150, test=150)
Saved: ../../train.csv, ../../val.csv, ../../test.csv
