# 一、拆分data_part

In [1]:
import json
import os
from pathlib import Path

# 根据json文件中的 key 切分文件
def split_json_by_keys(input_path: str, output_dir: str, max_files: int = 1000) -> None:
    """
    - input_path: 待切分文件路径
    - output_dir: 输出目录（如果不存在则创建）
    - max_files: 输出文件数量，方便测试
    """
    input_path = Path(input_path)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    with input_path.open('r', encoding='utf-8') as f:
        data = json.load(f)

    written = 0
    # 这里遍历json文件，按照key进行切分
    for key_str in data:
        key_int = int(key_str)
        out_path = output_dir / f"{key_int}.json"
        with out_path.open('w', encoding='utf-8') as wf:
            json.dump(data[key_str], wf, ensure_ascii=False, indent=2)
        written += 1
        if written >= max_files:
            break

    print(f"Done. Requested: {max_files}, written: {written} → {output_dir}")

In [3]:
json_data_input_path = "../../datas/origin_datas/data_part/data_part.json"
json_data_output_dir = "../../datas/origin_datas/new_data_by_keys"
json_data_max_files = 1000
split_json_by_keys(json_data_input_path, json_data_output_dir, json_data_max_files)

Done. Requested: 1000, written: 1000 → ../../datas/origin_datas/new_data_by_keys


# 二、制作映射表

In [4]:
import os
import json
import csv

audio_dir = "../../datas/origin_datas/audio_part"
image_dir = "../../datas/origin_datas/images_part"
json_dir  = "../../datas/origin_datas/new_data_by_keys"
output_map_csv_path = "../../data_map.csv"

In [5]:
# 从指定文件夹中列出具有指定扩展名的文件的ID集合
def list_ids_by_ext(folder_path, format_list):
    ids = set()
    if not os.path.isdir(folder_path):
        return ids
    for file_name in os.listdir(folder_path):
        lower = file_name.lower()
        if any(lower.endswith(format_name) for format_name in format_list):
            base_name = os.path.splitext(file_name)[0]
            ids.add(base_name)
    return ids

In [6]:
audio_ids = list_ids_by_ext(audio_dir, [".mp3"])
image_ids = list_ids_by_ext(image_dir, [".jpg"])
json_ids  = list_ids_by_ext(json_dir,  [".json"])
# 取三者交集，确保三模态都存在
common_ids = audio_ids & image_ids & json_ids
print("长度：", len(common_ids))

rows = []
for _id in sorted(common_ids, key=lambda x: int(x) if x.isdigit() else x):
    audio_path = os.path.join(audio_dir, f"{_id}.mp3")
    image_path = os.path.join(image_dir, f"{_id}.jpg")
    json_path = os.path.join(json_dir, f"{_id}.json")

    rows.append({
        "id": _id,
        "audio_path": audio_path,
        "image_path": image_path,
        "json_path": json_path,
    })

# 写入 CSV
os.makedirs(os.path.dirname(output_map_csv_path), exist_ok=True) if os.path.dirname(output_map_csv_path) else None
with open(output_map_csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["id", "audio_path", "image_path", "json_path"])
    writer.writeheader()
    writer.writerows(rows)

print(f"Done. {len(rows)} entries written to {output_map_csv_path}")

长度： 1000
Done. 1000 entries written to ../../data_map.csv
