In [None]:
# pip install pydub pandas librosa tqdm

import os
import pandas as pd
from pydub import AudioSegment
from tqdm import tqdm

# ==============================
# 1. 设定数据路径
# ==============================
AUDIO_DIR = "path_to_audio"  # 存放 DAIC-WOZ 的长音频
TEXT_DIR = "path_to_transcripts"  # 存放对应的文本转录（带时间戳）
OUTPUT_AUDIO_DIR = "path_to_output"  # 切分后的短音频保存路径

os.makedirs(OUTPUT_AUDIO_DIR, exist_ok=True)

# ==============================
# 2. 处理每个音频文件
# ==============================
def split_audio_by_transcript(audio_file, transcript_file, output_dir):
    """ 根据文本转录时间戳切割音频，并保持同步 """
    
    # 读取音频
    audio = AudioSegment.from_wav(audio_file)
    
    # 读取文本转录，假设格式为 CSV，包含 [start_time, end_time, text]
    df = pd.read_csv(transcript_file)  

    # 遍历每一句话，根据时间戳切割音频
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        start_time = int(row["start_time"] * 1000)  # 转换为毫秒
        end_time = int(row["end_time"] * 1000)
        text = row["text"]

        # 切割音频
        segment = audio[start_time:end_time]

        # 保存短音频文件
        output_audio_path = os.path.join(output_dir, f"{os.path.basename(audio_file).split('.')[0]}_{idx}.wav")
        segment.export(output_audio_path, format="wav")

        # 保存新的文本标注（可选）
        with open(output_audio_path.replace(".wav", ".txt"), "w") as f:
            f.write(text)

# ==============================
# 3. 运行批处理
# ==============================
audio_files = [f for f in os.listdir(AUDIO_DIR) if f.endswith(".wav")]

for audio_file in tqdm(audio_files):
    transcript_file = os.path.join(TEXT_DIR, audio_file.replace(".wav", ".csv"))
    if os.path.exists(transcript_file):
        split_audio_by_transcript(os.path.join(AUDIO_DIR, audio_file), transcript_file, OUTPUT_AUDIO_DIR)