In [None]:
import os
import whisper

# 加载Whisper模型
model = whisper.load_model("large-v3")

def transcribe_directory(directory_path):
    # 遍历文件夹内所有的.flac文件
    for filename in os.listdir(directory_path):
        if filename.endswith(".flac"):
            # 构建完整的文件路径
            file_path = os.path.join(directory_path, filename)
            
            # 调用Whisper模型进行语音转写
            result = model.transcribe(file_path, language="cantonese")  
            print(f"Transcription of {filename}: {result['text']}")


directory_path = "path/to/your/flac/files"
transcribe_directory(directory_path)


In [9]:
import os
import json

folder_path = "/home/pachen/dataset/Audio2Caption_Demo/alice's group/Nov 23, 2021 1026 AM/cut_audio_segments_flac"

results = {}

# 遍历文件夹,获取所有.flac文件路径
for root, dirs, files in os.walk(folder_path):
    for file in files:
        if file.endswith(".flac"):
            file_path = os.path.join(root, file)
            file_number = int(file.split(".")[0])  # 获取文件序号

            # 调用 model.transcribe 方法获取结果
            result = model.transcribe(file_path, language="cantonese")

            # 将结果添加到字典中
            results[file_number] = result

# 将字典写入JSON文件
with open("output.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

In [10]:
import os
import json
from datetime import timedelta

folder_path = "/home/pachen/dataset/Audio2Caption_Demo/alice's group/Nov 23, 2021 1026 AM"

results = []


for root, dirs, files in os.walk(os.path.join(folder_path, "cut_audio_segments_flac")):
    for file in files:
        if file.endswith(".flac"):
            file_path = os.path.join(root, file)
            file_number = int(file.split(".")[0])  # 获取文件序号

            # 调用 model.transcribe 方法获取结果
            result = model.transcribe(file_path, language="cantonese")

            # 读取原始字幕文件
            original_json_path = os.path.join(folder_path, f"GMT20211123-022650_Recording.json")
            with open(original_json_path, "r", encoding="utf-8") as f:
                original_data = json.load(f)

            # 查找对应的原始字幕条目
            original_entry = next((entry for entry in original_data if int(entry["id"]) == file_number), None)

            if original_entry:
                # 创建新的字幕条目
                new_entry = {
                    "id": str(file_number),
                    "start_time": original_entry["start_time"],
                    "end_time": original_entry["end_time"],
                    "duration": original_entry["duration"],
                    "speaker": original_entry["speaker"],
                    "sentence": result
                }
                results.append(new_entry)

# 将结果写入JSON文件
output_json_path = os.path.join(folder_path, f"GMT20211123-022650_Recording_whisper.json")
with open(output_json_path, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

KeyboardInterrupt: 

In [None]:
import json
import editdistance

def compute_cer_wer(original_json_path, asr_json_path):
    with open(original_json_path, 'r', encoding='utf-8') as f:
        original_data = json.load(f)
    with open(asr_json_path, 'r', encoding='utf-8') as f:
        asr_data = json.load(f)

    total_chars = 0
    total_words = 0
    char_errors = 0
    word_errors = 0
    char_weighted_errors = 0
    word_weighted_errors = 0

    for original_entry, asr_entry in zip(original_data, asr_data):
        original_sentence = original_entry['sentence']
        asr_sentence = asr_entry['sentence']

        original_chars = len(original_sentence)
        original_words = len(original_sentence.split())

        total_chars += original_chars
        total_words += original_words

        char_distance = editdistance.eval(original_sentence, asr_sentence)
        char_errors += char_distance
        char_weighted_errors += char_distance * original_chars

        original_word_list = original_sentence.split()
        asr_word_list = asr_sentence.split()
        word_distance = editdistance.eval(original_word_list, asr_word_list)
        word_errors += word_distance
        word_weighted_errors += word_distance * original_words

    cer = char_errors / total_chars if total_chars > 0 else 0
    wer = word_errors / total_words if total_words > 0 else 0
    char_weighted_cer = char_weighted_errors / total_chars if total_chars > 0 else 0
    word_weighted_wer = word_weighted_errors / total_words if total_words > 0 else 0

    return cer, wer, char_weighted_cer, word_weighted_wer

original_json_path = "/home/pachen/dataset/Audio2Caption_Demo/alice's group/Nov 23, 2021 1026 AM/GMT20211123-022650_Recording.json"
asr_json_path = "/home/pachen/dataset/Audio2Caption_Demo/alice's group/Nov 23, 2021 1026 AM/GMT20211123-022650_Recording_whisper.json"

compute_cer_wer(original_json_path, asr_json_path)

Whisper预测生成的结果放在原始答案后

In [11]:
import os
import json
from datetime import timedelta

# 定义文件夹路径
folder_path = "/home/pachen/dataset/Audio2Caption_Demo/alice's group/Nov 23, 2021 1026 AM"

# 遍历文件夹,获取所有.flac文件路径
for root, dirs, files in os.walk(os.path.join(folder_path, "cut_audio_segments_flac")):
    for file in files:
        if file.endswith(".flac"):
            file_path = os.path.join(root, file)
            file_number = int(file.split(".")[0])  # 获取文件序号

            # 调用 model.transcribe 方法获取结果
            result = model.transcribe(file_path, language="cantonese")

            # 读取原始字幕文件
            original_json_path = os.path.join(folder_path, f"GMT20211123-022650_Recording.json")
            with open(original_json_path, "r", encoding="utf-8") as f:
                original_data = json.load(f)

            # 查找对应的原始字幕条目
            original_entry = next((entry for entry in original_data if int(entry["id"]) == file_number), None)

            if original_entry:
                # 在原始字幕条目中添加 Whisper 的 ASR 结果
                original_entry["whisper_sentence"] = result

# 将修改后的数据写入新的 JSON 文件
output_json_path = os.path.join(folder_path, f"GMT20211123-022650_Recording_whisper.json")
with open(output_json_path, "w", encoding="utf-8") as f:
    json.dump(original_data, f, ensure_ascii=False, indent=4)