In [3]:
import re
import json
import os

def parse_srt(srt_text):
    pattern = re.compile(r'(\d+)\n(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\n(.*?)\n', re.DOTALL)
    matches = pattern.findall(srt_text)

    data = []
    skip_patterns = [r'\[', r'\]', r'【', r'】']  # 定义需要跳过的字符模式列表

    for match in matches:
        id_, start, end, content = match
        if any(re.search(pattern, content) for pattern in skip_patterns):
            continue  # 如果包含跳过字符，跳过这部分

        speaker_lines = content.strip().split('\n')
        for speaker_line in speaker_lines:
            if speaker_line.strip():  # 忽略空行
                # 进一步拆分可能包含多个说话者的行
                sub_lines = re.split(r'(?<=[。！？])(?=[^。！？]*：)', speaker_line)
                for sub_line in sub_lines:
                    parts = re.split(r':|：|;|；', sub_line, maxsplit=1)
                    if len(parts) == 2:
                        speaker, sentence = parts
                    else:
                        speaker = ''
                        sentence = parts[0]
                    entry = {
                        'id': id_,
                        'start_time': start,
                        'end_time': end,
                        'speaker': speaker.strip(),
                        'sentence': sentence.strip()
                    }
                    data.append(entry)
    return data

def srt_to_json(srt_file_path):
    with open(srt_file_path, 'r', encoding='utf-8') as file:
        srt_text = file.read()
    parsed_data = parse_srt(srt_text)
    return json.dumps(parsed_data, ensure_ascii=False, indent=4)

def convert_srt_to_json_in_directory(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".srt"):
                srt_file_path = os.path.join(root, file)
                json_data = srt_to_json(srt_file_path)
                json_file_path = os.path.splitext(srt_file_path)[0] + '.json'
                with open(json_file_path, 'w', encoding='utf-8') as json_file:
                    json_file.write(json_data)
                print(f'Converted {srt_file_path} to {json_file_path}')

# 调用函数，转换 `Subtitle` 目录下的所有 `.srt` 文件
convert_srt_to_json_in_directory("Subtitle")


Converted Subtitle/alice's group/Nov 22, 2021 1027 AM/GMT20211122-022751/GMT20211122-022751_Recording_gvo_1280x720_with_audio.srt to Subtitle/alice's group/Nov 22, 2021 1027 AM/GMT20211122-022751/GMT20211122-022751_Recording_gvo_1280x720_with_audio.json
Converted Subtitle/alice's group/Jan 10, 2022 0351 PM/GMT20220110-075157_Recording_gvo_1280x720_with_audio.srt to Subtitle/alice's group/Jan 10, 2022 0351 PM/GMT20220110-075157_Recording_gvo_1280x720_with_audio.json
Converted Subtitle/alice's group/Dec 7, 2021 0926 AM/GMT20211207-012644/GMT20211207-012644_Recording_gvo_1280x720_with_audio.srt to Subtitle/alice's group/Dec 7, 2021 0926 AM/GMT20211207-012644/GMT20211207-012644_Recording_gvo_1280x720_with_audio.json
Converted Subtitle/alice's group/Mar 8, 2022 1102 AM/GMT20220308-030238/GMT20220308-030238_Recording_gvo_1280x720_with_audio.srt to Subtitle/alice's group/Mar 8, 2022 1102 AM/GMT20220308-030238/GMT20220308-030238_Recording_gvo_1280x720_with_audio.json
Converted Subtitle/alice's