In [1]:
import re
import json

def parse_srt(srt_file):
    # 正则表达式匹配时间和文本
    pattern = re.compile(r'(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\n(.*?)\n\n', re.DOTALL)

    with open(srt_file, 'r', encoding='utf-8') as file:
        content = file.read()
        matches = pattern.findall(content)

    results = []
    for start, end, text in matches:
        # 将时间转换为秒
        start_seconds = sum(x * float(t) for x, t in zip([3600, 60, 1, .001], start.replace(',', ':').split(':')))
        end_seconds = sum(x * float(t) for x, t in zip([3600, 60, 1, .001], end.replace(',', ':').split(':')))

        results.append({
            "start": start_seconds,
            "end": end_seconds,
            "text": text.replace('\n', ' ')
        })

    return results

def srt_to_jsonl(srt_file, output_file, audio_path, language):
    sentences = parse_srt(srt_file)
    sentence_text = ' '.join([sentence['text'] for sentence in sentences])
    duration = sentences[-1]['end'] - sentences[0]['start'] if sentences else 0

    data = {
        "audio": {
            "path": audio_path
        },
        "sentence": sentence_text,
        "language": language,
        "sentences": sentences,
        "duration": duration
    }

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

    print(f"File '{output_file}' has been created with the JSONL format.")




In [3]:
# 使用示例
srt_file = "Subtitle/alice's group/Jan 10, 2022 0351 PM/GMT20220110-075157_Recording_gvo_1280x720_with_audio.srt" # 这里修改为SRT文件的路径
output_file = "output.jsonl" # 输出文件名
audio_path = "dataset/0.wav" # 音频文件路径
language = "Cantonese" # 语言设置为粤语

# 将SRT转换为JSONL格式
srt_to_jsonl(srt_file, output_file, audio_path, language)

File 'output.jsonl' has been created with the JSONL format.


In [9]:
import re
import json

def format_srt_to_jsonl(srt_file_path, jsonl_file_path, audio_path):
    # 用于检测中文句末是否有标点符号的正则表达式
    punctuations_re = re.compile(r"[。？！，；：”’]$")

    # 用于移除人物名称和后面的标记（如“婆婆：”）
    speaker_re = re.compile(r"\s*\w+：")

    def ensure_punctuation(sentence):
        """确保每个句子结尾有标点符号"""
        if not punctuations_re.search(sentence):
            return sentence + "。"
        return sentence

    with open(srt_file_path, 'r', encoding='utf-8') as srt_file:
        srt_content = srt_file.read()

    entries = re.split(r'\n\n+', srt_content.strip())

    with open(jsonl_file_path, 'w', encoding='utf-8') as jsonl_file:
        for entry in entries:
            lines = entry.split('\n')
            if len(lines) < 3: continue

            start_end_timestamps = lines[1].split(' --> ')
            start_timestamp, end_timestamp = start_end_timestamps[0], start_end_timestamps[1]

            start_seconds = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], start_timestamp.replace(',', ':').split(':')))
            end_seconds = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], end_timestamp.replace(',', ':').split(':')))

            text = ' '.join(lines[2:])
            text = speaker_re.sub("", text)  # 删除人物名称和后面的标记
            text = ensure_punctuation(text)  # 确保句子结束有标点

            sentence = {
                "audio": {
                    "path": audio_path
                },
                "sentence": text,
                "language": "Cantonese",
                "sentences": [
                    {
                        "start": start_seconds,
                        "end": end_seconds,
                        "text": text
                    }
                ],
                "duration": end_seconds - start_seconds
            }

            jsonl_file.write(json.dumps(sentence, ensure_ascii=False) + '\n')





In [10]:
# 使用示例：
srt_file_path = "Subtitle/alice's group/Jan 10, 2022 0351 PM/GMT20220110-075157_Recording_gvo_1280x720_with_audio.srt" # 这里修改为SRT文件的路径
jsonl_file_path = 'output_file.jsonl'  # JSONL输出文件路径
audio_path = 'dataset/0.wav'  # 音频文件路径

# 执行函数
format_srt_to_jsonl(srt_file_path, jsonl_file_path, audio_path)

In [11]:
from datetime import datetime

# Function to parse the date and calculate days abroad
def calculate_days(dates):
    total_days_abroad = 0

    for departure, arrival in dates:
        departure_date = datetime.strptime(departure, "%y.%m.%d")
        arrival_date = datetime.strptime(arrival, "%y.%m.%d")
        # Add one to include the departure day
        total_days_abroad += (arrival_date - departure_date).days + 1

    return total_days_abroad

# Dates of departures and arrivals from the image
dates = [
    ("23.8.10", "23.8.10"),
    ("23.8.17", "23.8.17"),
    ("23.8.23", "23.8.23"),
    ("23.8.27", "23.8.31"),
    ("23.9.4", "23.9.23"),
    ("23.9.25", "23.9.29"),
    ("23.10.2", "23.11.30"),
    ("23.11.30", "23.12.22"),
    ("23.12.26", "24.1.6"),
    ("24.1.6", "24.2.3"),
    ("24.2.4", "24.2.8"),
    ("24.2.14", "24.2.25"),
]

# Calculate the total days spent abroad
total_days_abroad = calculate_days(dates)
total_days_abroad


174

In [18]:
import re
from datasets import Dataset, DatasetDict

subtitles = """
579
00:39:30,000 --> 00:39:30,900
婆婆：唔係啊，用購物袋都睇唔到啊，佢睇到你啲膠袋咧，就走過嚟噶啦，睇到啲膠袋

580
00:39:30,900 --> 00:39:35,375
婆婆：唔係啊，用購物袋都睇唔到啊，佢睇到你啲膠袋咧，就走過嚟噶啦，睇到啲膠袋
李太：呢兩晚冇行啦，琴晚睇到就冇行啦

581
00:39:35,375 --> 00:39:38,025
婆婆：唔係啊，用購物袋都睇唔到啊，佢睇到你啲膠袋咧，就走過嚟噶啦，睇到啲膠袋

582
00:39:38,625 --> 00:39:39,375
婆婆：佢搵食嘅，嚟呢度搵食啫嘛 

583
00:39:39,375 --> 00:39:41,263
婆婆：佢搵食嘅，嚟呢度搵食啫嘛 
李太：我個膠袋係白色嘅唔系紅色嘅。婆婆：白膠袋都系噶
"""

# 解析字幕
pattern = re.compile(r'(\d+)\n(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\n(.*?)\n', re.DOTALL)
matches = pattern.findall(subtitles)

# 构建数据集
data = {'id': [], 'start_time': [], 'end_time': [], 'speaker': [], 'sentence': []}
for match in matches:
    id_, start, end, content = match
    # 分离说话人和对白
    lines = content.split('\n')
    for line in lines:
        if line.strip():  # 确保行不为空
            speaker, sentence = line.split('：', 1) if '：' in line else ('', line)
            data['id'].append(id_)
            data['start_time'].append(start)
            data['end_time'].append(end)
            data['speaker'].append(speaker)
            data['sentence'].append(sentence)

# 创建数据集
subtitles_dataset = Dataset.from_dict(data)

# 如果需要DatasetDict结构
subtitles_dataset_dict = DatasetDict({
    'subtitles': subtitles_dataset
})

print(subtitles_dataset_dict)

print(subtitles_dataset_dict['subtitles'][1])

DatasetDict({
    subtitles: Dataset({
        features: ['id', 'start_time', 'end_time', 'speaker', 'sentence'],
        num_rows: 5
    })
})
{'id': '580', 'start_time': '00:39:30,900', 'end_time': '00:39:35,375', 'speaker': '婆婆', 'sentence': '唔係啊，用購物袋都睇唔到啊，佢睇到你啲膠袋咧，就走過嚟噶啦，睇到啲膠袋'}


In [26]:
import re
import json
import os

def parse_srt(srt_text):
    pattern = re.compile(r'(\d+)\n(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\n(.*?)\n', re.DOTALL)
    matches = pattern.findall(srt_text)

    data = []
    skip_phrases = ['[',']','【','】']  # 定义需要跳过的短语列表

    for match in matches:
        id_, start, end, content = match
        # 检查内容是否包含任何跳过短语
        if any(skip_phrase in content for skip_phrase in skip_phrases):
            continue  # 如果包含，跳过这部分

        speaker_lines = content.strip().split('\n')
        for speaker_line in speaker_lines:
            if speaker_line.strip():  # 忽略空行
                parts = re.split(r':|：|;', speaker_line, maxsplit=1)
                if len(parts) == 2:
                    speaker, sentence = parts
                else:
                    speaker = ''
                    sentence = parts[0]
                entry = {
                    'id': id_,
                    'start_time': start,
                    'end_time': end,
                    'speaker': speaker.strip(),
                    'sentence': sentence.strip()
                }
                data.append(entry)
    return data

def srt_to_json(srt_file_path):
    with open(srt_file_path, 'r', encoding='utf-8') as file:
        srt_text = file.read()
    parsed_data = parse_srt(srt_text)
    return json.dumps(parsed_data, ensure_ascii=False, indent=4)

def process_srt_file(srt_file_path):
    # 解析SRT文件并转换为JSON
    json_data = srt_to_json(srt_file_path)
    # 构造JSON文件的名称
    json_file_name = get_json_file_name(srt_file_path)
    # 保存JSON数据到文件
    with open(json_file_name, 'w', encoding='utf-8') as json_file:
        json_file.write(json_data)
    print(f'JSON data has been written to {json_file_name}')

def get_json_file_name(srt_file_path):
    # 提取文件名（无扩展名）并添加.json扩展名
    directory, file_name = os.path.split(srt_file_path)
    base_name = os.path.splitext(file_name)[0]
    return os.path.join(directory, base_name + '.json')

# 指定的SRT文件路径
srt_file_path = "Subtitle/alice's group/Dec 7, 2021 0926 AM/GMT20211207-012644/GMT20211207-012644_Recording_gvo_1280x720_with_audio.srt"

# 处理SRT文件，解析内容并保存为JSON文件
process_srt_file(srt_file_path)


JSON data has been written to Subtitle/alice's group/Dec 7, 2021 0926 AM/GMT20211207-012644/GMT20211207-012644_Recording_gvo_1280x720_with_audio.json
