In [35]:
def corr_punc(text):
    # Mapping of Korean punctuation to English equivalents
    conversion_mapping = {
        '\u2026': '...', # Horizontal ellipsis
        '\u3001': ',',  # Ideographic comma
        '\u3002': '.',  # Ideographic full stop
        '\u00A0': ' ',  # No-Break Space
        '\u1680': ' ',  # Ogham Space Mark
        '\u180E': ' ',  # Mongolian Vowel Separator
        '\u2000': ' ',  # En Quad
        '\u2001': ' ',  # Em Quad
        '\u2002': ' ',  # En Space
        '\u2003': ' ',  # Em Space
        '\u2004': ' ',  # Three-Per-Em Space
        '\u2005': ' ',  # Four-Per-Em Space
        '\u2006': ' ',  # Six-Per-Em Space
        '\u2007': ' ',  # Figure Space
        '\u2008': ' ',  # Punctuation Space
        '\u2009': ' ',  # Thin Space
        '\u200A': ' ',  # Hair Space
        '\u200B': ' ',  # Zero Width Space
        '\u202F': ' ',  # Narrow No-Break Space
        '\u205F': ' ',  # Medium Mathematical Space
        '\u3000': ' ',  # Ideographic Space
        '\uFEFF': ' ',  # Zero Width No-Break Space
        '\u201C': '"',  # Left double quotation mark
        '\u201D': '"',  # Right double quotation mark
        '\u2018': "'",  # Left single quotation mark
        '\u2019': "'",  # Right single quotation mark
        '\u201A': "'",  # Single low-9 quotation mark
        '\u201B': "'",  # Single high-reversed-9 quotation mark
        '\u2032': "'",  # Prime (used to denote minutes or feet)
        '\u2035': "'",  # Reversed Prime (used to denote seconds or inches)
        '\u02BC': "'",  # Modifier letter apostrophe
        '\u02BB': "'",  # Modifier letter turned comma
        '\uFF07': "'",  # Fullwidth apostrophe
    }

    for special_char, english_char in conversion_mapping.items():
        text = text.replace(special_char, english_char)
    
    text = re.sub(r' +', ' ', text)

    return text

In [36]:
from pypinyin import lazy_pinyin, Style

def to_pinyin(sentence):
    pinyin_result = []
    for char in sentence:
        if char == ' ':
            pinyin_result.append(' ')
        else:
            char_pinyin = lazy_pinyin(char, style=Style.TONE, neutral_tone_with_five=True)
            pinyin_result.append(char_pinyin[0])
    pinyin_sentence = ''.join(pinyin_result)
    return pinyin_sentence

# Example usage:
chinese_sentence = "庞加莱 是 高斯 和 黎曼 之后"
pinyin_sentence = to_pinyin(chinese_sentence)

print("Original:", chinese_sentence)
print("Pinyin:", pinyin_sentence)



Original: 庞加莱 是 高斯 和 黎曼 之后
Pinyin: pángjiālái shì gāosī hé límàn zhīhòu


In [38]:
import re
import os
import regex
import json
import jieba

succ_ids = []
folder_path = "laoshi/"

def process_caption_line(line):
    # Replace '</c>' followed by any character or number with a space
    processed_line = re.sub(r'</c>.', ' ', line)
    # Replace multiple spaces with a single space
    processed_line = re.sub(r'\s+', ' ', processed_line).strip()
    return processed_line

for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path) and file_path.endswith(".vtt"):

        # Extract video ID from the file path
        video_id = os.path.basename(file_path).split('.')[0]

        # Read the caption file
        with open(file_path, 'r', encoding='utf-8') as file:
            captions = []
            lines = file.readlines()
            for i, line in enumerate(lines):
                if '-->' in line:
                    l = lines[i+1].strip()
                    l = ' '.join(jieba.cut(l))
                    l = corr_punc(l)
                    captions.append(l)
                    p = to_pinyin(l)
                    captions.append(p)
                
        # Create a JSON object
        caption_json = {
            'video_id': video_id,
            'captions': captions
        }

        # Optionally, you can save the JSON object to a file
        with open(f'zh_captions/{video_id}.json', 'w', encoding='utf-8') as file:
            json.dump(caption_json, file, ensure_ascii=False, indent=2)
            succ_ids.append(video_id)


In [39]:
len(succ_ids)

20