In [1]:
import eng_to_ipa as ipa
import re
import glob
import os
import numpy as np
from deep_translator import GoogleTranslator

In [2]:
def list_txt_files(directory):
    # 生成指定目录及其子目录下所有.txt文件的路径列表
    txt_files = glob.glob(os.path.join(directory, '**', '*.txt'), recursive=True)
    return txt_files

def convert_word_to_phoneme(word):
    # we use ipa now instead of pronouncing
    # phonemes = ipa.ipa_list(word)
    phonemes = ipa.convert(word)
    if phonemes:
        return phonemes
        # return ' '.join(phonemes[0])  # the output is a 2-d list.
    return word  

def convert_word_to_46(word, CET4_dict, CET6_dict):
    # If the word exits in the CET4_dict or CET6_dict, mark it as '4', '6'
    # Otherwise, return '0'
    # print(word)
    if word in CET4_dict:
        return '4'
    elif word in CET6_dict:
        return '6'
    return '-'

def get_song_phoneme(input_file):
    # Handle the file
    song:str = ''
    with open(input_file, 'r') as infile:
        phoneme_list = []
        for line in infile:
            # find and split parts as [], () and words
            parts = re.findall(r'\[.*?\]|\(.*?\)|\w+|[^\s\w]', line)
            for part in parts:
                if re.match (r'\w+', part): # 单词部分
                    song += part + ' '
    phoneme_list = ipa.convert(song)
    return phoneme_list

def process_lyrics(input_file, output_file):
    # Handle the file
    with open(input_file, 'r') as infile, open(output_file, 'w', encoding = 'utf-8') as yrcy:
        for line in infile:
            # find and split parts as [], () and words
            parts = re.findall(r'\[.*?\]|\(.*?\)|\w+|[^\s\w]', line)
            for part in parts:
                if not re.match (r'\w+', part): # 单词部分
                    yrcy.write(part)
                else:
                    phoneme = convert_word_to_phoneme(part)
                    yrcy.write(phoneme)
            yrcy.write('\n')
    print("处理完成，音标已写入输出文件。")

def process_foursix(input_file, output_file):
    # Load the CET4 and CET6 dictionaries
    CET4_dict = np.load('CET4_dict.npy', allow_pickle=True).item()
    CET6_dict = np.load('CET6_dict.npy', allow_pickle=True).item()
    # Handle the file
    with open(input_file, 'r') as infile, open(output_file, 'w', encoding = 'utf-8') as yrcf:
        for line in infile:
            # find and split parts as [], () and words
            parts = re.findall(r'\[.*?\]|\(.*?\)|\w+|[^\s\w]', line)
            for part in parts:
                if not re.match (r'\w+', part): # 单词部分
                    yrcf.write(part)
                else:
                    four_six = convert_word_to_46(part, CET4_dict, CET6_dict)
                    yrcf.write(four_six)
            yrcf.write('\n')
    print("处理完成，四六级信息已写入输出文件。")

def process_cn(input_file, output_file):
    # Use any translator to get the translated string
    translated = GoogleTranslator(source='auto', target='zh-CN').translate_file('yrc/3852042.yrc.txt')
    # Write the srting into the output file
    # Remove all spaces
    translated = translated.replace(' ', '')
    with open(output_file, 'w', encoding = 'utf-8') as yrccn:
        yrccn.write(translated)
    print("处理完成，中文翻译已写入输出文件。")


In [3]:
# 将时间截和音标写入输出文件
# 输入和输出文件路径
input_file = 'yrc/3852042.yrc.txt'
phoneme_file = 'output_phonemes.txt'
four_six_file = 'output_four_six.txt'
cn_file = 'output_cn.txt'

# 处理音标文件
# phoneme_list = get_song_phoneme(input_file)
# process_lyrics(input_file, phoneme_file)
# process_foursix(input_file, four_six_file)
process_cn(input_file, cn_file)


处理完成，中文翻译已写入输出文件。
