In [20]:
from pycantonese import characters_to_jyutping as ctj, segment
import re
from pathlib import Path

# **Backend functions for CantoneseTokenize**

In [21]:
can_pat = r'[\u4e00-\u9fff]+$' #pattern recognizing Chinese characters
num_pat = r'(\d)' #pattern that recognizing jyutping with the tone as the stop word 
rep_num_pat = r'\1 '# replacing the num_pat with the tone and a space that follows

def is_cantonese(str):
    return re.match(can_pat, str)

def split_jyut(str):
    """ convert a Cantonese term's jyutping into a list of jyutping 
    e.g. 'gwong2dung1waa2'  -> ['gwong2','dung1','waa2']
    """
    return re.sub(num_pat, rep_num_pat,str).split()

def flat_list(nested):
    """ flatten a nested list containing sublists 
    e.g. [['a','b'],['c','d'],['e']] -> ['a','b','c','d','e']
    """
    #return [item for sublist in nested for item in sublist]
    return [item for sublist in nested for item in (flat_list(sublist) if isinstance(sublist, list) else [sublist])]


# **CantoneseTokenize Class**

In [22]:
class CantoneseTokenize:
    def __init__(self, source_text):
        # raw tokens from pycantonese.characters_to_jyutping
        self.tokens = self.tokenize(source_text) 
        # derive the list of single Cantonese characters and English words & pronunciations from the raw tokens
        self.word_tokens = flat_list([i[0] if i[1]==None else list(i[0]) for i in self.tokens])
        # derive the list of jyutping or English for single Cantonese characters and English words & pronunciations from the raw tokens
        self.jyut_tokens = flat_list([i[0] if i[1]==None else split_jyut(i[1]) for i in self.tokens])
        # return the maximum width of the character in the two lists word_tokens and jyut_tokens
        self.max_word_width = max([max(len(w),len(j)) for (w,j) in zip(self.word_tokens, self.jyut_tokens)])
        # return the adjusted text for the word_tokens
        #self.word_tokens_txt = '\t'.join([i.center(self.max_word_width) for i in self.word_tokens])
        self.word_tokens_txt = '\t'.join([f" {i.center(len(i))}" for i in self.word_tokens])
        # return the adjusted text for the jyut_tokens
        #self.jyut_tokens_txt = '\t'.join([i.center(self.max_word_width) for i in self.jyut_tokens])
        self.jyut_tokens_txt = '\t'.join([f" {i.center(len((i)))}" for i in self.jyut_tokens])
        
    def tokenize(self, source_text):
        """ Tokenize the source_text using pycantonese """
        try: 
            tokens = ctj(source_text)
            return tokens
        except:
            print(f"error during tokenization {e}")
            return []


# **Read srt file as a list**

In [23]:
file_dir = r'g:/aud_cap'
srt_file = r'edit2a_fhtml_part1_can_sub'
file_ext = r'srt'

src_input_file = f"{file_dir}/{srt_file}.{file_ext}"

srt_src_list = Path(src_input_file).read_text(encoding='utf-8').split('\n')

# **Condition For Extracting Text**

In [24]:
output_list_txt = '\n'.join([f"{CantoneseTokenize(i).word_tokens_txt}\n{CantoneseTokenize(i).jyut_tokens_txt}" if idx % 4 == 2 else i for idx, i in enumerate(srt_src_list)])

In [25]:
#print(output_list_txt[:700])

In [26]:
i_txt = '開咗player'
r = CantoneseTokenize(i_txt).jyut_tokens_txt
w = CantoneseTokenize(i_txt).word_tokens_txt
print(r,'\n',w,'\n',f'Length of r = {len(r)}','\n' ,f'Length of w = {len(w)}')


 hoi1	 zo2	 player 
  開	 咗	 player 
 Length of r = 18 
 Length of w = 13


In [27]:
output_name = r'jyutping_output'
output_dir = r"g:"
output_file = f"{output_dir}/{output_name}.srt"
#Path(output_file).write_text(output_list_txt,encoding='utf-8')

In [28]:
print(len(r),len(w))

18 13
