In [1]:
from pycantonese import characters_to_jyutping as ctj, segment
import re

In [2]:
src_txt = r'咁我就裝咗' #input text

In [3]:
can_pat = r'[\u4e00-\u9fff]+$' #pattern recognizing Chinese characters
num_pat = r'(\d)' #pattern that recognizing jyutping with the tone as the stop word 
rep_num_pat = r'\1 '# replacing the num_pat with the tone and a space that follows

def is_cantonese(str):
    return re.match(can_pat, str)

def split_jyut(str):
    """ convert a Cantonese term's jyutping into a list of jyutping 
    e.g. 'gwong2dung1waa2'  -> ['gwong2','dung1','waa2']
    """
    return re.sub(num_pat, rep_num_pat,str).split()

def flat_list(nested):
    """ flatten a nested list containing sublists 
    e.g. [['a','b'],['c','d'],['e']] -> ['a','b','c','d','e']
    """
    #return [item for sublist in nested for item in sublist]
    return [item for sublist in nested for item in (flat_list(sublist) if isinstance(sublist, list) else [sublist])]

In [4]:
class CantoneseTokenize:
    def __init__(self, source_text):
        # raw tokens from pycantonese.characters_to_jyutping
        self.tokens = self.tokenize(source_text) 
        # derive the list of single Cantonese characters and English words & pronunciations from the raw tokens
        self.word_tokens = flat_list([i[0] if i[1]==None else list(i[0]) for i in self.tokens])
        # derive the list of jyutping or English for single Cantonese characters and English words & pronunciations from the raw tokens
        self.jyut_tokens = flat_list([i[0] if i[1]==None else split_jyut(i[1]) for i in self.tokens])
        # return the maximum width of the character in the two lists word_tokens and jyut_tokens
        self.max_word_width = max([max(len(w),len(j)) for (w,j) in zip(self.word_tokens, self.jyut_tokens)])
        # return the adjusted text for the word_tokens
        self.word_tokens_txt = '\t'.join([i.center(self.max_word_width) for i in self.word_tokens])
        # return the adjusted text for the jyut_tokens
        self.jyut_tokens_txt = '\t'.join([i.center(self.max_word_width) for i in self.jyut_tokens])
        
    def tokenize(self, source_text):
        """ Tokenize the source_text using pycantonese """
        try: 
            tokens = ctj(source_text)
            return tokens
        except:
            print(f"error during tokenization {e}")
            return []


# **preview**

In [5]:
sent_tok = CantoneseTokenize(source_text=src_txt)
tokens = sent_tok.tokens
jyut_tokens = sent_tok.jyut_tokens
word_tokens = sent_tok.word_tokens

print(f"""Ths is the tokens:\n{tokens}\n
This is the word_tokens:\n{word_tokens}\n
This is the jyut_tokens:\n{jyut_tokens}
""")
print(f"The index 3 is {word_tokens[3]}")
print(f"The length of the longest word is {sent_tok.max_word_width}")
print(f"This is the word tokens txt:\n{sent_tok.word_tokens_txt}")
print(f"This is the jyut tokens txt:\n{sent_tok.jyut_tokens_txt}")
print(f"This is the output text:\n{sent_tok.word_tokens_txt}\n{sent_tok.jyut_tokens_txt}")

Ths is the tokens:
[('咁', 'gam3'), ('我', 'ngo5'), ('就', 'zau6'), ('裝', 'zong1'), ('咗', 'zo2')]

This is the word_tokens:
['咁', '我', '就', '裝', '咗']

This is the jyut_tokens:
['gam3', 'ngo5', 'zau6', 'zong1', 'zo2']

The index 3 is 裝
The length of the longest word is 5
This is the word tokens txt:
  咁  	  我  	  就  	  裝  	  咗  
This is the jyut tokens txt:
 gam3	 ngo5	 zau6	zong1	 zo2 
This is the output text:
  咁  	  我  	  就  	  裝  	  咗  
 gam3	 ngo5	 zau6	zong1	 zo2 


# **Explaining how to flatten a list of sublist with for-loops**