In [1]:
from pycantonese import characters_to_jyutping as ctj, segment
import re

In [2]:
src_txt = r'『Delay-no-more』 唔係 English嚟㗎?' #input text

In [3]:
can_pat = r'[\u4e00-\u9fff]+$' #pattern recognizing Chinese characters
num_pat = r'(\d)' #pattern that recognizing jyutping with the tone as the stop word 
rep_num_pat = r'\1 '# replacing the num_pat with the tone and a space that follows

def is_cantonese(str):
    return re.match(can_pat, str)

def split_jyut(str):
    """ convert a Cantonese term's jyutping into a list of jyutping 
    e.g. 'gwong2dung1waa2'  -> ['gwong2','dung1','waa2']
    """
    return re.sub(num_pat, rep_num_pat,str).split()

def flat_list(nested):
    """ flatten a nested list containing sublists 
    e.g. [['a','b'],['c','d'],['e']] -> ['a','b','c','d','e']
    """
    #return [item for sublist in nested for item in sublist]
    return [item for sublist in nested for item in (sublist if isinstance(sublist, list) else [sublist])]

In [4]:
class CantoneseTokenize:
    def __init__(self, source_text):
        # raw tokens from pycantonese.characters_to_jyutping
        self.tokens = self.tokenize(source_text) 
        # derive the list of single Cantonese characters and English words & pronunciations from the raw tokens
        self.word_tokens = flat_list([i[0] if i[1]==None else list(i[0]) for i in self.tokens])
        # derive the list of jyutping or English for single Cantonese characters and English words & pronunciations from the raw tokens
        self.jyut_tokens = flat_list([i[0] if i[1]==None else split_jyut(i[1]) for i in self.tokens])
        
    def tokenize(self, source_text):
        """ Tokenize the source_text using pycantonese """
        try: 
            tokens = ctj(source_text)
            return tokens
        except:
            print(f"error during tokenization {e}")
            return []


# **preview**

In [5]:
sent_tok = CantoneseTokenize(source_text=src_txt)
tokens = sent_tok.tokens
jyut_tokens = sent_tok.jyut_tokens
word_tokens = sent_tok.word_tokens

print(f"""Ths is the tokens:\n{tokens}\n
This is the word_tokens:\n{word_tokens}\n
This is the jyut_tokens:\n{jyut_tokens}
""")
print(f"The index 3 is {word_tokens[3]}")

Ths is the tokens:
[('『', None), ('Delay-no-more', None), ('』', None), ('唔係', 'm4hai6'), ('English', None), ('嚟㗎', 'lai4gaa3'), ('?', None)]

This is the word_tokens:
['『', 'Delay-no-more', '』', '唔', '係', 'English', '嚟', '㗎', '?']

This is the jyut_tokens:
['『', 'Delay-no-more', '』', 'm4', 'hai6', 'English', 'lai4', 'gaa3', '?']

The index 3 is 唔


# **Explaining how to flatten a list of sublist with for-loops**