In [1]:
from pycantonese import characters_to_jyutping as ctj, segment
import re

In [2]:
src_txt = r'你都short short哋嘅。' #input text

In [3]:
can_pat = r'[\u4e00-\u9fff]+$' #pattern recognizing Chinese characters
num_pat = r'(\d)' #pattern that recognizing jyutping with the tone as the stop word 
rep_num_pat = r'\1 '# replacing the num_pat with the tone and a space that follows

def is_cantonese(str):
    """ check whether a text string is Cantonese """
    return re.match(can_pat, str)

def split_jyut(str):
    """ convert a Cantonese term's jyutping into a list of jyutping 
    e.g. 'gwong2dung1waa2'  -> ['gwong2','dung1','waa2']
    """
    return re.sub(num_pat, rep_num_pat,str).split()

def flat_list(nested):
    """ flatten a nested list containing sublists 
    e.g. [['a','b'],['c','d'],['e']] -> ['a','b','c','d','e']
    """
    #return [item for sublist in nested for item in sublist]
    return [item for sublist in nested for item in (sublist if isinstance(sublist, list) else [sublist])]

In [4]:
class CantoneseTokenize:
    def __init__(self, source_text):
        # raw tokens from pycantonese.characters_to_jyutping
        self.tokens = self.tokenize(source_text) 
        # derive the list of single Cantonese characters and English words & pronunciations from the raw tokens
        self.word_tokens = flat_list([i[0] if i[1]==None else list(i[0]) for i in self.tokens])
        # derive the list of jyutping or English for single Cantonese characters and English words & pronunciations from the raw tokens
        self.jyut_tokens = flat_list([i[0] if i[1]==None else split_jyut(i[1]) for i in self.tokens])
        
    def tokenize(self, source_text):
        """ Tokenize the source_text using pycantonese """
        try: 
            tokens = ctj(source_text)
            return tokens
        except:
            print(f"error during tokenization {e}")
            return []


# **preview**

In [5]:
sent_tok = CantoneseTokenize(source_text=src_txt)
tokens = sent_tok.tokens
jyut_tokens = sent_tok.jyut_tokens
word_tokens = sent_tok.word_tokens

print(f"""Ths is the tokens:\n{tokens}\n
This is the word_tokens:\n{word_tokens}\n
This is the jyut_tokens:\n{jyut_tokens}
""")
print(f"The index 3 is {word_tokens[3]}")

Ths is the tokens:
[('你', 'nei5'), ('都', 'dou1'), ('shortshort', None), ('哋', 'dei2'), ('嘅', 'ge3'), ('。', None)]

This is the word_tokens:
['你', '都', 'shortshort', '哋', '嘅', '。']

This is the jyut_tokens:
['nei5', 'dou1', 'shortshort', 'dei2', 'ge3', '。']

The index 3 is 哋


# **Test Styling For WordTokens & JyutTokens**

In [6]:
"""Return the number of characters for the longest word or jyutping"""
width = max([max(len(x),len(y)) for (x,y) in zip(word_tokens, jyut_tokens)])
width


10

In [7]:
w_txt = ''.join([i.center(width) if is_cantonese(i) else i.center(width)  for i in word_tokens])
j_txt = ''.join([i.center(width+1) if is_cantonese(word_tokens[idx-1]) else i.center(width) for idx, i in enumerate(jyut_tokens)])

o_txt = f"{w_txt}\n{j_txt}"
print(o_txt)

    你         都     shortshort    哋         嘅         。     
   nei5       dou1    shortshort   dei2       ge3         。     
