In [1]:
from pycantonese import characters_to_jyutping as ctj, segment
import re

In [2]:
src_txt = r'廣東話 e 唔 easy呀?' #input text
can_pat = r'[\u4e00-\u9fff]+$' #pattern recognizing Chinese characters
num_pat = r'\d' #pattern that recognizing jyutping with the tone as the stop word
rep_num_pat = r'\1 '# replacing the num_pat with the tone and a space that follows

def is_cantonese(str):
    return re.match(can_pat, str)

def split_jyut(str):
    """ convert a Cantonese term's jyutping into a list of jyutping 
    e.g. 'gwong2dung1waa2'  -> ['gwong2','dung1','waa2']
    """
    return re.sub(num_pat, rep_num_pat,str).split()

def flat_list(nested):
    """ flatten a nested list containing sublists 
    e.g. [['a','b'],['c','d'],['e']] -> ['a','b','c','d','e']
    """
    #return [item for sublist in nested for item in sublist]
    return [item for sublist in nested for item in (sublist if isinstance(sublist, list) else [sublist])]

In [3]:
class CantoneseTokenize:
    def __init__(self, source_text):
        self.tokens = self.tokenize(source_text)
        self.word_tokens = flat_list([list(i) if is_cantonese(i) else i for i in segment(source_text)])
        #self.jyut_tokens = [i[1] if is_cantonese(i[0]) else i[0]  for i in self.tokens]
        self.jyut_tokens = [ctj(i)[0][1] if is_cantonese(i) else i for i in self.word_tokens]
        
        
    def tokenize(self, source_text):
        """ Tokenize the source_text using pycantonese """
        try: 
            tokens = ctj(source_text)
            return tokens
        except:
            print(f"error during tokenization {e}")
            return []
            

# **preview**

In [4]:
sent_tok = CantoneseTokenize(source_text=src_txt)
tokens = sent_tok.tokens
jyut_tokens = sent_tok.jyut_tokens
word_tokens = sent_tok.word_tokens

print(f"""Ths is the tokens:\n{tokens}
This is the jyut_tokens:\n{jyut_tokens}
This is the word_tokens:\n{word_tokens}""")
print(f"The index 3 is {word_tokens[3]}")

Ths is the tokens:
[('廣東話', 'gwong2dung1waa2'), ('e', 'e1'), ('唔', 'm4'), ('easy', None), ('呀', 'aa4'), ('?', None)]
This is the jyut_tokens:
['gwong2', 'dung1', 'waa6', 'e', 'm4', 'easy', 'aa4', '?']
This is the word_tokens:
['廣', '東', '話', 'e', '唔', 'easy', '呀', '?']
The index 3 is e


# **Explaining how to flatten a list of sublist with for-loops**

In [5]:
tword_tokens = [['廣', '東', '話'], 'e', ['唔'], 'easy', ['呀'], '?']
flattened_word_tokens = []
for item in tword_tokens:
    if isinstance(item, list):
        flattened_word_tokens.extend(item)
    else:
        flattened_word_tokens.append(item)
print(flattened_word_tokens)


['廣', '東', '話', 'e', '唔', 'easy', '呀', '?']


# **Explaining how to flatten a list of sublist with list comprehension**

In [6]:
tword_tokens = [['廣', '東', '話'], 'e', ['唔'], 'easy', ['呀'], '?']
flattened_word_tokens = [item for sublist in tword_tokens for item in (sublist if isinstance(sublist, list) else [sublist])]
print(flattened_word_tokens)


['廣', '東', '話', 'e', '唔', 'easy', '呀', '?']


In [7]:
r = [f"{ctj(i)}" if is_cantonese(i) else i for i in word_tokens] 
r

["[('廣', 'gwong2')]",
 "[('東', 'dung1')]",
 "[('話', 'waa6')]",
 'e',
 "[('唔', 'm4')]",
 'easy',
 "[('呀', 'aa4')]",
 '?']

In [8]:
w_list = flat_list([list(i) if is_cantonese(i) else i for i in segment(src_txt)])
w_list

['廣', '東', '話', 'e', '唔', 'easy', '呀', '?']

In [9]:
j_list = [ctj(i)[0][1] for i in w_list]
j_list

['gwong2', 'dung1', 'waa6', 'e1', 'm4', None, 'aa4', None]