In [4]:
punctuation = [" ", "༄", "༅", "࿓", "࿔", "༇", "༆", "༈", "།", "༎", "༏", "༐", "༑", "༔","་", "༌", "༼", "༽", "༒", "༓", "ཿ"]

merged_aa_cases = ["འི", "འིས", "འང", "འམ", "འོ"]
other_merged = ["ས", "ར"]


dreldra = ['གི', 'ཀྱི', 'གྱི', 'ཡི']
jedra = ['གིས', 'ཀྱིས', 'གྱིས', 'ཡིས']
ladon = ['སུ', 'ཏུ', 'དུ', 'རུ']
lhagche = ['སྟེ', 'ཏེ', 'དེ']
gyendu = ['གམ', 'ངམ', 'དམ', 'ནམ', 'བམ', 'མམ', 'རམ', 'ལམ', 'སམ', 'ཏམ']
dagdra = ['པ', 'པོ', 'བ', 'བོ']
lardu = ['གོ', 'ངོ', 'དོ', 'ནོ', 'བོ', 'མོ', 'འོ', 'རོ', 'ལོ', 'སོ', 'ཏོ']
separate_particles = dreldra + jedra + ladon + lhagche + gyendu + dagdra + lardu

In [5]:
current_layers = {}

In [16]:
class prepareTib:
    ''' Dealing with the punctuation of Tibetan '''
    global current_layers
    current_layers = {} # initiate the layers' dictionary for the current file
    
    def __init__(self, tibstring):
        self.raw = tibstring
        self.punct = punctuation
        self.layers = current_layers
        
        #####################################################
        # Prepare the layer of syllables
        #ne pas oublier d’implémenter le layer des linebreaks
        #import re
        #re.split('([ ༄༅࿓࿔༇༆༈།༎༏༐༑༔་༌༼༽༒༓ཿ]+)', '༄༅།། །།བཅོམ་ལྡན་འདས་དེའི་ཚེ་ན་ཚེ་དང་ལྡན་པ་ཀུན་དགང་།ཀུན ་ཀུན་')[1:-1]
        linebreaks_layer = []
        syllables_layer = []
        puncts = []
        syl = ''
        for index, char in enumerate(self.raw):
            if index == 0:
                if char in self.punct:
                    puncts.append(char)
                else:
                    syl += char
            else:
                if char not in self.punct and self.raw[index-1] in self.punct:
                    if '\r\n' in syl:
                        syllables_layer.append((syl.replace('\r\n', ''), puncts))
                        linebreaks_layer.append('\n')
                    elif '\n' in syl:
                        syllables_layer.append((syl.replace('\n', ''), puncts))
                        linebreaks_layer.append('\n')
                    else:
                        syllables_layer.append((syl, puncts))
                        linebreaks_layer.append('')
                        
                    puncts = []
                    syl = char
                elif char not in self.punct:
                    syl += char
                elif char in self.punct:
                    puncts.append(char)
        # add last syllable + its punctuation
        if syl != '':
            if '\r\n' in syl:
                syllables_layer.append((syl.replace('\r\n', ''), puncts))
                linebreaks_layer.append('\n')
            elif '\n' in syl:
                syllables_layer.append((syl.replace('\n', ''), puncts))
                linebreaks_layer.append('\n')
            else:
                syllables_layer.append((syl, puncts))
                linebreaks_layer.append('')
            
        self.layers['linebreaks'] = linebreaks_layer
        self.layers['syllables'] = syllables_layer
        #
        ################################################################
        
    def syl_tuples(self):
        tuples = self.layers['syllables']
        return [(t[0], ''.join(t[1])) for t in tuples]
    
    def all_punct(self):
        tuples = prepareTib.syl_tuples(self)
        return [t[0]+t[1] for t in tuples]
    
    def syls_only(self):
        tuples = self.layers['syllables']
        return [t[0] for t in tuples]
    
    def tsheks_only(self):
        tuples = prepareTib.syl_tuples(self)
        return [t[0]+'་' for t in tuples]
    
    def no_tshek(self):
        tuples = prepareTib.syl_tuples(self)
        no_tshek = []
        for t in tuples:
            if '་' in t[1]:
                no_tshek.append(t[0]+t[1].replace('་', ''))
            else :
                no_tshek.append(t[0]+t[1])
        return no_tshek

In [18]:
class segment:
    '''segment a given string of Tibetan'''
    def __init__(self):
        self.m_particles = merged_aa_cases # ས and ར are not yet supported
        self.s_particles = separate_particles
        self.layers = current_layers
        
        #####################################################
        # pre-segmentation : finding all particles in the text
        particles_layer = []
        for t in self.layers['syllables']:
            # find wether the syllable ends with a merged aa case
            m_part = ''
            for aa in self.m_particles:
                if t[0].endswith(aa):
                    m_part = aa
            
            if t[0] in self.s_particles:
                particles_layer.append(t[0])
            elif m_part != '':
                particles_layer.append((t[0].replace(m_part, ''), m_part))
            else:
                particles_layer.append('')
                
        self.layers['particles'] = particles_layer
        #
        ####################################################

In [17]:
string = '''༄༅།། །།བཅོམ་ལྡན་འདས་དེའི་ཚེ་ན་ཚེ་དང་ལྡན་པ་
ཀུན་དགང་།ཀུན ་ཀུན་'''
prepareTib(string).syl_tuples()
print(current_layers)

{'linebreaks': ['', '', '', '', '', '', '', '', '', '', '', '\n', '', '', ''], 'syllables': [('', ['༄', '༅', '།', '།', ' ', '།', '།']), ('བཅོམ', ['་']), ('ལྡན', ['་']), ('འདས', ['་']), ('དེའི', ['་']), ('ཚེ', ['་']), ('ན', ['་']), ('ཚེ', ['་']), ('དང', ['་']), ('ལྡན', ['་']), ('པ', ['་']), ('ཀུན', ['་']), ('དགང', ['་', '།']), ('ཀུན', [' ', '་']), ('ཀུན', ['་'])]}


In [19]:
segment()
print(current_layers)

{'linebreaks': ['', '', '', '', '', '', '', '', '', '', '', '\n', '', '', ''], 'particles': ['', '', '', '', ('དེ', 'འི'), '', '', '', '', '', 'པ', '', '', '', ''], 'syllables': [('', ['༄', '༅', '།', '།', ' ', '།', '།']), ('བཅོམ', ['་']), ('ལྡན', ['་']), ('འདས', ['་']), ('དེའི', ['་']), ('ཚེ', ['་']), ('ན', ['་']), ('ཚེ', ['་']), ('དང', ['་']), ('ལྡན', ['་']), ('པ', ['་']), ('ཀུན', ['་']), ('དགང', ['་', '།']), ('ཀུན', [' ', '་']), ('ཀུན', ['་'])]}


In [24]:

for num, syl in enumerate(current_layers['particles']):
    if syl == '':
        syl = current_layers['syllables'][num][0]
        print(syl, end = '་')
    else:
        print(' +'+str(syl)+'་+ ', end = '')

་བཅོམ་ལྡན་འདས་ +('དེ', 'འི')་+ ཚེ་ན་ཚེ་དང་ལྡན་ +པ་+ ཀུན་དགང་ཀུན་ཀུན་

In [None]:
# The empty strings show the  syllables where there is no suffix

# "merging-particles" : [list-of-preceeding-suffix]
{
"འི" : ["འ", ""],
"འིས" : ["འ", ""],
"ས" : ["འ", ""],
"ར" : ["འ", ""],
"འང" : ["འ", ""],
"འམ" : ["འ", ""],
"འོ" : ["འ", ""]
}

# "separate-particles" : [list-of-preceeding-suffix]
{
"གི" : ["ག", "ང"],
"ཀྱི" : ["ད", "བ", "ས"],
"གྱི" : ["ན", "མ", "ར", "ལ"],
"ཡི" : ["འ", ""],
"གིས" : ["ག", "ང"],
"ཀྱིས" : ["ད", "བ", "ས"],
"གྱིས" : ["ན", "མ", "ར", "ལ"],
"ཡིས" : ["འ", ""],
"སུ" : ["ས"],
"ཏུ" : ["ག", "བ", "ད་དྲག"],
"དུ" : ["ང", "ད", "ན", "མ", "ར", "ལ"],
"རུ" : ["འ", ""],
"སྟེ" : ["ག", "ང", "བ", "མ", "འ", ""],
"ཏེ" : ["ན", "ར", "ལ", "ས"],
"དེ" : ["ད"],
"ཀྱང" : ["ག", "ད", "བ", "ས"],
"ཡང" : ["ང", "ན", "མ", "འ", "ར", "ལ", ""],
"འང" : ["འ", ""],
"གམ" : ["ག"],
"ངམ" : ["ང"],
"དམ" : ["ད"],
"ནམ" : ["ན"],
"བམ" : ["བ"],
"མམ" : ["མ"],
"འམ" : ["འ"],
"རམ" : ["ར"],
"ལམ" : ["ལ"],
"སམ" : ["ས"],
"ཏམ" : ["ད་དྲག"],
"པ" : ["ག", "ད", "བ", "ས", "ན", "མ"],
"པོ" : ["ག", "ད", "བ", "ས", "ན", "མ"],
"བ" : ["ང", "འ", "ར", "ལ", ""],
"བོ" : ["ང", "འ", "ར", "ལ", ""],
"གོ" : ["ག"],
"ངོ" : ["ང"],
"དོ" : ["ད"],
"ནོ" : ["ན"],
"བོ" : ["བ"],
"མོ" : ["མ"],
"འོ" : ["འ"],
"རོ" : ["ར"],
"ལོ" : ["ལ"],
"སོ" : ["ས"],
"ཏོ" : ["ད་དྲག"],
"ཅིང" : ["ག", "ད", "བ", "ད་དྲག"],
"ཅེས" : ["ག", "ད", "བ", "ད་དྲག"],
"ཅེའོ" : ["ག", "ད", "བ", "ད་དྲག"],
"ཅེ་ན" : ["ག", "ད", "བ", "ད་དྲག"],
"ཅིག" : ["ག", "ད", "བ", "ད་དྲག"],
"ཞིང" : ["ང", "ན", "མ", "འ", "ར", "ལ", ""],
"ཞེས" : ["ང", "ན", "མ", "འ", "ར", "ལ", "ས", ""],
"ཞེའོ" : ["ང", "ན", "མ", "འ", "ར", "ལ", ""],
"ཞེ་ན" : ["ང", "ན", "མ", "འ", "ར", "ལ", ""],
"ཞིག" : ["ང", "ན", "མ", "འ", "ར", "ལ", ""],
"ཤིང" : ["ས"],
"ཤེའོ" : ["ས"],
"ཤེ་ན" : ["ས"],
"ཤིག" : ["ས"]
}

//"གི" "ཀྱི" "གྱི" "ཡི" "གིས" "ཀྱིས" "གྱིས" "ཡིས" "སུ" "ཏུ" "དུ" "རུ" "སྟེ" "ཏེ" "དེ" "གམ" "ངམ" "དམ" "ནམ" "བམ" "མམ" "རམ" "ལམ" "སམ" "ཏམ" "པ" "པོ" "བ" "བོ" "གོ" "ངོ" "དོ" "ནོ" "བོ" "མོ" "འོ" "རོ" "ལོ" "སོ" "ཏོ" 
{"གི" : true, "ཀྱི" : true, "གྱི" : true, "ཡི" : true, "གིས" : true, "ཀྱིས" : true, "གྱིས" : true, "ཡིས" : true, "སུ" : true, "ཏུ" : true, "དུ" : true, "རུ" : true, "སྟེ" : true, "ཏེ" : true, "དེ" : true, "གམ" : true, "ངམ" : true, "དམ" : true, "ནམ" : true, "བམ" : true, "མམ" : true, "རམ" : true, "ལམ" : true, "སམ" : true, "ཏམ" : true, "པ" : true, "པོ" : true, "བ" : true, "བོ" : true, "གོ" : true, "ངོ" : true, "དོ" : true, "ནོ" : true, "བོ" : true, "མོ" : true, "འོ" : true, "རོ" : true, "ལོ" : true, "སོ" : true, "ཏོ" : true}

//"ཅི" "ཇི" "ཡིན" "མིན" "ཡོད" "མེད" "དང" "ལ" "ལས" "ན" "ནི" 
//"ཀྱང" "ཡང" "ཅིང" "ཅེས" "ཅེའོ" "ཅིག" "ཞིང" "ཞེས" "ཞེའོ" "ཞིག" "ཤིང" "ཤེའོ" "ཤིག" "གིན" "གྱིན"
{"ཅི" : true, "ཇི" : true, "ཡིན" : true, "མིན" : true, "ཡོད" : true, "མེད" : true, "དང" : true, "ལ" : true, "ལས" : true, "ན" : true, "ནི" : true, "ཀྱང" : true, "ཡང" : true, "ཅིང" : true, "ཅེས" : true, "ཅེའོ" : true, "ཅིག" : true, "ཞིང" : true, "ཞེས" : true, "ཞེའོ" : true, "ཞིག" : true, "ཤིང" : true, "ཤེའོ" : true, "ཤིག" : true, "གིན" : true, "གྱིན" : true, 

//"ཤེ་ན" "ཞེ་ན" "ཅེ་ན" 
{"ཤེ་ན" : true, "ཞེ་ན" : true, "ཅེ་ན" : true, 

ལྡན་