In [1]:
import re

In [2]:
punct_beginning = ["༄", "༅", "࿓", "࿔", "༇", "༆", "༈"]
punct_separators = [" ", "།", "༎", "༏", "༐", "༑", "༔"]
punct_other = ["༼", "༽", "༒", "༓"]
text_punct = punct_beginning+punct_separators+punct_other
syl_punct = ["་", "༌", "ཿ"]

In [3]:
current_text = []

In [4]:
class prepareTib:
    ''' Dealing with the punctuation of Tibetan '''
    
    def __init__(self, tibstring):
        global current_text
        if current_text != []:
            current_text = []
        
        self.raw = tibstring
        
        self.b_punct = punct_beginning        
        self.t_punct = text_punct
        
        self.s_punct = syl_punct
        
        self.any_s_punct = '['+''.join(self.s_punct)+']+'
        self.any_t_punct = '['+''.join(self.t_punct)+']+'
        self.any_b_punct = '['+''.join(self.b_punct)+']+'
        
        #####################################################
        # Prepare the text representation
        p_splitted = re.split('('+''.join(self.any_t_punct)+')', self.raw)
        p_splitted = [punct for punct in p_splitted if punct != ''] # eliminate empty elements
        for elt in p_splitted:
            # initialize the tuple (syl, {})
            if re.findall(self.any_s_punct, elt):
                syls = re.split(self.any_s_punct, elt)
                syls = [syl for syl in syls if syl != ''] # eliminate empty elements
                for syl in syls:
                    current_text.append((syl, {}))
            else:
                if re.findall(self.any_b_punct, elt):
                    current_text.append((elt, {'punct' : 1, 'pu_type' : 0}))
                elif '༎'in elt:
                    current_text.append((elt, {'punct' : 1, 'pu_type' : 2}))
                elif re.findall(self.any_t_punct, elt):
                    current_text.append((elt, {'punct' : 1, 'pu_type' : 1}))
                else:
                    current_text.append((elt, {}))
        # tag the linebreaks
        i = 0
        while i <= len(current_text)-1:
            for lb in ['\n', '\r\n']:                
                if lb in current_text[i][0]:
                    elt = current_text[i][0].replace(lb, '')
                    attribs = current_text[i][1]
                    attribs['lbreak'] = 1
                    current_text[i] = (elt, attribs)
            i = i+1
        #
        ################################################################
    
    def get_all_punct(self):
        global current_text
        
        out = []
        for elt in current_text:
            if 'punct' in elt[1].keys():
                out.append(elt[0])
            else:
                out.append(elt[0]+'་')
                
        # extra tshek deletion
        for num, syl in enumerate(out):
            if syl.endswith('་') and syl[-2] != 'ང':
                if out[num+1][0] in self.t_punct:
                    out[num] = out[num][:-1]
        return ''.join(out)

In [11]:
prepareTib("༄༅།། །།བཅོམ་ལྡན་འདས་དེའི་ཚེ་ན་ཚེ་དང་ལྡན་པ་ཀུན་དགང་། ཀུན་ཀུན༎").get_all_punct()
print(current_text)

[('༄༅།། །།', {'pu_type': 0, 'punct': 1}), ('བཅོམ', {}), ('ལྡན', {}), ('འདས', {}), ('དེའི', {}), ('ཚེ', {}), ('ན', {}), ('ཚེ', {}), ('དང', {}), ('ལྡན', {}), ('པ', {}), ('ཀུན', {}), ('དགང', {}), ('། ', {'pu_type': 1, 'punct': 1}), ('ཀུན', {}), ('ཀུན', {}), ('༎', {'pu_type': 2, 'punct': 1})]


In [5]:
merged_cases = [("འི", 'dreldra'), ("འང", 'gyendu'), ("འམ", 'jedu'), ("འོ", 'lardu')]
other = [("ས", 'jedra'), ("ར", 'ladon')]

dreldra = ['གི', 'ཀྱི', 'གྱི', 'ཡི']
jedra = ['གིས', 'ཀྱིས', 'གྱིས', 'ཡིས']
ladon = ['སུ', 'ཏུ', 'དུ', 'རུ']
lhagche = ['སྟེ', 'ཏེ', 'དེ']
gyendu = ["ཀྱང", "ཡང"]
jedu = ['གམ', 'ངམ', 'དམ', 'ནམ', 'བམ', 'མམ', 'རམ', 'ལམ', 'སམ', 'ཏམ']
dagdra = ['པ', 'པོ', 'བ', 'བོ']
lardu = ['གོ', 'ངོ', 'དོ', 'ནོ', 'བོ', 'མོ', 'འོ', 'རོ', 'ལོ', 'སོ', 'ཏོ']
separate_particles = dreldra + jedra + ladon + lhagche + gyendu + dagdra + lardu

non_flex_particles = ['ནི', 'དང', 'ཅི', 'ཇི', 'སུ'] # find the particles that are often in composed lexicon entries

In [8]:
class segment:
    '''segment a given string of Tibetan'''
    
    def __init__(self):
        self.m_particles = [c[0] for c in merged_cases] # ས and ར are not yet supported
        self.s_particles = separate_particles
        self.n_particles = non_flex_particles
        self.text = current_text
        self.nonamb_part = ['གི', 'ཀྱི', 'གྱི', 'གིས', 'ཀྱིས', 'ཡིས', 'ཏུ', 'རུ', 'སྟེ', 'ཏེ', 'ཀྱང', 'ཡང', 'འང', 'མམ', 'འམ', 'སམ', 'ཏམ', 'ནོ', 'ཏོ', 'ཅིང', 'ཅེས', 'ཅེའོ', 'ཅིག', 'ཞེས', 'ཞེའོ', 'ཞིག', 'ཤིང', 'ཤེའོ', 'ཤིག']
        self.amb_part = ['ཡི', 'གྱིས', 'སུ', 'དུ', 'ལམ', 'གམ', 'ངམ', 'དམ', 'ནམ', 'བམ', 'རམ', 'པ', 'པོ', 'བ', 'བོ', 'གོ', 'ངོ', 'དོ', 'མོ', 'འོ', 'རོ', 'ལོ', 'སོ', 'དེ',  'ཞིང']

    def is_punct(elt):
        if 'punct' in elt[1].keys():
            return True
        else:
            return False
        
    def is_part(elt):
        if 'part' in elt[1].keys():
            return True
        else:
            return False
    
    def tag_particles(self):
        global current_text
        
        for elt in current_text:
            if not segment.is_punct(elt): # leave out all punctuation                 
                # tag unfusioned flexional particles
                if elt[0] in self.nonamb_part:
                    elt[1]['part'] = 1
                    elt[1]['pa_type'] = 'nonamb'
                # tag all the non-flexional particles
                elif elt[0] in self.n_particles:
                    elt[1]['part'] = 1
                    elt[1]['pa_type'] = 'nonflex'
                # tag the merged particles
                elif elt[0][-2:] in self.m_particles:
                    elt[1]['part'] = 1
                    elt[1]['pa_type'] = 'merged'
    
    def syl_clusters(self):
        global current_text
        
        clusters = []
        separators = 'punct|part'
        beginning = 0
        end = 0
        index = 0
        for index, elt in enumerate(current_text):
            if index == 0:
                    if segment.is_punct(elt) or segment.is_part(elt):
                        beginning = index
            else:
                    if not (segment.is_punct(elt) or segment.is_part(elt)) and (segment.is_punct(current_text[index-1]) or segment.is_part(current_text[index-1])):
                        beginning = index
                    if (segment.is_punct(elt) or segment.is_part(elt)) and not (segment.is_punct(current_text[index-1]) or segment.is_part(current_text[index-1])):
                        end = index-1
                        clusters.append((beginning, end))
                    if index == len(current_text)-1:# and not segment.is_punct(elt):
                        end = index
                        clusters.append((beginning, end))
        
        if (beginning, end) != clusters[-1]:
            clusters.append((beginning, end))
        print(clusters)
        return clusters
                        
                             
string = '''༄༅།། །།བཅོམ་ལྡན་འདས་ཀྱི་
དེའི་ཚེ་ན་
ཚེ་དང་ལྡན་པ་
ཀུན་དགང་།ཀུན ་ཀུན་'''
prepareTib(string)
segment().tag_particles()
for cluster in segment().syl_clusters():
    index = cluster[0]
    while index <= cluster[1]:
        print(current_text[index][0], end = ' ')
        index = index+1
    print()
print(current_text)

[(1, 3), (6, 8), (10, 13), (15, 15), (17, 17)]
བཅོམ ལྡན འདས 
ཚེ ན ཚེ 
ལྡན པ ཀུན དགང 
ཀུན 
ཀུན 
[('༄༅།། །།', {'pu_type': 0, 'punct': 1}), ('བཅོམ', {}), ('ལྡན', {}), ('འདས', {}), ('ཀྱི', {'part': 1, 'pa_type': 'nonamb'}), ('དེའི', {'part': 1, 'lbreak': 1, 'pa_type': 'merged'}), ('ཚེ', {}), ('ན', {}), ('ཚེ', {'lbreak': 1}), ('དང', {'part': 1, 'pa_type': 'nonflex'}), ('ལྡན', {}), ('པ', {}), ('ཀུན', {'lbreak': 1}), ('དགང', {}), ('།', {'pu_type': 1, 'punct': 1}), ('ཀུན', {}), (' ', {'pu_type': 1, 'punct': 1}), ('ཀུན', {})]


In [43]:
current_text[15]

('ཀུན', {})

In [23]:
#tout avec les particules entre des +
for num, syl in enumerate(current_layers['particles']):
    if syl == '':
        syl = current_layers['syllables'][num][0]
        print(syl, end = '་')
    else:
        print(' +'+str(syl)+'་+ ', end = '')
print()

# juste les blocs de syllabes
for num, syl in enumerate(current_layers['particles']):
    if syl == '':
        syl = current_layers['syllables'][num][0]
        print(syl, end = '་')
    else:
        print(end = ' ')

NameError: name 'current_layers' is not defined

In [None]:
# returns False or the length of the suffix it had to remove to find the string
def lookupStr(str):
    if len(str) == 0:
        return False
    if search(str, 'particles'):
        return {'suffixLength': 0, 'type': 'particle', 'str': str}
    if search(str, 'words'):
        return {'suffixLength': 0, 'type': 'word', 'str': str}
    suffixLength = getSuffixLength(str)
    if suffixLength == 0:
        return False
    strNoSuffix = str[0: -suffixLength]
    if not str:
        return False
    if search(strNoSuffix+'འ', 'words'):
        return {'suffixLength': suffixLength, 'type': 'word', 'ashung': True, 'str': str}
    if search(strNoSuffix, 'words'):
        return {'suffixLength': suffixLength, 'type': 'word', 'str': str}
    if search(strNoSuffix, 'particles'):
        return {'suffixLength': suffixLength, 'type': 'particle', 'str': str}
    return False

In [None]:
def search(entry, listName):   # can't use a to specify default for hi
    global lists, lists_len
    # using bisect here divides computation time by a huge amount
    pos = bisect_left(lists[listName],entry,0,lists_len[listName]) # gives the index of a given element(entry) in lists[listName]
    return (True if pos != lists_len[listName] and lists[listName][pos]

In [None]:
# get the length of the suffix (or 0)
# see the content of suffixes{} and second_suffixes{}. the meaning is not the normal one
def getSuffixLength(str):
    global suffixes, second_suffixes
    if str[-2:] in second_suffixes and len(str) > 3 and not str[-3] == '་':
        return 0
    if str[-2:] in suffixes:
        return 2
    if str[-1:] in suffixes:
        return 1
    return 0

In [None]:
lists = {}
lists_len = {}

def addList(listName, fileName):
    global lists, lists_len
    if not listName in lists:
        lists[listName] = []
    with open(fileName, encoding='utf-8') as f:
        for line in f:
            word = line.strip().strip('་')
            if word != '':
                lists[listName].append(word)
    lists[listName] = sorted(lists[listName])
    lists_len[listName] = len(lists[listName])

addList('particles', 'src/particles.txt')
addList('words', 'src/verbs.txt')
addList('words', 'src/TDC.txt')