In [1]:
import re

In [2]:
class pseudoWylie:
    '''to and from pseudo Wylie'''
    
    def __init__(self, string):    
        self.string = string
        
        # characters inserted by converting unicode into wylie
        self.nonwylie_separator = '\[\#\]'
        self.space = ' '
        self.new_nonwylie = ' [#]'

        self.punct_beginning = r'([\@\u0fd3][\#\u0fd4]+[\/_]+|[\%\$\!]+)' #["༄", "༅", "࿓", "࿔", "༇", "༆", "༈"]
        self.p_b_punct = '*'
        self.new_beginning = '@#/_/'

        self.punct_separators = r'(\/_\/(?!\/)|\/\/?\_?(?!\/\/)|\;\_\;|\u0f10\_\u0f10|\|\_\||\:\_)' # [" ", "།", "༎", "༏", "༐", "༑", "༔"]
        self.p_m_punct = ';'
        self.new_middle = '/_/'

        self.punct_other = r'[\(\)\u0f12\u0f13]+' # ["༼", "༽", "༒", "༓"]
        self.p_o_punct = '-'
        self.new_other = ''

        self.w_e_punct = '//_//'
        self.p_e_punct = '.'
        self.new_end = '//_//'
        
        self.syl_punct = r'[\u0f7f \*]+' # ["་", "༌", "ཿ"]
        self.w_space = 'X'

        self.w_aa = "'"
        self.p_aa = 'v'

        self.w_plus = '+'
        self.p_plus = 'x'

    def from_p_w(self):
        words = self.string.replace(self.space, self.new_nonwylie)
        spaces = words.replace(self.w_space, self.space)
        aa = spaces.replace(self.p_aa, self.w_aa)
        b_punct = aa.replace(self.p_b_punct, self.new_beginning)
        m_punct = b_punct.replace(self.p_m_punct, self.new_middle)
        o_punct = m_punct.replace(self.p_o_punct, self.new_other)
        e_punct = o_punct.replace(self.p_e_punct, self.new_end)
        plus = e_punct.replace(self.p_plus, self.w_plus)
        return plus

    def to_p_w(self):
        spaces = re.sub(self.syl_punct, self.w_space, self.string) # inter-word spaces
        words = re.sub(r''+self.w_space+'*'+self.nonwylie_separator, self.space, spaces)
        aa = words.replace(self.w_aa, self.p_aa) # ' > v
        b_punct = re.sub(self.punct_beginning, self.p_b_punct, aa)
        m_punct = re.sub(self.punct_separators, self.p_m_punct, b_punct) # middle punctuation > ;
        o_punct = re.sub(self.punct_other, self.p_o_punct, m_punct) # other punct > -
        e_punct = o_punct.replace(self.w_e_punct, self.p_e_punct) # add ending punctuation
        plus = e_punct.replace(self.w_plus, self.p_plus) # + > f
        extra_spaces = re.sub(r''+self.w_space+'([\\'+self.p_m_punct+'\\'+self.p_e_punct+'\\'+self.p_o_punct+'\\'+self.p_b_punct+'])', r'\1', plus) # remove spaces (x) in front of punctuation
        return extra_spaces

In [3]:
with open('A1_wylie.txt', 'r', -1, 'utf-8-sig') as f:
    content = [line.strip() for line in f.readlines()]

In [18]:
for line in content:
    li = pseudoWylie(line).to_p_w()
    print(li)
    print(line)
    print(pseudoWylie(li).from_p_w())
    print()

oM; snangXsrid rnamXdag rangXbzhin lhun grubXpavi;
oM/[#]snang srid [#]rnam dag [#]rang bzhin [#]lhun [#]grub pa'i//
oM/_/ [#]snang srid [#]rnam dag [#]rang bzhin [#]lhun [#]grub pa'i/_/

bkraXshis phyogsXbcuvi zhing na bzhugsXpa yi;
bkra shis [#]phyogs bcu'i [#]zhing [#]na [#]bzhugs pa [#]yi//
bkra shis [#]phyogs bcu'i [#]zhing [#]na [#]bzhugs pa [#]yi/_/

sangsXrgyas chos dang dgeXvdun vphagsXpavi tshogs;
sangs rgyas [#]chos [#]dang [#]dge 'dun [#]'phags pa'i [#]tshogs//
sangs rgyas [#]chos [#]dang [#]dge 'dun [#]'phags pa'i [#]tshogs/_/

kun la phyag vtshal bdagXcag bkraXshis shog;
kun [#]la [#]phyag [#]'tshal [#]bdag cag [#]bkra shis [#]shog/
kun [#]la [#]phyag [#]'tshal [#]bdag cag [#]bkra shis [#]shog/_/

sgronXmevi rgyalXpo rtsal brtan donXgrub dgongs;
sgron me'i [#]rgyal po [#]rtsal [#]brtan [#]don grub [#]dgongs//
sgron me'i [#]rgyal po [#]rtsal [#]brtan [#]don grub [#]dgongs/_/

byamsXpavi rgyan dpal dgeXgrags dpal damXpa;
byams pa'i [#]rgyan [#]dpal [#]dge grags [#]dpal [#]d

In [17]:
import re
l1 = []
l2 = []

text = "རྗེ་བཙུན་བླ་མ་དམིགས་པ་མེད་པའི་ཐུགས་རྗེ་ཆེན་པོ་དང་ལྡན་པ་རྣམས་ལ་ཕྱག་འཚལ་ལོ། །རིག་འཛིན"
lexicon = [["རྗེ", "བཙུན"], ["བླ", "མ"], ["དམིགས", "པ"], ["མེད", "པ"], ["ཐུགས", "རྗེ"], ["ཆེན", "པོ"], ["དང"], ["ལྡན", "པ"], ["རྣམས"], ["།", " "], ["།", " ", "།"], ["།"], ["ལ"], ["ཕྱག"], ["འཚལ"]]

text1 = re.sub(r"([།|༎|༏|༐|༑|༔|\s]+)", "་\g<1>་", text)
print(text1)
l1 = re.split(r"་+", text1)

def process(list1, list2, num):
    list2.append("་".join(l1[:num])+"་")
    del list1[:num]

while len(l1) > 0:
    if   l1[:4] in lexicon: process(l1, l2, 4)
    elif l1[:3] in lexicon: process(l1, l2, 3)
    elif l1[:2] in lexicon: process(l1, l2, 2)
    elif l1[:1] in lexicon: process(l1, l2, 1)
    elif l1[:1] not in lexicon:
        l2.append("་".join(l1[:1])+"་*")
        del l1[:1]

yo = " ".join(l2)
print(yo)

རྗེ་བཙུན་བླ་མ་དམིགས་པ་མེད་པའི་ཐུགས་རྗེ་ཆེན་པོ་དང་ལྡན་པ་རྣམས་ལ་ཕྱག་འཚལ་ལོ་། །་རིག་འཛིན
རྗེ་བཙུན་ བླ་མ་ དམིགས་པ་ མེད་* པའི་* ཐུགས་རྗེ་ ཆེན་པོ་ དང་ ལྡན་པ་ རྣམས་ ལ་ ཕྱག་ འཚལ་ ལོ་* ། །་* རིག་* འཛིན་*
