# Latin to Lao transformation

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/enabling-languages/libraries-transformation/blob/main/notebooks/ALA_LC_transformations.ipynb)

## Setup

In [1]:
#@title
#%%capture
#!pip install -q unicodedataplus
#!pip install -q regex
#!pip install -q grapheme
#!pip install -q laonlp
#!pip install -q pythainlp
#!pip install -q python-crfsuite
#!pip install -q pyicu
#!wget -q https://raw.githubusercontent.com/enabling-languages/libraries-transformation/main/el_transliteration.py -O /content/el_transliteration.py
#!wget -q https://raw.githubusercontent.com/enabling-languages/libraries-transformation/main/el_utils.py -O /content/el_utils.py

# pip install -U git+https://github.com/enabling-languages/el_internationalisation.git#egg=el_internationalisation
# pip install -U git+https://github.com/enabling-languages/el_utilities.git#egg=el_utilities

import el_utilities as elu

## Latin to Lao transformation

In [4]:
# Test:     Vīangchan : Hǭ Samut hæng Sāt, 2006.
latn_input_str = input("String to transform: ")
laoo_output_str = elu.el_transliterate(latn_input_str, lang="lo", dir="reverse", nf="NFM")
print("\n")
print(latn_input_str)
print(laoo_output_str)



Vīangchan : Hǭ Samut Hæng Sāt, 2006.
ວຽງຈັນ : ຫໍ ສະມຸດ hæng ຊາດ, 2006.


## Lao to Latin transformation

In [5]:
lang = "lo"
if lang == "lo":
    from laonlp.tokenize import word_tokenize
import el_internationalisation as eli
import regex as re

# Test:  ວຽງຈັນ : ຫໍສະມຸດແຫ່ງຊາດ, 2006.
laoo_input_str_orig = input("String to transform: ")
laoo_input_str = laoo_input_str_orig
# laoo_input_str = word_tokenize(laoo_input_str)

In [17]:
print(laoo_input_str)
# pattern = r'(\p{Lao}+)'
LANG = "lo"
pattern = {
    'lo': r'(\p{Lao}+)'
}
# replacement = " ".join(word_tokenize(r'\1'))

# https://stackoverflow.com/questions/12597370/python-replace-string-pattern-with-output-of-function
def replacement(match):
    if LANG == "lo":
        #match = match.group()
        return " ".join(word_tokenize(match.group()))

laoo_input_str = re.sub(pattern[LANG], replacement, laoo_input_str)
print(laoo_input_str)

ວຽງຈັນ : ຫໍສະມຸດ ແຫ່ງຊາດ, 2006.
ວຽງຈັນ : ຫໍສະມຸດ ແຫ່ງຊາດ, 2006.


In [18]:
latn_output_str = elu.el_transliterate(laoo_input_str, lang="lo", dir="forward", nf="nfd")
print("\n")
print(laoo_input_str)
print(latn_output_str)



ວຽງຈັນ : ຫໍສະມຸດ ແຫ່ງຊາດ, 2006.
Vīangchan : HǭSamut hǣngSāt, 2006.


In [9]:
from icu import BreakIterator, Locale, RuleBasedBreakIterator
LANG = "lo-LA"
LOC = Locale.createCanonical(LANG)

def iterate_breaks(text, break_iterator):
    break_iterator.setText(text)
    lastpos = 0
    while True:
        next_boundary = break_iterator.nextBoundary()
        if next_boundary == -1: return
        yield text[lastpos:next_boundary]
        lastpos = next_boundary

# SEP = "\u2009·\u2009"
SEP = " "
def results(l, sep="|", pymyan=False):
    print("Number of tokens: ", str(len(l)))
    r = sep.join(list(s['syllable'] for s in l)) if pymyan else sep.join(l)
    print("Segmentation boundaries: " + r)

In [11]:
laoo_input_str2 = laoo_input_str_orig

wbi = BreakIterator.createWordInstance(LOC)
# words_icu = list(iterate_breaks(laoo_input_str2, wbi))
# results(words_icu, sep=" ")

def lao_replace_icu(match):
    #match = match.group()
    return SEP.join(list(iterate_breaks(match.group(), wbi)))

laoo_input_str2 = re.sub(pattern['lo'], lao_replace_icu, laoo_input_str2)
print(laoo_input_str2)

ວຽງຈັນ : ຫໍ ສະມຸດ ແຫ່ງຊາດ, 2006.


In [12]:
latn_output_str2 = elu.el_transliterate(laoo_input_str2, lang="lo", dir="forward", nf="nfd")
print(laoo_input_str_orig)
print(laoo_input_str2)
print(latn_output_str2)

ວຽງຈັນ : ຫໍສະມຸດແຫ່ງຊາດ, 2006.
ວຽງຈັນ : ຫໍ ສະມຸດ ແຫ່ງຊາດ, 2006.
Vīangchan : Hǭ Samut hǣngSāt, 2006.


### Trankit

```zsh
pip install trankit
```

In [13]:
from trankit import Pipeline, supported_langs
print(supported_langs)


['afrikaans', 'ancient-greek-perseus', 'ancient-greek', 'arabic', 'armenian', 'basque', 'belarusian', 'bulgarian', 'catalan', 'chinese', 'traditional-chinese', 'classical-chinese', 'croatian', 'czech-cac', 'czech-cltt', 'czech-fictree', 'czech', 'danish', 'dutch', 'dutch-lassysmall', 'english', 'english-gum', 'english-lines', 'english-partut', 'estonian', 'estonian-ewt', 'finnish-ftb', 'finnish', 'french', 'french-partut', 'french-sequoia', 'french-spoken', 'galician', 'galician-treegal', 'german', 'german-hdt', 'greek', 'hebrew', 'hindi', 'hungarian', 'indonesian', 'irish', 'italian', 'italian-partut', 'italian-postwita', 'italian-twittiro', 'italian-vit', 'japanese', 'kazakh', 'korean', 'korean-kaist', 'kurmanji', 'latin', 'latin-perseus', 'latin-proiel', 'latvian', 'lithuanian', 'lithuanian-hse', 'marathi', 'norwegian-nynorsk', 'norwegian-nynorsklia', 'norwegian-bokmaal', 'old-french', 'old-russian', 'persian', 'polish-lfg', 'polish', 'portuguese', 'portuguese-gsd', 'romanian-nonsta

---

In [14]:
# dict1 = OrderedDict(sorted(dict.items()))
lang="lo"
translit_table = elu.SUPPORTED_TRANSLITERATORS[lang]
# w_dict = elu.transliteration_data[translit_table[0]]['translit_dict']['forward']
from collections import OrderedDict
#word_dict = OrderedDict(sorted(w_dict.items(), reverse=True))
from icu import Collator, Locale
lang = "lo"
lang_collator = Collator.createInstance(Locale(lang))
root_collator = Collator.createInstance(Locale.getRoot())

word_dict = OrderedDict(sorted(elu.TRANSLIT_DATA[translit_table[0]]['translit_dict']['forward'].items(), reverse=True, key=lambda x: lang_collator.getSortKey(x[0])))
word_dict = {elu.normalise("nfd", k): elu.normalise("nfc", v) for k, v in word_dict.items()}
word_dict

{'ໂຮມ': 'hōm',
 'ໂຮງຮຽນ': 'Hōnghīan',
 'ໂຮງພິມ': 'Hōngphim',
 'ແຮງງານ': 'hǣnngān',
 'ແຮກ': 'hǣk',
 'ເຮົາ': 'hao',
 'ຮຽບຮຽງ': 'līaplīang',
 'ຮູບພາບ': 'Hūppāp',
 'ຮູບປັ້ນ': 'hūppan',
 'ຮູບ': 'hūp',
 'ຮຸ່ງ': 'Hung',
 'ຮືອນ': 'hūʼan',
 'ຮິບໂຮມ': 'hiphōm',
 'ຮ່ຳຮຽນ': 'hamhīan',
 'ຮ່າໂນ້ຍ': 'Hānōi',
 'ຮ້ານນາຍໄຊຍ໌': 'Rānnāisai',
 'ຮັບ': 'hap',
 'ຮັນສ໌': 'Han',
 'ຮັກສາ': 'Haksā',
 'ຮັກ': 'hak',
 'ຮອບ': 'hǭp',
 'ຮອດບຸນ': 'Hǭtbun',
 'ຮ້ອຍ': 'Hǭi',
 'ຮ່ວມສະໄໜ': 'hūamsamai',
 'ຮວບຮວມໂດຍ': 'Hūaphūam',
 'ໂອກາດ': 'ʻōkāt',
 'ເອົາ': 'ʻao',
 'ເອກະສານ': 'ʻēkasān',
 'ອົບຮົມ': 'ʻophom',
 'ອົງການ': 'ʻOngkān',
 'ອຸໄທ': 'ʻUthai',
 'ອຸດົມການ': 'ʻudomkān',
 'ອຸດສາຫະກຳ': 'ʻUtsāhakam',
 'ອຶນ': 'U̕n',
 'ອີງໃສ່': 'ʻīngsai',
 'ອິນທະວົງສ໌': 'ʻInthavong',
 'ອິນສີຊຽງໃໝ່': 'ʻInsīsīangmai',
 'ອຳນາດ': 'ʻamnāt',
 'ອຳທິລອ': 'ʻAmthilǭ',
 'ອາຣຸນ': 'ʻĀrun',
 'ອາຣ': 'ʻĀr',
 'ອານາຈັກ': 'ʻānāchak',
 'ອານັນໂທ': 'ʻĀnanthō',
 'ອາຈານໃຫຍ່': 'ʻĀchānnyai',
 'ອາຈານ': 'ʹĀchān',
 'ອ່າ': 'ʻān',
 'ອັພຍຍສັພທ໌': 'ʻapphayasap',
 'ອັນ': 'ʻAn',
 

In [15]:
import pdir
pdir(elu)

[0;33mproperty:[0m
    [0;36mDEFAULT_NF[0m[1;30m, [0m[0;36mSUPPORTED_TRANSLITERATORS[0m[1;30m, [0m[0;36mTRANSLIT_DATA[0m[1;30m, [0m[0;36m__author__[0m[1;30m, [0m[0;36m__builtins__[0m[1;30m, [0m[0;36m__credits__[0m[1;30m, [0m[0;36m__version__[0m[1;30m, [0m[0;36mre[0m
[0;33mmodule attribute:[0m
    [0;36m__cached__[0m[1;30m, [0m[0;36m__file__[0m[1;30m, [0m[0;36m__loader__[0m[1;30m, [0m[0;36m__name__[0m[1;30m, [0m[0;36m__package__[0m[1;30m, [0m[0;36m__path__[0m[1;30m, [0m[0;36m__spec__[0m
[0;33mspecial attribute:[0m
    [0;36m__doc__[0m
[0;33mclass:[0m
    [0;36mCollator[0m[0;36m: [0m[1;30mCollator objects[0m
    [0;36mLocale[0m[0;36m: [0m[1;30mLocale objects[0m
    [0;36mOrderedDict[0m[0;36m: [0m[1;30mDictionary that remembers insertion order[0m
[0;33mfunction:[0m
    [0;36mel_transliterate[0m[0;36m: [0m[1;30m[0m
    [0;36mnormalise[0m[0;36m: [0m[1;30m[0m
    [0;36mprep_string[0m[0;36m: 