In [1]:
from datasets import load_dataset
import itertools

# Load Dataset

## Wikipedia Dataset

In [2]:
ds = load_dataset("chuuhtetnaing/myanmar-wikipedia-dataset")

In [3]:
wiki_words = ds['train']['unique_mm_words']
wiki_phrases = ds['train']['unique_phrases']

In [4]:
my_wiki_word_dictionary = list(itertools.chain.from_iterable(wiki_words))
my_wiki_phrase_dictionary = list(itertools.chain.from_iterable(wiki_phrases))
my_wiki_dictionary = my_wiki_word_dictionary + my_wiki_phrase_dictionary

## Facebook flores dataset

In [5]:
flores_ds = load_dataset("facebook/flores", "mya_Mymr", trust_remote_code=True)

In [6]:
flores_ds

DatasetDict({
    dev: Dataset({
        features: ['id', 'URL', 'domain', 'topic', 'has_image', 'has_hyperlink', 'sentence'],
        num_rows: 997
    })
    devtest: Dataset({
        features: ['id', 'URL', 'domain', 'topic', 'has_image', 'has_hyperlink', 'sentence'],
        num_rows: 1012
    })
})

In [7]:
dev_sentences = flores_ds['dev']['sentence']
devtest_sentences = flores_ds['devtest']['sentence']

sentences = dev_sentences + devtest_sentences

In [8]:
len(sentences)

2009

In [9]:
import re


# Reference: https://github.com/ye-kyaw-thu/sylbreak/blob/master/python/sylbreak.py

def create_break_pattern():
    """Creates and returns the regular expression pattern for Myanmar syllable breaking."""
    my_consonant = r"က-အ"
    # en_char = r"a-zA-Z0-9"

    other_char = r"ဣဤဥဦဧဩဪဿ၌၍၏၀-၉၊။(){}"
    subscript_symbol = r'္'
    a_that = r'်'

    # Regular expression pattern for Myanmar syllable breaking
    return re.compile(
        r"((?<!" + subscript_symbol + r")[" + my_consonant + r"]"
                                                             r"(?!["
        + a_that + subscript_symbol + r"])"
        + r"|[" + other_char + r"])"
    )


def break_syllables(line, break_pattern, separator):
    """Applie
    s syllable breaking rules to a line."""
    line = re.sub(r'\s+', ' ', line.strip())  # Normalize space
    segmented_line = break_pattern.sub(separator + r"\1", line)

    # Remove the leading delimiter if it exists
    if segmented_line.startswith(separator):
        segmented_line = segmented_line[len(separator):]

    # Replace delimiter+space+delimiter with a single space
    double_delimiter = separator + " " + separator
    segmented_line = segmented_line.replace(double_delimiter, " ")

    return segmented_line


break_pattern = create_break_pattern()


def split_words(text):
    seperator = "|X|"

    words = break_syllables(text, break_pattern, seperator)
    words = words.split(seperator)

    return words


def extract_myanmar_chars(text):
    result = ""

    for char in text:
        code_point = ord(char)

        # Check if the character is Myanmar Unicode or a common punctuation/space
        if (0x1000 <= code_point <= 0x109F or  # Myanmar
                0xAA60 <= code_point <= 0xAA7F or  # Myanmar Extended-A
                0xA9E0 <= code_point <= 0xA9FF or  # Myanmar Extended-B
                char in "()[]{} "):  # Keep common punctuation and spaces
            result += char

    return result

In [10]:
my_flores_word_dictionary = []

for sentence in sentences:
    words = split_words(extract_myanmar_chars(sentence))
    my_flores_word_dictionary.extend(words)

In [11]:
my_flores_phrase_dictionary = []
for sentence in sentences:
    phrases = sentence.split()
    my_flores_phrase_dictionary.extend(phrases)

In [12]:
my_flores_phrase_dictionary

['တနင်္လာနေ့တွင်',
 'စတန်းဖို့ဒ်တက္ကသိုလ်',
 'ဆေးကျောင်းမှ',
 'သိပ္ပံပညာရှင်များသည်',
 'ဆဲလ်များကို',
 'အမျိုးအစားအလိုက်',
 'စီစဉ်နိုင်သော',
 'ရောဂါခွဲခြားစစ်ဆေးမှု',
 'ကိရိယာအသစ်',
 'တီထွင်မှုအကြောင်းကို',
 'ကြေညာခဲ့သည်-',
 '၎င်းသည်',
 'စံနှုန်းမီ',
 'မင်စက်ကလေးများဖြင့်',
 'ပုံဖော်သည့်',
 'ပရင်တာများကို',
 'သုံးကာ',
 'ထုတ်လုပ်နိုင်သော',
 'အလွန်သေးငယ်သည့်',
 'တစ်ခုလျှင်',
 'U.S.',
 'ဆင့်တစ်ပြားသာသာရှိသော',
 'ပရင့်လုပ်၍ရသော',
 'အပြားလေးဖြစ်ပါသည်။',
 '၎င်းသည်',
 'ချမ်းသာသော',
 'နိုင်ငံများတွင်',
 'ရင်သားကင်ဆာ',
 'ကဲ့သို့',
 'ရောဂါဖြစ်၍',
 'ရှင်သန်နှုန်းအောက်',
 'တစ်ဝက်သာရှိသည့်',
 'ဝင်ငွေနည်းသော',
 'နိုင်ငံများရှိ',
 'လူနာများတွင်',
 'ကင်ဆာ၊',
 'အဆုတ်ရောဂါ၊',
 'အိတ်အိုင်ဗွီနှင့်',
 'ငှက်ဖျားရောဂါတို့ကို',
 'ဆောလျင်စွာ',
 'သိရှိနိုင်စေမည်ဟု',
 'ထိပ်တန်းသုတေသီများက',
 'ဆိုပါသည်။',
 'JAS',
 '၃၉C',
 'Gripen',
 'သည်',
 'ဒေသစံတော်ချိန်',
 'မနက်',
 '၉:၃၀',
 'နာရီ',
 '(၀၂၃၀',
 'UTC)',
 'တွင်',
 'ပြေးလမ်းပေါ်တွင်',
 'တိုက်မိကာ',
 'ပေါက်ကွဲသွားခဲ့ပြီး',
 'လေဆိပ်မှ',
 'ပျံသန်းမှုများ',
 'ပိတ်သွားစ

# Get Single Myanmar Char

In [13]:
def list_myanmar_characters():
    # Myanmar Unicode block ranges from U+1000 to U+109F
    myanmar_chars = []

    # Main Myanmar block (U+1000 to U+109F)
    for code_point in range(0x1000, 0x109F + 1):
        char = chr(code_point)
        myanmar_chars.append((hex(code_point), char))

    # Myanmar Extended-A (U+AA60 to U+AA7F)
    for code_point in range(0xAA60, 0xAA7F + 1):
        char = chr(code_point)
        myanmar_chars.append((hex(code_point), char))

    # Myanmar Extended-B (U+A9E0 to U+A9FF)
    for code_point in range(0xA9E0, 0xA9FF + 1):
        char = chr(code_point)
        myanmar_chars.append((hex(code_point), char))

    return myanmar_chars

def display_myanmar_characters():
    characters = list_myanmar_characters()

    print("Myanmar Unicode Characters:")
    print("=========================")
    print("Code Point | Character")
    print("------------------------")

    for code_point, char in characters:
        print(f"{code_point:10} | {char}")

display_myanmar_characters()

Myanmar Unicode Characters:
Code Point | Character
------------------------
0x1000     | က
0x1001     | ခ
0x1002     | ဂ
0x1003     | ဃ
0x1004     | င
0x1005     | စ
0x1006     | ဆ
0x1007     | ဇ
0x1008     | ဈ
0x1009     | ဉ
0x100a     | ည
0x100b     | ဋ
0x100c     | ဌ
0x100d     | ဍ
0x100e     | ဎ
0x100f     | ဏ
0x1010     | တ
0x1011     | ထ
0x1012     | ဒ
0x1013     | ဓ
0x1014     | န
0x1015     | ပ
0x1016     | ဖ
0x1017     | ဗ
0x1018     | ဘ
0x1019     | မ
0x101a     | ယ
0x101b     | ရ
0x101c     | လ
0x101d     | ဝ
0x101e     | သ
0x101f     | ဟ
0x1020     | ဠ
0x1021     | အ
0x1022     | ဢ
0x1023     | ဣ
0x1024     | ဤ
0x1025     | ဥ
0x1026     | ဦ
0x1027     | ဧ
0x1028     | ဨ
0x1029     | ဩ
0x102a     | ဪ
0x102b     | ါ
0x102c     | ာ
0x102d     | ိ
0x102e     | ီ
0x102f     | ု
0x1030     | ူ
0x1031     | ေ
0x1032     | ဲ
0x1033     | ဳ
0x1034     | ဴ
0x1035     | ဵ
0x1036     | ံ
0x1037     | ့
0x1038     | း
0x1039     | ္
0x103a     | ်
0x103b     | ျ
0x103c     | ြ
0x103d   

In [14]:
characters = list_myanmar_characters()
my_chars = [char for _, char in characters]

# Trim Phrase Length

In [15]:
CORPUS_LENGTH = 100

In [16]:
print("Wiki Phrases:", len(my_wiki_phrase_dictionary))
print("Flores Phrases:", len(my_flores_phrase_dictionary))

Wiki Phrases: 7166372
Flores Phrases: 20678


In [17]:
new_phrases = []
for phrase in my_wiki_phrase_dictionary + my_flores_phrase_dictionary:
    phrase = phrase.strip()
    if len(phrase) <= CORPUS_LENGTH:
        if phrase != "":
            new_phrases.append(phrase)
    else:
        phrase = phrase.split("။")
        phrase = list(itertools.chain.from_iterable([p.split("၊") for p in phrase]))
        phrase = [p.strip() for p in phrase if p.strip() != ""]
        phrase = [p for p in phrase if len(p) <= CORPUS_LENGTH]
        new_phrases.extend(phrase)

print("Total Phrases:", len(new_phrases))

Total Phrases: 7187152


In [18]:
max_length = 0
longest_phrase = ""
for phrase in new_phrases:
    if len(phrase) > max_length:
        max_length = len(phrase)
        longest_phrase = phrase

print("Max Length:", max_length)
print("Longest Phrase:", longest_phrase)

Max Length: 100
Longest Phrase: မြေကြီးပေါ်မှာရှိသောအုပျခြုပျမယျ့သူတိုင်းမင်းကြီးသည်များ၏အမည်များအပါအဝင်အတိတ်နှင့်အနာဂတ်ဖြစ်ရပ်များ,


# Export Chars and Phrases

In [19]:
# my_words = []
#
# for word in my_chars + my_wiki_word_dictionary + my_flores_word_dictionary:
#     if word not in my_words:
#         my_words.append(word)
#
# print("Before Removing Duplicates:", len(my_chars + my_wiki_word_dictionary + my_flores_word_dictionary))
# print("After Removing Duplicates:", len(my_words))

In [20]:
import os
import requests

def download_file(url, filename):
    # Check if file already exists
    if os.path.exists(filename):
        print(f"File {filename} already exists")
        return

    # Convert GitHub URL to raw content URL
    raw_url = url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")

    # Download file
    print(f"Downloading {filename}...")
    response = requests.get(raw_url)

    # Save file
    if response.status_code == 200:
        with open(filename, 'wb') as f:
            f.write(response.content)
        print("Download complete")
    else:
        print(f"Download failed with status code {response.status_code}")

# File to download
url = "https://github.com/PaddlePaddle/PaddleOCR/blob/33fa33e4a990bdef095a4a74cf849781312c2b67/ppocr/utils/en_dict.txt"
filename = "data/en_dict.txt"

# Download if not exists
download_file(url, filename)

Downloading data/en_dict.txt...
Download complete


In [21]:
with open("data/en_dict.txt") as file:
    en_dict = file.read()

In [22]:
en_dict = en_dict.split("\n")
my_dict = en_dict + my_chars

In [23]:
with open("data/my_dict.txt", "w") as file:
    file.write("\n".join(my_dict))

In [24]:
my_phrases = list(set(new_phrases))

In [25]:
print("Before Removing Duplicates:", len(new_phrases))
print("After Removing Duplicates:", len(my_phrases))

Before Removing Duplicates: 7187152
After Removing Duplicates: 2737935


In [26]:
my_phrases[10000:10010]

['(အကျိုးကို)၊',
 'ပါဝင်ပတ်သက်နိုင်သည်။',
 'အတည်ပြုထားပြီး',
 'ထိုနေရာတွင်ရုပ်ပုံပိတ်ပင်တားဆီးမှုတစ်ခုစီတွင်ကိုယ်ပိုင်',
 'မောင်နှမချင်း',
 'ချက်ပေးရသည့်',
 'အိမ်သို့ပြန်ခဲ့ရ၏။',
 'လျှပ်စီးများများစီးပါက',
 'ပုံစံပြောင်းလဲခြင်းပြု',
 'ြမင်းခင်းသဘင်ပွဲ၌']

In [27]:
with open("data/my_corpus.txt", "w") as file:
    file.write("\n".join(my_phrases))