In [1]:
from datasets import load_dataset
from icu import Transliterator
from datasets import concatenate_datasets
from myanmartools import ZawgyiDetector

# Load the Original CulturaX Dataset

In [2]:
dataset = load_dataset("uonlp/CulturaX", "my")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'timestamp', 'url', 'source'],
        num_rows: 865575
    })
})

# Example Convertion from Zawgyi to Unicode

In [4]:
detector = ZawgyiDetector()
converter = Transliterator.createInstance('Zawgyi-my')

original_text = dataset['train'][0]['text']
score = detector.get_zawgyi_probability(original_text)

print("Original Text:", original_text)

if score > 0.5:
    converted_text = converter.transliterate(original_text)
    print()
    print("Score:", score)
    print()
    print("Converted Text:", converted_text)

Original Text: လွည္းကေလးနဲ႔ ေက်ာင္းဆရာ
၂၀၁၀ ခုႏွစ္က အီဖရန္ပီနာဖလိုရီဒါတစ္ေယာက္ ေဟာလိဝုဒ္အေက်ာ္အေမာ္ေတြတက္တဲ့ CNN Heroes ညစာစားပြဲကိုသြားတက္ေတာ့ ဖိလစ္ပိုင္က လူအေတာ္မ်ားမ်ား သူ႔ကိုမသိၾကေပ။ အဆိုပါႏွစ္၏ CNN သတင္းဌာနကေပးေသာ သူရဲေကာင္းဆုကို ရခဲ့ၿပီးေနာက္တြင္ေတာ့ ေလဆိပ္အထိ ျပည္သူရာေက်ာ္လာႀကိဳၾကသည္။ "ကိုယ့္ႏိုင္ငံလည္း ျပန္ေရာက္ေရာ ကၽြန္ေတာ္က ဆူပါစတားျဖစ္ေနပါေလေရာ"ဟု ၎က ေျပာၾကားသည္။ ပီနာဖလိုရီဒါသည္ လက္တြန္းလွည္းစာသင္ခန္းမ်ားကိုဖန္တီးကာ စာအုပ္မ်ား၊ သင္ေထာက္ကူမ်ားတင္ၿပီး လွည့္လည္သြားလာကူညီေပးခဲ့သျဖင့္ ထိုဆုကိုရရွိခဲ့ျခင္း ျဖစ္သည္။ ၁၉၉၇ ခုႏွစ္မွစတင္ကာ ပီနာဖလိုရီဒါႏွင့္ ေစတနာ့ဝန္ထမ္း ဆယ္ေက်ာ္သက္မ်ားစြာတို႔က အဆိုပါ လက္တြန္းလွည္းမ်ားျဖင့္ လွည့္လည္ကာ လမ္းေပၚကေလးမ်ားစြာကို စာေရး၊ စာဖတ္တတ္ေအာင္ လိုက္လံစာသင္ၾကားေပးခဲ့ေပသည္။
"ဒီဆုရရွိလိုက္တာ အက်ိဳးရွိပါတယ္။ ကၽြန္ေတာ္နဲ႔ ကၽြန္ေတာ့္ေစတနာ့ဝန္ထမ္း လုပ္ေဖာ္ကိုင္ဖက္ေတြအတြက္ လုပ္ရက်ိဳးနပ္တယ္လို႔ ခံစားရေစပါတယ္"ဟု ယခုအခါ အသက္ ၃၇ ထဲ ေရာက္ေနၿပီျဖစ္ေသာ ပီနာဖလိုရီဒါကဆိုသည္။ ဖိလစ္ပိုင္အစိုးရကလည္း အရပ္သားမ်ားကိုေပးသည့္ အျမင့္ဆံုးဆုျဖစ္ေသာ လာကန္ဒူလာဆုကို ေပးအပ္ခဲ့သည္။ ထိ

# Convert the Whole Dataset from Zawgyi to Unicode

In [5]:
def zawgyi_to_unicode_converter(examples):
    detector = ZawgyiDetector()
    converter = Transliterator.createInstance('Zawgyi-my')

    texts = examples['text']
    converted_texts = []

    for text in texts:
        score = detector.get_zawgyi_probability(text)
        if score > 0.5:
            text = converter.transliterate(text)

        converted_texts.append(text)

    examples['text'] = converted_texts
    return examples

In [6]:
culturax_my_dataset_unicode = dataset.map(
    zawgyi_to_unicode_converter,
    batched=True,
    batch_size=100,
    num_proc=10
)

Map (num_proc=10):   0%|          | 0/865575 [00:00<?, ? examples/s]

# Upload to Huggingface

In [8]:
culturax_my_dataset_unicode = culturax_my_dataset_unicode['train'].train_test_split(test_size=0.1, seed=42)

In [9]:
culturax_my_dataset_unicode

DatasetDict({
    train: Dataset({
        features: ['text', 'timestamp', 'url', 'source'],
        num_rows: 779017
    })
    test: Dataset({
        features: ['text', 'timestamp', 'url', 'source'],
        num_rows: 86558
    })
})

In [10]:
culturax_my_dataset_unicode.push_to_hub("chuuhtetnaing/myanmar-culturax-dataset")

Uploading the dataset shards:   0%|          | 0/14 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/56 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/56 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/56 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/56 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/56 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/56 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/56 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/56 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/56 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/56 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/56 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/56 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/56 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/56 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/44 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/44 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/chuuhtetnaing/myanmar-culturax-dataset/commit/ce7b813b11d22dd1350c129e827326bf35686b09', commit_message='Upload dataset', commit_description='', oid='ce7b813b11d22dd1350c129e827326bf35686b09', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/chuuhtetnaing/myanmar-culturax-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='chuuhtetnaing/myanmar-culturax-dataset'), pr_revision=None, pr_num=None)