In [1]:
from datasets import load_dataset
from icu import Transliterator
from datasets import concatenate_datasets
from myanmartools import ZawgyiDetector

In [2]:
detector = ZawgyiDetector()
converter = Transliterator.createInstance('Zawgyi-my')

# Load the Original C4 Dataset

In [3]:
dataset = load_dataset("allenai/c4", "my")

Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/64 [00:00<?, ?it/s]

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'timestamp', 'url'],
        num_rows: 813530
    })
    validation: Dataset({
        features: ['text', 'timestamp', 'url'],
        num_rows: 858
    })
})

# Combine the Train and Validation Splits

In [4]:
c4_my_dataset = concatenate_datasets([dataset['train'], dataset['validation']])
print(f"Combined dataset has {len(c4_my_dataset)} examples")

Combined dataset has 814388 examples


In [5]:
c4_my_dataset.column_names

['text', 'timestamp', 'url']

# Example Convertion from Zawgyi to Unicode

In [6]:
original_text = c4_my_dataset[0]['text']
score = detector.get_zawgyi_probability(original_text)

print("Original Text:", original_text)

if score > 0.5:
    converted_text = converter.transliterate(original_text)
    print()
    print("Score:", score)
    print()
    print("Converted Text:", converted_text)

Original Text: ၂၀၁၀ ခုႏွစ္က အီဖရန္ပီနာဖလိုရီဒါတစ္ေယာက္ ေဟာလိဝုဒ္အေက်ာ္အေမာ္ေတြတက္တဲ့ CNN Heroes ညစာစားပြဲကိုသြားတက္ေတာ့ ဖိလစ္ပိုင္က လူအေတာ္မ်ားမ်ား သူ႔ကိုမသိၾကေပ။ အဆိုပါႏွစ္၏ CNN သတင္းဌာနကေပးေသာ သူရဲေကာင္းဆုကို ရခဲ့ၿပီးေနာက္တြင္ေတာ့ ေလဆိပ္အထိ ျပည္သူရာေက်ာ္လာႀကိဳၾကသည္။ “ကိုယ့္ႏိုင္ငံလည္း ျပန္ေရာက္ေရာ ကၽြန္ေတာ္က ဆူပါစတားျဖစ္ေနပါေလေရာ”ဟု ၎က ေျပာၾကားသည္။ ပီနာဖလိုရီဒါသည္ လက္တြန္းလွည္းစာသင္ခန္းမ်ားကိုဖန္တီးကာ စာအုပ္မ်ား၊ သင္ေထာက္ကူမ်ားတင္ၿပီး လွည့္လည္သြားလာကူညီေပးခဲ့သျဖင့္ ထိုဆုကိုရရွိခဲ့ျခင္း ျဖစ္သည္။ ၁၉၉၇ ခုႏွစ္မွစတင္ကာ ပီနာဖလိုရီဒါႏွင့္ ေစတနာ့ဝန္ထမ္း ဆယ္ေက်ာ္သက္မ်ားစြာတို႔က အဆိုပါ လက္တြန္းလွည္းမ်ားျဖင့္ လွည့္လည္ကာ လမ္းေပၚကေလးမ်ားစြာကို စာေရး၊ စာဖတ္တတ္ေအာင္ လိုက္လံစာသင္ၾကားေပးခဲ့ေပသည္။
“ဒီဆုရရွိလိုက္တာ အက်ိဳးရွိပါတယ္။ ကၽြန္ေတာ္နဲ႔ ကၽြန္ေတာ့္ေစတနာ့ဝန္ထမ္း လုပ္ေဖာ္ကိုင္ဖက္ေတြအတြက္ လုပ္ရက်ိဳးနပ္တယ္လို႔ ခံစားရေစပါတယ္”ဟု ယခုအခါ အသက္ ၃၇ ထဲ ေရာက္ေနၿပီျဖစ္ေသာ ပီနာဖလိုရီဒါကဆိုသည္။ ဖိလစ္ပိုင္အစိုးရကလည္း အရပ္သားမ်ားကိုေပးသည့္ အျမင့္ဆံုးဆုျဖစ္ေသာ လာကန္ဒူလာဆုကို ေပးအပ္ခဲ့သည္။ ထိုဆုက ျပည္သူမ်ားအတြက္ အသက

# Convert the Whole Dataset from Zawgyi to Unicode

In [7]:
def zawgyi_to_unicode_converter(examples):
    texts = examples['text']
    converted_texts = []

    for text in texts:
        score = detector.get_zawgyi_probability(text)
        if score > 0.5:
            text = converter.transliterate(text)

        converted_texts.append(text)

    examples['text'] = converted_texts
    return examples

In [8]:
c4_my_dataset_unicode = c4_my_dataset.map(
    zawgyi_to_unicode_converter,
    batched=True,
    batch_size=100,
    num_proc=10
)



Map (num_proc=10):   0%|          | 0/814388 [00:00<?, ? examples/s]

# Upload to Huggingface

In [11]:
c4_my_dataset_unicode = c4_my_dataset_unicode.train_test_split(test_size=0.1, seed=42)

In [13]:
c4_my_dataset_unicode.push_to_hub("chuuhtetnaing/myanmar-c4-dataset")

Uploading the dataset shards:   0%|          | 0/15 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/41 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/41 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/chuuhtetnaing/myanmar-c4-dataset/commit/2cdcb1a3e241ea70aff6308e0cc23ab6d0e84716', commit_message='Upload dataset', commit_description='', oid='2cdcb1a3e241ea70aff6308e0cc23ab6d0e84716', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/chuuhtetnaing/myanmar-c4-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='chuuhtetnaing/myanmar-c4-dataset'), pr_revision=None, pr_num=None)