In [1]:
from datasets import load_dataset
from icu import Transliterator
from datasets import concatenate_datasets
from myanmartools import ZawgyiDetector

# Load the Original C4 Dataset

In [2]:
my_dataset = load_dataset("statmt/cc100", "my", trust_remote_code=True)
my_zaw_dataset = load_dataset("statmt/cc100", "my_zaw", trust_remote_code=True)

Downloading data:   0%|          | 0.00/48.0M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/186M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [3]:
my_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text'],
        num_rows: 2207994
    })
})

In [6]:
my_dataset['train'][0]

{'id': '0', 'text': 'အပိုင်း ၁ သင်ခန်းစာ ၃ ကုဒ်လျှော့ရေးပါ\n'}

In [4]:
my_zaw_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text'],
        num_rows: 11516761
    })
})

In [7]:
my_zaw_dataset['train'][0]

{'id': '0', 'text': 'မည္သည္႔အေၾကာင္းအရာကိုမဆို ၀င္ေရာက္ ေျပာဆို ႏုိင္ပါသည္။\n'}

# Example Convertion from Zawgyi to Unicode

In [9]:
detector = ZawgyiDetector()
converter = Transliterator.createInstance('Zawgyi-my')

original_text = my_zaw_dataset['train'][0]['text']
score = detector.get_zawgyi_probability(original_text)

print("Original Text:", original_text)

if score > 0.5:
    converted_text = converter.transliterate(original_text)
    print()
    print("Score:", score)
    print()
    print("Converted Text:", converted_text)

Original Text: မည္သည္႔အေၾကာင္းအရာကိုမဆို ၀င္ေရာက္ ေျပာဆို ႏုိင္ပါသည္။


Score: 1.0

Converted Text: မည်သည့်အကြောင်းအရာကိုမဆို ၀င်ရောက် ပြောဆို နိုင်ပါသည်။



# Convert the Whole Dataset from Zawgyi to Unicode

In [10]:
def zawgyi_to_unicode_converter(examples):
    detector = ZawgyiDetector()
    converter = Transliterator.createInstance('Zawgyi-my')

    texts = examples['text']
    converted_texts = []

    for text in texts:
        score = detector.get_zawgyi_probability(text)
        if score > 0.5:
            text = converter.transliterate(text)

        converted_texts.append(text)

    examples['text'] = converted_texts
    return examples

In [12]:
my_unicode_dataset = my_zaw_dataset['train'].map(
    zawgyi_to_unicode_converter,
    batched=True,
    batch_size=100,
    num_proc=10
)

Map (num_proc=10):   0%|          | 0/11516761 [00:00<?, ? examples/s]

# Combine Original Unicode Dataset and Converted Unicode Dataset

In [23]:
cc100_dataset = concatenate_datasets([my_dataset['train'], my_unicode_dataset])
print(f"Combined dataset has {len(cc100_dataset)} examples")

Combined dataset has 13724755 examples


In [27]:
cc100_dataset = cc100_dataset.remove_columns("id")

# Upload to Huggingface

In [32]:
cc100_dataset = cc100_dataset.train_test_split(test_size=0.1, seed=42)

In [33]:
cc100_dataset.push_to_hub("chuuhtetnaing/myanmar-cc100-dataset")

Uploading the dataset shards:   0%|          | 0/5 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2471 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/2471 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/2471 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/2471 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/2471 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1373 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/chuuhtetnaing/myanmar-cc100-dataset/commit/50e9383b0bcfec0847df1ae921b0254680bea452', commit_message='Upload dataset', commit_description='', oid='50e9383b0bcfec0847df1ae921b0254680bea452', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/chuuhtetnaing/myanmar-cc100-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='chuuhtetnaing/myanmar-cc100-dataset'), pr_revision=None, pr_num=None)