In [1]:
from datasets import load_dataset
from transformers import pipeline
import re

# Load the Segmentation Model and Prepare Function

In [2]:
nlp = pipeline("token-classification", model="chuuhtetnaing/myanmar-text-segmentation-model", grouped_entities=True)

Device set to use mps:0


In [3]:
def segment(text):
    segments = nlp(text)

    segmented_text = []
    for segment in segments:
        if segment["entity_group"] == "B":
            segmented_text.append(segment["word"])
        else:  # 'I' - append to previous word
            segmented_text[-1] += segment["word"]
    segmented_text = " ".join(segmented_text)

    return segmented_text

In [4]:
text ="အချစ်ဆိုတာလူတွေရှင်သန်ဖို့သဘာဝကပေးတဲ့လက်နက်လား၊"

segment(text)

'အချစ်ဆိုတာ လူတွေရှင်သန်ဖို့ သဘာဝကပေးတဲ့ လက်နက်လား၊'

In [5]:
def is_english(char):
    return bool(re.match(r'[a-zA-Z]', char))

def process(example):
    texts = example["burmese_raw"].split()

    # Here the purpose is to put the space between the english word only
    # For eg, raw input is "မင်း သ မီး က The White Tiger ရုပ် ရှင် ကို ..."
    # But the input to model should be in "မင်းသမီးကThe White Tigerရုပ်ရှင်ကို..."
    # Then, the model will generate "မင်းသမီးက The White Tigerရုပ်ရှင်ကို..."
    # Otherwise, the input to model will become "မင်းသမီးကTheWhiteTigerရုပ်ရှင်ကို..."
    # And model may generate "မင်းသမီးက TheWhiteTigerရုပ်ရှင်ကို..."
    for i, text in enumerate(texts[1:]):
        texts[i] = " " + texts[i] if is_english(texts[i-1][-1]) and is_english(texts[i][0]) else texts[i]

    example["burmese"] = segment("".join(texts))
    return example

# Segment raw myanmar text

In [6]:
ds = load_dataset("Francis-Phone/mm_eng_news_translation")
ds = ds.rename_column("burmese", "burmese_raw")

In [7]:
ds['train'] = ds['train'].map(process)
ds['train'] = ds['train'].select_columns(["burmese", "english", "burmese_raw"])

In [8]:
ds['validation'] = ds['validation'].map(process)
ds['validation'] = ds['validation'].select_columns(["burmese", "english", "burmese_raw"])

Map:   0%|          | 0/9629 [00:00<?, ? examples/s]

In [9]:
ds['test'] = ds['test'].map(process)
ds['test'] = ds['test'].select_columns(["burmese", "english", "burmese_raw"])

Map:   0%|          | 0/9629 [00:00<?, ? examples/s]

# Upload to HuggingFace Repo

In [10]:
ds.push_to_hub("chuuhtetnaing/myanmar-english-news-translation-dataset", private=True, token="")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/chuuhtetnaing/myanmar-english-news-translation-dataset/commit/f19cb656155aea81fc3bf93b82dd9ff6da1cb4f9', commit_message='Upload dataset', commit_description='', oid='f19cb656155aea81fc3bf93b82dd9ff6da1cb4f9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/chuuhtetnaing/myanmar-english-news-translation-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='chuuhtetnaing/myanmar-english-news-translation-dataset'), pr_revision=None, pr_num=None)