In [15]:
from datasets import Dataset, DatasetDict, Features, Sequence, ClassLabel, Value, load_dataset
from itertools import chain
import fasttext
from huggingface_hub import hf_hub_download
import re
import pandas as pd
from sklearn.model_selection import train_test_split

# Load Dataset and Language Classifcation Model

In [3]:
ds = load_dataset("chuuhtetnaing/myanmar-wikipedia-dataset")

model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
model = fasttext.load_model(model_path)

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00003.parquet:   0%|          | 0.00/92.1M [00:00<?, ?B/s]

data/train-00001-of-00003.parquet:   0%|          | 0.00/106M [00:00<?, ?B/s]

data/train-00002-of-00003.parquet:   0%|          | 0.00/117M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/116340 [00:00<?, ? examples/s]

model.bin:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

In [4]:
model.predict(["Hello, world!"])[0][0][0]

'__label__eng_Latn'

# Preprocess Functions

In [5]:
def create_break_pattern():
    """Creates and returns the regular expression pattern for Myanmar syllable breaking."""
    my_consonant = r"က-အ"
    en_char = r"a-zA-Z0-9"
    symbols = r"!\"#$%&'()*+,\-./:;<=>?@\[\\\]^_`{|}~"

    other_char = r"ဣဤဥဦဧဩဪဿ၌၍၏၀-၉၊။"
    subscript_symbol = r'္'
    a_that = r'်'

    # Regular expression pattern for Myanmar syllable breaking
    return re.compile(
        r"((?<!" + subscript_symbol + r")[" + my_consonant + r"]"
        r"(?!["
        + a_that + subscript_symbol + r"])"
        + r"|[" + other_char + r"]"
        + r"|[" + en_char + r"]"
        + r"|[" + symbols + r"])"
    )

def break_syllables(line, break_pattern, separator):
    """Applie
    s syllable breaking rules to a line."""
    line = re.sub(r'\s+', ' ', line.strip()) # Normalize space
    segmented_line = break_pattern.sub(separator + r"\1", line)

    # Remove the leading delimiter if it exists
    if segmented_line.startswith(separator):
        segmented_line = segmented_line[len(separator):]

    # Replace delimiter+space+delimiter with a single space
    double_delimiter = separator + " " + separator
    segmented_line = segmented_line.replace(double_delimiter, " ")

    return segmented_line

break_pattern = create_break_pattern()

def mm_split(text):
    text = text.strip()
    seperator = "|X|"

    result = break_syllables(text, break_pattern, seperator)
    result = result.split(seperator)

    return result

def generate_data(lines):
    labels = []
    inputs = []
    
    for line in lines:
        language = model.predict([line])[0][0][0]
        if language == "__label__mya_Mymr":
            segments = line.split()
            line_label = []
            for segment in segments:
                chars = mm_split(segment)
                segment_label = ['I'] * len(chars)
                segment_label[0] = 'B'
                line_label.append(segment_label)
    
            line_input = mm_split(re.sub(r'\s+', '', line))
            line_label = list(chain.from_iterable(line_label))
    
            inputs.append(line_input)
            labels.append(line_label)

    return inputs, labels

def process_and_expand(examples):
    """
    Input: batch of N contents
    Output: batch of M rows (where M > N, since each content has multiple lines)
    """
    all_tokens = []
    all_labels = []
    
    for content in examples['content']:
        lines = [line.strip() for line in re.split(r'\n+', content) if line.strip()]
        
        if lines:
            input_data, labels = generate_data(lines)
            # input_data: [['tok1', 'tok2'], ['tok3', 'tok4', 'tok5'], ...]  <- 5 lines
            # labels:     [['B', 'I'],       ['B', 'I', 'I'],          ...]  <- 5 lines
            
            # Each line becomes a new row
            for tokens, label in zip(input_data, labels):
                all_tokens.append(tokens)
                all_labels.append(label)
    
    return {
        "tokens": all_tokens,  # M items (expanded)
        "labels": all_labels   # M items (expanded)
    }


In [6]:
content = ds['train'][0]['content']

lines = [line.strip() for line in re.split(r'\n+', content) if line.strip()]

generate_data(lines)

([['(', 'ဒု', ')', 'ချီး', 'ယား', 'တန်း', '(', 'တောင်', ')', 'ရွာ'],
  ['ကိုး', 'ကား']],
 [['B', 'I', 'I', 'B', 'I', 'I', 'B', 'I', 'I', 'I'], ['B', 'I']])

# Create the Segmentation Dataset

In [7]:
# This expands the dataset!
new_ds = ds['train'].map(
    process_and_expand,
    batched=True,
    batch_size=100,
    remove_columns=ds['train'].column_names,
)

print(f"Original rows: {len(ds['train'])}")
print(f"New rows: {len(new_ds)}")



Map:   0%|          | 0/116340 [00:00<?, ? examples/s]

Original rows: 116340
New rows: 753639


In [8]:
label_names = ["B", "I"]
label2id = {label: i for i, label in enumerate(label_names)}

In [9]:
features = Features({
    "tokens": Sequence(Value("string")),
    "segment_tags": Sequence(ClassLabel(names=label_names)),
})

In [10]:
def convert_labels_to_ids(example):
    example["segment_tags"] = [label2id[label] for label in example["labels"]]
    return example

In [11]:
new_ds = new_ds.map(convert_labels_to_ids, remove_columns=["labels"])
new_ds = new_ds.cast(features)


Map:   0%|          | 0/753639 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/753639 [00:00<?, ? examples/s]

In [12]:
print(new_ds.features["segment_tags"].feature.names)

['B', 'I']


# Remove Duplications

In [13]:
df = new_ds.to_pandas()

# Convert list to string for deduplication
df['tokens_str'] = df['tokens'].apply(lambda x: '|'.join(x))
df = df.drop_duplicates(subset='tokens_str')
df = df.drop(columns='tokens_str')

# Simple split
train_df, test_df = train_test_split(df, test_size=0.05, random_state=42)

print(f"Train: {len(train_df)}")
print(f"Test: {len(test_df)}")

Train: 544120
Test: 28638


In [16]:
# Convert back to dataset
deduped_ds = DatasetDict({
    "train": Dataset.from_pandas(train_df, preserve_index=False),
    "test": Dataset.from_pandas(test_df, preserve_index=False),
})

In [17]:
deduped_ds = deduped_ds.cast(features)

Casting the dataset:   0%|          | 0/544120 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/28638 [00:00<?, ? examples/s]

# Upload to HuggingFace

In [None]:
deduped_ds.push_to_hub("chuuhtetnaing/myanmar-text-segmentation-dataset", private=True)

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/datasets/chuuhtetnaing/myanmar-text-segmentation-dataset/commit/90cd35ad1702c1cfd3c3cd221c0dc6a1434dcdb1', commit_message='Upload dataset', commit_description='', oid='90cd35ad1702c1cfd3c3cd221c0dc6a1434dcdb1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/chuuhtetnaing/myanmar-text-segmentation-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='chuuhtetnaing/myanmar-text-segmentation-dataset'), pr_revision=None, pr_num=None)

# Example Reconstruction Function

In [19]:
def reconstruct(tokens, labels):
    """
    Combine tokens based on B/I labels.
    Add space before 'B' tokens (except the first one).
    """
    result = []
    for token, label in zip(tokens, labels):
        if label == 0 and result:
            result.append(" ")
        result.append(token)
    return "".join(result)

In [20]:
reconstruct(new_ds[700]['tokens'], new_ds[700]['segment_tags'])

'သူနာပြုဌာန (FKP)'