In [None]:
from datasets import Dataset, DatasetDict, Features, Sequence, ClassLabel, Value, load_dataset
from itertools import chain
import fasttext
from huggingface_hub import hf_hub_download
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from huggingface_hub import login

login(token="")

# Load Dataset and Language Classifcation Model

In [2]:
ds = load_dataset("chuuhtetnaing/myanmar-wikipedia-dataset")

model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
model = fasttext.load_model(model_path)

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00003.parquet:   0%|          | 0.00/92.1M [00:00<?, ?B/s]

data/train-00001-of-00003.parquet:   0%|          | 0.00/106M [00:00<?, ?B/s]

data/train-00002-of-00003.parquet:   0%|          | 0.00/117M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/116340 [00:00<?, ? examples/s]

model.bin:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

In [3]:
model.predict(["Hello, world!"])[0][0][0]

'__label__eng_Latn'

# Preprocess Functions

In [9]:
import re

def create_break_pattern():
    """Creates pattern for Myanmar syllable breaking."""
    my_consonant = r"က-အ"
    other_char = r"ဣဤဥဦဧဩဪဿ၌၍၏၀-၉၊။"
    symbols = r"!\"#$%&'()*+,\-./:;<=>?@\[\\\]^_`{|}~"
    subscript_symbol = r'္'
    a_that = r'်'
    
    return re.compile(
        r"((?<!" + subscript_symbol + r")[" + my_consonant + r"]"
        r"(?![" + a_that + subscript_symbol + r"])"
        + r"|[" + other_char + r"]"
        + r"|[" + symbols + r"])"
    )

break_pattern = create_break_pattern()

def break_syllables_myanmar(text, separator="|X|"):
    """Apply Myanmar syllable breaking."""
    segmented = break_pattern.sub(separator + r"\1", text)
    if segmented.startswith(separator):
        segmented = segmented[len(separator):]
    return [s for s in segmented.split(separator) if s]

def is_myanmar(text):
    """Check if text contains Myanmar characters."""
    return bool(re.search(r'[\u1000-\u109F]', text))

def tokenize_chunk(chunk):
    """
    Tokenize a single chunk (no spaces).
    - If contains Myanmar: split by script, syllable-break Myanmar parts
    - If no Myanmar: keep as single token
    """
    if not chunk:
        return []
    
    # If no Myanmar, keep entire chunk as single token
    if not is_myanmar(chunk):
        return [chunk]
    
    # If has Myanmar, split by script boundaries
    tokens = []
    
    # Pattern: Myanmar block OR non-Myanmar block
    pattern = re.compile(
        r'([\u1000-\u109F]+)'      # Myanmar characters
        r'|([^\u1000-\u109F]+)'    # Non-Myanmar characters
    )
    
    for match in pattern.finditer(chunk):
        myanmar, non_myanmar = match.groups()
        
        if myanmar:
            # Myanmar: split into syllables
            syllables = break_syllables_myanmar(myanmar)
            tokens.extend(syllables)
        elif non_myanmar:
            # Non-Myanmar: keep as single token
            tokens.append(non_myanmar)
    
    return tokens


def create_labels(line):
    """
    Create tokens and B/I labels from word-segmented text.
    
    1. Split by spaces (trust ground truth word boundaries)
    2. For each chunk: detect Myanmar and tokenize accordingly
    """
    segments = line.split()  # Trust spaces as word boundaries
    
    line_tokens = []
    line_labels = []
    
    for segment in segments:
        tokens = tokenize_chunk(segment)
        
        if tokens:
            # First token is 'B', rest are 'I'
            segment_labels = ['I'] * len(tokens)
            segment_labels[0] = 'B'
            
            line_tokens.extend(tokens)
            line_labels.extend(segment_labels)
    
    return {"tokens": line_tokens, "labels": line_labels}


def reconstruct(tokens, labels):
    result = []
    for token, label in zip(tokens, labels):
        if label == "B" and result:
            result.append(" ")
        result.append(token)
    return "".join(result)

def generate_data(lines):
    """Generate tokens and labels for multiple lines."""
    inputs = []
    labels = []
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        language = model.predict([line])[0][0][0]
        if language == "__label__mya_Mymr":
            result = create_labels(line)
            inputs.append(result["tokens"])
            labels.append(result["labels"])
    
    return inputs, labels

def process_and_expand(examples):
    """
    Input: batch of N contents
    Output: batch of M rows (where M > N, since each content has multiple lines)
    """
    all_tokens = []
    all_labels = []
    
    for content in examples['content']:
        lines = [line.strip() for line in re.split(r'\n+', content) if line.strip()]
        
        if lines:
            input_data, labels = generate_data(lines)
            # input_data: [['tok1', 'tok2'], ['tok3', 'tok4', 'tok5'], ...]  <- 5 lines
            # labels:     [['B', 'I'],       ['B', 'I', 'I'],          ...]  <- 5 lines
            
            # Each line becomes a new row
            for tokens, label in zip(input_data, labels):
                all_tokens.append(tokens)
                all_labels.append(label)
    
    return {
        "tokens": all_tokens,  # M items (expanded)
        "labels": all_labels   # M items (expanded)
    }


In [11]:

# Test cases
test_cases = [
    "နယူးဇီလန် ရဲများသည် Black Beauty1 45 and 678 ၏",
    "နယူးဇီလန် ရဲများသည်Black Beauty ၏",
    "နယူးဇီလန် ရဲများသည် Black Beauty၏",
    "Hello World",
    "မြန်မာ English mixed စာ",
    "test123 hello",
    "Beauty1 is 45kg",
]

for text in test_cases:
    result = create_labels(text)
    reconstructed = reconstruct(result["tokens"], result["labels"])
    match = "✅" if text == reconstructed else "❌"
    print(f"Input:   {text}")
    print(f"Tokens:  {result['tokens']}")
    print(f"Labels:  {result['labels']}")
    print(f"Output:  {reconstructed} {match}")
    print()

Input:   နယူးဇီလန် ရဲများသည် Black Beauty1 45 and 678 ၏
Tokens:  ['န', 'ယူး', 'ဇီ', 'လန်', 'ရဲ', 'များ', 'သည်', 'Black', 'Beauty1', '45', 'and', '678', '၏']
Labels:  ['B', 'I', 'I', 'I', 'B', 'I', 'I', 'B', 'B', 'B', 'B', 'B', 'B']
Output:  နယူးဇီလန် ရဲများသည် Black Beauty1 45 and 678 ၏ ✅

Input:   နယူးဇီလန် ရဲများသည်Black Beauty ၏
Tokens:  ['န', 'ယူး', 'ဇီ', 'လန်', 'ရဲ', 'များ', 'သည်', 'Black', 'Beauty', '၏']
Labels:  ['B', 'I', 'I', 'I', 'B', 'I', 'I', 'I', 'B', 'B']
Output:  နယူးဇီလန် ရဲများသည်Black Beauty ၏ ✅

Input:   နယူးဇီလန် ရဲများသည် Black Beauty၏
Tokens:  ['န', 'ယူး', 'ဇီ', 'လန်', 'ရဲ', 'များ', 'သည်', 'Black', 'Beauty', '၏']
Labels:  ['B', 'I', 'I', 'I', 'B', 'I', 'I', 'B', 'B', 'I']
Output:  နယူးဇီလန် ရဲများသည် Black Beauty၏ ✅

Input:   Hello World
Tokens:  ['Hello', 'World']
Labels:  ['B', 'B']
Output:  Hello World ✅

Input:   မြန်မာ English mixed စာ
Tokens:  ['မြန်', 'မာ', 'English', 'mixed', 'စာ']
Labels:  ['B', 'I', 'B', 'B', 'B']
Output:  မြန်မာ English mixed စာ ✅

Inpu

In [12]:
content = ds['train'][0]['content']

lines = [line.strip() for line in re.split(r'\n+', content) if line.strip()]

generate_data(lines)

([['(', 'ဒု', ')', 'ချီး', 'ယား', 'တန်း', '(', 'တောင်', ')', 'ရွာ'],
  ['ကိုး', 'ကား']],
 [['B', 'I', 'I', 'B', 'I', 'I', 'B', 'I', 'I', 'I'], ['B', 'I']])

# Create the Segmentation Dataset

In [14]:
# This expands the dataset!
new_ds = ds['train'].map(
    process_and_expand,
    batched=True,
    remove_columns=ds['train'].column_names,
)

print(f"Original rows: {len(ds['train'])}")
print(f"New rows: {len(new_ds)}")

Map:   0%|          | 0/116340 [00:00<?, ? examples/s]

Original rows: 116340
New rows: 753639


In [15]:
label_names = ["B", "I"]
label2id = {label: i for i, label in enumerate(label_names)}

In [16]:
features = Features({
    "tokens": Sequence(Value("string")),
    "segment_tags": Sequence(ClassLabel(names=label_names)),
})

In [17]:
def convert_labels_to_ids(example):
    example["segment_tags"] = [label2id[label] for label in example["labels"]]
    return example

In [18]:
new_ds = new_ds.map(convert_labels_to_ids, remove_columns=["labels"])

Map:   0%|          | 0/753639 [00:00<?, ? examples/s]

# Remove Duplications

In [20]:
df = new_ds.to_pandas()

# Convert list to string for deduplication
df['tokens_str'] = df['tokens'].apply(lambda x: '|'.join(x))
df = df.drop_duplicates(subset='tokens_str')
df = df.drop(columns='tokens_str')

# Simple split
train_df, test_df = train_test_split(df, test_size=0.05, random_state=42)

print(f"Train: {len(train_df)}")
print(f"Test: {len(test_df)}")

Train: 544133
Test: 28639


In [21]:
# Convert back to dataset
deduped_ds = DatasetDict({
    "train": Dataset.from_pandas(train_df, preserve_index=False),
    "test": Dataset.from_pandas(test_df, preserve_index=False),
})

In [22]:
deduped_ds = deduped_ds.cast(features)

Casting the dataset:   0%|          | 0/544133 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/28639 [00:00<?, ? examples/s]

# Upload to HuggingFace

In [None]:
deduped_ds.push_to_hub("chuuhtetnaing/myanmar-text-segmentation-dataset", private=True, commit_message="fixed english word tokens")

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/chuuhtetnaing/myanmar-text-segmentation-dataset/commit/affed528ef5a66c6cb65e50671ea0cb78ed7002d', commit_message='fixed english word tokens', commit_description='', oid='affed528ef5a66c6cb65e50671ea0cb78ed7002d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/chuuhtetnaing/myanmar-text-segmentation-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='chuuhtetnaing/myanmar-text-segmentation-dataset'), pr_revision=None, pr_num=None)

# Example Reconstruction Function

In [24]:
def reconstruct(tokens, labels):
    """
    Combine tokens based on B/I labels.
    Add space before 'B' tokens (except the first one).
    """
    result = []
    for token, label in zip(tokens, labels):
        if label == 0 and result:
            result.append(" ")
        result.append(token)
    return "".join(result)

In [25]:
reconstruct(new_ds[700]['tokens'], new_ds[700]['segment_tags'])

'သူနာပြုဌာန (FKP)'