In [1]:
from datasets import Dataset, DatasetDict, Features, Sequence, ClassLabel, Value, load_dataset
from sklearn.model_selection import train_test_split
import re

# Preprocessing Functions

In [2]:
def create_break_pattern():
    """Creates and returns the regular expression pattern for Myanmar syllable breaking."""
    my_consonant = r"က-အ"
    # en_char = r"a-zA-Z0-9"

    other_char = r"ဣဤဥဦဧဩဪဿ၌၍၏၀-၉၊။"
    subscript_symbol = r'္'
    a_that = r'်'

    # Regular expression pattern for Myanmar syllable breaking
    return re.compile(
        r"((?<!" + subscript_symbol + r")[" + my_consonant + r"]"
        r"(?!["
        + a_that + subscript_symbol + r"])"
        + r"|[" + other_char + r"])"
    )

def break_syllables(line, break_pattern, separator):
    """Applie
    s syllable breaking rules to a line."""
    line = re.sub(r'\s+', ' ', line.strip()) # Normalize space
    segmented_line = break_pattern.sub(separator + r"\1", line)

    # Remove the leading delimiter if it exists
    if segmented_line.startswith(separator):
        segmented_line = segmented_line[len(separator):]

    # Replace delimiter+space+delimiter with a single space
    double_delimiter = separator + " " + separator
    segmented_line = segmented_line.replace(double_delimiter, " ")

    return segmented_line

break_pattern = create_break_pattern()

def mm_split(text):
    text = text.strip()
    seperator = "|X|"

    result = break_syllables(text, break_pattern, seperator)
    result = result.split(seperator)

    return result

In [3]:
def bioes_to_bio(tag):
    if tag.startswith('E-'):
        return 'I-' + tag[2:]
    elif tag.startswith('S-'):
        return 'B-' + tag[2:]
    return tag

def expand_tag_for_syllables(tag, count):
    """Expand a single tag for multiple syllables with correct BIO format."""
    if count == 1:
        return [tag]

    if tag.startswith('B-'):
        # B-X followed by I-X for remaining syllables
        return [tag] + ['I-' + tag[2:]] * (count - 1)
    elif tag.startswith('I-'):
        # All I-X
        return [tag] * count
    else:
        # O tags stay O
        return [tag] * count

def load_conll(filepath):
    sentences = []
    current_sentence = {'tokens': [], 'pos': [], 'ner': []}

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line == '':
                if current_sentence['tokens']:
                    sentences.append({
                        'tokens': current_sentence['tokens'],
                        'pos': current_sentence['pos'],
                        'ner': current_sentence['ner']
                    })
                    current_sentence = {'tokens': [], 'pos': [], 'ner': []}
            elif not line.startswith('#'):
                parts = line.split('\t')

                word = parts[0]
                words = mm_split(word)
                word_count = len(words)

                pos = [parts[1]] * word_count

                # Convert BIOES to BIO, then expand correctly
                bio_tag = bioes_to_bio(parts[2])
                ner_tags = expand_tag_for_syllables(bio_tag, word_count)
                ner = ner_tags

                word = words

                current_sentence['tokens'].extend(word)
                current_sentence['pos'].extend(pos)
                current_sentence['ner'].extend(ner)

    if current_sentence['tokens']:
        sentences.append({
            'tokens': current_sentence['tokens'],
            'pos': current_sentence['pos'],
            'ner': current_sentence['ner']
        })

    return sentences

# Create HuggingFace Format Dataset

In [4]:
# Load and convert to DataFrame
import pandas as pd

data = load_conll('myNER-7tags_ver.1.0.conll')


In [5]:
all_tags = set()
for row in data:
    all_tags.update(row["ner"])

In [6]:
def sort_tags(tags):
    def tag_key(tag):
        if tag == 'O':
            return ('ZZZ', 0)  # Use string 'ZZZ' to put 'O' at the end
        prefix, entity = tag.split('-')
        prefix_order = {'B': 0, 'I': 1}
        return (entity, prefix_order.get(prefix, 2))

    return sorted(tags, key=tag_key)

tag_list = sort_tags(list(all_tags))
tag2id = {tag: idx for idx, tag in enumerate(tag_list)}
id2tag = {idx: tag for tag, idx in tag2id.items()}

In [7]:
tag_list

['B-DATE',
 'I-DATE',
 'B-LOC',
 'I-LOC',
 'B-NUM',
 'I-NUM',
 'B-ORG',
 'I-ORG',
 'B-PER',
 'I-PER',
 'B-TIME',
 'I-TIME',
 'O']

In [8]:
for row in data:
    row["ner_tags"] = [tag2id[ner_tag] for ner_tag in row['ner']]

In [9]:
features = Features({
    "tokens": Sequence(Value("string")),
    "ner_tags": Sequence(ClassLabel(names=tag_list)),
})

In [10]:
df = pd.DataFrame(data)
before_size = df.shape[0]

# Convert list to string for deduplication
df['tokens_str'] = df['tokens'].apply(lambda x: ''.join(x))
df = df.drop_duplicates(subset='tokens_str')
df = df.drop(columns=['tokens_str', 'pos', 'ner'])
after_size = df.shape[0]

# Simple split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print(f"Train: {len(train_df)}")
print(f"Test: {len(test_df)}")
print("Total row removed:", before_size - after_size)

Train: 12825
Test: 3207
Total row removed: 572


In [11]:
# Convert back to dataset
deduped_ds = DatasetDict({
    "train": Dataset.from_pandas(train_df, preserve_index=False),
    "test": Dataset.from_pandas(test_df, preserve_index=False),
})

In [12]:
deduped_ds = deduped_ds.cast(features)

Casting the dataset:   0%|          | 0/12825 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3207 [00:00<?, ? examples/s]

In [13]:
deduped_ds

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 12825
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3207
    })
})

In [14]:
deduped_ds['train']

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 12825
})

# Upload to HuggingFace

In [15]:
deduped_ds.push_to_hub("chuuhtetnaing/myanmar-ner-dataset", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/datasets/chuuhtetnaing/myanmar-ner-dataset/commit/1cd21f97fd84932b0860c018542f75d3ec755245', commit_message='Upload dataset', commit_description='', oid='1cd21f97fd84932b0860c018542f75d3ec755245', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/chuuhtetnaing/myanmar-ner-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='chuuhtetnaing/myanmar-ner-dataset'), pr_revision=None, pr_num=None)