In [1]:
from datasets import Dataset, DatasetDict, Features, Sequence, ClassLabel, Value, load_dataset
from sklearn.model_selection import train_test_split
from huggingface_hub import login
import re


login(token="")

# Load Dataset

In [2]:
ds = load_dataset("LULab/myPOS")

In [3]:
ds['train'][0]['text']

'၁၉၆၂/num ခုနှစ်/n ခန့်မှန်း/v သန်းခေါင်စာရင်း/n အရ/ppm လူဦးရေ/n ၁၁၅၉၃၁/num ယောက်/part ရှိ/v သည်/ppm ။/punc'

# Cleanup Dataset

In [4]:
def parse_pos_text(text):
    """
    Parse 'word/tag word/tag ...' format into tokens and tags.
    """
    tokens = []
    pos_tags = []

    for item in text.strip().split():
        if '/' in item:
            # Split from the RIGHT in case word contains '/'
            # e.g., "၁/၂/num" should be ("၁/၂", "num")
            last_slash_idx = item.rfind('/')
            word = item[:last_slash_idx]
            tag = item[last_slash_idx + 1:]

            if item.count("/") > 1:
                if '|' in item:
                    sub_items = item.split("|")
                    for sub_item in sub_items:
                        sub_tokens, sub_tags = parse_pos_text(sub_item)
                        tokens.extend(sub_tokens)
                        pos_tags.extend(sub_tags)
                else: raise
            else:
                tokens.append(word)
                pos_tags.append(tag)
        else:
            print("XXXXXXX")
            # Handle edge cases (if any)
            tokens.append(item)
            pos_tags.append("UNK")

    return tokens, pos_tags



In [5]:
text = ds['train'][0]['text']
tokens, tags = parse_pos_text(text)

print("Text:", text)
print("Tokens:", tokens)
print("Tags:", tags)

Text: ၁၉၆၂/num ခုနှစ်/n ခန့်မှန်း/v သန်းခေါင်စာရင်း/n အရ/ppm လူဦးရေ/n ၁၁၅၉၃၁/num ယောက်/part ရှိ/v သည်/ppm ။/punc
Tokens: ['၁၉၆၂', 'ခုနှစ်', 'ခန့်မှန်း', 'သန်းခေါင်စာရင်း', 'အရ', 'လူဦးရေ', '၁၁၅၉၃၁', 'ယောက်', 'ရှိ', 'သည်', '။']
Tags: ['num', 'n', 'v', 'n', 'ppm', 'n', 'num', 'part', 'v', 'ppm', 'punc']


In [6]:
text = ds['train'][1]['text']
tokens, tags = parse_pos_text(text)

print("Text:", text)
print("Tokens:", tokens)
print("Tags:", tags)

Text: လူ/n တိုင်း/part တွင်/ppm သင့်မြတ်/v လျော်ကန်/v စွာ/part ကန့်သတ်/v ထား/part သည့်/part အလုပ်/n လုပ်/v ချိန်/n အပြင်/conj ၊/punc လစာ/n နှင့်တကွ/conj အခါ/n ကာလ/n အားလျော်စွာ/ppm သတ်မှတ်/v ထား/part သည့်/part အလုပ်/n|အားလပ်ရက်/n များ/part ပါဝင်/v သည့်/part အနားယူခွင့်/n နှင့်/conj အားလပ်ခွင့်/n ခံစားပိုင်ခွင့်/n ရှိ/v သည်/ppm ။/punc
Tokens: ['လူ', 'တိုင်း', 'တွင်', 'သင့်မြတ်', 'လျော်ကန်', 'စွာ', 'ကန့်သတ်', 'ထား', 'သည့်', 'အလုပ်', 'လုပ်', 'ချိန်', 'အပြင်', '၊', 'လစာ', 'နှင့်တကွ', 'အခါ', 'ကာလ', 'အားလျော်စွာ', 'သတ်မှတ်', 'ထား', 'သည့်', 'အလုပ်', 'အားလပ်ရက်', 'များ', 'ပါဝင်', 'သည့်', 'အနားယူခွင့်', 'နှင့်', 'အားလပ်ခွင့်', 'ခံစားပိုင်ခွင့်', 'ရှိ', 'သည်', '။']
Tags: ['n', 'part', 'ppm', 'v', 'v', 'part', 'v', 'part', 'part', 'n', 'v', 'n', 'conj', 'punc', 'n', 'conj', 'n', 'n', 'ppm', 'v', 'part', 'part', 'n', 'n', 'part', 'v', 'part', 'n', 'conj', 'n', 'n', 'v', 'ppm', 'punc']


In [7]:
error_rows = []

for i, example in enumerate(ds["train"]):
    try:
        parse_pos_text(example['text'])
    except:
        error_rows.append(i)

In [8]:
error_rows

[7120, 11832, 19935, 28042, 33593, 38921, 39267]

In [9]:
corrected_rows = [''] * len(error_rows)

In [10]:
text = ds['train'][error_rows[0]]['text']
text

'ဒီ/adj ဟာ/pron တွေ/part က/ppm တော့/part/ppm မ/part မှည့်/v သေး/part ပါ/part ဘူး/part ။/punc'

In [11]:
corrected_rows[0] = 'ဒီ/adj ဟာ/pron တွေ/part က/ppm တော့/part မ/part မှည့်/v သေး/part ပါ/part ဘူး/part ။/punc'
corrected_rows[0]

'ဒီ/adj ဟာ/pron တွေ/part က/ppm တော့/part မ/part မှည့်/v သေး/part ပါ/part ဘူး/part ။/punc'

In [12]:
text = ds['train'][error_rows[1]]['text']
text

'အခု/n|ခေတ်/n မှာ/ppm တော့/part ၊/punc နက္ခတ်တာရာ/n စုံ/v တဲ့/part အချိန်/n မှာ/ppm သစ်မြစ်/n ၊/punc/သစ်ရွက်/n အားလုံး/adj ဆေးခေါင်းခ/v တယ်/ppm လို့/part အယူအဆ/n ရှိ/v ကြ/part လို့/part ၊/punc မယ်ဇလီဖူး/n|သုပ်/n စား/v ကြ/part တာ/part လည်း/part ရှိ/v တယ်/ppm လေ/part ။/punc'

In [13]:
corrected_rows[1] = 'အခု/n ခေတ်/n မှာ/ppm တော့/part ၊/punc နက္ခတ်တာရာ/n စုံ/v တဲ့/part အချိန်/n မှာ/ppm သစ်မြစ်/n ၊/punc သစ်ရွက်/n အားလုံး/adj ဆေးခေါင်းခ/v တယ်/ppm လို့/part အယူအဆ/n ရှိ/v ကြ/part လို့/part ၊/punc မယ်ဇလီဖူး/n သုပ်/n စား/v ကြ/part တာ/part လည်း/part ရှိ/v တယ်/ppm လေ/part ။/punc'
corrected_rows[1]

'အခု/n ခေတ်/n မှာ/ppm တော့/part ၊/punc နက္ခတ်တာရာ/n စုံ/v တဲ့/part အချိန်/n မှာ/ppm သစ်မြစ်/n ၊/punc သစ်ရွက်/n အားလုံး/adj ဆေးခေါင်းခ/v တယ်/ppm လို့/part အယူအဆ/n ရှိ/v ကြ/part လို့/part ၊/punc မယ်ဇလီဖူး/n သုပ်/n စား/v ကြ/part တာ/part လည်း/part ရှိ/v တယ်/ppm လေ/part ။/punc'

In [14]:
text = ds['train'][error_rows[2]]['text']
text

'ဒီ/adj နေ့/n/pron ရုပ်ရှင်/n ဘာ/adj ကား/n ပြ/v နေ/part လဲ/part ။/punc'

In [15]:
corrected_rows[2] = 'ဒီ/adj နေ့/n ရုပ်ရှင်/n ဘာ/adj ကား/n ပြ/v နေ/part လဲ/part ။/punc'
corrected_rows[2]

'ဒီ/adj နေ့/n ရုပ်ရှင်/n ဘာ/adj ကား/n ပြ/v နေ/part လဲ/part ။/punc'

In [16]:
text = ds['train'][error_rows[3]]['text']
text

'ကျွန်တော်/pron က/ppm တော့/part/ppm မြန်မာ/n လူမျိုး/n ဖြစ်/v ပါ/part တယ်/ppm ။/punc'

In [17]:
corrected_rows[3] = 'ကျွန်တော်/pron က/ppm တော့/part မြန်မာ/n လူမျိုး/n ဖြစ်/v ပါ/part တယ်/ppm ။/punc'
corrected_rows[3]

'ကျွန်တော်/pron က/ppm တော့/part မြန်မာ/n လူမျိုး/n ဖြစ်/v ပါ/part တယ်/ppm ။/punc'

In [18]:
text = ds['train'][error_rows[4]]['text']
text

'မ/part/part ဟုတ်/v ဘူး/part မေမေ/n ၊/punc ကိုဝင်းထွန်း/n နဲ့/ppm ဘာ/pron မှ/ppm မ/part ဆိုင်/v ပါ/part ဘူး/part ၊/punc အပေါ်ထပ်/n က/ppm အိမ်ဖော်/n မိလှ/n ကြောင့်/ppm ပါ/part ။/punc'

In [19]:
corrected_rows[4] = 'မ/part ဟုတ်/v ဘူး/part မေမေ/n ၊/punc ကိုဝင်းထွန်း/n နဲ့/ppm ဘာ/pron မှ/ppm မ/part ဆိုင်/v ပါ/part ဘူး/part ၊/punc အပေါ်ထပ်/n က/ppm အိမ်ဖော်/n မိလှ/n ကြောင့်/ppm ပါ/part ။/punc'
corrected_rows[4]

'မ/part ဟုတ်/v ဘူး/part မေမေ/n ၊/punc ကိုဝင်းထွန်း/n နဲ့/ppm ဘာ/pron မှ/ppm မ/part ဆိုင်/v ပါ/part ဘူး/part ၊/punc အပေါ်ထပ်/n က/ppm အိမ်ဖော်/n မိလှ/n ကြောင့်/ppm ပါ/part ။/punc'

In [20]:
text = ds['train'][error_rows[5]]['text']
text

'ဆရာဝန်/n|များ/part/နေ့/n ရောက်/v တော့/part မယ်/ppm ။/punc'

In [21]:
corrected_rows[5] = 'ဆရာဝန်/n များ/part နေ့/n ရောက်/v တော့/part မယ်/ppm ။/punc'
corrected_rows[5]

'ဆရာဝန်/n များ/part နေ့/n ရောက်/v တော့/part မယ်/ppm ။/punc'

In [22]:
text = ds['train'][error_rows[6]]['text']
text

'ဒီ/adj သူငယ်ချင်း/n က/ppm တော့/part/ppm မြန်မာပြည်/n က/ppm လာ/v ခဲ့/part ပါ/part တယ်/ppm ။/punc'

In [23]:
corrected_rows[6] = 'ဒီ/adj သူငယ်ချင်း/n က/ppm တော့/part မြန်မာပြည်/n က/ppm လာ/v ခဲ့/part ပါ/part တယ်/ppm ။/punc'
corrected_rows[6]

'ဒီ/adj သူငယ်ချင်း/n က/ppm တော့/part မြန်မာပြည်/n က/ppm လာ/v ခဲ့/part ပါ/part တယ်/ppm ။/punc'

In [24]:
def fix_annotations(example, idx):
    """Fix known bad annotations."""
    if idx == error_rows[0]:
        example['text'] = corrected_rows[0]
    elif idx == error_rows[1]:
        example['text'] = corrected_rows[1]
    elif idx == error_rows[2]:
        example['text'] = corrected_rows[2]
    elif idx == error_rows[3]:
        example['text'] = corrected_rows[3]
    elif idx == error_rows[4]:
        example['text'] = corrected_rows[4]
    elif idx == error_rows[5]:
        example['text'] = corrected_rows[5]
    elif idx == error_rows[6]:
        example['text'] = corrected_rows[6]
    return example

ds['train'] = ds['train'].map(fix_annotations, with_indices=True)

In [25]:
error_rows = []

for i, example in enumerate(ds["train"]):
    try:
        parse_pos_text(example['text'])
    except:
        error_rows.append(i)

In [26]:
error_rows

[]

In [27]:
all_tags = set()
for split in ds:
    for i, example in enumerate(ds[split]):
        try:
            # print("Okay", i)
            _, tags = parse_pos_text(example['text'])
            all_tags.update(tags)
        except:
            print("### ERROR ###", i)

tag_list = sorted(list(all_tags))
tag2id = {tag: idx for idx, tag in enumerate(tag_list)}
id2tag = {idx: tag for tag, idx in tag2id.items()}

print(f"Number of POS tags: {len(tag_list)}")
print(f"Tags: {tag_list}")
print(f"tag2id: {tag2id}")

Number of POS tags: 16
Tags: ['abb', 'adj', 'adv', 'conj', 'fw', 'int', 'n', 'num', 'part', 'ppm', 'pron', 'punc', 'sb', 'tn', 'v', 'v|']
tag2id: {'abb': 0, 'adj': 1, 'adv': 2, 'conj': 3, 'fw': 4, 'int': 5, 'n': 6, 'num': 7, 'part': 8, 'ppm': 9, 'pron': 10, 'punc': 11, 'sb': 12, 'tn': 13, 'v': 14, 'v|': 15}


In [28]:
def parse_pos_text(text):
    """
    Parse 'word/tag word/tag ...' format into tokens and tags.
    """
    tokens = []
    pos_tags = []

    for item in text.strip().split():
        if '/' in item:
            # Split from the RIGHT in case word contains '/'
            # e.g., "၁/၂/num" should be ("၁/၂", "num")
            last_slash_idx = item.rfind('/')
            word = item[:last_slash_idx]
            tag = item[last_slash_idx + 1:]

            if tag == "v|":
                tag = "v"

            if item.count("/") > 1:
                if '|' in item:
                    sub_items = item.split("|")
                    for sub_item in sub_items:
                        sub_tokens, sub_tags = parse_pos_text(sub_item)
                        tokens.extend(sub_tokens)
                        pos_tags.extend(sub_tags)
                else: raise
            else:
                tokens.append(word)
                pos_tags.append(tag)
        else:
            print("XXXXXXX")
            # Handle edge cases (if any)
            tokens.append(item)
            pos_tags.append("UNK")

    return tokens, pos_tags



In [29]:
all_tags = set()
for split in ds:
    for i, example in enumerate(ds[split]):
        try:
            # print("Okay", i)
            _, tags = parse_pos_text(example['text'])
            all_tags.update(tags)
        except:
            print("### ERROR ###", i)

tag_list = sorted(list(all_tags))
tag2id = {tag: idx for idx, tag in enumerate(tag_list)}
id2tag = {idx: tag for tag, idx in tag2id.items()}

print(f"Number of POS tags: {len(tag_list)}")
print(f"Tags: {tag_list}")
print(f"tag2id: {tag2id}")

Number of POS tags: 15
Tags: ['abb', 'adj', 'adv', 'conj', 'fw', 'int', 'n', 'num', 'part', 'ppm', 'pron', 'punc', 'sb', 'tn', 'v']
tag2id: {'abb': 0, 'adj': 1, 'adv': 2, 'conj': 3, 'fw': 4, 'int': 5, 'n': 6, 'num': 7, 'part': 8, 'ppm': 9, 'pron': 10, 'punc': 11, 'sb': 12, 'tn': 13, 'v': 14}


# Expend the Text to Tokens and Tags

In [30]:
def expand_text(example):
    """Convert text column to tokens and pos_tags columns."""
    tokens, tags = parse_pos_text(example['text'])
    tag_ids = [tag2id[tag] for tag in tags]

    return {
        "tokens": tokens,
        "pos_tags": tags,
        "pos_tag_ids": tag_ids
    }

# Apply to dataset
expanded_ds = ds.map(expand_text)

In [31]:
expanded_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'tokens', 'pos_tags', 'pos_tag_ids'],
        num_rows: 42196
    })
})

In [32]:
expanded_ds["train"]

Dataset({
    features: ['text', 'tokens', 'pos_tags', 'pos_tag_ids'],
    num_rows: 42196
})

# Preprocessed for HuggingFace Token Classification Fine-Tuning API

In [33]:
def create_break_pattern():
    """Creates pattern for Myanmar syllable breaking."""
    my_consonant = r"က-အ"
    other_char = r"ဣဤဥဦဧဩဪဿ၌၍၏၀-၉၊။"
    symbols = r"!\"#$%&'()*+,\-./:;<=>?@\[\\\]^_`{|}~"
    subscript_symbol = r'္'
    a_that = r'်'

    return re.compile(
        r"((?<!" + subscript_symbol + r")[" + my_consonant + r"]"
        r"(?![" + a_that + subscript_symbol + r"])"
        + r"|[" + other_char + r"]"
        + r"|[" + symbols + r"])"
    )

break_pattern = create_break_pattern()

def break_syllables_myanmar(text, separator="|X|"):
    """Apply Myanmar syllable breaking."""
    segmented = break_pattern.sub(separator + r"\1", text)
    if segmented.startswith(separator):
        segmented = segmented[len(separator):]
    return [s for s in segmented.split(separator) if s]

def is_myanmar(text):
    """Check if text contains Myanmar characters."""
    return bool(re.search(r'[\u1000-\u109F]', text))

def tokenize_chunk(chunk):
    """
    Tokenize a single chunk (no spaces).
    - If contains Myanmar: split by script, syllable-break Myanmar parts
    - If no Myanmar: keep as single token
    """
    if not chunk:
        return []

    # If no Myanmar, keep entire chunk as single token
    if not is_myanmar(chunk):
        return [chunk]

    # If has Myanmar, split by script boundaries
    tokens = []

    # Pattern: Myanmar block OR non-Myanmar block
    pattern = re.compile(
        r'([\u1000-\u109F]+)'      # Myanmar characters
        r'|([^\u1000-\u109F]+)'    # Non-Myanmar characters
    )

    for match in pattern.finditer(chunk):
        myanmar, non_myanmar = match.groups()

        if myanmar:
            # Myanmar: split into syllables
            syllables = break_syllables_myanmar(myanmar)
            tokens.extend(syllables)
        elif non_myanmar:
            # Non-Myanmar: keep as single token
            tokens.append(non_myanmar)

    return tokens

In [34]:
def annotate_for_ner(words, ner_labels, split_fn):
    """
    Annotate words for NER training by splitting into sub-tokens.

    Args:
        words: List of words ['၁၉၆၂', 'ခုနှစ်', ...]
        ner_labels: List of NER labels per word [0, 0, 0, 6, 0, ...]
                    (0 = no entity, non-zero = entity type)
        split_fn: Function to split word into sub-tokens

    Returns:
        tokens: Flattened list of sub-tokens
        tags: BIO tags for each sub-token
    """
    tokens = []
    tags = []

    for word, ner_label in zip(words, ner_labels):
        # Split word into sub-tokens
        sub_tokens = split_fn(word)

        if not sub_tokens:
            continue

        if ner_label == 0:
            # Not an entity: all tokens get "O"
            tokens.extend(sub_tokens)
            tags.extend(["O"] * len(sub_tokens))
        else:
            # Entity: first token gets "B-X", rest get "I-X"
            tokens.extend(sub_tokens)
            tags.append(f"B-{ner_label}")
            tags.extend([f"I-{ner_label}"] * (len(sub_tokens) - 1))

    return tokens, tags

words = ['၁၉၆၂', 'ခုနှစ်', 'ခန့်မှန်း', 'သန်းခေါင်စာရင်း', 'အရ', 'လူဦးရေ', '၁၁၅၉၃၁', 'ယောက်', 'ရှိ', 'သည်', '။']
ner_labels = ["num", "n", "v", "n", "ppm", "n", "num", "part", "v", "ppm", "punc"]

tokens, tags = annotate_for_ner(words, ner_labels, tokenize_chunk)

In [35]:
def process_and_expand(examples):
    all_tokens = []
    all_labels = []

    for chunks, pos_tags in zip(examples['tokens'], examples['pos_tags']):
        tokens, tags = annotate_for_ner(chunks, pos_tags, tokenize_chunk)

        all_tokens.append(tokens)
        all_labels.append(tags)


    return {
        "tokens": all_tokens,  # M items (expanded)
        "ner_tags": all_labels   # M items (expanded)
    }

In [36]:
expanded_ds = expanded_ds["train"].map(
    process_and_expand,
    batched=True,
    remove_columns=expanded_ds["train"].column_names,
)

In [37]:
expanded_ds

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 42196
})

In [38]:
all_tags = set()
for i, example in enumerate(expanded_ds):
    tags = example['ner_tags']
    all_tags.update(tags)


tag_list = sorted(list(all_tags))
tag2id = {tag: idx for idx, tag in enumerate(tag_list)}
id2tag = {idx: tag for tag, idx in tag2id.items()}

print(f"Number of POS tags: {len(tag_list)}")
print(f"Tags: {tag_list}")
print(f"tag2id: {tag2id}")

Number of POS tags: 29
Tags: ['B-abb', 'B-adj', 'B-adv', 'B-conj', 'B-fw', 'B-int', 'B-n', 'B-num', 'B-part', 'B-ppm', 'B-pron', 'B-punc', 'B-sb', 'B-tn', 'B-v', 'I-abb', 'I-adj', 'I-adv', 'I-conj', 'I-fw', 'I-int', 'I-n', 'I-num', 'I-part', 'I-ppm', 'I-pron', 'I-punc', 'I-tn', 'I-v']
tag2id: {'B-abb': 0, 'B-adj': 1, 'B-adv': 2, 'B-conj': 3, 'B-fw': 4, 'B-int': 5, 'B-n': 6, 'B-num': 7, 'B-part': 8, 'B-ppm': 9, 'B-pron': 10, 'B-punc': 11, 'B-sb': 12, 'B-tn': 13, 'B-v': 14, 'I-abb': 15, 'I-adj': 16, 'I-adv': 17, 'I-conj': 18, 'I-fw': 19, 'I-int': 20, 'I-n': 21, 'I-num': 22, 'I-part': 23, 'I-ppm': 24, 'I-pron': 25, 'I-punc': 26, 'I-tn': 27, 'I-v': 28}


In [39]:
features = Features({
    "tokens": Sequence(Value("string")),
    "ner_tags": Sequence(ClassLabel(names=tag_list)),
})

In [40]:
def convert_labels_to_ids(example):
    example["ner_tags"] = [tag2id[label] for label in example["ner_tags"]]
    return example

In [41]:
expanded_ds = expanded_ds.map(convert_labels_to_ids)

In [42]:
expanded_ds

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 42196
})

# Remove Duplication and Split into Train/Test

In [43]:
df = expanded_ds.to_pandas()
before_size = df.shape[0]

# Convert list to string for deduplication
df['tokens_str'] = df['tokens'].apply(lambda x: '|'.join(x))
df = df.drop_duplicates(subset='tokens_str')
df = df.drop(columns='tokens_str')
after_size = df.shape[0]

# Simple split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print(f"Train: {len(train_df)}")
print(f"Test: {len(test_df)}")
print("Total row removed:", before_size - after_size)

Train: 32777
Test: 8195
Total row removed: 1224


In [44]:
# Convert back to dataset
deduped_ds = DatasetDict({
    "train": Dataset.from_pandas(train_df, preserve_index=False),
    "test": Dataset.from_pandas(test_df, preserve_index=False),
})

In [45]:
deduped_ds = deduped_ds.cast(features)

Casting the dataset:   0%|          | 0/32777 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/8195 [00:00<?, ? examples/s]

# Upload to HuggingFace

In [46]:
deduped_ds.push_to_hub("chuuhtetnaing/myanmar-pos-dataset", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/datasets/chuuhtetnaing/myanmar-pos-dataset/commit/b1e24868e34851393306581bf3ef7f38102219fd', commit_message='Upload dataset', commit_description='', oid='b1e24868e34851393306581bf3ef7f38102219fd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/chuuhtetnaing/myanmar-pos-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='chuuhtetnaing/myanmar-pos-dataset'), pr_revision=None, pr_num=None)

# Rename column

In [1]:
from datasets import load_dataset

ds = load_dataset("chuuhtetnaing/myanmar-pos-dataset")

ds = ds.rename_column("ner_tags", "pos_tags")
print(ds["train"].column_names)

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/1.49M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/373k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32777 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8195 [00:00<?, ? examples/s]

['tokens', 'pos_tags']


In [2]:
# Push back to Hub
ds.push_to_hub("chuuhtetnaing/myanmar-pos-dataset", commit_message="rename the column from ner_tags to pos_tags")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/datasets/chuuhtetnaing/myanmar-pos-dataset/commit/e1c470ef21e10bc300ce42dca919d36e144a1784', commit_message='rename the column from ner_tags to pos_tags', commit_description='', oid='e1c470ef21e10bc300ce42dca919d36e144a1784', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/chuuhtetnaing/myanmar-pos-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='chuuhtetnaing/myanmar-pos-dataset'), pr_revision=None, pr_num=None)