In [1]:
!pip install evaluate seqeval -qqq

In [2]:
from huggingface_hub import login
import wandb
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
import evaluate
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np

wandb.login(key="")
login(token="")
ds = load_dataset("chuuhtetnaing/myanmar-pos-dataset")

seqeval = evaluate.load("seqeval")
tokenizer = AutoTokenizer.from_pretrained("chuuhtetnaing/myanmar-text-segmentation-model")
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mchuu[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
id2label = {i: l for i, l in enumerate(ds['train'].features['pos_tags'].feature.names)}
label2id = {l: i for i, l in enumerate(ds['train'].features['pos_tags'].feature.names)}

In [4]:
id2label

{0: 'B-abb',
 1: 'B-adj',
 2: 'B-adv',
 3: 'B-conj',
 4: 'B-fw',
 5: 'B-int',
 6: 'B-n',
 7: 'B-num',
 8: 'B-part',
 9: 'B-ppm',
 10: 'B-pron',
 11: 'B-punc',
 12: 'B-sb',
 13: 'B-tn',
 14: 'B-v',
 15: 'I-abb',
 16: 'I-adj',
 17: 'I-adv',
 18: 'I-conj',
 19: 'I-fw',
 20: 'I-int',
 21: 'I-n',
 22: 'I-num',
 23: 'I-part',
 24: 'I-ppm',
 25: 'I-pron',
 26: 'I-punc',
 27: 'I-tn',
 28: 'I-v'}

In [5]:
label2id

{'B-abb': 0,
 'B-adj': 1,
 'B-adv': 2,
 'B-conj': 3,
 'B-fw': 4,
 'B-int': 5,
 'B-n': 6,
 'B-num': 7,
 'B-part': 8,
 'B-ppm': 9,
 'B-pron': 10,
 'B-punc': 11,
 'B-sb': 12,
 'B-tn': 13,
 'B-v': 14,
 'I-abb': 15,
 'I-adj': 16,
 'I-adv': 17,
 'I-conj': 18,
 'I-fw': 19,
 'I-int': 20,
 'I-n': 21,
 'I-num': 22,
 'I-part': 23,
 'I-ppm': 24,
 'I-pron': 25,
 'I-punc': 26,
 'I-tn': 27,
 'I-v': 28}

In [6]:
num_labels = len(ds['train'].features['pos_tags'].feature.names)
num_labels

29

In [7]:
model = AutoModelForTokenClassification.from_pretrained(
    "chuuhtetnaing/myanmar-text-segmentation-model", num_labels=num_labels, id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True
)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at chuuhtetnaing/myanmar-text-segmentation-model and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([29]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([29, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
label_list = ds["train"].features[f"pos_tags"].feature.names

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [9]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"pos_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [10]:
tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)

In [11]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['tokens', 'pos_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 32777
    })
    test: Dataset({
        features: ['tokens', 'pos_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 8195
    })
})

In [12]:
training_args = TrainingArguments(
    output_dir="myanmar_pos_model",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="epoch", #"steps",
    save_strategy="epoch", #"steps",
    load_best_model_at_end=True,
    push_to_hub=True,
    hub_private_repo=True,
    # eval_steps=1000,
    # save_steps=1000,
    logging_steps=10,
    logging_strategy="steps",
    # save_total_limit=2,
    # hub_strategy="all_checkpoints",
    save_safetensors=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to=["wandb", "tensorboard"],
    gradient_accumulation_steps=8,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [13]:
wandb.init(project="myanmar-pos-fine-tuning")

In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.7611,0.541684,0.799976,0.842209,0.820549,0.85566
2,0.3736,0.317003,0.887862,0.904016,0.895866,0.912292
3,0.3015,0.276361,0.899971,0.914323,0.90709,0.921923
4,0.2589,0.256176,0.906666,0.918871,0.912728,0.92648
5,0.2504,0.24734,0.910443,0.921897,0.916134,0.92846
6,0.2209,0.240286,0.914146,0.923686,0.918891,0.931057
7,0.2253,0.234137,0.916762,0.925648,0.921184,0.932788
8,0.2361,0.231913,0.918273,0.926382,0.922309,0.933686
9,0.214,0.230518,0.918021,0.926776,0.922378,0.93361
10,0.2199,0.231061,0.918407,0.926501,0.922436,0.93368


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=650, training_loss=0.39428256841806264, metrics={'train_runtime': 1927.8091, 'train_samples_per_second': 170.022, 'train_steps_per_second': 0.337, 'total_flos': 2.392540791016854e+16, 'train_loss': 0.39428256841806264, 'epoch': 10.0})

# Add the training and evaluation result manually

In [1]:
# import json
# from huggingface_hub import HfApi
#
# # Get log history directly from trainer
# log_history = trainer.state.log_history
#
# def fmt(val):
#     """Format value, handle None"""
#     return f"{val:.4f}" if val is not None else "N/A"
#
# # Get eval logs with epoch info
# eval_logs = [log for log in log_history if "eval_loss" in log]
# train_logs = {log["step"]: log for log in log_history if "loss" in log and "eval_loss" not in log}
#
# table_rows = []
# table_rows.append("| Epoch | Training Loss | Validation Loss | Precision | Recall | F1 | Accuracy |")
# table_rows.append("|-------|---------------|-----------------|-----------|--------|------|----------|")
#
# for e in eval_logs:
#     epoch = e.get("epoch")
#     step = e.get("step")
#
#     # Find closest training loss
#     t = train_logs.get(step, {}).get("loss") or next(
#         (train_logs[s]["loss"] for s in range(step, step-1000, -100) if s in train_logs),
#         None
#     )
#
#     table_rows.append(
#         f"| {epoch:.0f} | {fmt(t)} | {fmt(e.get('eval_loss'))} | {fmt(e.get('eval_precision'))} | {fmt(e.get('eval_recall'))} | {fmt(e.get('eval_f1'))} | {fmt(e.get('eval_accuracy'))} |"
#     )
#
# readme = f"""---
# license: apache-2.0
# base_model: chuuhtetnaing/myanmar-text-segmentation-model
# tags:
#   - token-classification
#   - myanmar
#   - pos-tagging
# language:
#   - my
# datasets:
#   - chuuhtetnaing/myanmar-pos-dataset
# metrics:
#   - f1
# ---
#
# # Myanmar POS Tagging Model
#
# Fine-tuned [myanmar-text-segmentation-model](https://huggingface.co/chuuhtetnaing/myanmar-text-segmentation-model) for Myanmar Part-of-Speech tagging.
#
# ## Training Results
#
# {chr(10).join(table_rows)}
#
# ## Training Details
#
# | Parameter | Value |
# |-----------|-------|
# | Base Model | chuuhtetnaing/myanmar-text-segmentation-model |
# | Total Epochs | {trainer.state.epoch:.0f} |
# | Total Steps | {trainer.state.global_step} |
# | Best F1 | {fmt(trainer.state.best_metric)} |
#
# ## Usage
# ```python
# from transformers import pipeline
#
# nlp = pipeline("token-classification", model="chuuhtetnaing/myanmar_pos_model", aggregation_strategy="simple")
# result = nlp("သူသည်ကျောင်းသို့သွားသည်။")
# ```
#
# ## Labels
#
# | Tag | Description |
# |-----|-------------|
# | n | Noun |
# | v | Verb |
# | adj | Adjective |
# | adv | Adverb |
# | pron | Pronoun |
# | num | Number |
# | punc | Punctuation |
# | part | Particle |
# | conj | Conjunction |
# | ppm | Postpositional Marker |
# | fw | Foreign Word |
# | abb | Abbreviation |
# | int | Interjection |
# | sb | Symbol |
# | tn | Text Number |
# """
#
# # Upload README
# api = HfApi()
# api.upload_file(
#     path_or_fileobj=readme.encode(),
#     path_in_repo="README.md",
#     repo_id="chuuhtetnaing/myanmar_pos_model",
#     commit_message="Add training results"
# )
# print("✅ Done!")