In [1]:
from typing import Literal, List, Tuple, cast, Dict
from collections import Counter
from transformers import (
    BertTokenizer,
    AutoModelForTokenClassification,
    BatchEncoding,
    TrainingArguments,
    DataCollatorForTokenClassification,
    Trainer,
)
from torch.utils.data import Dataset
import torch
import numpy as np

In [2]:
torch.cuda.is_available()

True

In [3]:
PrimitiveDataset = List[Tuple[List[str], List[str]]]

NUM_TAGS = 16

TAG2IDX = {
    "ADP": 0,
    "NOUN": 1,
    "PUNCT": 2,
    "VERB": 3,
    "AUX": 4,
    "PRON": 5,
    "ADJ": 6,
    "PART": 7,
    "ADV": 8,
    "INTJ": 9,
    "DET": 10,
    "PROPN": 11,
    "CCONJ": 12,
    "NUM": 13,
    "SCONJ": 14,
    "X": 15,
}
IDX2TAG = {
    0: "ADP",
    1: "NOUN",
    2: "PUNCT",
    3: "VERB",
    4: "AUX",
    5: "PRON",
    6: "ADJ",
    7: "PART",
    8: "ADV",
    9: "INTJ",
    10: "DET",
    11: "PROPN",
    12: "CCONJ",
    13: "NUM",
    14: "SCONJ",
    15: "X",
}

In [4]:
def parse_dataset(
    dataset: Literal["train"] | Literal["dev"] | Literal["test"],
) -> PrimitiveDataset:
    assert dataset in ["train", "dev", "test"]

    tokens = []

    with open(f"./corpus/bg_btb-ud-{dataset}.conllu") as file:
        sents = file.read().split("\n" * 2)
        for sent in sents:
            if not sent:
                continue

            sent_words = []
            sent_pos_types = []

            rows = sent.split("\n")
            for r in rows:
                if r[0] == "#":
                    continue
                _, word, _, pos_type, *_ = r.split("\t")
                sent_words.append(word)
                sent_pos_types.append(pos_type)

            tokens.append((sent_words, sent_pos_types))

    return tokens

In [5]:
def count_tokens(dataset: PrimitiveDataset) -> Counter:
    tokens = [token for (_, sent_tokens) in dataset for token in sent_tokens]
    return Counter(tokens)

In [6]:
def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(
            batch_index=i
        )  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif (
                word_idx != previous_word_idx
            ):  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [7]:
class POSDataset(Dataset):
    def __init__(self, pds: PrimitiveDataset, tokenizer):
        self.tokenizer = tokenizer
        self.pds = pds
        self.tds = []
        for sent_words, sent_tokens in self.pds:
            pbe = tokenize_and_align(sent_words, sent_tokens, self.tokenizer)
            self.tds.append(
                {
                    "input_ids": torch.tensor(pbe["input_ids"]),
                    "attention_mask": torch.tensor(pbe["attention_mask"]),
                    "labels": torch.tensor(pbe["labels"]),
                }
            )

    def __len__(self):
        return len(self.tds)

    def __getitem__(self, idx):  # type: ignore (the LSP complains)
        return self.tds[idx]

In [8]:
def compute_metrics(eval_prediction):
    (predictions, label_ids) = eval_prediction
    predictions = np.argmax(predictions, axis=2)

    compare_tuples = [t for t in zip(predictions, label_ids) if t[0] != -100]
    total = len(compare_tuples)
    correct = sum(1 for t in compare_tuples if t[0] == t[1])

    return {"accuracy": correct / total}

In [9]:
def create_trainer(model, train_dataset, test_dataset, tokenzier):
    # https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments
    training_args = TrainingArguments(
        num_train_epochs=5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        learning_rate=2e-5,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        warmup_ratio=0.1,
        fp16=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        processing_class=tokenizer,
        data_collator=DataCollatorForTokenClassification(tokenizer),
        compute_metrics=compute_metrics,
    )

    return trainer

In [10]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
tokenizer = cast(BertTokenizer, tokenizer)
train_dataset = POSDataset(parse_dataset("train"), tokenizer)
test_dataset = POSDataset(parse_dataset("test"), tokenizer)
# print(dataset[0])
# print(dataset[0]["labels"].size())
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=NUM_TAGS,
    id2label=IDX2TAG,
    label2id=TAG2IDX,
)
print(f"Total parameters: {sum(p.numel() for p in model.parameters())}")
trainer = create_trainer(model, train_dataset, test_dataset, tokenizer)
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

FileNotFoundError: [Errno 2] No such file or directory: './corpus/bg_btb-ud-train.conllu'