Імпорт необхідних бібліотек

In [None]:
!pip install transformers datasets scikit-learn --quiet

Донавчання та тестування моделі

In [None]:
import json
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification
)
import torch

# шлях до пре-тренованої моделі
model_checkpoint = "ukr-models/xlm-roberta-base-uk"

# завантажуємо токенізатор
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

# функція для читання json
def load_json_dataset(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            item = json.loads(line)
            data.append({
                "tokens": item["text"].split(),
                "labels": item["labels"]
            })
    return Dataset.from_list(data)

# завантаження train і test наборів
train_dataset = load_json_dataset("train.json")
test_dataset  = load_json_dataset("test.json")

# вирівнювання
def tokenize_and_align_labels(examples):
    tokenized = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=128
    )
    all_labels = []
    for i, word_ids in enumerate(tokenized.word_ids(batch_index=i) for i in range(len(tokenized["input_ids"]))):
        labels = examples["labels"][i]
        aligned = []
        prev_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned.append(-100)
            elif word_idx != prev_word_idx:
                aligned.append(labels[word_idx])
            else:
                aligned.append(labels[word_idx])
            prev_word_idx = word_idx
        all_labels.append(aligned)
    tokenized["labels"] = all_labels
    return tokenized

train_dataset = train_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["tokens", "labels"]
)
test_dataset = test_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["tokens", "labels"]
)

# ініціалізація моделі для токен-класифікації
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=3)

# data collator для паддінгу
data_collator = DataCollatorForTokenClassification(tokenizer)

# функція для тестування
def compute_metrics(p):
    preds, labels = p
    preds = torch.argmax(torch.tensor(preds), dim=-1).numpy()
    true_preds, true_labels = [], []
    for pred_seq, label_seq in zip(preds, labels):
        for p_i, l_i in zip(pred_seq, label_seq):
            if l_i != -100:
                true_preds.append(p_i)
                true_labels.append(l_i)
    from sklearn.metrics import precision_recall_fscore_support
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, true_preds, average="macro"
    )
    return {"precision": precision, "recall": recall, "f1": f1}

# параметри навчання
training_args = TrainingArguments(
    output_dir="./comma_error_detector",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=2,
    do_eval=True,
    report_to="none"
)

# створюємо Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# пуск навчання
trainer.train()

# оцінка на тестових даних
metrics = trainer.evaluate(test_dataset)
print("Test set metrics:", metrics)

# зберігаємо модель і токенізатор
trainer.save_model("./comma_error_detector")
tokenizer.save_pretrained("./comma_error_detector")

Map:   0%|          | 0/12464 [00:00<?, ? examples/s]

Map:   0%|          | 0/2311 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at ukr-models/xlm-roberta-base-uk and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,0.088
200,0.0539
300,0.0431
400,0.0334
500,0.0349
600,0.033
700,0.0267
800,0.0311
900,0.0303
1000,0.0243


Test set metrics: {'eval_loss': 0.029461093246936798, 'eval_precision': 0.7749920547501312, 'eval_recall': 0.6751486428798289, 'eval_f1': 0.7176371490230108, 'eval_runtime': 15.7941, 'eval_samples_per_second': 146.32, 'eval_steps_per_second': 18.298, 'epoch': 4.0}


('./comma_error_detector/tokenizer_config.json',
 './comma_error_detector/special_tokens_map.json',
 './comma_error_detector/sentencepiece.bpe.model',
 './comma_error_detector/added_tokens.json',
 './comma_error_detector/tokenizer.json')

Завантаження моделі шляхом архівування у .zip

In [None]:
import shutil

shutil.make_archive("comma_error_detector", 'zip', "./comma_error_detector")

'/content/comma_error_detector.zip'