In [68]:
%%capture
!pip install corus
!pip install datasets
!pip install seqeval
!pip install evaluate


import evaluate
import numpy as np
import matplotlib.pyplot as plt

from nltk.tokenize import WordPunctTokenizer
from nltk.translate.bleu_score import corpus_bleu

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils


from IPython.display import clear_output
from tqdm import tqdm, trange
from corus import load_ne5
import corus.sources.ne5 as ne5

from sklearn.model_selection import train_test_split
from transformers import (AutoTokenizer,
                          AutoModelForTokenClassification,
                          AutoModelForMaskedLM,
                          Trainer,
                          TrainingArguments,
                          DataCollatorForTokenClassification,
                          DataCollatorForLanguageModeling)
from datasets import Dataset, DatasetDict, ClassLabel
from seqeval.metrics import classification_report, f1_score

%matplotlib inline

np.random.seed(998)

import warnings
warnings.filterwarnings('ignore')

### Загрузка данных

In [6]:
!wget http://www.labinform.ru/pub/named_entities/collection5.zip

--2025-04-15 19:16:27--  http://www.labinform.ru/pub/named_entities/collection5.zip
Resolving www.labinform.ru (www.labinform.ru)... 95.181.230.181
Connecting to www.labinform.ru (www.labinform.ru)|95.181.230.181|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1899530 (1.8M) [application/zip]
Saving to: ‘collection5.zip’


2025-04-15 19:16:29 (1.13 MB/s) - ‘collection5.zip’ saved [1899530/1899530]



In [8]:
%%capture
!unzip collection5.zip
!rm collection5.zip

In [9]:
def load_text_utf8(path):
    with open(path, 'r', encoding='utf-8') as f:
        return f.read()

ne5.load_text = load_text_utf8

data_iter = load_ne5("Collection5/")
data = list(data_iter)

## Предобработка данных и разделение на трейн/тест

In [38]:
examples = []
for doc_id, text, spans in data:
    examples.append({
        "id": doc_id,
        "text": text,
        "entities": spans
    })


labels_list = sorted(set(span.type for example in examples for span in example['entities']))

In [39]:
labels_list

['GEOPOLIT', 'LOC', 'MEDIA', 'ORG', 'PER']

In [37]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2", use_fast=True)

tokenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [40]:
label2id = {'O': 0}
for label in labels_list:
    label2id[f'B-{label}'] = len(label2id)
    label2id[f'I-{label}'] = len(label2id)
id2label = {v: k for k, v in label2id.items()}

# переводим ne5 в список словарей + создаем bio-теги
def span_to_dict(span):
    return {'start': span.start, 'end': span.stop, 'label': span.type}

def create_bio_tags(text, spans, tokens, offsets):
    tags = ['O'] * len(tokens)
    for span in spans:
        span_start = span['start']
        span_end = span['end']
        label = span['label']

        for i, (token_start, token_end) in enumerate(offsets):
            if token_start >= span_end or token_end <= span_start:
                continue  # не пересекается
            if span_start <= token_start < span_end:
                if tags[i] == 'O':
                    tags[i] = f'B-{label}'
                else:
                    tags[i] = f'I-{label}'
            elif span_start < token_end <= span_end:
                tags[i] = f'I-{label}'

    return [label2id[tag] for tag in tags]


In [41]:
# true examples и токенизация с сохранением смещений
examples = []
for record in data:
    text = record.text
    spans = [span_to_dict(span) for span in record.spans]

    tokenized = tokenizer(text, return_offsets_mapping=True, truncation=True)
    tokens = tokenizer.convert_ids_to_tokens(tokenized['input_ids'])
    offsets = tokenized['offset_mapping']

    if tokens[0] in tokenizer.special_tokens_map.values():
        tokens = tokens[1:-1]
        offsets = offsets[1:-1]

    tags = create_bio_tags(text, spans, tokens, offsets)
    examples.append({
        'tokens': tokens,
        'ner_tags': tags
    })

dataset = Dataset.from_list(examples)

In [43]:
print("label list:")
print(list(id2label.values()))

label list:
['O', 'B-GEOPOLIT', 'I-GEOPOLIT', 'B-LOC', 'I-LOC', 'B-MEDIA', 'I-MEDIA', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']


In [45]:
train_test = dataset.train_test_split(test_size=0.2, seed=42)
dataset2 = DatasetDict({
    "train": train_test["train"],
    "test": train_test["test"]
})

## Дообучение модели

In [46]:
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], is_split_into_words=True, truncation=True)

    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["ner_tags"][word_idx])
        else:
            labels.append(-100)
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_datasets = dataset2.map(tokenize_and_align_labels, batched=False)


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [47]:
label_list = list(id2label.values())

In [59]:
model_name = "cointegrated/rubert-tiny2"
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label={i: label for i, label in enumerate(label_list)},
    label2id=label2id
)

In [60]:
# метрики
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [61]:
training_args = TrainingArguments(
    output_dir="./result",
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=1000,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    max_steps=3000,
    logging_steps=100,
    weight_decay=0.01,
    metric_for_best_model="f1",
    report_to="none",
    load_best_model_at_end=True,
    save_total_limit=2,
)


In [62]:
# дообучение
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

dummy_metrics = trainer.evaluate(tokenized_datasets["test"])


In [63]:
print("Метрики до дообучения:")
for i, j in dummy_metrics.items():
  print(i, j)




Метрики до дообучения:
eval_loss 2.5015883445739746
eval_model_preparation_time 0.0008
eval_precision 0.03963084713232742
eval_recall 0.12314427103159498
eval_f1 0.05996385486337447
eval_accuracy 0.05031909671084929
eval_runtime 2.5214
eval_samples_per_second 79.32
eval_steps_per_second 5.156


In [64]:
trainer.train()
metrics = trainer.evaluate(tokenized_datasets["test"])

Step,Training Loss,Validation Loss,Model Preparation Time,Precision,Recall,F1,Accuracy
100,1.266,0.879546,0.0008,0.633126,0.187222,0.288988,0.763214
200,0.7931,0.675722,0.0008,0.616581,0.404834,0.48876,0.799705
300,0.6536,0.577374,0.0008,0.66199,0.526837,0.586731,0.824644
400,0.5698,0.522005,0.0008,0.670916,0.611027,0.639572,0.837342
500,0.5108,0.478951,0.0008,0.694771,0.656643,0.675169,0.848454
600,0.4635,0.447757,0.0008,0.712896,0.67409,0.69295,0.855457
700,0.4282,0.431328,0.0008,0.709839,0.700292,0.705033,0.857748
800,0.4006,0.41278,0.0008,0.723966,0.708349,0.716072,0.863099
900,0.375,0.401829,0.0008,0.726395,0.720404,0.723387,0.865243
1000,0.3558,0.393306,0.0008,0.735324,0.724781,0.730015,0.868336


In [65]:
print("Метрики после дообучения:")
for i, j in metrics.items():
  print(i, j)

Метрики после дообучения:
eval_loss 0.3703927993774414
eval_model_preparation_time 0.0008
eval_precision 0.7371127709073099
eval_recall 0.7638624540032991
eval_f1 0.7502492522432702
eval_accuracy 0.8740631647848143
eval_runtime 2.4097
eval_samples_per_second 82.998
eval_steps_per_second 5.395
epoch 60.0


После 1500 эпох точность и f1 выходят на плато и до 10000 эпох почти ничего не меняется в лучшую сторону даже при настройке других параметров, оставила 3000, дальше попробуем добавить MLM веса

## Дообучение модели с MLM

In [71]:
# датасет без тегов и коллейтор для mlm
train, test = train_test_split(data, test_size=0.2, shuffle=True, random_state=42)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

mlm_texts = [i.text for i in train]
mlm_dataset = Dataset.from_dict({"text": mlm_texts})

def mlm_tokenize(texts):
    return tokenizer(
        texts["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
        return_special_tokens_mask=True
    )

mlm_dataset = mlm_dataset.map(mlm_tokenize, batched=True, remove_columns=["text"])

# базовая модель та же что и для ner
mlm_model = AutoModelForMaskedLM.from_pretrained(model_name)
mlm_data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [75]:
training_args = TrainingArguments(
    output_dir="./mlm_result",
    save_strategy="steps",
    save_steps=1000,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    max_steps=3000,
    logging_steps=100,
    weight_decay=0.01,
    metric_for_best_model="f1",
    report_to="none",
    save_total_limit=2)

trainer = Trainer(
    model=mlm_model,
    args=training_args,
    train_dataset=mlm_dataset,
    data_collator=mlm_data_collator
)


In [76]:
# обучение модели c млм весами
trainer.train()


mlm_model.save_pretrained("./mlm_result")
tokenizer.save_pretrained("./mlm_result")

Step,Training Loss
100,3.2451
200,3.0893
300,3.0096
400,2.9161
500,2.8783
600,2.81
700,2.7856
800,2.7355
900,2.6864
1000,2.6833


('./mlm_result/tokenizer_config.json',
 './mlm_result/special_tokens_map.json',
 './mlm_result/vocab.txt',
 './mlm_result/added_tokens.json',
 './mlm_result/tokenizer.json')

In [77]:
# дообучение подготовленной модели для ner
ner_mlm_model = AutoModelForTokenClassification.from_pretrained(
    "./mlm_result",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

ner_mlm_training_args = TrainingArguments(
    output_dir="./mlm_result2",
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=1000,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    max_steps=3000,
    logging_steps=100,
    weight_decay=0.01,
    metric_for_best_model="f1",
    report_to="none",
    load_best_model_at_end=True,
    save_total_limit=2,
)

ner_mlm_trainer = Trainer(
    model=ner_mlm_model,
    args=ner_mlm_training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer)
)

In [78]:
ner_mlm_trainer.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
100,1.1793,0.81972,0.60414,0.270334,0.373526,0.770921
200,0.7522,0.641956,0.657685,0.435161,0.523768,0.806938
300,0.6239,0.550391,0.679903,0.569217,0.619656,0.828621
400,0.5431,0.495463,0.688107,0.647507,0.66719,0.841548
500,0.4862,0.458539,0.700826,0.678594,0.689531,0.850499
600,0.4426,0.434981,0.706574,0.701688,0.704122,0.855261
700,0.4107,0.417721,0.715605,0.713107,0.714354,0.85999
800,0.3854,0.402506,0.725794,0.721926,0.723855,0.86413
900,0.3616,0.392436,0.728282,0.728651,0.728466,0.866438
1000,0.344,0.38562,0.729265,0.736899,0.733062,0.868369


TrainOutput(global_step=3000, training_loss=0.357888427734375, metrics={'train_runtime': 1296.2094, 'train_samples_per_second': 37.031, 'train_steps_per_second': 2.314, 'total_flos': 680590618693152.0, 'train_loss': 0.357888427734375, 'epoch': 60.0})

In [79]:
ner_mlm_metrics = ner_mlm_trainer.evaluate(tokenized_datasets["test"])

## Дообучение по синтетическй разметке

Его не успела осмыслить и добавить, постараюсь доделать позже :(

## Выводы:

Метрики до обучения с услово дамми-моделью:

In [80]:
for i, j in dummy_metrics.items():
  print(i, j)

eval_loss 2.5015883445739746
eval_model_preparation_time 0.0008
eval_precision 0.03963084713232742
eval_recall 0.12314427103159498
eval_f1 0.05996385486337447
eval_accuracy 0.05031909671084929
eval_runtime 2.5214
eval_samples_per_second 79.32
eval_steps_per_second 5.156


Метрики после обучения для NER:

In [81]:
for i, j in metrics.items():
  print(i, j)

eval_loss 0.3703927993774414
eval_model_preparation_time 0.0008
eval_precision 0.7371127709073099
eval_recall 0.7638624540032991
eval_f1 0.7502492522432702
eval_accuracy 0.8740631647848143
eval_runtime 2.4097
eval_samples_per_second 82.998
eval_steps_per_second 5.395
epoch 60.0


Метрики после подготовки модели с MLM весами:

In [82]:
for i, j in ner_mlm_metrics.items():
  print(i, j)

eval_loss 0.3657907247543335
eval_precision 0.7381678458160527
eval_recall 0.7678594087044791
eval_f1 0.7527209403569873
eval_accuracy 0.8752577319587629
eval_runtime 2.6307
eval_samples_per_second 76.025
eval_steps_per_second 4.942
epoch 60.0


В итоге можно увидеть что подход с MLM дал довольно незначительный прирост в f1 (75 vs 75.2) при прочих равных параметрах обучения по сравнению с простым обучением небольшой модели под задачу классификации. Вероятно дообучение на размеченном корпусе русскоязычных похожих текстов помогло бы поднять метрики сильнее..