# Дмитрий Ильин. ДЗ №2 - Задача NER.

Базовая модель bert-base-multilingual-cased.\
F1 = 0.94366

In [None]:
import os
import time
import json
import pandas as pd
import numpy as np

In [None]:
import torch

In [None]:
#!pip install simpletransformers

In [None]:
from simpletransformers.ner import NERModel, NERArgs

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
root_path = "/content/drive/My Drive/MIPT/NLP/HW2/"
output_path = "/content/drive/My Drive/MIPT/NLP/HW2/output/"
logs_path = "/content/drive/My Drive/MIPT/NLP/HW2/logs/"
models_path = "/content/drive/My Drive/MIPT/NLP/HW2/models/"

# 1. Загрузка данных

Для загрузки данных была написана простая функция для парсинга, которая на выходе дает датафрейм с тремя колонками: sentence_id (порядковый номер предложения), words и labels - https://simpletransformers.ai/docs/ner-data-formats/.

In [None]:
def load_data_to_dataframe(file_name):
    with open(file_name, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    sentence_id = 0
    data = []

    for line in lines:
        line = line.strip()
        if not line:
            sentence_id += 1
            continue
        word, label = line.split(" ", 1)
        data.append([sentence_id, word, label])

    df = pd.DataFrame(data, columns=["sentence_id", "words", "labels"])
    return df

In [None]:
train_df = load_data_to_dataframe(os.path.join(root_path, "train.txt"))
train_df.head()

Unnamed: 0,sentence_id,words,labels
0,0,"""",O
1,0,Если,O
2,0,Миронов,B-PER
3,0,занял,O
4,0,столь,O


In [None]:
dev_df = load_data_to_dataframe(os.path.join(root_path, "dev.txt"))
dev_df.head()

Unnamed: 0,sentence_id,words,labels
0,0,как,O
1,0,акционерный,O
2,0,коммерческий,O
3,0,Московский,B-ORG
4,0,муниципальный,I-ORG


In [None]:
test_df = load_data_to_dataframe(os.path.join(root_path, "test.txt"))
test_df.head()

Unnamed: 0,sentence_id,words,labels
0,0,Тогда,O
1,0,замешанные,O
2,0,в,O
3,0,скандале,O
4,0,прокуроры,O


# 2. Проверка данных

Так как simpletransformers не требует создания класса датасетов и дополнительной обработки датафреймов (помимо приведения к стандартному для библиотеки виду), то на данном этапе мы просто проверяем, какие лейблы используются и каково максимальное количество слов в предложении.

In [None]:
labels = list(train_df["labels"].unique())

In [None]:
labels

['O', 'B-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'I-PER']

In [None]:
sentence_lengths = pd.concat([train_df, dev_df, test_df], ignore_index=True).groupby("sentence_id").size()
max_sentence_length = sentence_lengths.max()
max_sentence_length

252

# 3. Подбор гиперпараметров и обучение



In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == "cuda":
    print(f"GPU: {torch.cuda.get_device_name()}")
else:
    print("CPU")

GPU: Tesla V100-SXM2-16GB


Примечание: изменение гиперпараметров train_batch_size и некоторых других в итоге негативно влияло на итоговый скор, поэтому решено их не подбирать.

In [None]:
def get_default_model_args():
    model_args = NERArgs()
    model_args.evaluate_during_training = True
    model_args.evaluate_during_training_verbose = True
    model_args.num_train_epochs = 1
    model_args.learning_rate = 3e-5
    model_args.train_batch_size = 8
    model_args.eval_batch_size = 8
    model_args.overwrite_output_dir = True
    model_args.reprocess_input_data = True
    model_args.output_dir = output_path
    model_args.best_model_dir = models_path
    model_args.tensorboard_dir = logs_path
    model_args.max_seq_length = 256
    model_args.manual_seed = 42

    return model_args

In [None]:
def get_model(labels, model_args = None):
    if model_args is None:
        model_args = get_default_model_args()
    return NERModel(
        "bert",
        "bert-base-multilingual-cased",
        labels=labels,
        args=model_args
    )

## 3.1. Подбор гиперпараметров
На данном этапе перебираются значения следующих гиперпараметров: learning_rate, num_train_epoch и scheduler. В результате мы получаем словарь с лучшими гиперпараметрами на основе метрики F1, который сохраняется на диск.

In [None]:
learning_rates = [3e-5, 1e-5, 5e-6]
num_train_epochs = [2, 3]
schedulers = ["constant_schedule", "linear_schedule_with_warmup"]

best_f1 = 0
best_params = {}

for lr in learning_rates:
    for epoch in num_train_epochs:
        for scheduler in schedulers:
            print(f"Обучаем на: learning_rate={lr}, num_train_epochs={epoch}, scheduler={scheduler}")

            current_params = {
                "learning_rate": lr,
                "num_train_epochs": epoch,
                "scheduler": scheduler
            }

            model_args = get_default_model_args()
            model_args.best_model_dir = None
            for key, value in current_params.items():
                setattr(model_args, key, value)

            model = get_model(labels, model_args)

            _, result = model.train_model(train_df, eval_data=dev_df)

            f1_score = result['f1_score'][0]
            print(f"F1: {f1_score}")
            if f1_score > best_f1:
                best_f1 = f1_score
                best_params = current_params

print(f"Лучшее значение F1: {best_f1} с параметрами: {best_params}")

Обучаем на: learning_rate=3e-05, num_train_epochs=2, scheduler=constant_schedule


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return [


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

F1: 0.9555470396627707
Обучаем на: learning_rate=3e-05, num_train_epochs=2, scheduler=linear_schedule_with_warmup


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return [


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

F1: 0.9444391499094635
Обучаем на: learning_rate=3e-05, num_train_epochs=3, scheduler=constant_schedule


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return [


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

F1: 0.9555470396627707
Обучаем на: learning_rate=3e-05, num_train_epochs=3, scheduler=linear_schedule_with_warmup


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return [


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

F1: 0.9382716049382716
Обучаем на: learning_rate=1e-05, num_train_epochs=2, scheduler=constant_schedule


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return [


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

F1: 0.9271586154431344
Обучаем на: learning_rate=1e-05, num_train_epochs=2, scheduler=linear_schedule_with_warmup


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return [


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

F1: 0.9358253075236006
Обучаем на: learning_rate=1e-05, num_train_epochs=3, scheduler=constant_schedule


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return [


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

F1: 0.9271586154431344
Обучаем на: learning_rate=1e-05, num_train_epochs=3, scheduler=linear_schedule_with_warmup


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return [


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

F1: 0.9234563217300579
Обучаем на: learning_rate=5e-06, num_train_epochs=2, scheduler=constant_schedule


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return [


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

F1: 0.9325960530079132
Обучаем на: learning_rate=5e-06, num_train_epochs=2, scheduler=linear_schedule_with_warmup


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return [


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

F1: 0.931762022429196
Обучаем на: learning_rate=5e-06, num_train_epochs=3, scheduler=constant_schedule


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return [


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

F1: 0.9325960530079132
Обучаем на: learning_rate=5e-06, num_train_epochs=3, scheduler=linear_schedule_with_warmup


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return [


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/969 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

F1: 0.9318484675423568
Лучшее значение F1: 0.9555470396627707 с параметрами: {'learning_rate': 3e-05, 'num_train_epochs': 2, 'scheduler': 'constant_schedule'}


In [None]:
with open(os.path.join(root_path, "best_params.json"), "w") as file:
    json.dump(best_params, file)

## 3.2. Обучение

Да данном этапе происходит следующее: train_df и dev_df объединяются в full_df, а затем обучаются три модели (с использованием ранее подобранных гиперпараметров) на новом обучающем датасете и валидационном датасете, которые мы получаем путем случайного разбиения full_df.

In [None]:
with open(os.path.join(root_path, "best_params.json"), "r") as file:
    best_params = json.load(file)

print(best_params)

{'learning_rate': 3e-05, 'num_train_epochs': 2, 'scheduler': 'constant_schedule'}


In [None]:
full_df = pd.concat([train_df, dev_df])
test_size=0.2

models = []

for i in range(3):
    print(f"Обучение моделей, итерация №{i+1}")
    new_train_df, new_dev_df = train_test_split(full_df, test_size=test_size, random_state=int(time.time()))

    model_args = get_default_model_args()
    model_args.best_model_dir = os.path.join(models_path, f"model_{i}")
    model_args.output_dir = os.path.join(output_path, f"model_{i}")
    model_args.tensorboard_dir = os.path.join(logs_path, f"model_{i}")
    for key, value in best_params.items():
        setattr(model_args, key, value)

    model = get_model(labels, model_args)
    _, result = model.train_model(new_train_df, eval_data=new_dev_df)
    f1_score = result['f1_score'][0]
    print(f"F1 для модели №{i+1}: {f1_score}")

    models.append(model)

Обучение моделей, итерация №1


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return [


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/968 [00:00<?, ?it/s]

  return [


  0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/932 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/968 [00:00<?, ?it/s]

  return [


  0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/932 [00:00<?, ?it/s]

F1 для модели №1: 0.8502994011976048
Обучение моделей, итерация №2


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return [


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/968 [00:00<?, ?it/s]

  return [


  0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/932 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/968 [00:00<?, ?it/s]

  return [


  0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/932 [00:00<?, ?it/s]

F1 для модели №2: 0.8492206726825267
Обучение моделей, итерация №3


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return [


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/968 [00:00<?, ?it/s]

  return [


  0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/931 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/968 [00:00<?, ?it/s]

  return [


  0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/931 [00:00<?, ?it/s]

F1 для модели №3: 0.8532079828467756


# 4. Предсказание
На этой стадии берется набор из трех ранее обученных моделей, после чего с помощью ensemble prediction делается предсказание на тестовом датасете, вычисляется метрика F1, а предсказанные метки добавляются отдельной колонкой к test_df и сохраняются на диск.

In [None]:
predictions = []

# Получим предсказания для каждой модели
for model in models:
    result, _, preds_list = model.eval_model(test_df)
    predictions.append(preds_list)
    print(f"F1: {result['f1_score']:.4f}")

  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

F1: 0.8938


  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

F1: 0.8710


  return [


  0%|          | 0/6 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/323 [00:00<?, ?it/s]

F1: 0.8853


In [None]:
ensemble_predictions = []

for sentence_preds in zip(*predictions):
    sentence_ensemble = []
    for labels in zip(*sentence_preds):
        # Голосование по большинству
        most_common = max(set(labels), key=labels.count)
        sentence_ensemble.append(most_common)
    ensemble_predictions.append(sentence_ensemble)

In [None]:
true_labels = list(test_df["labels"])
flat_ensemble_predictions = [label for sublist in ensemble_predictions for label in sublist]

Сначала посчитаем F1 с учетом O-меток.

In [None]:
f1 = f1_score(true_labels, flat_ensemble_predictions, average='weighted')
print(f"F1 для всех токенов: {f1}")

F1 для всех токенов: 0.9796409214074258


А теперь посчитаем F1 без учета О-меток.

In [None]:
filtered_true_labels = []
filtered_ensemble_predictions = []

for true_label, pred in zip(true_labels, flat_ensemble_predictions):
    if true_label != "O" and pred != "O":
        filtered_true_labels.append(true_label)
        filtered_ensemble_predictions.append(pred)

f1 = f1_score(filtered_true_labels, filtered_ensemble_predictions, average='weighted')
print(f"F1 итоговый: {f1}")

F1 итоговый: 0.9436612047852708


In [None]:
test_with_preds_df = test_df.copy()
test_with_preds_df["predictions"] = flat_ensemble_predictions
test_with_preds_df.sample(5)

Unnamed: 0,sentence_id,words,labels,predictions
38341,1872,ОАО,B-ORG,B-ORG
47714,2332,Между,O,O
2568,123,'',O,O
47098,2303,.,O,O
8746,422,мэра,O,O


In [None]:
test_with_preds_df.to_csv(os.path.join(root_path, "test_with_preds.csv"))

In [None]:
mismatches = test_with_preds_df[test_with_preds_df["labels"] != test_with_preds_df["predictions"]]
mismatches.sample(5)

Unnamed: 0,sentence_id,words,labels,predictions
17468,857,Хабурдзания,B-PER,I-PER
38480,1879,Биостэна,B-ORG,B-PER
34525,1690,линейного,I-ORG,O
28963,1399,Вильданов,B-PER,I-PER
51986,2532,Интер,I-ORG,O


In [None]:
print(f"{mismatches.shape[0]}/{test_with_preds_df.shape[0]}")

1076/53038


# 5. Выводы
Модель bert-base-multilingual-cased весьма неплохо показала себя для задачи NER для текстов на русском языке (изначально я думал использовать DeepPavlov).\
Хотел был также отметить, что библиотека simpletransformers здорово упростила мне задачу.\
Сначала я попробовал обучить только одну модель без перебора гиперпараметров на трех эпохах и получил метрику F1 ~ 0.9, что уже было хорошим результатом, однако подбор гиперпараметров и использование ансамбля и трех моделей позволило поднять итоговую метрику F1 до 0.94366.\
Вероятнее всего, итоговый скор можно было бы поднять еще выше, если бы я подбирал большее количество гиперпараметров, а также обучил бы больше моделей для ансамбля.\
Также в будущем было бы интересно попробовать другие техники, например блендинг из разных базовых моделей.