In [45]:
%%writefile requirements.txt
torch
numpy==1.23
pandas
scikit-learn
razdel
ipymarkup

pytorch-crf
natasha
deeppavlov
transformers

Overwriting requirements.txt


In [46]:
!pip install --upgrade -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pandas
  Using cached pandas-2.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
Collecting scikit-learn
  Using cached scikit_learn-1.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)


Новый день - новый датасет!

In [1]:
!wget http://ai-center.botik.ru/Airec/ai-resources/Persons-1000.zip
!unzip Persons-1000.zip

--2023-04-04 11:29:20--  http://ai-center.botik.ru/Airec/ai-resources/Persons-1000.zip
Resolving ai-center.botik.ru (ai-center.botik.ru)... 95.129.138.2
Connecting to ai-center.botik.ru (ai-center.botik.ru)|95.129.138.2|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3363777 (3.2M) [application/zip]
Saving to: ‘Persons-1000.zip.1’


2023-04-04 11:29:21 (4.91 MB/s) - ‘Persons-1000.zip.1’ saved [3363777/3363777]

Archive:  Persons-1000.zip
replace Persons-1000/collection/001/anno.markup.xml? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [2]:
!cat Persons-1000/collection/001/anno.markup.xml  

﻿<markup>
<entry>
<id>1</id>
<offset>308</offset>
<length>16</length>
<class>AAA_Estimate_Person</class>
<attribute>
<name>Canonical</name>
<value>ГРИГОРИЙ КАРАСИН</value>
</attribute>
</entry>
<entry>
<id>2</id>
<offset>387</offset>
<length>15</length>
<class>AAA_Estimate_Person</class>
<attribute>
<name>Canonical</name>
<value>ДЭНИЭЛ ФРИД</value>
</attribute>
</entry>
</markup>


# NER

## Датасет

Named Entity Recognition - распознавание именных сущностей. Выделяем в тексте спаны PER, LOC, ORG.

В случае с Persons-1000 только PER. 

In [3]:
import os
import xml.etree.ElementTree as ET
from ipymarkup import show_box_markup
from ipymarkup.palette import palette, BLUE, RED, GREEN

directory = "Persons-1000/collection/"

def read_text_with_markup(directory):
    markup_file_name = os.path.join(directory, "anno.markup.xml")
    text_file_name = os.path.join(directory, "text.txt")
    with open(text_file_name, "r", encoding="windows-1251") as r:
        text = r.read()
    text = text.replace("\n", "\r\n")
    root = ET.parse(markup_file_name).getroot()
    spans = []
    for entry in root.findall("entry"):
        start_pos = int(entry.find("offset").text)
        end_pos = start_pos + int(entry.find("length").text)
        tag = entry.find("class").text
        spans.append((start_pos, end_pos, "PER"))
    return text, spans

data = []
for sample_name in os.listdir(directory):
    sample_path = os.path.join(directory, sample_name)
    data.append(read_text_with_markup(sample_path))

ipymarkup - модуль для вывода NER разметки в ipynb

In [None]:
show_box_markup(data[0][0], data[0][1], palette=palette(PER=BLUE, ORG=RED, LOC=GREEN))

## BIO

BIO разметка: B - begin, I - inner, O - outer. Преобразуем задачу разметки спанов в задачу классификации каждого слова.

In [4]:
from razdel import tokenize
from collections import namedtuple

Sample = namedtuple("Sample", "text,tokens,spans,labels")

samples = []
for text, spans in data:
    labels = []
    tokens = list(tokenize(text))
    for token in tokens:
        label = 0
        for span in spans:
            if token.start == span[0]:
                label = 1
            elif token.start > span[0] and token.stop <= span[1]:
                label = 2
        labels.append(label)
    sample = Sample(text, tokens, spans, labels)
    samples.append(sample)

show_box_markup(samples[0].text, samples[0].spans, palette=palette(PER=BLUE, ORG=RED, LOC=GREEN))
print(samples[0].labels)
print(len(samples))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 1, 2, 2, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 

Бьём на выборки

In [5]:
import random
random.shuffle(samples)

train = samples[:700]
val = samples[700:850]
test = samples[850:]

In [6]:
char_set = ["<pad>", "<unk>"] + list({ch for sample in samples for token in sample.tokens for ch in token.text})
print(char_set)

['<pad>', '<unk>', 'х', '?', '+', 'K', '#', 'P', 'ш', '"', '5', '3', 'N', '!', 'h', 'f', 'Х', 'И', ';', 'ф', 'r', 's', 'W', 'b', '[', '—', 'у', 'Ъ', 'е', 'Й', 'Ж', '©', 'г', 'Z', 'Ш', 'н', '6', 'c', 'Q', '«', 'Г', 'p', 'n', 'F', '”', '%', 'U', 'A', 'р', '>', ':', 'і', 'ь', 'У', 'ю', 'Ь', 'Ю', 'M', '8', 'Ф', 'Ц', 'a', 'л', '7', 'й', 'З', ',', 'g', 'u', 'G', '№', 'Н', 'Ы', 'O', 'к', '$', 'X', '.', 'o', 'B', 'Я', '_', 'v', '•', 'x', 'Y', 'Б', 'ъ', 'ч', 'Э', '“', 'Ч', '€', '2', 'т', 'А', '»', 'R', 'D', 'E', 'и', 'q', '9', 'в', 'д', 'e', '=', 'б', 'Е', 'О', 'с', '\xad', ']', 'з', 'я', 'i', 'd', '-', 'Ё', '–', '|', '/', 'S', 'К', 'щ', 'V', '(', '…', 'п', '<', 'ц', 'L', 'z', '0', "'", 'Д', 'о', 'П', 'М', '&', 'Щ', 'k', 'ж', 't', 'y', 'м', 'а', 'В', 'ы', 'l', 'Л', 'э', 'm', 'Т', '1', 'C', 'w', '*', 'ё', 'Р', 'T', ')', 'H', 'I', 'J', '4', 'j', 'С']


Для каждого слова сохраняем его символьный состав, а в остальном старый добрый пайплайн

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import time
import numpy as np

def get_next_gen_batch(samples, max_seq_len=500, max_char_seq_len=40, batch_size=32):
    indices = np.arange(len(samples))
    np.random.shuffle(indices)
    batch_begin = 0
    while batch_begin < len(samples):
        batch_indices = indices[batch_begin: batch_begin + batch_size]
        batch = []
        batch_labels = []
        batch_max_len = 0
        for data_ind in batch_indices:
            sample = samples[data_ind] #беру одно предложение
            inputs = []
            for token in sample.tokens[:max_seq_len]: #цикл по токенам предложения, обрезанного до max_seq_len
                chars = [char_set.index(ch) if ch in char_set else char_set.index("<unk>") for ch in token.text][:max_char_seq_len] #максимальная символьная длина токена - max_char_seq_len
                chars += [0] * (max_char_seq_len - len(chars)) #каждый токен должен быть представлен max_char_seq_len символами
                inputs.append(chars)
            #inputs - list of chars' list
            batch_max_len = max(batch_max_len, len(inputs)) #кол-во токенов в предложении
            inputs += [[0]*max_char_seq_len] * (max_seq_len - len(inputs)) #приводим предложение к длине max_seq_len
            batch.append(inputs)
            labels = sample.labels[:max_seq_len]         #аналогично с labels
            labels += [0] * (max_seq_len - len(labels))  #аналогично с labels
            batch_labels.append(labels)
        batch_begin += batch_size
        batch = torch.cuda.LongTensor(batch)#[:, :batch_max_len]
        labels = torch.cuda.LongTensor(batch_labels)#[:, :batch_max_len]
        yield batch_indices, batch, labels


def train_gen_model(model, train_samples, val_samples, epochs_count=10, 
                    loss_every_nsteps=1000, lr=0.01, save_path="model.pt", device_name="cuda",
                    early_stopping=True):
    params_count = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print("Trainable params: {}".format(params_count))
    device = torch.device(device_name)
    model = model.to(device)
    total_loss = 0
    start_time = time.time()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_function = nn.CrossEntropyLoss().cuda()
    prev_avg_val_loss = None
    for epoch in range(epochs_count):
        model.train()
        for step, (_, batch, batch_labels) in enumerate(get_next_gen_batch(train)):
            logits = model(batch) # Прямой проход
            logits = logits.transpose(1, 2)
            loss = loss_function(logits, batch_labels) # Подсчёт ошибки
            loss.backward() # Подсчёт градиентов dL/dw
            optimizer.step() # Градиентный спуск или его модификации (в данном случае Adam)
            optimizer.zero_grad() # Зануление градиентов, чтобы их спокойно менять на следующей итерации
            total_loss += loss.item()
        val_total_loss = 0
        val_batch_count = 0
        model.eval()
        for _, (_, batch, batch_labels) in enumerate(get_next_gen_batch(val)):
            logits = model(batch) # Прямой проход
            logits = logits.transpose(1, 2)
            val_total_loss += loss_function(logits, batch_labels) # Подсчёт ошибки
            val_batch_count += 1
        avg_val_loss = val_total_loss/val_batch_count
        print("Epoch = {}, Avg Train Loss = {:.4f}, Avg val loss = {:.4f}, Time = {:.2f}s".format(epoch, total_loss / loss_every_nsteps, avg_val_loss, time.time() - start_time))
        total_loss = 0
        start_time = time.time()

        if early_stopping and prev_avg_val_loss is not None and avg_val_loss > prev_avg_val_loss:
            model.load_state_dict(torch.load(save_path))
            model.eval()
            break
        prev_avg_val_loss = avg_val_loss
        torch.save(model.state_dict(), save_path)

## Бесконтекстная модель

In [None]:
import torch
from torch import nn

class SuperSimpleModel(nn.Module):
    def __init__(self, char_set_size, char_embedding_dim=16, classes_count=3, char_max_seq_len=40):
        super().__init__()
        
        self.embeddings_layer = nn.Embedding(char_set_size, char_embedding_dim)
        self.out_layer = nn.Linear(char_max_seq_len * char_embedding_dim, classes_count)

    def forward(self, inputs):
        projections = self.embeddings_layer.forward(inputs)
        projections = projections.reshape(projections.size(0), projections.size(1), -1)
        output = self.out_layer.forward(projections)
        return output


model = SuperSimpleModel(len(char_set))
train_gen_model(model, train, val, epochs_count=30, early_stopping=False, lr=0.02)

Trainable params: 4611
Epoch = 0, Avg Train Loss = 0.0332, Avg val loss = 0.7698, Time = 7.46s
Epoch = 1, Avg Train Loss = 0.0082, Avg val loss = 0.1617, Time = 4.65s
Epoch = 2, Avg Train Loss = 0.0028, Avg val loss = 0.1163, Time = 4.22s
Epoch = 3, Avg Train Loss = 0.0021, Avg val loss = 0.0961, Time = 5.69s
Epoch = 4, Avg Train Loss = 0.0019, Avg val loss = 0.0900, Time = 4.80s
Epoch = 5, Avg Train Loss = 0.0018, Avg val loss = 0.0861, Time = 4.22s
Epoch = 6, Avg Train Loss = 0.0017, Avg val loss = 0.0841, Time = 5.42s
Epoch = 7, Avg Train Loss = 0.0017, Avg val loss = 0.0815, Time = 4.98s
Epoch = 8, Avg Train Loss = 0.0016, Avg val loss = 0.0800, Time = 4.20s
Epoch = 9, Avg Train Loss = 0.0016, Avg val loss = 0.0787, Time = 5.59s
Epoch = 10, Avg Train Loss = 0.0016, Avg val loss = 0.0792, Time = 5.53s
Epoch = 11, Avg Train Loss = 0.0016, Avg val loss = 0.0767, Time = 4.28s
Epoch = 12, Avg Train Loss = 0.0015, Avg val loss = 0.0763, Time = 5.69s
Epoch = 13, Avg Train Loss = 0.0015, A

## Метрики

Можно использовать как классические мультиклассификационнные метрики, так и метрики специально для NER.

Например, число точных и частичных совпадений спанов, пропущшенных и лишних спанов.

In [11]:
def get_spans(labels, tokens):
    spans = []
    for i, label in enumerate(labels[:len(tokens)]): #enumerate(labels)
        if label == 1:
            spans.append((tokens[i].start, tokens[i].stop, "PER"))
        elif label == 2:
            spans[-1] = (spans[-1][0], tokens[i].stop, spans[-1][-1])
    return spans


def calc_metrics(true_labels, predicted_labels, samples):
    one_tp = 0
    one_fp = 0
    one_fn = 0
    for true, predicted in zip(true_labels, predicted_labels):
        for l1, l2 in zip(true, predicted):
            if l1 == 1 and l2 == 1:
                one_tp += 1
            elif l1 != 1 and l2 == 1:
                one_fp += 1
            elif l1 == 1 and l2 !=1:
                one_fn += 1
    if one_tp + one_fp == 0:
        print("No positives!")
    else:
        print("1 Precision: {}, 1 Recall: {}".format(float(one_tp)/(one_tp + one_fp), float(one_tp)/(one_tp + one_fn)))

    exact = 0
    partial = 0
    missing = 0
    spurius = 0
    for (true, predicted), sample in zip(zip(true_labels, predicted_labels), samples):
        true_spans = get_spans(true, sample.tokens)
        predicted_spans = get_spans(predicted, sample.tokens)
        for true_span in true_spans:
            is_missing = True
            for predicted_span in predicted_spans:
                if true_span == predicted_span:
                    exact += 1
                    is_missing = False
                    break
                ts = true_span[0]
                te = true_span[1]
                ps = predicted_span[0]
                pe = predicted_span[1]
                # ts te ps pe
                # ps pe ts te
                if ts <= te <= ps <= pe or ps <= pe <= ts <= te:
                    continue
                is_missing = False
                partial += 1
                break
            if is_missing:
                missing += 1
        for predicted_span in predicted_spans:
            is_missing = True
            for true_span in true_spans:
                if true_span == predicted_span:
                    is_missing = False
                    break
                ts = true_span[0]
                te = true_span[1]
                ps = predicted_span[0]
                pe = predicted_span[1]
                if ts <= te <= ps <= pe or ps <= pe <= ts <= te:
                    continue
                is_missing = False
                break
            if is_missing:
                spurius += 1
    print("Exact: {}, partial: {}, missing: {}, spurius: {}".format(exact, partial, missing, spurius))
            


def predict(model, samples):
    model.eval()
    true_labels = []
    predicted_labels = []
    all_indices = []
    for _, (indices, batch, batch_labels) in enumerate(get_next_gen_batch(samples)):
        logits = model(batch)
        plabels = logits.max(dim=2)[1]
        # Убираем неконсистентность
        for sample_num, sample in enumerate(plabels):
            for word_num, label in enumerate(sample):
                if label != 2:
                    continue
                if word_num == 0:
                    plabels[sample_num][word_num] = 0
                    continue
                if sample[word_num - 1] != 1:
                    plabels[sample_num][word_num] = 0
        true_labels.extend(batch_labels)
        predicted_labels.extend(plabels)
        all_indices.extend(indices)
    samples = [samples[index] for index in all_indices]
    calc_metrics(true_labels, predicted_labels, samples)
    show_box_markup(samples[0].text, get_spans(predicted_labels[0], samples[0].tokens), palette=palette(PER=BLUE, ORG=RED, LOC=GREEN))

In [None]:
predict(model, test)

1 Precision: 0.6658841940532081, 1 Recall: 0.5873015873015873
Exact: 481, partial: 417, missing: 551, spurius: 297


## Контекстная модель: LSTM над конкатенацией

In [None]:
import torch
from torch import nn

class LstmModel(nn.Module):
    def __init__(self, char_set_size, char_embedding_dim=4, classes_count=3, lstm_embedding_dim=8, char_max_seq_len=40):
        super().__init__()
        
        self.embeddings_layer = nn.Embedding(char_set_size, char_embedding_dim)
        self.dropout = nn.Dropout(0.4)
        self.lstm_layer = nn.LSTM(char_embedding_dim * char_max_seq_len, lstm_embedding_dim // 2, batch_first=True, bidirectional=True)
        self.out_layer = nn.Linear(lstm_embedding_dim, classes_count)

    def forward(self, inputs):
        batch_size = inputs.size(0)
        seq_len = inputs.size(1)
        projections = self.embeddings_layer.forward(inputs)
        projections = projections.reshape(projections.size(0), projections.size(1), -1)
        output, _= self.lstm_layer(projections)
        output = self.dropout(output)
        output = self.out_layer.forward(output)
        return output

model = LstmModel(len(char_set))
train_gen_model(model, train, val, epochs_count=50, early_stopping=False, lr=0.02)

Trainable params: 6011
Epoch = 0, Avg Train Loss = 0.0088, Avg val loss = 0.2144, Time = 4.40s
Epoch = 1, Avg Train Loss = 0.0054, Avg val loss = 0.2083, Time = 6.35s
Epoch = 2, Avg Train Loss = 0.0052, Avg val loss = 0.2072, Time = 4.25s
Epoch = 3, Avg Train Loss = 0.0051, Avg val loss = 0.2059, Time = 4.08s
Epoch = 4, Avg Train Loss = 0.0050, Avg val loss = 0.2073, Time = 6.35s
Epoch = 5, Avg Train Loss = 0.0049, Avg val loss = 0.2046, Time = 4.23s
Epoch = 6, Avg Train Loss = 0.0048, Avg val loss = 0.2066, Time = 4.23s
Epoch = 7, Avg Train Loss = 0.0047, Avg val loss = 0.2075, Time = 6.26s
Epoch = 8, Avg Train Loss = 0.0047, Avg val loss = 0.2032, Time = 4.13s
Epoch = 9, Avg Train Loss = 0.0046, Avg val loss = 0.2067, Time = 4.23s
Epoch = 10, Avg Train Loss = 0.0046, Avg val loss = 0.2054, Time = 6.31s
Epoch = 11, Avg Train Loss = 0.0045, Avg val loss = 0.2067, Time = 4.27s
Epoch = 12, Avg Train Loss = 0.0045, Avg val loss = 0.2049, Time = 4.25s
Epoch = 13, Avg Train Loss = 0.0045, A

In [None]:
predict(model, test)

No positives!
Exact: 0, partial: 0, missing: 1449, spurius: 0


## Контекстная модель: LSTM над CharFF

## Задание 0
Сделайте полносвзяный слой с активацией над конкатенацией символьных эмбедов

In [None]:
from torch import nn

class CharFFLstmModel(nn.Module):
    def __init__(self, char_set_size, char_embedding_dim=4, classes_count=3, word_embedding_dim=16, lstm_embedding_dim=16, char_max_seq_len=40):
        super().__init__()
        
        self.embeddings_layer = nn.Embedding(char_set_size, char_embedding_dim)
        self.dropout = nn.Dropout(0.4)
        # YOUR CODE HERE                    
        self.linear = nn.Linear(char_max_seq_len * char_embedding_dim, lstm_embedding_dim)  #<-
        self.relu = nn.ReLU()                                                               #<-
        self.lstm_layer = nn.LSTM(word_embedding_dim, lstm_embedding_dim, batch_first=True) #<-
        self.out_layer = nn.Linear(lstm_embedding_dim, classes_count)

    def forward(self, inputs):
        projections = self.embeddings_layer.forward(inputs)
        # YOUR CODE HERE
        projections = projections.reshape(projections.size(0), projections.size(1), -1) #<-
        charff = self.linear(projections)   #<-
        charff = self.relu(charff)          #<-
        output, _ = self.lstm_layer(charff) #<-                
        output = self.dropout(output)
        output = self.out_layer(output)
        return output

model = CharFFLstmModel(len(char_set))
train_gen_model(model, train, val, epochs_count=50, early_stopping=False, lr=0.02)

Trainable params: 5475
Epoch = 0, Avg Train Loss = 0.0067, Avg val loss = 0.2171, Time = 4.87s
Epoch = 1, Avg Train Loss = 0.0049, Avg val loss = 0.2089, Time = 4.14s
Epoch = 2, Avg Train Loss = 0.0047, Avg val loss = 0.2046, Time = 6.03s
Epoch = 3, Avg Train Loss = 0.0044, Avg val loss = 0.1865, Time = 4.86s
Epoch = 4, Avg Train Loss = 0.0039, Avg val loss = 0.1443, Time = 4.38s
Epoch = 5, Avg Train Loss = 0.0032, Avg val loss = 0.1197, Time = 6.07s
Epoch = 6, Avg Train Loss = 0.0029, Avg val loss = 0.1123, Time = 4.54s
Epoch = 7, Avg Train Loss = 0.0028, Avg val loss = 0.1049, Time = 4.29s
Epoch = 8, Avg Train Loss = 0.0027, Avg val loss = 0.0999, Time = 5.96s
Epoch = 9, Avg Train Loss = 0.0027, Avg val loss = 0.0941, Time = 4.67s
Epoch = 10, Avg Train Loss = 0.0026, Avg val loss = 0.0912, Time = 4.23s
Epoch = 11, Avg Train Loss = 0.0025, Avg val loss = 0.0886, Time = 5.94s
Epoch = 12, Avg Train Loss = 0.0023, Avg val loss = 0.0781, Time = 4.74s
Epoch = 13, Avg Train Loss = 0.0020, A

In [None]:
predict(model, test)

1 Precision: 0.8104265402843602, 1 Recall: 0.8260869565217391
Exact: 957, partial: 273, missing: 219, spurius: 243


## Задание 1.1
Сделайте то же самое, но с bidirectional LSTM на уровне символов

In [None]:
import torch
from torch import nn

class BiLstmModel(nn.Module):
    def __init__(self, char_set_size, char_embedding_dim=4, classes_count=3, word_embedding_dim=16, lstm_embedding_dim=16, char_max_seq_len=40):
        super().__init__()
        
        self.embeddings_layer = nn.Embedding(char_set_size, char_embedding_dim)
        self.lstm_layer = nn.LSTM(char_embedding_dim * char_max_seq_len, lstm_embedding_dim // 2, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.4)    
        self.out_layer = nn.Linear(lstm_embedding_dim, classes_count)

    def forward(self, inputs):      
        projections = self.embeddings_layer.forward(inputs)
        projections = projections.reshape(projections.size(0), projections.size(1), -1) #<-
        output, _ = self.lstm_layer(projections)
        output = self.dropout(output)
        output = self.out_layer.forward(output)
        return output

model = BiLstmModel(len(char_set))
train_gen_model(model, train, val, epochs_count=50, early_stopping=False, lr=0.02)

Trainable params: 11603
Epoch = 0, Avg Train Loss = 0.0068, Avg val loss = 0.2213, Time = 6.17s
Epoch = 1, Avg Train Loss = 0.0052, Avg val loss = 0.2073, Time = 4.42s
Epoch = 2, Avg Train Loss = 0.0050, Avg val loss = 0.2035, Time = 4.31s
Epoch = 3, Avg Train Loss = 0.0049, Avg val loss = 0.2058, Time = 6.00s
Epoch = 4, Avg Train Loss = 0.0048, Avg val loss = 0.2067, Time = 4.64s
Epoch = 5, Avg Train Loss = 0.0048, Avg val loss = 0.2070, Time = 4.23s
Epoch = 6, Avg Train Loss = 0.0047, Avg val loss = 0.2039, Time = 5.95s
Epoch = 7, Avg Train Loss = 0.0047, Avg val loss = 0.2043, Time = 4.73s
Epoch = 8, Avg Train Loss = 0.0046, Avg val loss = 0.2051, Time = 4.32s
Epoch = 9, Avg Train Loss = 0.0046, Avg val loss = 0.2056, Time = 6.04s
Epoch = 10, Avg Train Loss = 0.0045, Avg val loss = 0.2049, Time = 4.67s
Epoch = 11, Avg Train Loss = 0.0045, Avg val loss = 0.2074, Time = 4.20s
Epoch = 12, Avg Train Loss = 0.0045, Avg val loss = 0.2048, Time = 6.05s
Epoch = 13, Avg Train Loss = 0.0045, 

In [None]:
predict(model, test)

No positives!
Exact: 0, partial: 0, missing: 1449, spurius: 0


## Задание 1.2
Сделайте то же самое, но со свёртками на уровне символов

In [None]:
import torch
from torch import nn

class CnnModel(nn.Module):
    def __init__(self, char_set_size, char_embedding_dim=4, classes_count=3, word_embedding_dim=16, lstm_embedding_dim=16, char_max_seq_len=40,
                 kernel_size=5, channels=500 ): # 500 - max_seq_len
        super().__init__()
        
        self.embeddings_layer = nn.Embedding(char_set_size, char_embedding_dim)
        self.char_cnn = nn.Conv1d(in_channels=channels, out_channels=channels, kernel_size=char_embedding_dim*kernel_size)
        self.dropout = nn.Dropout(0.4)    
        self.out_layer = nn.Linear(char_embedding_dim*char_max_seq_len-char_embedding_dim*kernel_size+1, classes_count)

    def forward(self, inputs):      
        projections = self.embeddings_layer.forward(inputs)
        projections = projections.reshape(projections.size(0), projections.size(1), -1)
        output = self.char_cnn(projections)    
        output = self.dropout(output)
        output = self.out_layer.forward(output)
        return output

model = CnnModel(len(char_set))
train_gen_model(model, train, val, epochs_count=50, early_stopping=False, lr=0.02)

Trainable params: 5001598
Epoch = 0, Avg Train Loss = 0.6754, Avg val loss = 13.1119, Time = 8.53s
Epoch = 1, Avg Train Loss = 0.1731, Avg val loss = 2.7987, Time = 5.38s
Epoch = 2, Avg Train Loss = 0.0399, Avg val loss = 1.1306, Time = 5.86s
Epoch = 3, Avg Train Loss = 0.0153, Avg val loss = 0.5798, Time = 6.26s
Epoch = 4, Avg Train Loss = 0.0086, Avg val loss = 0.5370, Time = 5.32s
Epoch = 5, Avg Train Loss = 0.0058, Avg val loss = 0.5022, Time = 6.70s
Epoch = 6, Avg Train Loss = 0.0045, Avg val loss = 0.5125, Time = 5.33s
Epoch = 7, Avg Train Loss = 0.0040, Avg val loss = 0.4927, Time = 6.44s
Epoch = 8, Avg Train Loss = 0.0035, Avg val loss = 0.5333, Time = 5.66s
Epoch = 9, Avg Train Loss = 0.0032, Avg val loss = 0.5490, Time = 5.27s
Epoch = 10, Avg Train Loss = 0.0030, Avg val loss = 0.5257, Time = 6.73s
Epoch = 11, Avg Train Loss = 0.0030, Avg val loss = 0.6274, Time = 5.32s
Epoch = 12, Avg Train Loss = 0.0047, Avg val loss = 0.5894, Time = 6.75s
Epoch = 13, Avg Train Loss = 0.002

In [None]:
predict(model, test)

1 Precision: 0.09251101321585903, 1 Recall: 0.043478260869565216
Exact: 19, partial: 80, missing: 1350, spurius: 574


## Задание 1.3
Сделайте то же самое, но с CRF над головой

In [None]:
from torch import nn
from torchcrf import CRF

def train_gen_model(model, train_samples, val_samples, epochs_count=10, 
                    loss_every_nsteps=1000, lr=0.01, save_path="model.pt", device_name="cuda",
                    early_stopping=True):
    params_count = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print("Trainable params: {}".format(params_count))
    device = torch.device(device_name)
    model = model.to(device)
    total_loss = 0
    start_time = time.time()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_function = CRF(3, batch_first=True).cuda()
    prev_avg_val_loss = None
    for epoch in range(epochs_count):
        model.train()
        for step, (_, batch, batch_labels) in enumerate(get_next_gen_batch(train)):
            logits = model(batch) # Прямой проход
            loss = -loss_function(logits, batch_labels) # Подсчёт ошибки
            loss.backward() # Подсчёт градиентов dL/dw
            optimizer.step() # Градиентный спуск или его модификации (в данном случае Adam)
            optimizer.zero_grad() # Зануление градиентов, чтобы их спокойно менять на следующей итерации
            total_loss += loss.item()
        val_total_loss = 0
        val_batch_count = 0
        model.eval()
        for _, (_, batch, batch_labels) in enumerate(get_next_gen_batch(val)):
            logits = model(batch) # Прямой проход
            val_total_loss += -loss_function(logits, batch_labels) # Подсчёт ошибки
            val_batch_count += 1
        avg_val_loss = val_total_loss/val_batch_count
        print("Epoch = {}, Avg Train Loss = {:.4f}, Avg val loss = {:.4f}, Time = {:.2f}s".format(epoch, total_loss / loss_every_nsteps, avg_val_loss, time.time() - start_time))
        total_loss = 0
        start_time = time.time()

        if early_stopping and prev_avg_val_loss is not None and avg_val_loss > prev_avg_val_loss:
            model.load_state_dict(torch.load(save_path))
            model.eval()
            break
        prev_avg_val_loss = avg_val_loss
        torch.save(model.state_dict(), save_path)

In [None]:
model = CharFFLstmModel(len(char_set))
train_gen_model(model, train, val, epochs_count=50, early_stopping=False, lr=0.02)

Trainable params: 5475
Epoch = 0, Avg Train Loss = 131.6959, Avg val loss = 2902.1526, Time = 17.92s
Epoch = 1, Avg Train Loss = 80.6541, Avg val loss = 2769.5361, Time = 16.77s
Epoch = 2, Avg Train Loss = 78.5845, Avg val loss = 2767.2102, Time = 16.98s
Epoch = 3, Avg Train Loss = 76.9702, Avg val loss = 2765.8787, Time = 16.07s
Epoch = 4, Avg Train Loss = 76.6547, Avg val loss = 2762.6365, Time = 16.26s
Epoch = 5, Avg Train Loss = 76.2028, Avg val loss = 2763.0867, Time = 16.11s
Epoch = 6, Avg Train Loss = 75.8180, Avg val loss = 2764.0076, Time = 16.24s
Epoch = 7, Avg Train Loss = 75.2496, Avg val loss = 2765.8635, Time = 16.02s
Epoch = 8, Avg Train Loss = 75.1527, Avg val loss = 2764.2402, Time = 16.28s
Epoch = 9, Avg Train Loss = 74.7368, Avg val loss = 2764.0552, Time = 16.61s
Epoch = 10, Avg Train Loss = 74.2612, Avg val loss = 2764.3230, Time = 17.39s
Epoch = 11, Avg Train Loss = 73.5884, Avg val loss = 2761.1218, Time = 16.93s
Epoch = 12, Avg Train Loss = 73.0469, Avg val loss

In [None]:
predict(model, test)

No positives!
Exact: 0, partial: 0, missing: 1522, spurius: 0


# NER из коробки

https://github.com/natasha/natasha

https://github.com/deepmipt/DeepPavlov

https://pypi.org/project/polyglot/

http://www.pullenti.ru/

## Задание 2
Оцените готовые модели из natasha и deeppavlov-ner на нашем тестовом датасете

### natasha

In [20]:
from natasha import Segmenter, NewsEmbedding, NewsNERTagger, Doc

segmenter = Segmenter()
emb = NewsEmbedding()
ner_tagger = NewsNERTagger(emb)

text = train[0].text

doc = Doc(text)
doc.segment(segmenter)
doc.tag_ner(ner_tagger)

print(doc.spans[:5])
print('-'*100)
doc.ner.print()

[DocSpan(start=36, stop=39, type='ORG', text='МВД', tokens=[...]), DocSpan(start=54, stop=60, type='LOC', text='России', tokens=[...]), DocSpan(start=61, stop=77, type='PER', text='Дмитрий Медведев', tokens=[...]), DocSpan(start=191, stop=197, type='LOC', text='Кремля', tokens=[...]), DocSpan(start=252, stop=285, type='ORG', text='Санкт-Петербургского университета', tokens=[...])]
----------------------------------------------------------------------------------------------------
Д.Медведев уволил четырех генералов МВД.
                                    ORG 
Президент России Дмитрий Медведев подписал указ об освобождении от 
          LOC─── PER─────────────                                  
должностей ряда сотрудников органов внутренних дел. Как сообщили в 
пресс-службе Кремля, своих должностей лишились два заместителя 
             LOC───                                            
начальника Санкт-Петербургского университета МВД РФ - генерал-майор 
           ORG──────────────────

In [25]:
def compare_span_sets(left_spans, right_spans):
    exact, partial, missing = 0, 0, 0
    for left_span in left_spans:
        is_missing = True
        for right_span in right_spans:
            if left_span == right_span:
                exact += 1
                is_missing = False
                break
            ls, le, _ = left_span
            rs, re, _ = right_span
            # [ls le] [rs re]
            # [rs re] [ls le]
            if not (ls <= le <= rs <= re or rs <= re <= ls <= le):
                is_missing = False
                partial += 1
                break            
        if is_missing:
            missing += 1
    return exact, partial, missing


e, p, m, s = 0, 0, 0, 0
for sample in test:
    true_spans = sample.spans

    doc = Doc(sample.text)
    doc.segment(segmenter)
    doc.tag_ner(ner_tagger)
    predicted_spans = [(span.start, span.stop, span.type) for span in doc.spans if span.type == "PER"]

    exact, partial, missing = compare_span_sets(true_spans, predicted_spans)
    _, _, spurius = compare_span_sets(predicted_spans, true_spans)

    e += exact
    p += partial
    m += missing
    s += spurius
print("Exact: {}, partial: {}, missing: {}, spurius: {}".format(e, p, m, s))

Exact: 1498, partial: 18, missing: 39, spurius: 16


### deeppavlov

In [18]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class NerCharsDataset(Dataset):
    def __init__(self, samples, char_set, max_seq_len=500, max_char_seq_len=50):
        assert len(samples) != 0
        self.samples = []
        self.tokens = []
        self.texts = []
        for sample in samples:
            inputs = torch.zeros((max_seq_len, max_char_seq_len), dtype=torch.long)
            for token_num, token in enumerate(sample.tokens[:max_seq_len]):
                for ch_num, ch in enumerate(token.text[:max_char_seq_len]):
                    char_index = char_set.index(ch) if ch in char_set else char_set.index("<unk>")
                    inputs[token_num][ch_num] = char_index
            labels = torch.zeros((max_seq_len,), dtype=torch.long)
            input_labels = [int(i) for i in sample.labels[:max_seq_len]]
            labels[:len(input_labels)] = torch.LongTensor(input_labels)
            self.samples.append((torch.LongTensor(inputs), torch.LongTensor(labels)))
            self.tokens.append(sample.tokens[:max_seq_len])
            self.texts.append(sample.text)
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, index):
        return self.samples[index]

    def get_tokens(self, index):
        return self.tokens[index]
    
    def get_text(self, index):
        return self.texts[index]


BATCH_SIZE = 32

test_data = NerCharsDataset(test, char_set)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)

def predict(model, test_loader, show_sample_index=51):
    all_true_labels, all_predicted_labels, all_tokens, all_texts = [], [], [], []
    for batch_index, batch in enumerate(test_loader):
        inputs, true_labels = batch
        batch_size = inputs.size(0)
        _, logits = model(inputs.to("cuda"), true_labels.to("cuda"))
        predicted_labels = logits.max(dim=1)[1].detach().cpu()

        # Убираем неконсистентность BIO
        for sample_num, sample in enumerate(predicted_labels):
            for token_num, label in enumerate(sample):
                if token_num == 0 and label == 2:
                    predicted_labels[sample_num][0] = 1
                    continue
                prev_label = sample[token_num - 1]
                if label == 2 and prev_label == 0:
                    predicted_labels[sample_num][token_num] = 1

        all_true_labels.extend(true_labels)
        all_predicted_labels.extend(predicted_labels)
        for i in range(batch_size):
            all_tokens.append(test_data.get_tokens(batch_index * batch_size + i))
            all_texts.append(test_data.get_text(batch_index * batch_size + i))

    calc_metrics(all_true_labels, all_predicted_labels, all_tokens)
    print("PREDICTED:")
    show_box_markup(all_texts[show_sample_index],
                    get_spans(all_predicted_labels[show_sample_index], all_tokens[show_sample_index]),
                    palette=palette(PER=BLUE, ORG=RED, LOC=GREEN))

In [32]:
from deeppavlov import configs, build_model
# configs.ner.*?

text = train[0].text

model = build_model(config=configs.ner.ner_rus_bert, download=True)
model([text])

2023-04-04 12:35:18.263 INFO in 'deeppavlov.core.data.utils'['utils'] at line 95: Downloading from http://files.deeppavlov.ai/v1/ner/ner_rus_bert_torch_new.tar.gz to /root/.deeppavlov/models/ner_rus_bert_torch_new.tar.gz
INFO:deeppavlov.core.data.utils:Downloading from http://files.deeppavlov.ai/v1/ner/ner_rus_bert_torch_new.tar.gz to /root/.deeppavlov/models/ner_rus_bert_torch_new.tar.gz
100%|██████████| 1.44G/1.44G [00:24<00:00, 59.2MB/s]
2023-04-04 12:35:42.775 INFO in 'deeppavlov.core.data.utils'['utils'] at line 276: Extracting /root/.deeppavlov/models/ner_rus_bert_torch_new.tar.gz archive into /root/.deeppavlov/models/ner_rus_bert_torch
INFO:deeppavlov.core.data.utils:Extracting /root/.deeppavlov/models/ner_rus_bert_torch_new.tar.gz archive into /root/.deeppavlov/models/ner_rus_bert_torch
Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNor

[[['Д',
   '.',
   'Медведев',
   'уволил',
   'четырех',
   'генералов',
   'МВД',
   '.',
   '\r',
   '\n',
   '\r',
   '\n',
   'Президент',
   'России',
   'Дмитрий',
   'Медведев',
   'подписал',
   'указ',
   'об',
   'освобождении',
   'от',
   'должностей',
   'ряда',
   'сотрудников',
   'органов',
   'внутренних',
   'дел',
   '.',
   'Как',
   'сообщили',
   'в',
   'пресс',
   '-',
   'службе',
   'Кремля',
   ',',
   'своих',
   'должностей',
   'лишились',
   'два',
   'заместителя',
   'начальника',
   'Санкт',
   '-',
   'Петербургского',
   'университета',
   'МВД',
   'РФ',
   '-',
   'генерал',
   '-',
   'майор',
   'милиции',
   'Виктор',
   'Берекет',
   'и',
   'генерал',
   '-',
   'майор',
   'милиции',
   'Леонид',
   'Бородавко',
   '.',
   '\r',
   '\n',
   '\r',
   '\n',
   'Кроме',
   'этого',
   ',',
   'освобожден',
   'от',
   'должности',
   'начальник',
   'Уральского',
   'юридического',
   'института',
   'МВД',
   'РФ',
   'генерал',
   '-',
   'ма