### Libraries

In [1]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import ParameterGrid
import matplotlib.pyplot as plt
from matplotlib import gridspec
from tqdm.notebook import tqdm
import numpy as np

from torch.utils.tensorboard import SummaryWriter
from torchvision.datasets import FashionMNIST
from torchvision import datasets, transforms
import torchvision
import torch

import warnings
warnings.filterwarnings("ignore")

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### Dataset & general parameters

In [3]:
# Uncomment string below to download dataset
# !wget https://storage.yandexcloud.net/natasha-nerus/data/nerus_lenta.conllu.gz

In [4]:
from nerus import load_nerus
docs = load_nerus('nerus_lenta.conllu.gz')
special_tokens = ['<PAD>']
special_tags = ['<PAD>']

sentences = []
tags = []

cnt = 0
n_docs_max = 5000
for doc in tqdm(docs):
    cnt += 1
    
    for sent in doc.sents:
        sent_ = []
        tag_ = []

        for word in sent.tokens:
            tag_.append(word.pos)
            sent_.append(word.text)

        sentences.append(sent_)
        tags.append(tag_)

    if cnt > n_docs_max:
        break

0it [00:00, ?it/s]

In [5]:
print(sentences[0])
print(sentences[1])

['Вице-премьер', 'по', 'социальным', 'вопросам', 'Татьяна', 'Голикова', 'рассказала', ',', 'в', 'каких', 'регионах', 'России', 'зафиксирована', 'наиболее', 'высокая', 'смертность', 'от', 'рака', ',', 'сообщает', 'РИА', 'Новости', '.']
['По', 'словам', 'Голиковой', ',', 'чаще', 'всего', 'онкологические', 'заболевания', 'становились', 'причиной', 'смерти', 'в', 'Псковской', ',', 'Тверской', ',', 'Тульской', 'и', 'Орловской', 'областях', ',', 'а', 'также', 'в', 'Севастополе', '.']


In [6]:
set_tokens = {word for sent in sentences for word in sent}
set_tokens.difference_update(special_tokens)
list_tokens = special_tokens + list(set_tokens)

set_tags = {tag for t in tags for tag in t}
set_tags.difference_update(special_tags)
list_tags = special_tags + list(set_tags)

token_to_idx = dict(zip(list_tokens, np.arange(len(list_tokens))))
tag_to_idx = dict(zip(list_tags, np.arange(len(list_tags))))

In [7]:
train_test_boundary = int(len(sentences) * 0.9)

train_sentences = sentences[:train_test_boundary]
train_tags = tags[:train_test_boundary]
test_sentences = sentences[train_test_boundary:]
test_tags = tags[train_test_boundary:]

print(len(train_sentences), len(test_sentences))

53112 5902


In [8]:
class TaggingDataset(torch.utils.data.Dataset):
    def __init__(self, sentences, tags, token_to_idx, tag_to_idx):
        super().__init__()

        self.sentences = sentences
        self.tags = tags
        self.token_to_idx = token_to_idx
        self.tag_to_idx = tag_to_idx

        sent_index = []
        tags_index = []

        for sent in sentences:
            sequence = []

            for token in sent:
                if token in self.token_to_idx:
                    sequence.append(token_to_idx[token])
                else:
                    sequence.append(0)

            sent_index.append(sequence)

        for sent_tags in tags:
            tgs = []

            for tag in sent_tags:
                tgs.append(tag_to_idx[tag])

            tags_index.append(tgs)

        self.sent_index = sent_index
        self.tags_index = tags_index

    def __getitem__(self, idx):
        return torch.tensor(self.sent_index[idx]), torch.tensor(self.tags_index[idx])

    def __len__(self):
        return len(self.sent_index)

In [9]:
train_dataset = TaggingDataset(train_sentences, train_tags, token_to_idx, tag_to_idx)
test_dataset = TaggingDataset(test_sentences, test_tags, token_to_idx, tag_to_idx)

In [10]:
# [sentence][token/tag][word]
print(train_dataset[0])
print(train_dataset[0][0].size())
print(f"Первое слово первого предложения:                   {list_tokens[int(train_dataset[0][0][0])]}")
print(f"Второе слово первого предложения:                   {list_tokens[int(train_dataset[0][0][1])]}")
print(f"Первое слово второго предложения:                   {list_tokens[int(train_dataset[1][0][0])]}")
print(f"Часть речи (PoS) первого слова первого предложения: {list_tags[int(train_dataset[0][1][0])]}")

(tensor([58888, 71854,  6652, 62548, 64147, 88873, 42599, 96671, 50119, 49013,
        63223, 75583, 15058, 40003, 43491, 97324, 34959, 32389, 96671, 43713,
        37086, 30485, 87631]), tensor([ 5, 14,  9,  5,  2,  2, 15, 11, 14, 12,  5,  2, 15,  3,  9,  5, 14,  5,
        11, 15,  2,  2, 11]))
torch.Size([23])
Первое слово первого предложения:                   Вице-премьер
Второе слово первого предложения:                   по
Первое слово второго предложения:                   По
Часть речи (PoS) первого слова первого предложения: NOUN


In [11]:
class PaddingCollator:
    """
    Реализует padding всех "предложений" в batch-e до размера самого большого предложения.
    """
    def __init__(self, pad_token_id, pad_tag_id):
        self.pad_token_idx = pad_token_id
        self.pad_tag_id = pad_tag_id
        
    def __call__(self, batch):
        # elem[0] -- тензор из токенов-слов
        # elem[1] -- тензор из тэгов-POS
        max_len = 0
        for elem in batch:
            if (len(elem[0]) > max_len):
                max_len = len(elem[0])

        new_sentences = []
        new_tags = []

        for elem in batch:
            new_sentences.append(torch.nn.functional.pad(elem[0], (0, max_len - len(elem[0])), "constant", self.pad_token_idx))
            new_tags.append(torch.nn.functional.pad(elem[1], (0, max_len - len(elem[1])), "constant", self.pad_tag_id))

        return torch.stack(new_sentences), torch.stack(new_tags)

In [12]:
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=False,
    drop_last=False,
    collate_fn=PaddingCollator(
        pad_token_id=token_to_idx['<PAD>'],
        pad_tag_id=tag_to_idx['<PAD>'],
    )
)

test_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=64,
    shuffle=False,
    drop_last=False,
    collate_fn=PaddingCollator(
        pad_token_id=token_to_idx['<PAD>'],
        pad_tag_id=tag_to_idx['<PAD>'],
    )
)

### General training code

Код для обучения, формат которого взят с семинаров.

In [13]:
def train_on_batch(model, x_batch, y_batch, optimizer, loss_function):
    global print_once
    
    model.train()
    model.zero_grad()

    x_batch = x_batch.to(device)
    y_batch = y_batch.to(device)

    output = model(x_batch)
    # (N, L, T) -> (N, T, L)
    output = torch.transpose(output, 1, 2)
    # Мы используем CrossEntropyLoass, которая ожидает input=(N, T, L) и target=(N, L)
    # NOTE: В документации T соответствует C, L соответствует d1
    loss = loss_function(output, y_batch)

    loss.backward()
    optimizer.step()

    return loss.cpu().item()

In [14]:
def train_epoch(train_generator, model, loss_function, optimizer, callback):
    epoch_loss = 0
    total = 0

    for it, (batch_of_x, batch_of_y) in enumerate(train_generator):
        batch_loss = train_on_batch(
            model, batch_of_x.to(device), batch_of_y.to(device), optimizer, loss_function)

        if callback is not None:
            callback(model, batch_loss)

        epoch_loss += batch_loss * len(batch_of_x)
        total += len(batch_of_x)

    return epoch_loss / total

In [15]:
def trainer(count_of_epoch,
            model,
            dataset_loader,
            loss_function,
            optimizer,
            lr=0.001,
            callback=None):
    optima = optimizer(model.parameters(), lr=lr, weight_decay=1e-5)

    iterations = tqdm(range(count_of_epoch))

    for it in iterations:
        epoch_loss = train_epoch(
            train_generator=dataset_loader, model=model,
            loss_function=loss_function,
            optimizer=optima,
            callback=callback)

        iterations.set_postfix({'train epoch loss': epoch_loss})

In [16]:
def quality_of_train(dataset_loader,
                     model,
                     loss_function):
    pred = []
    real = []
    test_loss = 0
    total = 0

    for it, (sentences, tags) in enumerate(dataset_loader):
        sentences = sentences.to(device)
        tags = tags.to(device)

        output = model(sentences)

        pred.extend(torch.argmax(output, dim=2).cpu().numpy().flatten().tolist())
        real.extend(tags.cpu().numpy().flatten().tolist())

        output = torch.transpose(output, 1, 2)
        test_loss += loss_function(output, tags).cpu().item() * len(sentences)
        total += len(sentences)

    test_loss /= total

    return test_loss, pred, real

### Модель LSTM

In [17]:
class LSTMTagger(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, dropout, num_layers):
        super(LSTMTagger, self).__init__()

        # Идентичен nn.Linear(vocab_size, embedding_dim) где
        # каждое слово из словаря является one-hot вектором, len = vocab_size
        self.word_embeddings = torch.nn.Embedding(vocab_size, embedding_dim)

        # Учитывая batch_first=True, ожидает на вход тензор размерами (N, L, H_in), где
        # N -- количество элементов в batch-e
        # L -- длина последовательности (количество токенов в "предложении")
        # H_in -- embedding_dim
        #
        # Учитывая batch_first=True, на выходе тензор (N,L,D*H_out), где
        # L -- см. выше
        # D -- directions (1 в случае bidirectional=False)
        # H_out -- hidden_dim
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, batch_first=True, dropout=dropout, num_layers=num_layers)

        # После него на выходе (N,L,T), где
        # T -- количество возможных тэгов
        self.linear = torch.nn.Linear(hidden_dim, tagset_size)

    def forward(self, x_batch):
        embeddings = self.word_embeddings(x_batch)

        d_n, (h_n, c_n) = self.lstm(embeddings)
        return self.linear(d_n)

### Tensorboard training tracking

In [18]:
class callback():
    def __init__(self, writer, dataset_loader, loss_function, delimeter=100):
        self.step = 0
        self.writer = writer
        self.delimeter = delimeter
        self.loss_function = loss_function

        self.dataset_loader = dataset_loader

    def forward(self, model, loss):
        self.step += 1
        self.writer.add_scalar('LOSS/train', loss, self.step)
        model.eval()

        if self.step % self.delimeter == 0:
            test_loss, pred, real = quality_of_train(dataset_loader=self.dataset_loader,
                                                     model=model, loss_function=self.loss_function)
            self.writer.add_scalar('LOSS/test', test_loss, self.step)

            indices = (real != tag_to_idx['<PAD>']).nonzero()

            real = np.array(real)[indices]
            pred = np.array(pred)[indices]

            self.writer.add_scalar('Valid/acc', accuracy_score(real, pred), self.step)
            self.writer.add_text('Valid/report', str(classification_report(real, pred)), self.step)

    def __call__(self, model, loss):
        return self.forward(model, loss)

### LSTM training

In [19]:
loss_function = torch.nn.CrossEntropyLoss(ignore_index=tag_to_idx['<PAD>'])
optimizer = torch.optim.Adam

In [20]:
grid = ParameterGrid({
    'num_layers' : [1, 2, 3],
    'embedding_dim': [150, 300, 450],
    'hidden_dim': [150, 300, 450],
    'dropout': [0.0, 0.3]
})

for item in tqdm(grid):
    print(str(item))

    model = LSTMTagger(
        vocab_size=len(token_to_idx),
        tagset_size=len(tag_to_idx),
        **item
    )

    writer = SummaryWriter('experiments/' + str(item))

    model.float().to(device)

    call = callback(writer, test_dataloader, loss_function, delimeter=300)

    trainer(count_of_epoch=3,
        dataset_loader=train_dataloader,
        model=model,
        loss_function=loss_function,
        optimizer=optimizer,
        lr=0.001,
        callback=call)

  0%|          | 0/54 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 150, 'hidden_dim': 150, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 150, 'hidden_dim': 150, 'num_layers': 2}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 150, 'hidden_dim': 150, 'num_layers': 3}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 150, 'hidden_dim': 300, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 150, 'hidden_dim': 300, 'num_layers': 2}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 150, 'hidden_dim': 300, 'num_layers': 3}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 150, 'hidden_dim': 450, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 150, 'hidden_dim': 450, 'num_layers': 2}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 150, 'hidden_dim': 450, 'num_layers': 3}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 300, 'hidden_dim': 150, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 300, 'hidden_dim': 150, 'num_layers': 2}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 300, 'hidden_dim': 150, 'num_layers': 3}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 300, 'hidden_dim': 300, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 300, 'hidden_dim': 300, 'num_layers': 2}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 300, 'hidden_dim': 300, 'num_layers': 3}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 300, 'hidden_dim': 450, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 300, 'hidden_dim': 450, 'num_layers': 2}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 300, 'hidden_dim': 450, 'num_layers': 3}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 450, 'hidden_dim': 150, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 450, 'hidden_dim': 150, 'num_layers': 2}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 450, 'hidden_dim': 150, 'num_layers': 3}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 450, 'hidden_dim': 300, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 450, 'hidden_dim': 300, 'num_layers': 2}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 450, 'hidden_dim': 300, 'num_layers': 3}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 450, 'hidden_dim': 450, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 450, 'hidden_dim': 450, 'num_layers': 2}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.0, 'embedding_dim': 450, 'hidden_dim': 450, 'num_layers': 3}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 150, 'hidden_dim': 150, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 150, 'hidden_dim': 150, 'num_layers': 2}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 150, 'hidden_dim': 150, 'num_layers': 3}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 150, 'hidden_dim': 300, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 150, 'hidden_dim': 300, 'num_layers': 2}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 150, 'hidden_dim': 300, 'num_layers': 3}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 150, 'hidden_dim': 450, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 150, 'hidden_dim': 450, 'num_layers': 2}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 150, 'hidden_dim': 450, 'num_layers': 3}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 300, 'hidden_dim': 150, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 300, 'hidden_dim': 150, 'num_layers': 2}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 300, 'hidden_dim': 150, 'num_layers': 3}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 300, 'hidden_dim': 300, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 300, 'hidden_dim': 300, 'num_layers': 2}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 300, 'hidden_dim': 300, 'num_layers': 3}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 300, 'hidden_dim': 450, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 300, 'hidden_dim': 450, 'num_layers': 2}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 300, 'hidden_dim': 450, 'num_layers': 3}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 450, 'hidden_dim': 150, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 450, 'hidden_dim': 150, 'num_layers': 2}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 450, 'hidden_dim': 150, 'num_layers': 3}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 450, 'hidden_dim': 300, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 450, 'hidden_dim': 300, 'num_layers': 2}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 450, 'hidden_dim': 300, 'num_layers': 3}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 450, 'hidden_dim': 450, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 450, 'hidden_dim': 450, 'num_layers': 2}


  0%|          | 0/3 [00:00<?, ?it/s]

{'dropout': 0.3, 'embedding_dim': 450, 'hidden_dim': 450, 'num_layers': 3}


  0%|          | 0/3 [00:00<?, ?it/s]

### Conclusions

Написана обвязка для dataset-a, реализована модель LSTM, проведен grid-search по гиперпараметрам, выполнена оценка loss и accuracy.

Результаты:
- Все модели имеют очень близкое качество
- Количество слоев почти не влияет на качество модели
- Dropout почти не влияет на качество модели
- Самое значительное влияние оказывает размерность входа и выхода LSTM
- Самая простая модель (1, 150, 150) обучилась в 5 раз быстрее самой сложной (3, 450, 450)

Модели показали высокую точность (0.93-0.94), причем усложнение моделей не дало сильного прироста. Значения на train и test отличаются не критично, то есть модели не переобучились.