In [1]:
import torch
from torch.utils.tensorboard import SummaryWriter

from torch.utils.data import Dataset

import torchvision
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt

from tqdm import tqdm

from tqdm.notebook import tqdm
import numpy as np

from sklearn.model_selection import ParameterGrid
from sklearn.metrics import classification_report, accuracy_score

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## Фиксируем seed

In [4]:
random_seed = 48
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

In [5]:
from nerus import load_nerus
docs = load_nerus('nerus_lenta.conllu.gz')

In [6]:
sents = []
tags  = []

n_docs = 3000

for i, doc in enumerate(docs):
    if i == n_docs:
        break

    for sent in doc.sents:
        s = []
        t = []
        for token in sent.tokens:
            s.append(token.text)
            t.append(token.pos)

        sents.append(s)
        tags.append(t)

In [7]:
padding_tokens = ['<PAD>']
padding_tags   = ['<PAD>']

In [8]:
unique_tags   = padding_tags   + list(set(sum(tags,  [])))
unique_tokens = padding_tokens + list(set(sum(sents, [])))

In [9]:
token_to_idx = dict(zip(unique_tokens, np.arange(len(unique_tokens))))
tag_to_idx   = dict(zip(unique_tags,   np.arange(len(unique_tags))))

In [10]:
train_test_bnd = int(len(sents) * 0.8)

train_sentences = sents[:train_test_bnd]
train_tags      = tags[:train_test_bnd]
test_sentences  = sents[train_test_bnd:]
test_tags       = tags[train_test_bnd:]

print(len(train_sentences), len(test_sentences))

28312 7079


In [11]:
#
# Creating a Custom Dataset for your files
# https://pytorch.org/tutorials/beginner/basics/data_tutorial.html#creating-a-custom-dataset-for-your-files
#
class TagsDataset(torch.utils.data.Dataset):
    def __init__(self, sentences, tags, token_to_idx, tag_to_idx):
        super().__init__()

        self.sentences    = sentences
        self.tags         = tags
        self.token_to_idx = token_to_idx
        self.tag_to_idx   = tag_to_idx

        sent_index = []
        tags_index = []

        for sent in sentences:
            sequence = []
            for token in sent:
                assert token in self.token_to_idx
                sequence.append(self.token_to_idx[token])

            sent_index.append(sequence)

        for sent_tags in tags:
            tgs = []
            for tag in sent_tags:
                assert tag in self.tag_to_idx
                tgs.append(self.tag_to_idx[tag])

            tags_index.append(tgs)

        self.sent_index = sent_index
        self.tags_index = tags_index
        
    def __getitem__(self, idx):
        return torch.tensor(self.sent_index[idx]), torch.tensor(self.tags_index[idx])

    def __len__(self):
        return len(self.sent_index)

In [12]:
#
# Provide padding for DataLoader
#
class Padding:
    def __init__(self, pad_token_id, pad_tag_id):
        self.pad_token_id = pad_token_id
        self.pad_tag_id    = pad_tag_id
        
    def __call__(self, batch):
        # Find maximum sentence length in batch
        max_len = 0
        for tokens, tags in batch:
            if len(tokens) > max_len:
                max_len = len(tokens)
        
        pad_sentences = []
        pad_tags = []

        for tokens, tags in batch:
            pad_sentences.append(torch.nn.functional.pad(tokens, (0, max_len - len(tokens)), "constant", self.pad_token_id))
            pad_tags.append(     torch.nn.functional.pad(tags,   (0, max_len - len(tags)),   "constant", self.pad_tag_id ))
        
        return torch.stack(pad_sentences), torch.stack(pad_tags)

## Обучение модели (код с семинара)

In [13]:
def train_on_batch(model, x_batch, y_batch, optimizer, loss_function):  
    model.train()
    model.zero_grad()

    x_batch = x_batch.to(device)
    y_batch = y_batch.to(device)

    output = model(x_batch)
    # Convert (N, L, T) to (N, T, L)
    output = torch.transpose(output, 1, 2)
    loss = loss_function(output, y_batch)

    loss.backward()
    optimizer.step()

    return loss.cpu().item()

In [14]:
def train_epoch(train_generator, model, loss_function, optimizer, callback):
    epoch_loss = 0
    total = 0

    for it, (batch_of_x, batch_of_y) in enumerate(train_generator):
        batch_loss = train_on_batch(
            model, batch_of_x.to(device), batch_of_y.to(device), optimizer, loss_function)

        if callback is not None:
            callback(model, batch_loss)

        epoch_loss += batch_loss * len(batch_of_x)
        total += len(batch_of_x)

    return epoch_loss / total

In [15]:
def trainer(count_of_epoch,
            model,
            dataset_loader,
            loss_function,
            optimizer,
            lr=0.001,
            callback=None):
    optima = optimizer(model.parameters(), lr=lr, weight_decay=1e-5)

    iterations = tqdm(range(count_of_epoch))

    for it in iterations:
        epoch_loss = train_epoch(
            train_generator=dataset_loader, model=model,
            loss_function=loss_function,
            optimizer=optima,
            callback=callback)

        iterations.set_postfix({'train epoch loss': epoch_loss})

In [16]:
def quality_of_train(dataset_loader,
                     model,
                     loss_function):
    pred = []
    real = []
    test_loss = 0
    total = 0

    for it, (sentences, tags) in enumerate(dataset_loader):
        sentences = sentences.to(device)
        tags = tags.to(device)

        output = model(sentences)

        pred.extend(torch.argmax(output, dim=2).cpu().numpy().flatten().tolist())
        real.extend(tags.cpu().numpy().flatten().tolist())

        output = torch.transpose(output, 1, 2)
        test_loss += loss_function(output, tags).cpu().item() * len(sentences)
        total += len(sentences)

    test_loss /= total

    return test_loss, pred, real

## Модель LSTM

In [17]:
class LSTM(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, dropout, num_layers, batchnorm_threshold):
        super(LSTM, self).__init__()

        # Sequence [1, 2, 3, 123, 33, PAD ...] --> [1.23, 0.23, 0.11] (size == dim)
        self.word_embeddings = torch.nn.Embedding(vocab_size, embedding_dim, max_norm = batchnorm_threshold)
        
        #
        # Input: (N, L, H_in)
        # N - batch size
        # L - sequence length
        # H_in - input size
        #
        # Output: (N, L, H_out)
        # N - batch size
        # L - sequence length
        # H_out - hidden size
        #
        # See: https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
        #
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, batch_first=True, dropout=dropout, num_layers=num_layers)
        self.linear = torch.nn.Linear(hidden_dim, tagset_size)

    def forward(self, x_batch):
        embeddings = self.word_embeddings(x_batch)
        d_n, (h_n, c_n) = self.lstm(embeddings)
        return self.linear(d_n)

## Tensorboard

In [18]:
class callback():
    def __init__(self, writer, dataset_loader, loss_function, delimeter=100):
        self.step = 0
        self.writer = writer
        self.delimeter = delimeter
        self.loss_function = loss_function

        self.dataset_loader = dataset_loader

    def forward(self, model, loss):
        self.step += 1
        self.writer.add_scalar('LOSS/train', loss, self.step)
        model.eval()

        if self.step % self.delimeter == 0:
            test_loss, pred, real = quality_of_train(dataset_loader=self.dataset_loader,
                                                     model=model, loss_function=self.loss_function)
            self.writer.add_scalar('LOSS/test', test_loss, self.step)

            indices = (real != tag_to_idx['<PAD>']).nonzero()

            real = np.array(real)[indices]
            pred = np.array(pred)[indices]

            self.writer.add_scalar('VALID/acc', accuracy_score(real, pred), self.step)

    def __call__(self, model, loss):
        return self.forward(model, loss)

In [19]:
train_dataset = TagsDataset(train_sentences, train_tags, token_to_idx, tag_to_idx)
test_dataset = TagsDataset(test_sentences, test_tags, token_to_idx, tag_to_idx)

In [20]:
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=4,
    shuffle=False,
    drop_last=False,
    collate_fn=Padding(
        pad_token_id = token_to_idx['<PAD>'],
        pad_tag_id   = tag_to_idx['<PAD>'],
    )
)

test_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=4,
    shuffle=False,
    drop_last=False,
    collate_fn=Padding(
        pad_token_id = token_to_idx['<PAD>'],
        pad_tag_id   = tag_to_idx['<PAD>'],
    )
)

In [21]:
loss_function = torch.nn.CrossEntropyLoss(ignore_index=tag_to_idx['<PAD>'])
optimizer = torch.optim.Adam

In [22]:
import warnings
warnings.filterwarnings("ignore")

In [23]:
grid = ParameterGrid({
    'num_layers' : [1, 3],
    'embedding_dim': [100, 300],
    'hidden_dim': [100, 300],
    'dropout': [0.0, 0.25],
    'batchnorm_threshold': [1, 10000],
})

for params in tqdm(grid):
    print(str(params))

    model = LSTM(
        vocab_size  = len(token_to_idx),
        tagset_size = len(tag_to_idx),
        **params
    )
    model.to(device)
    
    writer = SummaryWriter(f'runs/{str(params)}')

    call = callback(writer, test_dataloader, loss_function, delimeter=300)

    trainer(count_of_epoch=3,
        dataset_loader=train_dataloader,
        model=model,
        loss_function=loss_function,
        optimizer=optimizer,
        lr=0.001,
        callback=call)

  0%|          | 0/32 [00:00<?, ?it/s]

{'batchnorm_threshold': 1, 'dropout': 0.0, 'embedding_dim': 100, 'hidden_dim': 100, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

{'batchnorm_threshold': 1, 'dropout': 0.0, 'embedding_dim': 100, 'hidden_dim': 100, 'num_layers': 3}


  0%|          | 0/3 [00:00<?, ?it/s]

{'batchnorm_threshold': 1, 'dropout': 0.0, 'embedding_dim': 100, 'hidden_dim': 300, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

{'batchnorm_threshold': 1, 'dropout': 0.0, 'embedding_dim': 100, 'hidden_dim': 300, 'num_layers': 3}


  0%|          | 0/3 [00:00<?, ?it/s]

{'batchnorm_threshold': 1, 'dropout': 0.0, 'embedding_dim': 300, 'hidden_dim': 100, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

{'batchnorm_threshold': 1, 'dropout': 0.0, 'embedding_dim': 300, 'hidden_dim': 100, 'num_layers': 3}


  0%|          | 0/3 [00:00<?, ?it/s]

{'batchnorm_threshold': 1, 'dropout': 0.0, 'embedding_dim': 300, 'hidden_dim': 300, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

{'batchnorm_threshold': 1, 'dropout': 0.0, 'embedding_dim': 300, 'hidden_dim': 300, 'num_layers': 3}


  0%|          | 0/3 [00:00<?, ?it/s]

{'batchnorm_threshold': 1, 'dropout': 0.25, 'embedding_dim': 100, 'hidden_dim': 100, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

{'batchnorm_threshold': 1, 'dropout': 0.25, 'embedding_dim': 100, 'hidden_dim': 100, 'num_layers': 3}


  0%|          | 0/3 [00:00<?, ?it/s]

{'batchnorm_threshold': 1, 'dropout': 0.25, 'embedding_dim': 100, 'hidden_dim': 300, 'num_layers': 1}


  0%|          | 0/3 [00:00<?, ?it/s]

KeyboardInterrupt: 