# Stage 2

On this stage we want to compare the performance of a small neural model trained (BERT) on two different types of annotations:
    (1) annotations generated using the best method from the first stage of the project,
    (2) the original, ground-truth annotations provided in the Coll2003 dataset.

Importing required libraries:

In [32]:
%pip install --quiet datasets

In [33]:
from datasets import load_dataset
from collections import defaultdict, Counter
from tqdm import trange, tqdm
from dataclasses import dataclass
import pathlib
import os
import sys
import json
import itertools

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split, TensorDataset
from torch.nn.utils.rnn import pad_sequence

import numpy as np

from transformers import BertTokenizer, BertModel, BertForTokenClassification


In [34]:
API_URL = "https://llm.ispras.ru/api/chat/completions"
API_MODEL_URL = "https://llm.ispras.ru/api/models"
API_KEY = "YOUR_TOKEN"
with open('./secrets') as file:
    data: dict = json.load(file)
    API_KEY = data.get('API_KEY', 'FAILED TO LOAD')

HIDDEN = 512
NER_TAGS = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
ADDITIONAL = {"[CLS]": 9, "[SEP]": 10, "X": 11}
NER_TAGS.update(ADDITIONAL)
TAGS_COUNT = len(NER_TAGS)
DATABASE_DIR = './data'
EMBEDDINGS_PATH = f'{DATABASE_DIR}/embeddings.txt'
CWD = '~/Рабочий стол/NERC_LLM_Ispras/data'
BERT_MODEL = 'bert-base-cased'
BATCH_SIZE = 32
EPOCHS = 5

## Initalizing Neural Network Model

Model will consist of:
    (1) *pre-trained BERT model* for word embeddings, which captures contextual information effectively.
    (2) *bidirectional LSTM (BiLSTM) layer* to process the sequence of embeddings and capture dependencies between words.
    (3) *linear layer* to map the LSTM outputs to the NER tags.

In [35]:
class NERSmall(BertForTokenClassification):
    def forward(self, input_ids, attention_mask=None, token_type_ids=None,
                position_ids=None, head_mask=None, labels=None, label_masks=None):
        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids,
                            head_mask=head_mask)

        sequence_output = outputs[0]  # (b, MAX_LEN, 768)

        token_reprs = [embedding[mask] for mask, embedding in zip(label_masks, sequence_output)]
        token_reprs = pad_sequence(sequences=token_reprs, batch_first=True,
                                   padding_value=-1)  # (b, local_max_len, 768)
        sequence_output = self.dropout(token_reprs)
        logits = self.classifier(sequence_output)  # (b, local_max_len, num_labels)

        outputs = (logits,)
        if labels is not None:
            labels = [label[mask] for mask, label in zip(label_masks, labels)]
            labels = pad_sequence(labels, batch_first=True, padding_value=-1)  # (b, local_max_len)
            loss_fct = nn.CrossEntropyLoss(ignore_index=-1, reduction='sum')
            mask = labels != -1
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            loss /= mask.float().sum()
            outputs = (loss,) + outputs + (labels,)

        return outputs  # (loss), scores, (hidden_states), (attentions)


## Parsing and Loading Data

To organize the data, we define a simple data structure `InputExample`:

In [36]:
@dataclass
class InputExample:
    guid: str
    text: str
    ner_tag: list[str]

We make `parse_dataformat` function to processe raw text data, splitting it into sentences and their corresponding NER tags. We make it handle empty lines and document separators. It organizes the data into a list of InputExample objects.

In [37]:
def parse_dataformat(base_data: str, set_type: str) -> list[InputExample]:
    data = []
    sentence = []
    ner_tags = []
    for line in base_data.splitlines():
        if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
            if len(sentence) > 0:
                data.append((sentence, ner_tags))
                sentence = []
                ner_tags = []
            continue
        splits = line.split(' ')
        sentence.append(splits[0])
        ner_tags.append(splits[-1])

    if len(sentence) > 0:
        data.append((sentence, ner_tags))
        sentence = []
        ner_tags = []

    return [InputExample(f'{set_type}-{i}', ' '.join(sentence), ner_tag) for i, (sentence, ner_tag) in enumerate(data)]

def load_examples(data_dir) -> tuple[list[InputExample], list[InputExample], list[InputExample]]:
    with open(pathlib.Path.joinpath(pathlib.Path(data_dir), 'test.txt')) as test, open(pathlib.Path.joinpath(pathlib.Path(data_dir), 'train.txt')) as train, open(pathlib.Path.joinpath(pathlib.Path(data_dir), 'valid.txt')) as valid:
        test_data, train_data, valid_data = test.read(), train.read(), valid.read()
    return parse_dataformat(test_data, 'test'), parse_dataformat(train_data, 'train'), parse_dataformat(valid_data, 'valid')


## Custom Dataset


Now we need to create `NERDataSet` class that prepares the data for training. It tokenizes the input text, maps NER tags to their corresponding IDs, and pads sequences to a fixed length. The __getitem__ method processes each example by adding special tokens [CLS] and [SEP], tokenizing the text, and creating attention masks and sentence IDs. The method returns tensors for input IDs, NER tag IDs, attention masks, sentence IDs, and tag masks, which are used by the model during training.

## Training Environment

We are using BERT Tokenizer from `from_pretrained`.

The dataset is loaded into training, validation, and test sets using the load_examples function. DataLoader objects are created for each dataset, enabling efficient batching and shuffling of the data during training and evaluation.

In [38]:
class NERDataSet(Dataset):
    def __init__(self, data: list[InputExample], tokenizer: BertTokenizer, ner_tag_map: dict[str, int], max_len: int = 128):
        self._max_len = max_len
        self._ner_tag_map = ner_tag_map
        self._data = data
        self._tokenizer = tokenizer

    def __len__(self):
        return len(self._data)

    def __getitem__(self, idx: int):
        input_example = self._data[idx]

        text = input_example.text
        ner_tags = input_example.ner_tag
        word_tokens = ['[CLS]']
        ner_tag_list = ['[CLS]']
        ner_tag_mask = [0]  # value in {0,1} -- 0 signifies invalid token

        input_ids = [self._tokenizer.convert_tokens_to_ids('[CLS]')]
        ner_tag_ids = [self._ner_tag_map['[CLS]']]

        for word, ner_tag in zip(text.split(), ner_tags):
            tokenized_word = self._tokenizer.tokenize(word)

            word_tokens.extend(tokenized_word)
            assert len(tokenized_word) > 0
            input_ids.extend(map(self._tokenizer.convert_tokens_to_ids, tokenized_word))

            ner_tag_list.append(ner_tag)
            ner_tag_ids.append(self._ner_tag_map[ner_tag])
            ner_tag_mask.append(1)
            # len(tokenized_word) > 1 only if it splits word in between, in which case
            # the first token gets assigned NER tag and the remaining ones get assigned
            # X
            ner_tag_list.extend(itertools.repeat('X', len(tokenized_word) - 1))
            ner_tag_ids.extend(itertools.repeat(self._ner_tag_map['X'], len(tokenized_word) - 1))
            ner_tag_mask.extend(itertools.repeat(0, len(tokenized_word) - 1))

        assert len(word_tokens) == len(ner_tag_list) == len(input_ids) == len(ner_tag_ids) == len(ner_tag_mask)

        if len(word_tokens) >= self._max_len:
            word_tokens = word_tokens[:(self._max_len - 1)]
            ner_tag_list = ner_tag_list[:(self._max_len - 1)]
            input_ids = input_ids[:(self._max_len - 1)]
            ner_tag_ids = ner_tag_ids[:(self._max_len - 1)]
            ner_tag_mask = ner_tag_mask[:(self._max_len - 1)]

        assert len(word_tokens) < self._max_len, len(word_tokens)

        word_tokens.append('[SEP]')
        ner_tag_list.append('[SEP]')
        input_ids.append(self._tokenizer.convert_tokens_to_ids('[SEP]'))
        ner_tag_ids.append(self._ner_tag_map['[SEP]'])
        ner_tag_mask.append(0)

        assert len(word_tokens) == len(ner_tag_list) == len(input_ids) == len(ner_tag_ids) == len(ner_tag_mask)

        sentence_id = [0] * len(input_ids)
        attention_mask = [1] * len(input_ids)

        left = self._max_len - len(input_ids)
        input_ids.extend(itertools.repeat(0, left))
        ner_tag_ids.extend(itertools.repeat(self._ner_tag_map['X'], left))
        attention_mask.extend(itertools.repeat(0, left))
        sentence_id.extend(itertools.repeat(0, left))
        ner_tag_mask.extend(itertools.repeat(0, left))


        assert len(word_tokens) == len(ner_tag_list)
        assert len(input_ids) == len(ner_tag_ids) == len(attention_mask) == len(sentence_id) == len(
            ner_tag_mask) == self._max_len, len(input_ids)
        return torch.LongTensor(input_ids), torch.LongTensor(ner_tag_ids), torch.LongTensor(
            attention_mask), torch.LongTensor(sentence_id), torch.BoolTensor(ner_tag_mask)

    @staticmethod
    def collate_fn(batch):
        return {
            'input_ids': torch.stack([x['input_ids'] for x in batch]),
            'attention_mask': torch.tensor([x['attention_mask'] for x in batch])
        }

## Training Environment

We are using BERT Tokenizer from `from_pretrained`.

The dataset is loaded into training, validation, and test sets using the load_examples function. DataLoader objects are created for each dataset, enabling efficient batching and shuffling of the data during training and evaluation.

In [39]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)
test, train, valid = load_examples(DATABASE_DIR)
test = NERDataSet(test, tokenizer, NER_TAGS)
train = NERDataSet(train, tokenizer, NER_TAGS)
valid = NERDataSet(valid, tokenizer, NER_TAGS)
train_iter = DataLoader(dataset=train, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
eval_iter = DataLoader(dataset=valid, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
test_iter = DataLoader(dataset=test, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)


We will use *CrossEntropy* Loss Function and Stochastic Gradient Descent with a learning rate of 0.001 and momentum of 0.9 to accelerate convergence.

In [40]:
model = NERSmall.from_pretrained(BERT_MODEL, num_labels=TAGS_COUNT).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

Some weights of NERSmall were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Training the Model

In [41]:
for epoch in trange(EPOCHS, desc='Epoch'):
    running_loss = 0.0
    tr_loss = 0
    nb_tr_steps = 0
    for batch in tqdm(train_iter):
        # input_ids, labels, input_mask, token_type_ids, label_masks = batch
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_labels, b_input_mask, b_token_type_ids, b_label_masks = batch
        optimizer.zero_grad()

        loss, logits, labels = model(b_input_ids, token_type_ids=b_token_type_ids,
                                         attention_mask=b_input_mask, labels=b_labels,
                                         label_masks=b_label_masks)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        # track train loss
        tr_loss += loss.item()
        nb_tr_steps += 1
        # update parameters
        optimizer.step()
        model.zero_grad()
        # print train loss per epoch
    print(f"Train loss: {tr_loss / nb_tr_steps}")

print('Finished Training')

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/439 [00:00<?, ?it/s][A
  0%|          | 1/439 [00:00<05:47,  1.26it/s][A
  0%|          | 2/439 [00:01<04:56,  1.47it/s][A
  1%|          | 3/439 [00:02<04:45,  1.53it/s][A
  1%|          | 4/439 [00:02<04:37,  1.57it/s][A
  1%|          | 5/439 [00:03<04:41,  1.54it/s][A
  1%|▏         | 6/439 [00:03<04:44,  1.52it/s][A
  2%|▏         | 7/439 [00:04<04:40,  1.54it/s][A
  2%|▏         | 8/439 [00:05<04:35,  1.56it/s][A
  2%|▏         | 9/439 [00:05<04:32,  1.58it/s][A
  2%|▏         | 10/439 [00:06<04:31,  1.58it/s][A
  3%|▎         | 11/439 [00:07<04:30,  1.58it/s][A
  3%|▎         | 12/439 [00:07<04:30,  1.58it/s][A
  3%|▎         | 13/439 [00:08<04:29,  1.58it/s][A
  3%|▎         | 14/439 [00:08<04:26,  1.60it/s][A
  3%|▎         | 15/439 [00:09<04:26,  1.59it/s][A
  4%|▎         | 16/439 [00:10<04:25,  1.59it/s][A
  4%|▍         | 17/439 [00:10<04:28,  1.57it/s][A
  4%|▍         | 18/439 [00:11<04:36,

Train loss: 0.19989937151186082



  0%|          | 0/439 [00:00<?, ?it/s][A
  0%|          | 1/439 [00:00<05:09,  1.41it/s][A
  0%|          | 2/439 [00:01<04:41,  1.55it/s][A
  1%|          | 3/439 [00:01<04:29,  1.62it/s][A
  1%|          | 4/439 [00:02<04:27,  1.62it/s][A
  1%|          | 5/439 [00:03<04:22,  1.65it/s][A
  1%|▏         | 6/439 [00:03<04:20,  1.66it/s][A
  2%|▏         | 7/439 [00:04<04:19,  1.67it/s][A
  2%|▏         | 8/439 [00:04<04:17,  1.67it/s][A
  2%|▏         | 9/439 [00:05<04:15,  1.68it/s][A
  2%|▏         | 10/439 [00:06<04:14,  1.68it/s][A
  3%|▎         | 11/439 [00:06<04:13,  1.69it/s][A
  3%|▎         | 12/439 [00:07<04:13,  1.68it/s][A
  3%|▎         | 13/439 [00:07<04:13,  1.68it/s][A
  3%|▎         | 14/439 [00:08<04:12,  1.68it/s][A
  3%|▎         | 15/439 [00:09<04:13,  1.67it/s][A
  4%|▎         | 16/439 [00:09<04:11,  1.68it/s][A
  4%|▍         | 17/439 [00:10<04:10,  1.68it/s][A
  4%|▍         | 18/439 [00:10<04:09,  1.68it/s][A
  4%|▍         | 19/439 [00:1

Train loss: 0.05360179376875417



  0%|          | 0/439 [00:00<?, ?it/s][A
  0%|          | 1/439 [00:00<05:07,  1.42it/s][A
  0%|          | 2/439 [00:01<04:40,  1.56it/s][A
  1%|          | 3/439 [00:01<04:28,  1.63it/s][A
  1%|          | 4/439 [00:02<04:25,  1.64it/s][A
  1%|          | 5/439 [00:03<04:22,  1.65it/s][A
  1%|▏         | 6/439 [00:03<04:20,  1.66it/s][A
  2%|▏         | 7/439 [00:04<04:20,  1.66it/s][A
  2%|▏         | 8/439 [00:04<04:19,  1.66it/s][A
  2%|▏         | 9/439 [00:05<04:17,  1.67it/s][A
  2%|▏         | 10/439 [00:06<04:15,  1.68it/s][A
  3%|▎         | 11/439 [00:06<04:14,  1.68it/s][A
  3%|▎         | 12/439 [00:07<04:13,  1.68it/s][A
  3%|▎         | 13/439 [00:07<04:12,  1.69it/s][A
  3%|▎         | 14/439 [00:08<04:12,  1.69it/s][A
  3%|▎         | 15/439 [00:09<04:11,  1.68it/s][A
  4%|▎         | 16/439 [00:09<04:11,  1.68it/s][A
  4%|▍         | 17/439 [00:10<04:10,  1.69it/s][A
  4%|▍         | 18/439 [00:10<04:09,  1.69it/s][A
  4%|▍         | 19/439 [00:1

Train loss: 0.03490082435638714



  0%|          | 0/439 [00:00<?, ?it/s][A
  0%|          | 1/439 [00:00<05:49,  1.25it/s][A
  0%|          | 2/439 [00:01<04:56,  1.47it/s][A
  1%|          | 3/439 [00:01<04:36,  1.58it/s][A
  1%|          | 4/439 [00:02<04:30,  1.61it/s][A
  1%|          | 5/439 [00:03<04:25,  1.64it/s][A
  1%|▏         | 6/439 [00:03<04:21,  1.66it/s][A
  2%|▏         | 7/439 [00:04<04:18,  1.67it/s][A
  2%|▏         | 8/439 [00:04<04:17,  1.68it/s][A
  2%|▏         | 9/439 [00:05<04:15,  1.68it/s][A
  2%|▏         | 10/439 [00:06<04:14,  1.68it/s][A
  3%|▎         | 11/439 [00:06<04:13,  1.69it/s][A
  3%|▎         | 12/439 [00:07<04:12,  1.69it/s][A
  3%|▎         | 13/439 [00:07<04:12,  1.69it/s][A
  3%|▎         | 14/439 [00:08<04:11,  1.69it/s][A
  3%|▎         | 15/439 [00:09<04:10,  1.69it/s][A
  4%|▎         | 16/439 [00:09<04:09,  1.70it/s][A
  4%|▍         | 17/439 [00:10<04:09,  1.69it/s][A
  4%|▍         | 18/439 [00:10<04:09,  1.69it/s][A
  4%|▍         | 19/439 [00:1

Train loss: 0.024840473800426633



  0%|          | 0/439 [00:00<?, ?it/s][A
  0%|          | 1/439 [00:00<05:15,  1.39it/s][A
  0%|          | 2/439 [00:01<04:42,  1.55it/s][A
  1%|          | 3/439 [00:01<04:29,  1.62it/s][A
  1%|          | 4/439 [00:02<04:25,  1.64it/s][A
  1%|          | 5/439 [00:03<04:21,  1.66it/s][A
  1%|▏         | 6/439 [00:03<04:19,  1.67it/s][A
  2%|▏         | 7/439 [00:04<04:17,  1.68it/s][A
  2%|▏         | 8/439 [00:04<04:16,  1.68it/s][A
  2%|▏         | 9/439 [00:05<04:15,  1.68it/s][A
  2%|▏         | 10/439 [00:06<04:14,  1.68it/s][A
  3%|▎         | 11/439 [00:06<04:13,  1.69it/s][A
  3%|▎         | 12/439 [00:07<04:13,  1.68it/s][A
  3%|▎         | 13/439 [00:07<04:13,  1.68it/s][A
  3%|▎         | 14/439 [00:08<04:13,  1.68it/s][A
  3%|▎         | 15/439 [00:09<04:11,  1.69it/s][A
  4%|▎         | 16/439 [00:09<04:10,  1.69it/s][A
  4%|▍         | 17/439 [00:10<04:10,  1.68it/s][A
  4%|▍         | 18/439 [00:10<04:09,  1.69it/s][A
  4%|▍         | 19/439 [00:1

Train loss: 0.018063813425411718
Finished Training



