# Stage 2

On this stage we want to compare the performance of a small neural model trained (BERT) on two different types of annotations: 
    (1) annotations generated using the best method from the first stage of the project,
    (2) the original, ground-truth annotations provided in the Coll2003 dataset.

Importing required libraries:

In [1]:
from datasets import load_dataset
from collections import defaultdict, Counter
from tqdm import trange, tqdm
from dataclasses import dataclass
import pathlib
import os
import sys
import json
import itertools

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import numpy as np

from transformers import BertTokenizer, BertModel


In [None]:
API_URL = "https://llm.ispras.ru/api/chat/completions"
API_MODEL_URL = "https://llm.ispras.ru/api/models"
API_KEY = "YOUR_TOKEN"
with open('./secrets') as file:
    data: dict = json.load(file)
    API_KEY = data.get('API_KEY', 'FAILED TO LOAD')

HIDDEN = 512
NER_TAGS = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
ADDITIONAL = {"[CLS]": 9, "[SEP]": 10, "X": 11}
NER_TAGS.update(ADDITIONAL)
TAGS_COUNT = len(NER_TAGS)
DATABASE_DIR = './data'
EMBEDDINGS_PATH = f'{DATABASE_DIR}/embeddings.txt'
CWD = '~/Рабочий стол/NERC_LLM_Ispras/data'
BERT_MODEL = 'bert-base-cased'
BATCH_SIZE = 4

## Initalizing Neural Network Model

Model will consist of: 
    (1) *pre-trained BERT model* for word embeddings, which captures contextual information effectively. 
    (2) *bidirectional LSTM (BiLSTM) layer* to process the sequence of embeddings and capture dependencies between words. 
    (3) *linear layer* to map the LSTM outputs to the NER tags.

In [83]:
class NERSmall(nn.Module):
    def __init__(self, model_name, embedding_dim=300):
        super(NERSmall, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)

        self.bilstm1 = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=HIDDEN,
            bidirectional=True,
        )
        # self.bilstm2 = nn.LSTM(
        #     input_size=HIDDEN * 2,
        #     hidden_size=HIDDEN * HIDDEN,
        #     bidirectional=True,
        # )
        self.lin = nn.Linear(HIDDEN * 2, TAGS_COUNT)
    
    def forward(self, x):
        y = self.bert(x)
        y, _ = self.bilstm1(y)
        return self.lin(y)


## Parsing and Loading Data

To organize the data, we define a simple data structure `InputExample`:

In [77]:
@dataclass
class InputExample:
    guid: str
    text: str
    ner_tag: str | None

We make `parse_dataformat` function to processe raw text data, splitting it into sentences and their corresponding NER tags. We make it handle empty lines and document separators. It organizes the data into a list of InputExample objects. 

In [78]:
def parse_dataformat(base_data: str, set_type: str) -> list[InputExample]:
    data = []
    sentence = []
    ner_tags = []
    for line in base_data.splitlines():
        if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
            if len(sentence) > 0:
                data.append((sentence, ner_tags))
                sentence = []
                ner_tags = []
            continue
        splits = line.split(' ')
        sentence.append(splits[0])
        ner_tags.append(splits[-1][:-1])

    if len(sentence) > 0:
        data.append((sentence, ner_tags))
        sentence = []
        ner_tags = []

    return [InputExample(f'{set_type}-{i}', ' '.join(sentence), ner_tag) for i, (sentence, ner_tag) in enumerate(data)]

def load_examples(data_dir) -> tuple[list[InputExample], list[InputExample], list[InputExample]]:
    with open(pathlib.Path.joinpath(pathlib.Path(data_dir), 'test.txt')) as test, open(pathlib.Path.joinpath(pathlib.Path(data_dir), 'train.txt')) as train, open(pathlib.Path.joinpath(pathlib.Path(data_dir), 'valid.txt')) as valid:
        test_data, train_data, valid_data = test.read(), train.read(), valid.read()
    return parse_dataformat(test_data, 'test'), parse_dataformat(train_data, 'train'), parse_dataformat(valid_data, 'valid')


## Custom Dataset


Now we need to create `NERDataSet` class that prepares the data for training. It tokenizes the input text, maps NER tags to their corresponding IDs, and pads sequences to a fixed length. The __getitem__ method processes each example by adding special tokens [CLS] and [SEP], tokenizing the text, and creating attention masks and sentence IDs. The method returns tensors for input IDs, NER tag IDs, attention masks, sentence IDs, and tag masks, which are used by the model during training.

In [79]:
class NERDataSet(Dataset):
    def __init__(self, data: list[InputExample], tokenizer: BertTokenizer, ner_tag_map: dict[str, int],
                 max_len: int = 128):
        self._max_len = max_len
        self._ner_tag_map = ner_tag_map
        self._data = data
        self._tokenizer = tokenizer

    def __len__(self):
        return len(self._data)

    def __getitem__(self, idx):
        input_example = self._data[idx]
        text = input_example.text
        ner_tags = input_example.ner_tag
        word_tokens = ['[CLS]']
        ner_tag_list = ['[CLS]']
        ner_tag_mask = [0]  # value in {0,1} -- 0 signifies invalid token

        input_ids = [self._tokenizer.convert_tokens_to_ids('[CLS]')]
        ner_tag_ids = [self._ner_tag_map['[CLS]']]

        for word, ner_tag in zip(text.split(), ner_tags):
            tokenized_word = self._tokenizer.tokenize(word)

            word_tokens.extend(tokenized_word)
            assert len(tokenized_word) > 0
            input_ids.extend(map(self._tokenizer.convert_tokens_to_ids, tokenized_word))

            ner_tag_list.append(ner_tag)
            ner_tag_ids.append(self._ner_tag_map[ner_tag])
            ner_tag_mask.append(1)
            # len(tokenized_word) > 1 only if it splits word in between, in which case
            # the first token gets assigned NER tag and the remaining ones get assigned
            # X
            ner_tag_list.extend(itertools.repeat('X', len(tokenized_word) - 1))
            ner_tag_ids.extend(itertools.repeat(self._ner_tag_map['X'], len(tokenized_word) - 1))
            ner_tag_mask.extend(itertools.repeat(0, len(tokenized_word) - 1))

        assert len(word_tokens) == len(ner_tag_list) == len(input_ids) == len(ner_tag_ids) == len(ner_tag_mask)

        if len(word_tokens) >= self._max_len:
            word_tokens = word_tokens[:(self._max_len - 1)]
            ner_tag_list = ner_tag_list[:(self._max_len - 1)]
            input_ids = input_ids[:(self._max_len - 1)]
            ner_tag_ids = ner_tag_ids[:(self._max_len - 1)]
            ner_tag_mask = ner_tag_mask[:(self._max_len - 1)]

        assert len(word_tokens) < self._max_len, len(word_tokens)

        word_tokens.append('[SEP]')
        ner_tag_list.append('[SEP]')
        input_ids.append(self._tokenizer.convert_tokens_to_ids('[SEP]'))
        ner_tag_ids.append(self._ner_tag_map['[SEP]'])
        ner_tag_mask.append(0)

        assert len(word_tokens) == len(ner_tag_list) == len(input_ids) == len(ner_tag_ids) == len(ner_tag_mask)

        sentence_id = [0] * len(input_ids)
        attention_mask = [1] * len(input_ids)

        left = self._max_len - len(input_ids)
        input_ids.extend(itertools.repeat(0, left))
        ner_tag_ids.extend(itertools.repeat(self._ner_tag_map['X'], left))
        attention_mask.extend(itertools.repeat(0, left))
        sentence_id.extend(itertools.repeat(0, left))
        ner_tag_mask.extend(itertools.repeat(0, left))


        assert len(word_tokens) == len(ner_tag_list)
        assert len(input_ids) == len(ner_tag_ids) == len(attention_mask) == len(sentence_id) == len(
            ner_tag_mask) == self._max_len, len(input_ids)
        return torch.LongTensor(input_ids), torch.LongTensor(ner_tag_ids), torch.LongTensor(
            attention_mask), torch.LongTensor(sentence_id), torch.BoolTensor(ner_tag_mask)


## Training Environment

We are using BERT Tokenizer from `from_pretrained`.

The dataset is loaded into training, validation, and test sets using the load_examples function. DataLoader objects are created for each dataset, enabling efficient batching and shuffling of the data during training and evaluation.

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)
test, train, valid = load_examples(DATABASE_DIR)
train_iter = DataLoader(dataset=train, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
eval_iter = DataLoader(dataset=valid, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
test_iter = DataLoader(dataset=test, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)


NameError: name 'BERT_MODEL' is not defined