# Stage 2

On this stage we want to compare the performance of a small neural model trained (BERT) on two different types of annotations:
    (1) annotations generated using the best method from the first stage of the project,
    (2) the original, ground-truth annotations provided in the Coll2003 dataset.

Importing required libraries:

In [1]:
%pip install --quiet datasets seqeval

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [4]:
from datasets import load_dataset
from collections import defaultdict, Counter
from tqdm import trange, tqdm
from dataclasses import dataclass
import shutil
import pathlib
import os
import sys
import json
import itertools
import re

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split, TensorDataset, Subset
from torch.nn.utils.rnn import pad_sequence

import numpy as np

from transformers import BertTokenizer, BertModel, BertForTokenClassification
import torch.nn.functional as F
from seqeval.metrics import accuracy_score, f1_score, classification_report

from huggingface_hub import HfApi, PyTorchModelHubMixin, interpreter_login, snapshot_download, Repository

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
shutil.copytree("/content/drive/MyDrive/nerc_test_task_model", "/content/", dirs_exist_ok=True)

'/content/'

Make a repository for a future models.

In [10]:
interpreter_login()


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

Enter your token (input will not be visible): ··········
Add token as git credential? (Y/n) 


In [11]:
username = HfApi().whoami()["name"]
REPO_NAME = f"{username}/nerc-extraction"

HfApi().create_repo(repo_id=REPO_NAME, private=False, exist_ok=True)

print(f"Repository: '{REPO_NAME}'")

Repository: 'estnafinema0/nerc-extraction'


In [12]:
API_URL = "https://llm.ispras.ru/api/chat/completions"
API_MODEL_URL = "https://llm.ispras.ru/api/models"
API_KEY = "YOUR_TOKEN"
# with open('./secrets') as file:
#     data: dict = json.load(file)
#     API_KEY = data.get('API_KEY', 'FAILED TO LOAD')

HIDDEN = 512
NER_TAGS = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
ADDITIONAL = {"[CLS]": 9, "[SEP]": 10, "X": 11}
NER_TAGS.update(ADDITIONAL)
TAGS_COUNT = len(NER_TAGS)
DATABASE_DIR = './data'
EMBEDDINGS_PATH = f'{DATABASE_DIR}/embeddings.txt'
CWD = '~/Рабочий стол/NERC_LLM_Ispras/data'
BERT_MODEL = 'bert-base-cased'
BATCH_SIZE = 32
EPOCHS = 5
INV_NER_TAGS = {v: k for k, v in NER_TAGS.items()}


## Initalizing Neural Network Model

Model will consist of:
    (1) *pre-trained BERT model* for word embeddings, which captures contextual information effectively.
    (2) *bidirectional LSTM (BiLSTM) layer* to process the sequence of embeddings and capture dependencies between words.
    (3) *linear layer* to map the LSTM outputs to the NER tags.

In [13]:
class NERSmall(BertForTokenClassification, PyTorchModelHubMixin, repo_url=REPO_NAME, license="mit"):
    def forward(self, input_ids, attention_mask=None, token_type_ids=None,
                position_ids=None, head_mask=None, labels=None, label_masks=None):
        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids,
                            head_mask=head_mask)

        sequence_output = outputs[0]  # (b, MAX_LEN, 768)

        token_reprs = [embedding[mask] for mask, embedding in zip(label_masks, sequence_output)]
        token_reprs = pad_sequence(sequences=token_reprs, batch_first=True,
                                   padding_value=-1)  # (b, local_max_len, 768)
        sequence_output = self.dropout(token_reprs)
        logits = self.classifier(sequence_output)  # (b, local_max_len, num_labels)

        outputs = (logits,)
        if labels is not None:
            labels = [label[mask] for mask, label in zip(label_masks, labels)]
            labels = pad_sequence(labels, batch_first=True, padding_value=-1)  # (b, local_max_len)
            loss_fct = nn.CrossEntropyLoss(ignore_index=-1, reduction='sum')
            mask = labels != -1
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            loss /= mask.float().sum()
            outputs = (loss,) + outputs + (labels,)

        return outputs  # (loss), scores, (hidden_states), (attentions)


## Parsing and Loading Data

To organize the data, we define a simple data structure `InputExample`:

In [14]:
@dataclass
class InputExample:
    guid: str
    text: str
    ner_tag: list[str]

We make `parse_dataformat` function to processe raw text data, splitting it into sentences and their corresponding NER tags. We make it handle empty lines and document separators. It organizes the data into a list of InputExample objects.

In [15]:
def parse_dataformat(base_data: str, set_type: str) -> list[InputExample]:
    data = []
    sentence = []
    ner_tags = []
    for line in base_data.splitlines():
        if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
            if len(sentence) > 0:
                data.append((sentence, ner_tags))
                sentence = []
                ner_tags = []
            continue
        splits = line.split(' ')
        sentence.append(splits[0])
        ner_tags.append(splits[-1])

    if len(sentence) > 0:
        data.append((sentence, ner_tags))
        sentence = []
        ner_tags = []

    return [InputExample(f'{set_type}-{i}', ' '.join(sentence), ner_tag) for i, (sentence, ner_tag) in enumerate(data)]

def load_example(filepath, name) -> tuple[list[InputExample], list[InputExample], list[InputExample]]:
    with open(filepath) as file:
        file_data = file.read()
    return parse_dataformat(file_data, name)

def load_examples(data_dir) -> tuple[list[InputExample], list[InputExample], list[InputExample]]:
    return (load_example(pathlib.Path.joinpath(pathlib.Path(data_dir), 'test.txt'), 'test'),
            load_example(pathlib.Path.joinpath(pathlib.Path(data_dir), 'train.txt'), 'train'),
            load_example(pathlib.Path.joinpath(pathlib.Path(data_dir), 'valid.txt'), 'valid'))

## Custom Dataset


Now we need to create `NERDataSet` class that prepares the data for training. It tokenizes the input text, maps NER tags to their corresponding IDs, and pads sequences to a fixed length. The __getitem__ method processes each example by adding special tokens [CLS] and [SEP], tokenizing the text, and creating attention masks and sentence IDs. The method returns tensors for input IDs, NER tag IDs, attention masks, sentence IDs, and tag masks, which are used by the model during training.

## Training Environment

We are using BERT Tokenizer from `from_pretrained`.

The dataset is loaded into training, validation, and test sets using the load_examples function. DataLoader objects are created for each dataset, enabling efficient batching and shuffling of the data during training and evaluation.

In [16]:
class NERDataSet(Dataset):
    def __init__(self, data: list[InputExample], tokenizer: BertTokenizer, ner_tag_map: dict[str, int], max_len: int = 128):
        self._max_len = max_len
        self._ner_tag_map = ner_tag_map
        self._data = data
        self._tokenizer = tokenizer

    def __len__(self):
        return len(self._data)

    def __getitem__(self, idx: int):
        input_example = self._data[idx]

        text = input_example.text
        ner_tags = input_example.ner_tag
        word_tokens = ['[CLS]']
        ner_tag_list = ['[CLS]']
        ner_tag_mask = [0]  # value in {0,1} -- 0 signifies invalid token

        input_ids = [self._tokenizer.convert_tokens_to_ids('[CLS]')]
        ner_tag_ids = [self._ner_tag_map['[CLS]']]

        for word, ner_tag in zip(text.split(), ner_tags):
            tokenized_word = self._tokenizer.tokenize(word)

            word_tokens.extend(tokenized_word)
            assert len(tokenized_word) > 0
            input_ids.extend(map(self._tokenizer.convert_tokens_to_ids, tokenized_word))

            ner_tag_list.append(ner_tag)
            ner_tag_ids.append(self._ner_tag_map[ner_tag])
            ner_tag_mask.append(1)
            # len(tokenized_word) > 1 only if it splits word in between, in which case
            # the first token gets assigned NER tag and the remaining ones get assigned
            # X
            ner_tag_list.extend(itertools.repeat('X', len(tokenized_word) - 1))
            ner_tag_ids.extend(itertools.repeat(self._ner_tag_map['X'], len(tokenized_word) - 1))
            ner_tag_mask.extend(itertools.repeat(0, len(tokenized_word) - 1))

        assert len(word_tokens) == len(ner_tag_list) == len(input_ids) == len(ner_tag_ids) == len(ner_tag_mask)

        if len(word_tokens) >= self._max_len:
            word_tokens = word_tokens[:(self._max_len - 1)]
            ner_tag_list = ner_tag_list[:(self._max_len - 1)]
            input_ids = input_ids[:(self._max_len - 1)]
            ner_tag_ids = ner_tag_ids[:(self._max_len - 1)]
            ner_tag_mask = ner_tag_mask[:(self._max_len - 1)]

        assert len(word_tokens) < self._max_len, len(word_tokens)

        word_tokens.append('[SEP]')
        ner_tag_list.append('[SEP]')
        input_ids.append(self._tokenizer.convert_tokens_to_ids('[SEP]'))
        ner_tag_ids.append(self._ner_tag_map['[SEP]'])
        ner_tag_mask.append(0)

        assert len(word_tokens) == len(ner_tag_list) == len(input_ids) == len(ner_tag_ids) == len(ner_tag_mask)

        sentence_id = [0] * len(input_ids)
        attention_mask = [1] * len(input_ids)

        left = self._max_len - len(input_ids)
        input_ids.extend(itertools.repeat(0, left))
        ner_tag_ids.extend(itertools.repeat(self._ner_tag_map['X'], left))
        attention_mask.extend(itertools.repeat(0, left))
        sentence_id.extend(itertools.repeat(0, left))
        ner_tag_mask.extend(itertools.repeat(0, left))


        assert len(word_tokens) == len(ner_tag_list)
        assert len(input_ids) == len(ner_tag_ids) == len(attention_mask) == len(sentence_id) == len(
            ner_tag_mask) == self._max_len, len(input_ids)
        return torch.LongTensor(input_ids), torch.LongTensor(ner_tag_ids), torch.LongTensor(
            attention_mask), torch.LongTensor(sentence_id), torch.BoolTensor(ner_tag_mask)

    @staticmethod
    def collate_fn(batch):
        return {
            'input_ids': torch.stack([x['input_ids'] for x in batch]),
            'attention_mask': torch.tensor([x['attention_mask'] for x in batch])
        }

## Training Environment

We are using BERT Tokenizer from `from_pretrained`.

The dataset is loaded into training, validation, and test sets using the load_examples function. DataLoader objects are created for each dataset, enabling efficient batching and shuffling of the data during training and evaluation.

In [17]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)
test, train, valid = load_examples(DATABASE_DIR)
test = NERDataSet(test, tokenizer, NER_TAGS)
train = NERDataSet(train, tokenizer, NER_TAGS)
valid = NERDataSet(valid, tokenizer, NER_TAGS)
train_iter = DataLoader(dataset=train, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
eval_iter = DataLoader(dataset=valid, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
test_iter = DataLoader(dataset=test, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

We will use *CrossEntropy* Loss Function and Stochastic Gradient Descent with a learning rate of 0.001 and momentum of 0.9 to accelerate convergence.

In [None]:
%%script false --no-raise-error

# model_primary = NERSmall.from_pretrained(BERT_MODEL, num_labels=TAGS_COUNT).to(device)
# optimizer_primary = optim.SGD(model_primary.parameters(), lr=0.001, momentum=0.9)

Some weights of NERSmall were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Training the Primary Model

In [27]:
def train(model, train_iter, optimizer):
  model = model.train()
  for epoch in trange(EPOCHS, desc='Epoch'):
      running_loss = 0.0
      tr_loss = 0
      nb_tr_steps = 0
      for batch in tqdm(train_iter):
          # input_ids, labels, input_mask, token_type_ids, label_masks = batch
          batch = tuple(t.to(device) for t in batch)
          b_input_ids, b_labels, b_input_mask, b_token_type_ids, b_label_masks = batch
          optimizer.zero_grad()

          loss, logits, labels = model(b_input_ids, token_type_ids=b_token_type_ids,
                                          attention_mask=b_input_mask, labels=b_labels,
                                          label_masks=b_label_masks)
          loss.backward()
          optimizer.step()

          running_loss += loss.item()
          # track train loss
          tr_loss += loss.item()
          nb_tr_steps += 1
          # update parameters
          optimizer.step()
          model.zero_grad()
          # print train loss per epoch
      print(f"Train loss: {tr_loss / nb_tr_steps}")

  print('Finished Training')

In [None]:
%%script false --no-raise-error
# train(model_primary, train_iter, optimizer_primary)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/439 [00:00<?, ?it/s][A
  0%|          | 1/439 [00:00<05:47,  1.26it/s][A
  0%|          | 2/439 [00:01<04:56,  1.47it/s][A
  1%|          | 3/439 [00:02<04:45,  1.53it/s][A
  1%|          | 4/439 [00:02<04:37,  1.57it/s][A
  1%|          | 5/439 [00:03<04:41,  1.54it/s][A
  1%|▏         | 6/439 [00:03<04:44,  1.52it/s][A
  2%|▏         | 7/439 [00:04<04:40,  1.54it/s][A
  2%|▏         | 8/439 [00:05<04:35,  1.56it/s][A
  2%|▏         | 9/439 [00:05<04:32,  1.58it/s][A
  2%|▏         | 10/439 [00:06<04:31,  1.58it/s][A
  3%|▎         | 11/439 [00:07<04:30,  1.58it/s][A
  3%|▎         | 12/439 [00:07<04:30,  1.58it/s][A
  3%|▎         | 13/439 [00:08<04:29,  1.58it/s][A
  3%|▎         | 14/439 [00:08<04:26,  1.60it/s][A
  3%|▎         | 15/439 [00:09<04:26,  1.59it/s][A
  4%|▎         | 16/439 [00:10<04:25,  1.59it/s][A
  4%|▍         | 17/439 [00:10<04:28,  1.57it/s][A
  4%|▍         | 18/439 [00:11<04:36,

Train loss: 0.19989937151186082



  0%|          | 0/439 [00:00<?, ?it/s][A
  0%|          | 1/439 [00:00<05:09,  1.41it/s][A
  0%|          | 2/439 [00:01<04:41,  1.55it/s][A
  1%|          | 3/439 [00:01<04:29,  1.62it/s][A
  1%|          | 4/439 [00:02<04:27,  1.62it/s][A
  1%|          | 5/439 [00:03<04:22,  1.65it/s][A
  1%|▏         | 6/439 [00:03<04:20,  1.66it/s][A
  2%|▏         | 7/439 [00:04<04:19,  1.67it/s][A
  2%|▏         | 8/439 [00:04<04:17,  1.67it/s][A
  2%|▏         | 9/439 [00:05<04:15,  1.68it/s][A
  2%|▏         | 10/439 [00:06<04:14,  1.68it/s][A
  3%|▎         | 11/439 [00:06<04:13,  1.69it/s][A
  3%|▎         | 12/439 [00:07<04:13,  1.68it/s][A
  3%|▎         | 13/439 [00:07<04:13,  1.68it/s][A
  3%|▎         | 14/439 [00:08<04:12,  1.68it/s][A
  3%|▎         | 15/439 [00:09<04:13,  1.67it/s][A
  4%|▎         | 16/439 [00:09<04:11,  1.68it/s][A
  4%|▍         | 17/439 [00:10<04:10,  1.68it/s][A
  4%|▍         | 18/439 [00:10<04:09,  1.68it/s][A
  4%|▍         | 19/439 [00:1

Train loss: 0.05360179376875417



  0%|          | 0/439 [00:00<?, ?it/s][A
  0%|          | 1/439 [00:00<05:07,  1.42it/s][A
  0%|          | 2/439 [00:01<04:40,  1.56it/s][A
  1%|          | 3/439 [00:01<04:28,  1.63it/s][A
  1%|          | 4/439 [00:02<04:25,  1.64it/s][A
  1%|          | 5/439 [00:03<04:22,  1.65it/s][A
  1%|▏         | 6/439 [00:03<04:20,  1.66it/s][A
  2%|▏         | 7/439 [00:04<04:20,  1.66it/s][A
  2%|▏         | 8/439 [00:04<04:19,  1.66it/s][A
  2%|▏         | 9/439 [00:05<04:17,  1.67it/s][A
  2%|▏         | 10/439 [00:06<04:15,  1.68it/s][A
  3%|▎         | 11/439 [00:06<04:14,  1.68it/s][A
  3%|▎         | 12/439 [00:07<04:13,  1.68it/s][A
  3%|▎         | 13/439 [00:07<04:12,  1.69it/s][A
  3%|▎         | 14/439 [00:08<04:12,  1.69it/s][A
  3%|▎         | 15/439 [00:09<04:11,  1.68it/s][A
  4%|▎         | 16/439 [00:09<04:11,  1.68it/s][A
  4%|▍         | 17/439 [00:10<04:10,  1.69it/s][A
  4%|▍         | 18/439 [00:10<04:09,  1.69it/s][A
  4%|▍         | 19/439 [00:1

Train loss: 0.03490082435638714



  0%|          | 0/439 [00:00<?, ?it/s][A
  0%|          | 1/439 [00:00<05:49,  1.25it/s][A
  0%|          | 2/439 [00:01<04:56,  1.47it/s][A
  1%|          | 3/439 [00:01<04:36,  1.58it/s][A
  1%|          | 4/439 [00:02<04:30,  1.61it/s][A
  1%|          | 5/439 [00:03<04:25,  1.64it/s][A
  1%|▏         | 6/439 [00:03<04:21,  1.66it/s][A
  2%|▏         | 7/439 [00:04<04:18,  1.67it/s][A
  2%|▏         | 8/439 [00:04<04:17,  1.68it/s][A
  2%|▏         | 9/439 [00:05<04:15,  1.68it/s][A
  2%|▏         | 10/439 [00:06<04:14,  1.68it/s][A
  3%|▎         | 11/439 [00:06<04:13,  1.69it/s][A
  3%|▎         | 12/439 [00:07<04:12,  1.69it/s][A
  3%|▎         | 13/439 [00:07<04:12,  1.69it/s][A
  3%|▎         | 14/439 [00:08<04:11,  1.69it/s][A
  3%|▎         | 15/439 [00:09<04:10,  1.69it/s][A
  4%|▎         | 16/439 [00:09<04:09,  1.70it/s][A
  4%|▍         | 17/439 [00:10<04:09,  1.69it/s][A
  4%|▍         | 18/439 [00:10<04:09,  1.69it/s][A
  4%|▍         | 19/439 [00:1

Train loss: 0.024840473800426633



  0%|          | 0/439 [00:00<?, ?it/s][A
  0%|          | 1/439 [00:00<05:15,  1.39it/s][A
  0%|          | 2/439 [00:01<04:42,  1.55it/s][A
  1%|          | 3/439 [00:01<04:29,  1.62it/s][A
  1%|          | 4/439 [00:02<04:25,  1.64it/s][A
  1%|          | 5/439 [00:03<04:21,  1.66it/s][A
  1%|▏         | 6/439 [00:03<04:19,  1.67it/s][A
  2%|▏         | 7/439 [00:04<04:17,  1.68it/s][A
  2%|▏         | 8/439 [00:04<04:16,  1.68it/s][A
  2%|▏         | 9/439 [00:05<04:15,  1.68it/s][A
  2%|▏         | 10/439 [00:06<04:14,  1.68it/s][A
  3%|▎         | 11/439 [00:06<04:13,  1.69it/s][A
  3%|▎         | 12/439 [00:07<04:13,  1.68it/s][A
  3%|▎         | 13/439 [00:07<04:13,  1.68it/s][A
  3%|▎         | 14/439 [00:08<04:13,  1.68it/s][A
  3%|▎         | 15/439 [00:09<04:11,  1.69it/s][A
  4%|▎         | 16/439 [00:09<04:10,  1.69it/s][A
  4%|▍         | 17/439 [00:10<04:10,  1.68it/s][A
  4%|▍         | 18/439 [00:10<04:09,  1.69it/s][A
  4%|▍         | 19/439 [00:1

Train loss: 0.018063813425411718
Finished Training





Save our primary model to HuggingFace.

In [None]:
torch.save(model_primary.state_dict(), 'model.pth')
PRIMARY_MODEL_SAVEPATH = f"{REPO_NAME}primary_model"

HfApi().create_repo(repo_id=PRIMARY_MODEL_SAVEPATH, private=False, exist_ok=True)

model_primary.push_to_hub(PRIMARY_MODEL_SAVEPATH)

print("Primary model upload to Hugging Face succesfully!")

model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

Primary model upload to Hugging Face succesfully!


To download baseline model from HuggingFace:


In [None]:
# model_primary.load_state_dict(torch.load('model.pth'))

model_primary = NERSmall.from_pretrained(PRIMARY_MODEL_SAVEPATH)

config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

### Evaluating Primary Model

In [19]:
def evaluate(model, eval_iter, optimizer):
  model = model.eval()
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps = 0
  predictions, true_labels = [], []

  for batch in tqdm(eval_iter):
      batch = tuple(t.to(device) for t in batch)

      b_input_ids, b_labels, b_input_mask, b_token_type_ids, b_label_masks = batch

      with torch.no_grad():
          tmp_eval_loss, logits, reduced_labels = model(b_input_ids,
                                                          token_type_ids=b_token_type_ids,
                                                          attention_mask=b_input_mask,
                                                          labels=b_labels,
                                                          label_masks=b_label_masks)

      logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
      logits = logits.detach().cpu().numpy()
      reduced_labels = reduced_labels.to('cpu').numpy()

      labels_to_append = []
      predictions_to_append = []

      for prediction, r_label in zip(logits, reduced_labels):
          preds = []
          labels = []
          for pred, lab in zip(prediction, r_label):
              if lab.item() == -1:  # masked label; -1 means do not collect this label
                  continue
              preds.append(pred)
              labels.append(lab)
          predictions_to_append.append(preds)
          labels_to_append.append(labels)

      predictions.extend(predictions_to_append)
      true_labels.append(labels_to_append)

      eval_loss += tmp_eval_loss.mean().item()

      nb_eval_steps += 1
  eval_loss = eval_loss / nb_eval_steps
  print(f"Validation loss: {eval_loss}")
  pred_tags = [INV_NER_TAGS[p_i] for p in predictions for p_i in p]
  valid_tags = [INV_NER_TAGS[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
  print(f"Seq eval accuracy: {accuracy_score(valid_tags, pred_tags)}")
  print(f"F1-Score: {f1_score([valid_tags], [pred_tags])}")
  print("Classification report: -- ")
  print(classification_report([valid_tags], [pred_tags]))

In [None]:
evaluate(model_primary, eval_iter, optimizer_primary)

100%|██████████| 102/102 [00:20<00:00,  4.89it/s]


Validation loss: 0.04087882407475263
Seq eval accuracy: 0.9889133526878787
F1-Score: 0.9319836024429015
Classification report: -- 
              precision    recall  f1-score   support

         LOC       0.93      0.97      0.95      1837
        MISC       0.85      0.88      0.86       922
         ORG       0.92      0.89      0.90      1341
         PER       0.97      0.97      0.97      1836

   micro avg       0.93      0.94      0.93      5936
   macro avg       0.92      0.93      0.92      5936
weighted avg       0.93      0.94      0.93      5936



Let's test our model

In [None]:
evaluate(model_primary, test_iter, optimizer_primary)

100%|██████████| 108/108 [00:21<00:00,  4.91it/s]


Validation loss: 0.09429696848383173
Seq eval accuracy: 0.9795549044531099
F1-Score: 0.8895904586512321
Classification report: -- 
              precision    recall  f1-score   support

         LOC       0.88      0.94      0.91      1666
        MISC       0.75      0.80      0.77       702
         ORG       0.87      0.86      0.86      1661
         PER       0.96      0.94      0.95      1615

   micro avg       0.88      0.90      0.89      5644
   macro avg       0.86      0.88      0.87      5644
weighted avg       0.88      0.90      0.89      5644



Small test.

In [None]:

indices = torch.arange(10*BATCH_SIZE)
test_10 = Subset(test, indices)
test_iter_10 = DataLoader(dataset=test_10, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

evaluate(model_primary, test_iter_10, opt)

100%|██████████| 10/10 [02:45<00:00, 16.60s/it]


Validation loss: 0.056778930500149725
Seq eval accuracy: 0.9866873751941424
F1-Score: 0.9414965986394558
Classification report: -- 
              precision    recall  f1-score   support

         LOC       0.93      0.95      0.94       222
        MISC       0.83      0.88      0.85        57
         ORG       0.82      0.83      0.82        65
         PER       0.99      0.96      0.98       390

   micro avg       0.94      0.94      0.94       734
   macro avg       0.89      0.91      0.90       734
weighted avg       0.94      0.94      0.94       734



## Generation of syntetic dataset

Now we need a function to retrieve model answers and generate a dataset to train our NER model.

In [29]:
with open('prompts-latest.json') as file:
    data = json.load(file)

pattern = re.compile(r"""Tokens: '(`.*`(, )?)+'""") #  '`([\w\d.,';":!?\[\]\(\)]+`(, )?)+'

INV_NER_TAGS = {v: k for k, v in NER_TAGS.items()}

keys = list(data.keys())
tokenss = [(list(map(lambda x: x[1:-1], next(re.finditer(pattern, str(key))).group(1).split(', '))),
            list(map(lambda k: INV_NER_TAGS.get(k, None), value['predicted'])))
           for key, value in data.items()]

tokenss = [(l, t) for l, t in tokenss if all(k is not None for k in t)]


In [30]:
def gen_examples(examples):
    nl = '\n'
    return f"""{f'{nl}{nl}'.join(f'{nl}'.join(f'{token} {tag}' for token, tag in zip(*tokens)) for tokens in examples)}"""


In [31]:
with open('data/llm.txt', 'w') as file:
    print(gen_examples(tokenss), file=file)


In [32]:
llm_pure = load_example('data/llm.txt', "llm_pure")
llm_pure = NERDataSet(llm_pure, tokenizer, NER_TAGS)
llm_pure_iter = DataLoader(dataset=llm_pure, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)

In [33]:
model_llm_pure = NERSmall.from_pretrained(BERT_MODEL, num_labels=TAGS_COUNT).to(device)
optimizer_llm_pure = optim.SGD(model_llm_pure.parameters(), lr=0.001, momentum=0.9)

Some weights of NERSmall were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
train(model_llm_pure, llm_pure_iter, optimizer_llm_pure)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/64 [00:00<?, ?it/s][A
  2%|▏         | 1/64 [00:00<00:54,  1.16it/s][A
  3%|▎         | 2/64 [00:01<00:43,  1.42it/s][A
  5%|▍         | 3/64 [00:02<00:39,  1.54it/s][A
  6%|▋         | 4/64 [00:02<00:37,  1.59it/s][A
  8%|▊         | 5/64 [00:03<00:36,  1.63it/s][A
  9%|▉         | 6/64 [00:03<00:34,  1.66it/s][A
 11%|█         | 7/64 [00:04<00:33,  1.69it/s][A
 12%|█▎        | 8/64 [00:04<00:33,  1.69it/s][A
 14%|█▍        | 9/64 [00:05<00:32,  1.71it/s][A
 16%|█▌        | 10/64 [00:06<00:31,  1.72it/s][A
 17%|█▋        | 11/64 [00:06<00:30,  1.74it/s][A
 19%|█▉        | 12/64 [00:07<00:29,  1.74it/s][A
 20%|██        | 13/64 [00:07<00:29,  1.73it/s][A
 22%|██▏       | 14/64 [00:08<00:28,  1.74it/s][A
 23%|██▎       | 15/64 [00:08<00:28,  1.75it/s][A
 25%|██▌       | 16/64 [00:09<00:27,  1.75it/s][A
 27%|██▋       | 17/64 [00:10<00:26,  1.76it/s][A
 28%|██▊       | 18/64 [00:10<00:26,  1.76it/s][A
 30%

Train loss: 1.157826503738761



  0%|          | 0/64 [00:00<?, ?it/s][A
  2%|▏         | 1/64 [00:00<00:43,  1.44it/s][A
  3%|▎         | 2/64 [00:01<00:39,  1.58it/s][A
  5%|▍         | 3/64 [00:01<00:37,  1.64it/s][A
  6%|▋         | 4/64 [00:02<00:36,  1.66it/s][A
  8%|▊         | 5/64 [00:03<00:35,  1.68it/s][A
  9%|▉         | 6/64 [00:03<00:34,  1.68it/s][A
 11%|█         | 7/64 [00:04<00:33,  1.69it/s][A
 12%|█▎        | 8/64 [00:04<00:33,  1.69it/s][A
 14%|█▍        | 9/64 [00:05<00:32,  1.69it/s][A
 16%|█▌        | 10/64 [00:06<00:32,  1.68it/s][A
 17%|█▋        | 11/64 [00:06<00:31,  1.68it/s][A
 19%|█▉        | 12/64 [00:07<00:30,  1.68it/s][A
 20%|██        | 13/64 [00:07<00:30,  1.67it/s][A
 22%|██▏       | 14/64 [00:08<00:29,  1.68it/s][A
 23%|██▎       | 15/64 [00:08<00:29,  1.68it/s][A
 25%|██▌       | 16/64 [00:09<00:28,  1.68it/s][A
 27%|██▋       | 17/64 [00:10<00:27,  1.68it/s][A
 28%|██▊       | 18/64 [00:10<00:27,  1.68it/s][A
 30%|██▉       | 19/64 [00:11<00:26,  1.68it/s]

Train loss: 0.753741652239114



  0%|          | 0/64 [00:00<?, ?it/s][A
  2%|▏         | 1/64 [00:00<00:45,  1.40it/s][A
  3%|▎         | 2/64 [00:01<00:40,  1.52it/s][A
  5%|▍         | 3/64 [00:01<00:38,  1.57it/s][A
  6%|▋         | 4/64 [00:02<00:37,  1.58it/s][A
  8%|▊         | 5/64 [00:03<00:36,  1.60it/s][A
  9%|▉         | 6/64 [00:03<00:36,  1.60it/s][A
 11%|█         | 7/64 [00:04<00:35,  1.61it/s][A
 12%|█▎        | 8/64 [00:05<00:34,  1.61it/s][A
 14%|█▍        | 9/64 [00:05<00:34,  1.61it/s][A
 16%|█▌        | 10/64 [00:06<00:33,  1.60it/s][A
 17%|█▋        | 11/64 [00:06<00:32,  1.61it/s][A
 19%|█▉        | 12/64 [00:07<00:32,  1.60it/s][A
 20%|██        | 13/64 [00:08<00:31,  1.60it/s][A
 22%|██▏       | 14/64 [00:08<00:31,  1.61it/s][A
 23%|██▎       | 15/64 [00:09<00:30,  1.61it/s][A
 25%|██▌       | 16/64 [00:10<00:29,  1.61it/s][A
 27%|██▋       | 17/64 [00:10<00:29,  1.61it/s][A
 28%|██▊       | 18/64 [00:11<00:28,  1.61it/s][A
 30%|██▉       | 19/64 [00:11<00:27,  1.61it/s]

Train loss: 0.6484084823168814



  0%|          | 0/64 [00:00<?, ?it/s][A
  2%|▏         | 1/64 [00:00<00:46,  1.35it/s][A
  3%|▎         | 2/64 [00:01<00:42,  1.45it/s][A
  5%|▍         | 3/64 [00:02<00:40,  1.51it/s][A
  6%|▋         | 4/64 [00:02<00:39,  1.53it/s][A
  8%|▊         | 5/64 [00:03<00:38,  1.53it/s][A
  9%|▉         | 6/64 [00:03<00:37,  1.54it/s][A
 11%|█         | 7/64 [00:04<00:36,  1.54it/s][A
 12%|█▎        | 8/64 [00:05<00:36,  1.55it/s][A
 14%|█▍        | 9/64 [00:05<00:35,  1.55it/s][A
 16%|█▌        | 10/64 [00:06<00:34,  1.55it/s][A
 17%|█▋        | 11/64 [00:07<00:34,  1.54it/s][A
 19%|█▉        | 12/64 [00:07<00:33,  1.55it/s][A
 20%|██        | 13/64 [00:08<00:32,  1.55it/s][A
 22%|██▏       | 14/64 [00:09<00:32,  1.55it/s][A
 23%|██▎       | 15/64 [00:09<00:31,  1.55it/s][A
 25%|██▌       | 16/64 [00:10<00:31,  1.55it/s][A
 27%|██▋       | 17/64 [00:11<00:30,  1.55it/s][A
 28%|██▊       | 18/64 [00:11<00:29,  1.55it/s][A
 30%|██▉       | 19/64 [00:12<00:29,  1.55it/s]

Train loss: 0.5856812694109976



  0%|          | 0/64 [00:00<?, ?it/s][A
  2%|▏         | 1/64 [00:00<00:46,  1.36it/s][A
  3%|▎         | 2/64 [00:01<00:42,  1.47it/s][A
  5%|▍         | 3/64 [00:02<00:40,  1.52it/s][A
  6%|▋         | 4/64 [00:02<00:39,  1.54it/s][A
  8%|▊         | 5/64 [00:03<00:38,  1.55it/s][A
  9%|▉         | 6/64 [00:03<00:37,  1.56it/s][A
 11%|█         | 7/64 [00:04<00:36,  1.55it/s][A
 12%|█▎        | 8/64 [00:05<00:35,  1.56it/s][A
 14%|█▍        | 9/64 [00:05<00:35,  1.56it/s][A
 16%|█▌        | 10/64 [00:06<00:34,  1.56it/s][A
 17%|█▋        | 11/64 [00:07<00:33,  1.56it/s][A
 19%|█▉        | 12/64 [00:07<00:33,  1.57it/s][A
 20%|██        | 13/64 [00:08<00:32,  1.57it/s][A
 22%|██▏       | 14/64 [00:09<00:31,  1.57it/s][A
 23%|██▎       | 15/64 [00:09<00:31,  1.58it/s][A
 25%|██▌       | 16/64 [00:10<00:30,  1.58it/s][A
 27%|██▋       | 17/64 [00:10<00:29,  1.58it/s][A
 28%|██▊       | 18/64 [00:11<00:29,  1.58it/s][A
 30%|██▉       | 19/64 [00:12<00:28,  1.58it/s]

Train loss: 0.5433702929876745
Finished Training





In [35]:
evaluate(model_llm_pure, eval_iter, optimizer_llm_pure)

100%|██████████| 102/102 [00:22<00:00,  4.58it/s]


Validation loss: 0.5154326967895031
Seq eval accuracy: 0.8504179412738928
F1-Score: 0.47414279445872254
Classification report: -- 
              precision    recall  f1-score   support

         LOC       0.38      0.85      0.53      1837
        MISC       0.03      0.08      0.05       922
         ORG       0.26      0.33      0.29      1341
         PER       0.80      0.93      0.86      1836

   micro avg       0.38      0.64      0.47      5936
   macro avg       0.37      0.55      0.43      5936
weighted avg       0.43      0.64      0.50      5936

