# Stage 2

On this stage we want to compare the performance of a small neural model trained (BERT) on two different types of annotations:
    (1) annotations generated using the best method from the first stage of the project,
    (2) the original, ground-truth annotations provided in the Coll2003 dataset.

Importing required libraries:

In [6]:
%pip install --quiet datasets seqeval

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [68]:
from datasets import load_dataset
from collections import defaultdict, Counter
from tqdm import trange, tqdm
from dataclasses import dataclass
import shutil
import pathlib
import os
import sys
import json
import itertools
import re

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split, TensorDataset, Subset, ConcatDataset
from torch.nn.utils.rnn import pad_sequence

import numpy as np

from transformers import BertTokenizer, BertModel, BertForTokenClassification
import torch.nn.functional as F
from seqeval.metrics import accuracy_score, f1_score, classification_report

from huggingface_hub import HfApi, PyTorchModelHubMixin, interpreter_login, snapshot_download, Repository

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
shutil.copytree("/content/drive/MyDrive/nerc_test_task_model", "/content/", dirs_exist_ok=True)

'/content/'

Make a repository for a future models.

In [12]:
interpreter_login()


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

Enter your token (input will not be visible): ··········
Add token as git credential? (Y/n) 


In [13]:
username = HfApi().whoami()["name"]
REPO_NAME = f"{username}/nerc-extraction"

HfApi().create_repo(repo_id=REPO_NAME, private=False, exist_ok=True)

print(f"Repository: '{REPO_NAME}'")

Repository: 'estnafinema0/nerc-extraction'


In [71]:
API_URL = "https://llm.ispras.ru/api/chat/completions"
API_MODEL_URL = "https://llm.ispras.ru/api/models"
API_KEY = "YOUR_TOKEN"
# with open('./secrets') as file:
#     data: dict = json.load(file)
#     API_KEY = data.get('API_KEY', 'FAILED TO LOAD')

HIDDEN = 512
NER_TAGS = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
ADDITIONAL = {"[CLS]": 9, "[SEP]": 10, "X": 11}
NER_TAGS.update(ADDITIONAL)
TAGS_COUNT = len(NER_TAGS)
DATABASE_DIR = './data'
EMBEDDINGS_PATH = f'{DATABASE_DIR}/embeddings.txt'
CWD = '~/Рабочий стол/NERC_LLM_Ispras/data'
BERT_MODEL = 'bert-base-cased'
BATCH_SIZE = 32
EPOCHS = 5
INV_NER_TAGS = {v: k for k, v in NER_TAGS.items()}
PERCENTAGES = [.01, .02, .05, .1, .25, .5, 1.]


## Initalizing Neural Network Model

Model will consist of:
    (1) *pre-trained BERT model* for word embeddings, which captures contextual information effectively.
    (2) *bidirectional LSTM (BiLSTM) layer* to process the sequence of embeddings and capture dependencies between words.
    (3) *linear layer* to map the LSTM outputs to the NER tags.

In [14]:
class NERSmall(BertForTokenClassification, PyTorchModelHubMixin, repo_url=REPO_NAME, license="mit"):
    def forward(self, input_ids, attention_mask=None, token_type_ids=None,
                position_ids=None, head_mask=None, labels=None, label_masks=None):
        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids,
                            head_mask=head_mask)

        sequence_output = outputs[0]  # (b, MAX_LEN, 768)

        token_reprs = [embedding[mask] for mask, embedding in zip(label_masks, sequence_output)]
        token_reprs = pad_sequence(sequences=token_reprs, batch_first=True,
                                   padding_value=-1)  # (b, local_max_len, 768)
        sequence_output = self.dropout(token_reprs)
        logits = self.classifier(sequence_output)  # (b, local_max_len, num_labels)

        outputs = (logits,)
        if labels is not None:
            labels = [label[mask] for mask, label in zip(label_masks, labels)]
            labels = pad_sequence(labels, batch_first=True, padding_value=-1)  # (b, local_max_len)
            loss_fct = nn.CrossEntropyLoss(ignore_index=-1, reduction='sum')
            mask = labels != -1
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            loss /= mask.float().sum()
            outputs = (loss,) + outputs + (labels,)

        return outputs  # (loss), scores, (hidden_states), (attentions)


## Parsing and Loading Data

To organize the data, we define a simple data structure `InputExample`:

In [15]:
@dataclass
class InputExample:
    guid: str
    text: str
    ner_tag: list[str]

We make `parse_dataformat` function to processe raw text data, splitting it into sentences and their corresponding NER tags. We make it handle empty lines and document separators. It organizes the data into a list of InputExample objects.

In [16]:
def parse_dataformat(base_data: str, set_type: str) -> list[InputExample]:
    data = []
    sentence = []
    ner_tags = []
    for line in base_data.splitlines():
        if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
            if len(sentence) > 0:
                data.append((sentence, ner_tags))
                sentence = []
                ner_tags = []
            continue
        splits = line.split(' ')
        sentence.append(splits[0])
        ner_tags.append(splits[-1])

    if len(sentence) > 0:
        data.append((sentence, ner_tags))
        sentence = []
        ner_tags = []

    return [InputExample(f'{set_type}-{i}', ' '.join(sentence), ner_tag) for i, (sentence, ner_tag) in enumerate(data)]

def load_example(filepath, name) -> tuple[list[InputExample], list[InputExample], list[InputExample]]:
    with open(filepath) as file:
        file_data = file.read()
    return parse_dataformat(file_data, name)

def load_examples(data_dir) -> tuple[list[InputExample], list[InputExample], list[InputExample]]:
    return (load_example(pathlib.Path.joinpath(pathlib.Path(data_dir), 'test.txt'), 'test'),
            load_example(pathlib.Path.joinpath(pathlib.Path(data_dir), 'train.txt'), 'train'),
            load_example(pathlib.Path.joinpath(pathlib.Path(data_dir), 'valid.txt'), 'valid'))

## Custom Dataset


Now we need to create `NERDataSet` class that prepares the data for training. It tokenizes the input text, maps NER tags to their corresponding IDs, and pads sequences to a fixed length. The __getitem__ method processes each example by adding special tokens [CLS] and [SEP], tokenizing the text, and creating attention masks and sentence IDs. The method returns tensors for input IDs, NER tag IDs, attention masks, sentence IDs, and tag masks, which are used by the model during training.

## Training Environment

We are using BERT Tokenizer from `from_pretrained`.

The dataset is loaded into training, validation, and test sets using the load_examples function. DataLoader objects are created for each dataset, enabling efficient batching and shuffling of the data during training and evaluation.

In [17]:
class NERDataSet(Dataset):
    def __init__(self, data: list[InputExample], tokenizer: BertTokenizer, ner_tag_map: dict[str, int], max_len: int = 128):
        self._max_len = max_len
        self._ner_tag_map = ner_tag_map
        self._data = data
        self._tokenizer = tokenizer

    def __len__(self):
        return len(self._data)

    def __getitem__(self, idx: int):
        input_example = self._data[idx]

        text = input_example.text
        ner_tags = input_example.ner_tag
        word_tokens = ['[CLS]']
        ner_tag_list = ['[CLS]']
        ner_tag_mask = [0]  # value in {0,1} -- 0 signifies invalid token

        input_ids = [self._tokenizer.convert_tokens_to_ids('[CLS]')]
        ner_tag_ids = [self._ner_tag_map['[CLS]']]

        for word, ner_tag in zip(text.split(), ner_tags):
            tokenized_word = self._tokenizer.tokenize(word)

            word_tokens.extend(tokenized_word)
            assert len(tokenized_word) > 0
            input_ids.extend(map(self._tokenizer.convert_tokens_to_ids, tokenized_word))

            ner_tag_list.append(ner_tag)
            ner_tag_ids.append(self._ner_tag_map[ner_tag])
            ner_tag_mask.append(1)
            # len(tokenized_word) > 1 only if it splits word in between, in which case
            # the first token gets assigned NER tag and the remaining ones get assigned
            # X
            ner_tag_list.extend(itertools.repeat('X', len(tokenized_word) - 1))
            ner_tag_ids.extend(itertools.repeat(self._ner_tag_map['X'], len(tokenized_word) - 1))
            ner_tag_mask.extend(itertools.repeat(0, len(tokenized_word) - 1))

        assert len(word_tokens) == len(ner_tag_list) == len(input_ids) == len(ner_tag_ids) == len(ner_tag_mask)

        if len(word_tokens) >= self._max_len:
            word_tokens = word_tokens[:(self._max_len - 1)]
            ner_tag_list = ner_tag_list[:(self._max_len - 1)]
            input_ids = input_ids[:(self._max_len - 1)]
            ner_tag_ids = ner_tag_ids[:(self._max_len - 1)]
            ner_tag_mask = ner_tag_mask[:(self._max_len - 1)]

        assert len(word_tokens) < self._max_len, len(word_tokens)

        word_tokens.append('[SEP]')
        ner_tag_list.append('[SEP]')
        input_ids.append(self._tokenizer.convert_tokens_to_ids('[SEP]'))
        ner_tag_ids.append(self._ner_tag_map['[SEP]'])
        ner_tag_mask.append(0)

        assert len(word_tokens) == len(ner_tag_list) == len(input_ids) == len(ner_tag_ids) == len(ner_tag_mask)

        sentence_id = [0] * len(input_ids)
        attention_mask = [1] * len(input_ids)

        left = self._max_len - len(input_ids)
        input_ids.extend(itertools.repeat(0, left))
        ner_tag_ids.extend(itertools.repeat(self._ner_tag_map['X'], left))
        attention_mask.extend(itertools.repeat(0, left))
        sentence_id.extend(itertools.repeat(0, left))
        ner_tag_mask.extend(itertools.repeat(0, left))


        assert len(word_tokens) == len(ner_tag_list)
        assert len(input_ids) == len(ner_tag_ids) == len(attention_mask) == len(sentence_id) == len(
            ner_tag_mask) == self._max_len, len(input_ids)
        return torch.LongTensor(input_ids), torch.LongTensor(ner_tag_ids), torch.LongTensor(
            attention_mask), torch.LongTensor(sentence_id), torch.BoolTensor(ner_tag_mask)

    @staticmethod
    def collate_fn(batch):
        return {
            'input_ids': torch.stack([x['input_ids'] for x in batch]),
            'attention_mask': torch.tensor([x['attention_mask'] for x in batch])
        }

## Training Environment

We are using BERT Tokenizer from `from_pretrained`.

The dataset is loaded into training, validation, and test sets using the load_examples function. DataLoader objects are created for each dataset, enabling efficient batching and shuffling of the data during training and evaluation.

In [89]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)
test, train, valid = load_examples(DATABASE_DIR)
test = NERDataSet(test, tokenizer, NER_TAGS)
train = NERDataSet(train, tokenizer, NER_TAGS)
valid = NERDataSet(valid, tokenizer, NER_TAGS)
train_iter = DataLoader(dataset=train, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
eval_iter = DataLoader(dataset=valid, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
test_iter = DataLoader(dataset=test, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)


We will use *CrossEntropy* Loss Function and Stochastic Gradient Descent with a learning rate of 0.001 and momentum of 0.9 to accelerate convergence.

In [35]:


# model_primary = NERSmall.from_pretrained(BERT_MODEL, num_labels=TAGS_COUNT).to(device)
optimizer_primary = optim.SGD(model_primary.parameters(), lr=0.001, momentum=0.9)

### Training the Primary Model

In [87]:
def train_model(model, train_iter, optimizer):
  model = model.train()
  for epoch in trange(EPOCHS, desc='Epoch'):
      running_loss = 0.0
      tr_loss = 0
      nb_tr_steps = 0
      for batch in tqdm(train_iter):
          # input_ids, labels, input_mask, token_type_ids, label_masks = batch
          batch = tuple(t.to(device) for t in batch)
          b_input_ids, b_labels, b_input_mask, b_token_type_ids, b_label_masks = batch
          optimizer.zero_grad()

          loss, logits, labels = model(b_input_ids, token_type_ids=b_token_type_ids,
                                          attention_mask=b_input_mask, labels=b_labels,
                                          label_masks=b_label_masks)
          loss.backward()
          optimizer.step()

          running_loss += loss.item()
          # track train loss
          tr_loss += loss.item()
          nb_tr_steps += 1
          # update parameters
          optimizer.step()
          model.zero_grad()
          # print train loss per epoch
      print(f"Train loss: {tr_loss / nb_tr_steps}")

  print('Finished Training')

In [None]:
%%script false --no-raise-error
# train_model(model_primary, train_iter, optimizer_primary)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/439 [00:00<?, ?it/s][A
  0%|          | 1/439 [00:00<05:47,  1.26it/s][A
  0%|          | 2/439 [00:01<04:56,  1.47it/s][A
  1%|          | 3/439 [00:02<04:45,  1.53it/s][A
  1%|          | 4/439 [00:02<04:37,  1.57it/s][A
  1%|          | 5/439 [00:03<04:41,  1.54it/s][A
  1%|▏         | 6/439 [00:03<04:44,  1.52it/s][A
  2%|▏         | 7/439 [00:04<04:40,  1.54it/s][A
  2%|▏         | 8/439 [00:05<04:35,  1.56it/s][A
  2%|▏         | 9/439 [00:05<04:32,  1.58it/s][A
  2%|▏         | 10/439 [00:06<04:31,  1.58it/s][A
  3%|▎         | 11/439 [00:07<04:30,  1.58it/s][A
  3%|▎         | 12/439 [00:07<04:30,  1.58it/s][A
  3%|▎         | 13/439 [00:08<04:29,  1.58it/s][A
  3%|▎         | 14/439 [00:08<04:26,  1.60it/s][A
  3%|▎         | 15/439 [00:09<04:26,  1.59it/s][A
  4%|▎         | 16/439 [00:10<04:25,  1.59it/s][A
  4%|▍         | 17/439 [00:10<04:28,  1.57it/s][A
  4%|▍         | 18/439 [00:11<04:36,

Train loss: 0.19989937151186082



  0%|          | 0/439 [00:00<?, ?it/s][A
  0%|          | 1/439 [00:00<05:09,  1.41it/s][A
  0%|          | 2/439 [00:01<04:41,  1.55it/s][A
  1%|          | 3/439 [00:01<04:29,  1.62it/s][A
  1%|          | 4/439 [00:02<04:27,  1.62it/s][A
  1%|          | 5/439 [00:03<04:22,  1.65it/s][A
  1%|▏         | 6/439 [00:03<04:20,  1.66it/s][A
  2%|▏         | 7/439 [00:04<04:19,  1.67it/s][A
  2%|▏         | 8/439 [00:04<04:17,  1.67it/s][A
  2%|▏         | 9/439 [00:05<04:15,  1.68it/s][A
  2%|▏         | 10/439 [00:06<04:14,  1.68it/s][A
  3%|▎         | 11/439 [00:06<04:13,  1.69it/s][A
  3%|▎         | 12/439 [00:07<04:13,  1.68it/s][A
  3%|▎         | 13/439 [00:07<04:13,  1.68it/s][A
  3%|▎         | 14/439 [00:08<04:12,  1.68it/s][A
  3%|▎         | 15/439 [00:09<04:13,  1.67it/s][A
  4%|▎         | 16/439 [00:09<04:11,  1.68it/s][A
  4%|▍         | 17/439 [00:10<04:10,  1.68it/s][A
  4%|▍         | 18/439 [00:10<04:09,  1.68it/s][A
  4%|▍         | 19/439 [00:1

Train loss: 0.05360179376875417



  0%|          | 0/439 [00:00<?, ?it/s][A
  0%|          | 1/439 [00:00<05:07,  1.42it/s][A
  0%|          | 2/439 [00:01<04:40,  1.56it/s][A
  1%|          | 3/439 [00:01<04:28,  1.63it/s][A
  1%|          | 4/439 [00:02<04:25,  1.64it/s][A
  1%|          | 5/439 [00:03<04:22,  1.65it/s][A
  1%|▏         | 6/439 [00:03<04:20,  1.66it/s][A
  2%|▏         | 7/439 [00:04<04:20,  1.66it/s][A
  2%|▏         | 8/439 [00:04<04:19,  1.66it/s][A
  2%|▏         | 9/439 [00:05<04:17,  1.67it/s][A
  2%|▏         | 10/439 [00:06<04:15,  1.68it/s][A
  3%|▎         | 11/439 [00:06<04:14,  1.68it/s][A
  3%|▎         | 12/439 [00:07<04:13,  1.68it/s][A
  3%|▎         | 13/439 [00:07<04:12,  1.69it/s][A
  3%|▎         | 14/439 [00:08<04:12,  1.69it/s][A
  3%|▎         | 15/439 [00:09<04:11,  1.68it/s][A
  4%|▎         | 16/439 [00:09<04:11,  1.68it/s][A
  4%|▍         | 17/439 [00:10<04:10,  1.69it/s][A
  4%|▍         | 18/439 [00:10<04:09,  1.69it/s][A
  4%|▍         | 19/439 [00:1

Train loss: 0.03490082435638714



  0%|          | 0/439 [00:00<?, ?it/s][A
  0%|          | 1/439 [00:00<05:49,  1.25it/s][A
  0%|          | 2/439 [00:01<04:56,  1.47it/s][A
  1%|          | 3/439 [00:01<04:36,  1.58it/s][A
  1%|          | 4/439 [00:02<04:30,  1.61it/s][A
  1%|          | 5/439 [00:03<04:25,  1.64it/s][A
  1%|▏         | 6/439 [00:03<04:21,  1.66it/s][A
  2%|▏         | 7/439 [00:04<04:18,  1.67it/s][A
  2%|▏         | 8/439 [00:04<04:17,  1.68it/s][A
  2%|▏         | 9/439 [00:05<04:15,  1.68it/s][A
  2%|▏         | 10/439 [00:06<04:14,  1.68it/s][A
  3%|▎         | 11/439 [00:06<04:13,  1.69it/s][A
  3%|▎         | 12/439 [00:07<04:12,  1.69it/s][A
  3%|▎         | 13/439 [00:07<04:12,  1.69it/s][A
  3%|▎         | 14/439 [00:08<04:11,  1.69it/s][A
  3%|▎         | 15/439 [00:09<04:10,  1.69it/s][A
  4%|▎         | 16/439 [00:09<04:09,  1.70it/s][A
  4%|▍         | 17/439 [00:10<04:09,  1.69it/s][A
  4%|▍         | 18/439 [00:10<04:09,  1.69it/s][A
  4%|▍         | 19/439 [00:1

Train loss: 0.024840473800426633



  0%|          | 0/439 [00:00<?, ?it/s][A
  0%|          | 1/439 [00:00<05:15,  1.39it/s][A
  0%|          | 2/439 [00:01<04:42,  1.55it/s][A
  1%|          | 3/439 [00:01<04:29,  1.62it/s][A
  1%|          | 4/439 [00:02<04:25,  1.64it/s][A
  1%|          | 5/439 [00:03<04:21,  1.66it/s][A
  1%|▏         | 6/439 [00:03<04:19,  1.67it/s][A
  2%|▏         | 7/439 [00:04<04:17,  1.68it/s][A
  2%|▏         | 8/439 [00:04<04:16,  1.68it/s][A
  2%|▏         | 9/439 [00:05<04:15,  1.68it/s][A
  2%|▏         | 10/439 [00:06<04:14,  1.68it/s][A
  3%|▎         | 11/439 [00:06<04:13,  1.69it/s][A
  3%|▎         | 12/439 [00:07<04:13,  1.68it/s][A
  3%|▎         | 13/439 [00:07<04:13,  1.68it/s][A
  3%|▎         | 14/439 [00:08<04:13,  1.68it/s][A
  3%|▎         | 15/439 [00:09<04:11,  1.69it/s][A
  4%|▎         | 16/439 [00:09<04:10,  1.69it/s][A
  4%|▍         | 17/439 [00:10<04:10,  1.68it/s][A
  4%|▍         | 18/439 [00:10<04:09,  1.69it/s][A
  4%|▍         | 19/439 [00:1

Train loss: 0.018063813425411718
Finished Training





Save our primary model to HuggingFace.

In [45]:
PRIMARY_MODEL_SAVEPATH = f"{REPO_NAME}primary_model"

HfApi().create_repo(repo_id=PRIMARY_MODEL_SAVEPATH, private=False, exist_ok=True)

model_primary.push_to_hub(PRIMARY_MODEL_SAVEPATH)

print("Primary model upload to Hugging Face succesfully!")

No files have been modified since last commit. Skipping to prevent empty commit.


Primary model upload to Hugging Face succesfully!


To download baseline model from HuggingFace:



In [42]:
# model_primary.load_state_dict(torch.load('model.pth'))

PRIMARY_MODEL_SAVEPATH = f"{REPO_NAME}primary_model"
model_primary = NERSmall.from_pretrained(PRIMARY_MODEL_SAVEPATH).to(device)

config.json:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

### Evaluating Primary Model

In [52]:
def evaluate(model, eval_iter, optimizer):
  model = model.eval()
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps = 0
  predictions, true_labels = [], []

  for batch in tqdm(eval_iter):
      batch = tuple(t.to(device) for t in batch)

      b_input_ids, b_labels, b_input_mask, b_token_type_ids, b_label_masks = batch

      with torch.no_grad():
          tmp_eval_loss, logits, reduced_labels = model(b_input_ids,
                                                          token_type_ids=b_token_type_ids,
                                                          attention_mask=b_input_mask,
                                                          labels=b_labels,
                                                          label_masks=b_label_masks)

      logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
      logits = logits.detach().cpu().numpy()
      reduced_labels = reduced_labels.to('cpu').numpy()

      labels_to_append = []
      predictions_to_append = []

      for prediction, r_label in zip(logits, reduced_labels):
          preds = []
          labels = []
          for pred, lab in zip(prediction, r_label):
              if lab.item() == -1:  # masked label; -1 means do not collect this label
                  continue
              preds.append(pred)
              labels.append(lab)
          predictions_to_append.append(preds)
          labels_to_append.append(labels)

      predictions.extend(predictions_to_append)
      true_labels.append(labels_to_append)

      eval_loss += tmp_eval_loss.mean().item()

      nb_eval_steps += 1
  eval_loss = eval_loss / nb_eval_steps
  print(f"Validation loss: {eval_loss}")
  pred_tags = [INV_NER_TAGS[p_i] for p in predictions for p_i in p]
  valid_tags = [INV_NER_TAGS[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
  print(f"Seq eval accuracy: {accuracy_score(valid_tags, pred_tags)}")
  print(f"F1-Score: {f1_score([valid_tags], [pred_tags])}")
  print("Classification report: -- ")
  print(classification_report([valid_tags], [pred_tags]))

  return eval_loss, accuracy_score(valid_tags, pred_tags), f1_score([valid_tags], [pred_tags]), classification_report([valid_tags], [pred_tags], output_dict=True)

In [None]:
evaluate(model_primary, eval_iter, optimizer_primary)

100%|██████████| 102/102 [00:20<00:00,  4.89it/s]


Validation loss: 0.04087882407475263
Seq eval accuracy: 0.9889133526878787
F1-Score: 0.9319836024429015
Classification report: -- 
              precision    recall  f1-score   support

         LOC       0.93      0.97      0.95      1837
        MISC       0.85      0.88      0.86       922
         ORG       0.92      0.89      0.90      1341
         PER       0.97      0.97      0.97      1836

   micro avg       0.93      0.94      0.93      5936
   macro avg       0.92      0.93      0.92      5936
weighted avg       0.93      0.94      0.93      5936



Let's test our model

In [43]:
evaluate(model_primary, test_iter, optimizer_primary)

100%|██████████| 108/108 [00:23<00:00,  4.64it/s]


Validation loss: 0.09429696848383173
Seq eval accuracy: 0.9795549044531099
F1-Score: 0.8895904586512321
Classification report: -- 
              precision    recall  f1-score   support

         LOC       0.88      0.94      0.91      1666
        MISC       0.75      0.80      0.77       702
         ORG       0.87      0.86      0.86      1661
         PER       0.96      0.94      0.95      1615

   micro avg       0.88      0.90      0.89      5644
   macro avg       0.86      0.88      0.87      5644
weighted avg       0.88      0.90      0.89      5644



(0.09429696848383173,
 0.9795549044531099,
 0.8895904586512321,
 '              precision    recall  f1-score   support\n\n         LOC       0.88      0.94      0.91      1666\n        MISC       0.75      0.80      0.77       702\n         ORG       0.87      0.86      0.86      1661\n         PER       0.96      0.94      0.95      1615\n\n   micro avg       0.88      0.90      0.89      5644\n   macro avg       0.86      0.88      0.87      5644\nweighted avg       0.88      0.90      0.89      5644\n')

Small test.

In [40]:

indices = torch.arange(10*BATCH_SIZE)
test_10 = Subset(test, indices)
test_iter_10 = DataLoader(dataset=test_10, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

evaluate(model_primary, test_iter_10, optimizer_primary)

100%|██████████| 10/10 [00:02<00:00,  4.30it/s]


Validation loss: 0.444116672873497
Seq eval accuracy: 0.8788551142666963
F1-Score: 0.6367165834719911
Classification report: -- 
              precision    recall  f1-score   support

         LOC       0.59      0.87      0.70       222
        MISC       0.01      0.04      0.02        57
         ORG       0.21      0.43      0.28        65
         PER       0.85      0.90      0.87       390

   micro avg       0.54      0.78      0.64       734
   macro avg       0.41      0.56      0.47       734
weighted avg       0.65      0.78      0.70       734



(0.444116672873497,
 0.8788551142666963,
 0.6367165834719911,
 '              precision    recall  f1-score   support\n\n         LOC       0.59      0.87      0.70       222\n        MISC       0.01      0.04      0.02        57\n         ORG       0.21      0.43      0.28        65\n         PER       0.85      0.90      0.87       390\n\n   micro avg       0.54      0.78      0.64       734\n   macro avg       0.41      0.56      0.47       734\nweighted avg       0.65      0.78      0.70       734\n')

## Generation of syntetic dataset

Now we need a function to retrieve model answers and generate a dataset to train our NER model.

In [20]:
with open('prompts-latest.json') as file:
    data = json.load(file)

pattern = re.compile(r"""Tokens: '(`.*`(, )?)+'""") #  '`([\w\d.,';":!?\[\]\(\)]+`(, )?)+'

INV_NER_TAGS = {v: k for k, v in NER_TAGS.items()}

keys = list(data.keys())
tokenss = [(list(map(lambda x: x[1:-1], next(re.finditer(pattern, str(key))).group(1).split(', '))),
            list(map(lambda k: INV_NER_TAGS.get(k, None), value['predicted'])))
           for key, value in data.items()]

tokenss = [(l, t) for l, t in tokenss if all(k is not None for k in t)]


In [21]:
def gen_examples(examples):
    nl = '\n'
    return f"""{f'{nl}{nl}'.join(f'{nl}'.join(f'{token} {tag}' for token, tag in zip(*tokens)) for tokens in examples)}"""


In [22]:
with open('data/llm.txt', 'w') as file:
    print(gen_examples(tokenss), file=file)


In [23]:
llm_pure = load_example('data/llm.txt', "llm_pure")
llm_pure = NERDataSet(llm_pure, tokenizer, NER_TAGS)
llm_pure_iter = DataLoader(dataset=llm_pure, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)

In [24]:
model_llm_pure = NERSmall.from_pretrained(BERT_MODEL, num_labels=TAGS_COUNT).to(device)
optimizer_llm_pure = optim.SGD(model_llm_pure.parameters(), lr=0.001, momentum=0.9)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of NERSmall were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
 train_model(model_llm_pure, llm_pure_iter, optimizer_llm_pure)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/64 [00:00<?, ?it/s][A
  2%|▏         | 1/64 [00:01<01:44,  1.66s/it][A
  3%|▎         | 2/64 [00:02<01:02,  1.00s/it][A
  5%|▍         | 3/64 [00:02<00:48,  1.26it/s][A
  6%|▋         | 4/64 [00:03<00:41,  1.43it/s][A
  8%|▊         | 5/64 [00:03<00:37,  1.56it/s][A
  9%|▉         | 6/64 [00:04<00:35,  1.64it/s][A
 11%|█         | 7/64 [00:04<00:33,  1.69it/s][A
 12%|█▎        | 8/64 [00:05<00:32,  1.73it/s][A
 14%|█▍        | 9/64 [00:06<00:31,  1.76it/s][A
 16%|█▌        | 10/64 [00:06<00:30,  1.77it/s][A
 17%|█▋        | 11/64 [00:07<00:29,  1.78it/s][A
 19%|█▉        | 12/64 [00:07<00:29,  1.77it/s][A
 20%|██        | 13/64 [00:08<00:29,  1.75it/s][A
 22%|██▏       | 14/64 [00:08<00:28,  1.75it/s][A
 23%|██▎       | 15/64 [00:09<00:27,  1.75it/s][A
 25%|██▌       | 16/64 [00:10<00:27,  1.75it/s][A
 27%|██▋       | 17/64 [00:10<00:27,  1.73it/s][A
 28%|██▊       | 18/64 [00:11<00:26,  1.74it/s][A
 30%

Train loss: 1.1510908249765635



  0%|          | 0/64 [00:00<?, ?it/s][A
  2%|▏         | 1/64 [00:00<00:42,  1.48it/s][A
  3%|▎         | 2/64 [00:01<00:38,  1.62it/s][A
  5%|▍         | 3/64 [00:01<00:36,  1.67it/s][A
  6%|▋         | 4/64 [00:02<00:35,  1.70it/s][A
  8%|▊         | 5/64 [00:02<00:34,  1.71it/s][A
  9%|▉         | 6/64 [00:03<00:33,  1.71it/s][A
 11%|█         | 7/64 [00:04<00:33,  1.72it/s][A
 12%|█▎        | 8/64 [00:04<00:32,  1.72it/s][A
 14%|█▍        | 9/64 [00:05<00:31,  1.72it/s][A
 16%|█▌        | 10/64 [00:05<00:31,  1.72it/s][A
 17%|█▋        | 11/64 [00:06<00:30,  1.72it/s][A
 19%|█▉        | 12/64 [00:07<00:30,  1.72it/s][A
 20%|██        | 13/64 [00:07<00:29,  1.72it/s][A
 22%|██▏       | 14/64 [00:08<00:29,  1.72it/s][A
 23%|██▎       | 15/64 [00:08<00:28,  1.71it/s][A
 25%|██▌       | 16/64 [00:09<00:28,  1.71it/s][A
 27%|██▋       | 17/64 [00:09<00:27,  1.71it/s][A
 28%|██▊       | 18/64 [00:10<00:26,  1.70it/s][A
 30%|██▉       | 19/64 [00:11<00:26,  1.70it/s]

Train loss: 0.7464661709964275



  0%|          | 0/64 [00:00<?, ?it/s][A
  2%|▏         | 1/64 [00:00<00:44,  1.41it/s][A
  3%|▎         | 2/64 [00:01<00:40,  1.53it/s][A
  5%|▍         | 3/64 [00:01<00:38,  1.59it/s][A
  6%|▋         | 4/64 [00:02<00:37,  1.60it/s][A
  8%|▊         | 5/64 [00:03<00:36,  1.61it/s][A
  9%|▉         | 6/64 [00:03<00:35,  1.62it/s][A
 11%|█         | 7/64 [00:04<00:34,  1.63it/s][A
 12%|█▎        | 8/64 [00:04<00:34,  1.63it/s][A
 14%|█▍        | 9/64 [00:05<00:33,  1.64it/s][A
 16%|█▌        | 10/64 [00:06<00:33,  1.64it/s][A
 17%|█▋        | 11/64 [00:06<00:32,  1.64it/s][A
 19%|█▉        | 12/64 [00:07<00:31,  1.64it/s][A
 20%|██        | 13/64 [00:08<00:31,  1.64it/s][A
 22%|██▏       | 14/64 [00:08<00:30,  1.64it/s][A
 23%|██▎       | 15/64 [00:09<00:30,  1.62it/s][A
 25%|██▌       | 16/64 [00:09<00:29,  1.61it/s][A
 27%|██▋       | 17/64 [00:10<00:29,  1.61it/s][A
 28%|██▊       | 18/64 [00:11<00:28,  1.61it/s][A
 30%|██▉       | 19/64 [00:11<00:28,  1.59it/s]

Train loss: 0.643373510800302



  0%|          | 0/64 [00:00<?, ?it/s][A
  2%|▏         | 1/64 [00:00<00:54,  1.15it/s][A
  3%|▎         | 2/64 [00:01<00:44,  1.39it/s][A
  5%|▍         | 3/64 [00:02<00:41,  1.48it/s][A
  6%|▋         | 4/64 [00:02<00:39,  1.52it/s][A
  8%|▊         | 5/64 [00:03<00:38,  1.55it/s][A
  9%|▉         | 6/64 [00:03<00:36,  1.60it/s][A
 11%|█         | 7/64 [00:04<00:34,  1.63it/s][A
 12%|█▎        | 8/64 [00:05<00:33,  1.65it/s][A
 14%|█▍        | 9/64 [00:05<00:33,  1.65it/s][A
 16%|█▌        | 10/64 [00:06<00:32,  1.66it/s][A
 17%|█▋        | 11/64 [00:06<00:31,  1.66it/s][A
 19%|█▉        | 12/64 [00:07<00:31,  1.66it/s][A
 20%|██        | 13/64 [00:08<00:30,  1.66it/s][A
 22%|██▏       | 14/64 [00:08<00:29,  1.67it/s][A
 23%|██▎       | 15/64 [00:09<00:29,  1.67it/s][A
 25%|██▌       | 16/64 [00:09<00:28,  1.67it/s][A
 27%|██▋       | 17/64 [00:10<00:28,  1.66it/s][A
 28%|██▊       | 18/64 [00:11<00:27,  1.67it/s][A
 30%|██▉       | 19/64 [00:11<00:26,  1.68it/s]

Train loss: 0.578693363815546



  0%|          | 0/64 [00:00<?, ?it/s][A
  2%|▏         | 1/64 [00:00<00:51,  1.21it/s][A
  3%|▎         | 2/64 [00:01<00:45,  1.37it/s][A
  5%|▍         | 3/64 [00:02<00:41,  1.47it/s][A
  6%|▋         | 4/64 [00:02<00:39,  1.53it/s][A
  8%|▊         | 5/64 [00:03<00:37,  1.57it/s][A
  9%|▉         | 6/64 [00:03<00:36,  1.59it/s][A
 11%|█         | 7/64 [00:04<00:36,  1.58it/s][A
 12%|█▎        | 8/64 [00:05<00:34,  1.61it/s][A
 14%|█▍        | 9/64 [00:05<00:34,  1.60it/s][A
 16%|█▌        | 10/64 [00:06<00:33,  1.62it/s][A
 17%|█▋        | 11/64 [00:07<00:32,  1.62it/s][A
 19%|█▉        | 12/64 [00:07<00:31,  1.63it/s][A
 20%|██        | 13/64 [00:08<00:31,  1.64it/s][A
 22%|██▏       | 14/64 [00:08<00:30,  1.65it/s][A
 23%|██▎       | 15/64 [00:09<00:29,  1.65it/s][A
 25%|██▌       | 16/64 [00:10<00:29,  1.65it/s][A
 27%|██▋       | 17/64 [00:10<00:28,  1.65it/s][A
 28%|██▊       | 18/64 [00:11<00:28,  1.64it/s][A
 30%|██▉       | 19/64 [00:11<00:27,  1.64it/s]

Train loss: 0.5405911449342966
Finished Training





Save our pure llm model to HuggingFace.

To download baseline model from HuggingFace:



In [46]:
# torch.save(model_llm_pure.state_dict(), 'model.pth')
PURE_LLM_MODEL_SAVEPATH = f"{REPO_NAME}llm_pure_model"

HfApi().create_repo(repo_id=PURE_LLM_MODEL_SAVEPATH, private=False, exist_ok=True)

model_llm_pure.push_to_hub(PURE_LLM_MODEL_SAVEPATH)

print("Pure llm model upload to Hugging Face succesfully!")

model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

Pure llm model upload to Hugging Face succesfully!


In [48]:
model_llm_pure = NERSmall.from_pretrained(PURE_LLM_MODEL_SAVEPATH).to(device)

config.json:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

Let's evaluate our pure llm model:

In [47]:
evaluate(model_llm_pure, eval_iter, optimizer_llm_pure)

100%|██████████| 102/102 [00:21<00:00,  4.72it/s]


Validation loss: 0.5344288140973624
Seq eval accuracy: 0.8440270444050426
F1-Score: 0.4679277312962289
Classification report: -- 
              precision    recall  f1-score   support

         LOC       0.39      0.83      0.53      1837
        MISC       0.03      0.08      0.05       922
         ORG       0.26      0.40      0.32      1341
         PER       0.73      0.94      0.82      1836

   micro avg       0.37      0.65      0.47      5936
   macro avg       0.35      0.56      0.43      5936
weighted avg       0.41      0.65      0.50      5936



(0.5344288140973624,
 0.8440270444050426,
 0.4679277312962289,
 '              precision    recall  f1-score   support\n\n         LOC       0.39      0.83      0.53      1837\n        MISC       0.03      0.08      0.05       922\n         ORG       0.26      0.40      0.32      1341\n         PER       0.73      0.94      0.82      1836\n\n   micro avg       0.37      0.65      0.47      5936\n   macro avg       0.35      0.56      0.43      5936\nweighted avg       0.41      0.65      0.50      5936\n')

Syntetic dataset size:


In [None]:
len(tokenss)

2037

## Comparative analysis

In [55]:
llm_pure_metrics = evaluate(model_llm_pure, test_iter, optimizer_llm_pure)

100%|██████████| 108/108 [00:24<00:00,  4.43it/s]


Validation loss: 0.5251597337493742
Seq eval accuracy: 0.8393907404614689
F1-Score: 0.46034791506779227
Classification report: -- 
              precision    recall  f1-score   support

         LOC       0.39      0.88      0.54      1666
        MISC       0.03      0.08      0.04       702
         ORG       0.28      0.36      0.31      1661
         PER       0.72      0.92      0.81      1615

   micro avg       0.36      0.64      0.46      5644
   macro avg       0.35      0.56      0.43      5644
weighted avg       0.41      0.64      0.49      5644



In [54]:
llm_pure_metrics


(0.444116672873497,
 0.8788551142666963,
 0.6367165834719911,
 {'LOC': {'precision': 0.5884146341463414,
   'recall': 0.8693693693693694,
   'f1-score': 0.7018181818181818,
   'support': 222},
  'MISC': {'precision': 0.010256410256410256,
   'recall': 0.03508771929824561,
   'f1-score': 0.015873015873015876,
   'support': 57},
  'ORG': {'precision': 0.21052631578947367,
   'recall': 0.4307692307692308,
   'f1-score': 0.2828282828282828,
   'support': 65},
  'PER': {'precision': 0.8498789346246973,
   'recall': 0.9,
   'f1-score': 0.8742216687422166,
   'support': 390},
  'micro avg': {'precision': 0.5369504209541628,
   'recall': 0.782016348773842,
   'f1-score': 0.6367165834719911,
   'support': 734},
  'macro avg': {'precision': 0.41476907370423066,
   'recall': 0.5588065798592114,
   'f1-score': 0.4686852873154243,
   'support': 734},
  'weighted avg': {'precision': 0.6489777373229576,
   'recall': 0.782016348773842,
   'f1-score': 0.7030499829178489,
   'support': 734}})

In [56]:
primary_metrics = evaluate(model_primary, test_iter, optimizer_primary)

100%|██████████| 108/108 [00:23<00:00,  4.64it/s]


Validation loss: 0.09429696848383173
Seq eval accuracy: 0.9795549044531099
F1-Score: 0.8895904586512321
Classification report: -- 
              precision    recall  f1-score   support

         LOC       0.88      0.94      0.91      1666
        MISC       0.75      0.80      0.77       702
         ORG       0.87      0.86      0.86      1661
         PER       0.96      0.94      0.95      1615

   micro avg       0.88      0.90      0.89      5644
   macro avg       0.86      0.88      0.87      5644
weighted avg       0.88      0.90      0.89      5644



In [None]:
primary_metrics

In [66]:
def compare_pair(metric1, metric2):
  loss1, acc1, f1_1, _ = metric1
  loss2, acc2, f1_2, _ = metric2
  delta_acc = acc1 - acc2
  magnitude_loss = loss2/loss1 # suppose (sps) that acc1 > acc2
  delta_f1 = f1_1 - f1_2

  return delta_acc, magnitude_loss, delta_f1

def print_comparison(delta_acc, magnitude_loss, delta_f1, better="first"):
  if delta_acc < 0:
    print_comparison(-delta_acc, 1/magnitude_loss, -delta_f1, "second")
    return
  print(f"{better} model is better:\nAccuracy: {delta_acc * 100:.1f}%\nF1-score: {delta_f1*100:.1f}%\nLoss: {magnitude_loss:.3f}")


In [67]:
print_comparison(*compare_pair(primary_metrics, llm_pure_metrics))

first model is better:
Accuracy: 14.0%
F1-score: 42.9%
Loss: 5.569


# Stage 3

## Mix datasets


In [90]:
class HoneyConcatDataset(torch.utils.data.Dataset):
    def __init__(self, *datasets):
        self.datasets = datasets

    def __getitem__(self, i):
        return tuple(d[i] for d in self.datasets)

    def __len__(self):
        return min(len(d) for d in self.datasets)


In [91]:
def mix_datasets(expert, cheap, percentage):
  base_size = len(cheap)
  additional = Subset(expert, torch.arange(int(base_size * percentage)))
  dataset = ConcatDataset([cheap, additional])

  return DataLoader(dataset=dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)


In [93]:
models = []
model_metrics = []
paths = [f"{REPO_NAME}model_percentage{int(p*100)}" for p in PERCENTAGES]

for p, mixed_model_savepath in zip(PERCENTAGES, paths):
  model = NERSmall.from_pretrained(BERT_MODEL, num_labels=TAGS_COUNT).to(device)
  optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
  dataset_iter = mix_datasets(train, llm_pure, p)
  correct_iter = mix_datasets(test, llm_pure, p)
  train_model(model, dataset_iter, optimizer)
  model_metrics.append(evaluate(model, correct_iter, optimizer))

  HfApi().create_repo(repo_id=mixed_model_savepath, private=False, exist_ok=True)
  model_primary.push_to_hub(mixed_model_savepath)
  print(f"Mixed model with percentage {int(p*100)} upload to Hugging Face succesfully!")
  models.append(model)


Some weights of NERSmall were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch:   0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/65 [00:00<?, ?it/s][A
  2%|▏         | 1/65 [00:00<00:59,  1.08it/s][A
  3%|▎         | 2/65 [00:01<00:45,  1.38it/s][A
  5%|▍         | 3/65 [00:02<00:41,  1.51it/s][A
  6%|▌         | 4/65 [00:02<00:38,  1.59it/s][A
  8%|▊         | 5/65 [00:03<00:36,  1.62it/s][A
  9%|▉         | 6/65 [00:03<00:35,  1.65it/s][A
 11%|█         | 7/65 [00:04<00:35,  1.65it/s][A
 12%|█▏        | 8/65 [00:05<00:34,  1.67it/s][A
 14%|█▍        | 9/65 [00:05<00:33,  1.66it/s][A
 15%|█▌        | 10/65 [00:06<00:32,  1.67it/s][A
 17%|█▋        | 11/65 [00:06<00:32,  1.68it/s][A
 18%|█▊        | 12/65 [00:07<00:31,  1.68it/s][A
 20%|██        | 13/65 [00:08<00:30,  1.

Train loss: 1.1551043152809144



  0%|          | 0/65 [00:00<?, ?it/s][A
  2%|▏         | 1/65 [00:00<00:46,  1.39it/s][A
  3%|▎         | 2/65 [00:01<00:41,  1.53it/s][A
  5%|▍         | 3/65 [00:01<00:38,  1.59it/s][A
  6%|▌         | 4/65 [00:02<00:37,  1.61it/s][A
  8%|▊         | 5/65 [00:03<00:36,  1.63it/s][A
  9%|▉         | 6/65 [00:03<00:36,  1.63it/s][A
 11%|█         | 7/65 [00:04<00:35,  1.64it/s][A
 12%|█▏        | 8/65 [00:04<00:34,  1.64it/s][A
 14%|█▍        | 9/65 [00:05<00:33,  1.65it/s][A
 15%|█▌        | 10/65 [00:06<00:33,  1.65it/s][A
 17%|█▋        | 11/65 [00:06<00:32,  1.65it/s][A
 18%|█▊        | 12/65 [00:07<00:32,  1.66it/s][A
 20%|██        | 13/65 [00:07<00:31,  1.66it/s][A
 22%|██▏       | 14/65 [00:08<00:30,  1.66it/s][A
 23%|██▎       | 15/65 [00:09<00:29,  1.67it/s][A
 25%|██▍       | 16/65 [00:09<00:29,  1.67it/s][A
 26%|██▌       | 17/65 [00:10<00:28,  1.67it/s][A
 28%|██▊       | 18/65 [00:10<00:28,  1.67it/s][A
 29%|██▉       | 19/65 [00:11<00:27,  1.67it/s]

Train loss: 0.7533572201545422



  0%|          | 0/65 [00:00<?, ?it/s][A
  2%|▏         | 1/65 [00:00<00:45,  1.41it/s][A
  3%|▎         | 2/65 [00:01<00:40,  1.55it/s][A
  5%|▍         | 3/65 [00:01<00:38,  1.61it/s][A
  6%|▌         | 4/65 [00:02<00:37,  1.63it/s][A
  8%|▊         | 5/65 [00:03<00:36,  1.65it/s][A
  9%|▉         | 6/65 [00:03<00:35,  1.65it/s][A
 11%|█         | 7/65 [00:04<00:35,  1.66it/s][A
 12%|█▏        | 8/65 [00:04<00:34,  1.66it/s][A
 14%|█▍        | 9/65 [00:05<00:34,  1.64it/s][A
 15%|█▌        | 10/65 [00:06<00:33,  1.63it/s][A
 17%|█▋        | 11/65 [00:06<00:33,  1.63it/s][A
 18%|█▊        | 12/65 [00:07<00:32,  1.63it/s][A
 20%|██        | 13/65 [00:08<00:32,  1.61it/s][A
 22%|██▏       | 14/65 [00:08<00:31,  1.60it/s][A
 23%|██▎       | 15/65 [00:09<00:31,  1.61it/s][A
 25%|██▍       | 16/65 [00:09<00:30,  1.61it/s][A
 26%|██▌       | 17/65 [00:10<00:29,  1.62it/s][A
 28%|██▊       | 18/65 [00:11<00:28,  1.63it/s][A
 29%|██▉       | 19/65 [00:11<00:28,  1.63it/s]

Train loss: 0.6506178910915669



  0%|          | 0/65 [00:00<?, ?it/s][A
  2%|▏         | 1/65 [00:00<00:53,  1.20it/s][A
  3%|▎         | 2/65 [00:01<00:44,  1.43it/s][A
  5%|▍         | 3/65 [00:02<00:40,  1.53it/s][A
  6%|▌         | 4/65 [00:02<00:39,  1.56it/s][A
  8%|▊         | 5/65 [00:03<00:37,  1.59it/s][A
  9%|▉         | 6/65 [00:03<00:36,  1.61it/s][A
 11%|█         | 7/65 [00:04<00:35,  1.62it/s][A
 12%|█▏        | 8/65 [00:05<00:34,  1.63it/s][A
 14%|█▍        | 9/65 [00:05<00:34,  1.64it/s][A
 15%|█▌        | 10/65 [00:06<00:33,  1.65it/s][A
 17%|█▋        | 11/65 [00:06<00:32,  1.65it/s][A
 18%|█▊        | 12/65 [00:07<00:32,  1.65it/s][A
 20%|██        | 13/65 [00:08<00:31,  1.64it/s][A
 22%|██▏       | 14/65 [00:08<00:31,  1.62it/s][A
 23%|██▎       | 15/65 [00:09<00:31,  1.60it/s][A
 25%|██▍       | 16/65 [00:10<00:30,  1.61it/s][A
 26%|██▌       | 17/65 [00:10<00:29,  1.62it/s][A
 28%|██▊       | 18/65 [00:11<00:29,  1.61it/s][A
 29%|██▉       | 19/65 [00:11<00:28,  1.60it/s]

Train loss: 0.5785831176317655



  0%|          | 0/65 [00:00<?, ?it/s][A
  2%|▏         | 1/65 [00:00<00:44,  1.43it/s][A
  3%|▎         | 2/65 [00:01<00:40,  1.55it/s][A
  5%|▍         | 3/65 [00:01<00:38,  1.61it/s][A
  6%|▌         | 4/65 [00:02<00:37,  1.64it/s][A
  8%|▊         | 5/65 [00:03<00:36,  1.65it/s][A
  9%|▉         | 6/65 [00:03<00:35,  1.66it/s][A
 11%|█         | 7/65 [00:04<00:34,  1.67it/s][A
 12%|█▏        | 8/65 [00:04<00:34,  1.67it/s][A
 14%|█▍        | 9/65 [00:05<00:33,  1.67it/s][A
 15%|█▌        | 10/65 [00:06<00:32,  1.67it/s][A
 17%|█▋        | 11/65 [00:06<00:32,  1.68it/s][A
 18%|█▊        | 12/65 [00:07<00:31,  1.68it/s][A
 20%|██        | 13/65 [00:07<00:31,  1.68it/s][A
 22%|██▏       | 14/65 [00:08<00:30,  1.68it/s][A
 23%|██▎       | 15/65 [00:09<00:29,  1.68it/s][A
 25%|██▍       | 16/65 [00:09<00:29,  1.68it/s][A
 26%|██▌       | 17/65 [00:10<00:28,  1.67it/s][A
 28%|██▊       | 18/65 [00:10<00:28,  1.67it/s][A
 29%|██▉       | 19/65 [00:11<00:27,  1.67it/s]

Train loss: 0.5368599405655494
Finished Training


100%|██████████| 65/65 [00:13<00:00,  4.76it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Validation loss: 0.4764656479542072
Seq eval accuracy: 0.8538081444879877
F1-Score: 0.6376014990630856
Classification report: -- 
              precision    recall  f1-score   support

        CLS]       0.00      0.00      0.00         8
         LOC       0.75      0.81      0.78      1661
        MISC       0.46      0.21      0.29      1061
         ORG       0.58      0.43      0.49       816
         PER       0.83      0.77      0.80       818
        SEP]       0.00      0.00      0.00         1

   micro avg       0.70      0.58      0.64      4365
   macro avg       0.44      0.37      0.39      4365
weighted avg       0.66      0.58      0.61      4365



model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

Mixed model with percentage 1 upload to Hugging Face succesfully!


Some weights of NERSmall were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch:   0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/65 [00:00<?, ?it/s][A
  2%|▏         | 1/65 [00:00<00:50,  1.27it/s][A
  3%|▎         | 2/65 [00:01<00:42,  1.50it/s][A
  5%|▍         | 3/65 [00:01<00:39,  1.58it/s][A
  6%|▌         | 4/65 [00:02<00:37,  1.63it/s][A
  8%|▊         | 5/65 [00:03<00:36,  1.65it/s][A
  9%|▉         | 6/65 [00:03<00:35,  1.65it/s][A
 11%|█         | 7/65 [00:04<00:35,  1.65it/s][A
 12%|█▏        | 8/65 [00:04<00:34,  1.64it/s][A
 14%|█▍        | 9/65 [00:05<00:33,  1.65it/s][A
 15%|█▌        | 10/65 [00:06<00:33,  1.65it/s][A
 17%|█▋        | 11/65 [00:06<00:32,  1.66it/s][A
 18%|█▊        | 12/65 [00:07<00:32,  1.65it/s][A
 20%|██        | 13/65 [00:07<00:31,  1.

Train loss: 1.1413233747849099



  0%|          | 0/65 [00:00<?, ?it/s][A
  2%|▏         | 1/65 [00:00<00:47,  1.36it/s][A
  3%|▎         | 2/65 [00:01<00:41,  1.51it/s][A
  5%|▍         | 3/65 [00:01<00:39,  1.58it/s][A
  6%|▌         | 4/65 [00:02<00:37,  1.61it/s][A
  8%|▊         | 5/65 [00:03<00:36,  1.63it/s][A
  9%|▉         | 6/65 [00:03<00:35,  1.64it/s][A
 11%|█         | 7/65 [00:04<00:35,  1.65it/s][A
 12%|█▏        | 8/65 [00:04<00:34,  1.66it/s][A
 14%|█▍        | 9/65 [00:05<00:33,  1.66it/s][A
 15%|█▌        | 10/65 [00:06<00:33,  1.66it/s][A
 17%|█▋        | 11/65 [00:06<00:32,  1.67it/s][A
 18%|█▊        | 12/65 [00:07<00:31,  1.67it/s][A
 20%|██        | 13/65 [00:07<00:31,  1.67it/s][A
 22%|██▏       | 14/65 [00:08<00:30,  1.68it/s][A
 23%|██▎       | 15/65 [00:09<00:29,  1.68it/s][A
 25%|██▍       | 16/65 [00:09<00:29,  1.68it/s][A
 26%|██▌       | 17/65 [00:10<00:28,  1.67it/s][A
 28%|██▊       | 18/65 [00:10<00:28,  1.66it/s][A
 29%|██▉       | 19/65 [00:11<00:27,  1.66it/s]

Train loss: 0.7552606967779306



  0%|          | 0/65 [00:00<?, ?it/s][A
  2%|▏         | 1/65 [00:00<00:50,  1.28it/s][A
  3%|▎         | 2/65 [00:01<00:42,  1.49it/s][A
  5%|▍         | 3/65 [00:01<00:39,  1.58it/s][A
  6%|▌         | 4/65 [00:02<00:37,  1.62it/s][A
  8%|▊         | 5/65 [00:03<00:36,  1.65it/s][A
  9%|▉         | 6/65 [00:03<00:35,  1.66it/s][A
 11%|█         | 7/65 [00:04<00:34,  1.67it/s][A
 12%|█▏        | 8/65 [00:04<00:33,  1.68it/s][A
 14%|█▍        | 9/65 [00:05<00:33,  1.68it/s][A
 15%|█▌        | 10/65 [00:06<00:32,  1.69it/s][A
 17%|█▋        | 11/65 [00:06<00:32,  1.68it/s][A
 18%|█▊        | 12/65 [00:07<00:31,  1.69it/s][A
 20%|██        | 13/65 [00:07<00:30,  1.68it/s][A
 22%|██▏       | 14/65 [00:08<00:30,  1.68it/s][A
 23%|██▎       | 15/65 [00:09<00:29,  1.68it/s][A
 25%|██▍       | 16/65 [00:09<00:29,  1.69it/s][A
 26%|██▌       | 17/65 [00:10<00:28,  1.69it/s][A
 28%|██▊       | 18/65 [00:10<00:27,  1.68it/s][A
 29%|██▉       | 19/65 [00:11<00:27,  1.67it/s]

Train loss: 0.6461997004655692



  0%|          | 0/65 [00:00<?, ?it/s][A
  2%|▏         | 1/65 [00:00<00:51,  1.25it/s][A
  3%|▎         | 2/65 [00:01<00:43,  1.46it/s][A
  5%|▍         | 3/65 [00:01<00:39,  1.56it/s][A
  6%|▌         | 4/65 [00:02<00:38,  1.60it/s][A
  8%|▊         | 5/65 [00:03<00:36,  1.62it/s][A
  9%|▉         | 6/65 [00:03<00:35,  1.64it/s][A
 11%|█         | 7/65 [00:04<00:35,  1.65it/s][A
 12%|█▏        | 8/65 [00:04<00:34,  1.66it/s][A
 14%|█▍        | 9/65 [00:05<00:33,  1.66it/s][A
 15%|█▌        | 10/65 [00:06<00:32,  1.67it/s][A
 17%|█▋        | 11/65 [00:06<00:32,  1.67it/s][A
 18%|█▊        | 12/65 [00:07<00:31,  1.67it/s][A
 20%|██        | 13/65 [00:07<00:31,  1.67it/s][A
 22%|██▏       | 14/65 [00:08<00:30,  1.67it/s][A
 23%|██▎       | 15/65 [00:09<00:29,  1.67it/s][A
 25%|██▍       | 16/65 [00:09<00:29,  1.67it/s][A
 26%|██▌       | 17/65 [00:10<00:28,  1.67it/s][A
 28%|██▊       | 18/65 [00:10<00:28,  1.67it/s][A
 29%|██▉       | 19/65 [00:11<00:27,  1.66it/s]

Train loss: 0.5770858595004449



  0%|          | 0/65 [00:00<?, ?it/s][A
  2%|▏         | 1/65 [00:00<00:49,  1.29it/s][A
  3%|▎         | 2/65 [00:01<00:42,  1.48it/s][A
  5%|▍         | 3/65 [00:01<00:39,  1.57it/s][A
  6%|▌         | 4/65 [00:02<00:37,  1.61it/s][A
  8%|▊         | 5/65 [00:03<00:36,  1.64it/s][A
  9%|▉         | 6/65 [00:03<00:35,  1.65it/s][A
 11%|█         | 7/65 [00:04<00:34,  1.66it/s][A
 12%|█▏        | 8/65 [00:04<00:34,  1.67it/s][A
 14%|█▍        | 9/65 [00:05<00:33,  1.67it/s][A
 15%|█▌        | 10/65 [00:06<00:32,  1.67it/s][A
 17%|█▋        | 11/65 [00:06<00:32,  1.68it/s][A
 18%|█▊        | 12/65 [00:07<00:31,  1.68it/s][A
 20%|██        | 13/65 [00:07<00:30,  1.68it/s][A
 22%|██▏       | 14/65 [00:08<00:30,  1.68it/s][A
 23%|██▎       | 15/65 [00:09<00:29,  1.68it/s][A
 25%|██▍       | 16/65 [00:09<00:29,  1.68it/s][A
 26%|██▌       | 17/65 [00:10<00:28,  1.68it/s][A
 28%|██▊       | 18/65 [00:10<00:27,  1.68it/s][A
 29%|██▉       | 19/65 [00:11<00:27,  1.68it/s]

Train loss: 0.5404655296068925
Finished Training


100%|██████████| 65/65 [00:13<00:00,  4.81it/s]


Validation loss: 0.4637227952480316
Seq eval accuracy: 0.8596247779751333
F1-Score: 0.6572507001095824
Classification report: -- 
              precision    recall  f1-score   support

        CLS]       0.00      0.00      0.00         8
         LOC       0.80      0.80      0.80      1681
        MISC       0.47      0.28      0.35      1065
         ORG       0.55      0.52      0.54       817
         PER       0.89      0.74      0.81       853
        SEP]       0.00      0.00      0.00         1

   micro avg       0.71      0.61      0.66      4425
   macro avg       0.45      0.39      0.42      4425
weighted avg       0.69      0.61      0.64      4425



model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

Mixed model with percentage 2 upload to Hugging Face succesfully!


Some weights of NERSmall were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch:   0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/67 [00:00<?, ?it/s][A
  1%|▏         | 1/67 [00:00<00:52,  1.26it/s][A
  3%|▎         | 2/67 [00:01<00:43,  1.49it/s][A
  4%|▍         | 3/67 [00:01<00:40,  1.58it/s][A
  6%|▌         | 4/67 [00:02<00:38,  1.63it/s][A
  7%|▋         | 5/67 [00:03<00:37,  1.65it/s][A
  9%|▉         | 6/67 [00:03<00:36,  1.67it/s][A
 10%|█         | 7/67 [00:04<00:35,  1.67it/s][A
 12%|█▏        | 8/67 [00:04<00:35,  1.66it/s][A
 13%|█▎        | 9/67 [00:05<00:34,  1.66it/s][A
 15%|█▍        | 10/67 [00:06<00:34,  1.66it/s][A
 16%|█▋        | 11/67 [00:06<00:33,  1.66it/s][A
 18%|█▊        | 12/67 [00:07<00:33,  1.67it/s][A
 19%|█▉        | 13/67 [00:07<00:32,  1.

Train loss: 1.1142931971977006



  0%|          | 0/67 [00:00<?, ?it/s][A
  1%|▏         | 1/67 [00:00<00:48,  1.36it/s][A
  3%|▎         | 2/67 [00:01<00:43,  1.51it/s][A
  4%|▍         | 3/67 [00:01<00:40,  1.58it/s][A
  6%|▌         | 4/67 [00:02<00:39,  1.61it/s][A
  7%|▋         | 5/67 [00:03<00:38,  1.63it/s][A
  9%|▉         | 6/67 [00:03<00:37,  1.64it/s][A
 10%|█         | 7/67 [00:04<00:36,  1.65it/s][A
 12%|█▏        | 8/67 [00:04<00:35,  1.64it/s][A
 13%|█▎        | 9/67 [00:05<00:35,  1.65it/s][A
 15%|█▍        | 10/67 [00:06<00:34,  1.66it/s][A
 16%|█▋        | 11/67 [00:06<00:33,  1.67it/s][A
 18%|█▊        | 12/67 [00:07<00:32,  1.67it/s][A
 19%|█▉        | 13/67 [00:07<00:32,  1.67it/s][A
 21%|██        | 14/67 [00:08<00:31,  1.67it/s][A
 22%|██▏       | 15/67 [00:09<00:31,  1.67it/s][A
 24%|██▍       | 16/67 [00:09<00:30,  1.68it/s][A
 25%|██▌       | 17/67 [00:10<00:29,  1.68it/s][A
 27%|██▋       | 18/67 [00:10<00:29,  1.68it/s][A
 28%|██▊       | 19/67 [00:11<00:28,  1.68it/s]

Train loss: 0.7254876312035233



  0%|          | 0/67 [00:00<?, ?it/s][A
  1%|▏         | 1/67 [00:00<00:47,  1.38it/s][A
  3%|▎         | 2/67 [00:01<00:42,  1.55it/s][A
  4%|▍         | 3/67 [00:01<00:40,  1.59it/s][A
  6%|▌         | 4/67 [00:02<00:38,  1.62it/s][A
  7%|▋         | 5/67 [00:03<00:37,  1.64it/s][A
  9%|▉         | 6/67 [00:03<00:36,  1.65it/s][A
 10%|█         | 7/67 [00:04<00:36,  1.66it/s][A
 12%|█▏        | 8/67 [00:04<00:35,  1.66it/s][A
 13%|█▎        | 9/67 [00:05<00:34,  1.67it/s][A
 15%|█▍        | 10/67 [00:06<00:34,  1.67it/s][A
 16%|█▋        | 11/67 [00:06<00:33,  1.68it/s][A
 18%|█▊        | 12/67 [00:07<00:32,  1.68it/s][A
 19%|█▉        | 13/67 [00:07<00:32,  1.68it/s][A
 21%|██        | 14/67 [00:08<00:31,  1.68it/s][A
 22%|██▏       | 15/67 [00:09<00:30,  1.68it/s][A
 24%|██▍       | 16/67 [00:09<00:30,  1.68it/s][A
 25%|██▌       | 17/67 [00:10<00:29,  1.68it/s][A
 27%|██▋       | 18/67 [00:10<00:29,  1.68it/s][A
 28%|██▊       | 19/67 [00:11<00:28,  1.68it/s]

Train loss: 0.6199224982688676



  0%|          | 0/67 [00:00<?, ?it/s][A
  1%|▏         | 1/67 [00:00<00:48,  1.37it/s][A
  3%|▎         | 2/67 [00:01<00:43,  1.51it/s][A
  4%|▍         | 3/67 [00:01<00:40,  1.57it/s][A
  6%|▌         | 4/67 [00:02<00:39,  1.60it/s][A
  7%|▋         | 5/67 [00:03<00:38,  1.62it/s][A
  9%|▉         | 6/67 [00:03<00:37,  1.63it/s][A
 10%|█         | 7/67 [00:04<00:36,  1.65it/s][A
 12%|█▏        | 8/67 [00:04<00:35,  1.65it/s][A
 13%|█▎        | 9/67 [00:05<00:34,  1.66it/s][A
 15%|█▍        | 10/67 [00:06<00:34,  1.66it/s][A
 16%|█▋        | 11/67 [00:06<00:33,  1.67it/s][A
 18%|█▊        | 12/67 [00:07<00:32,  1.67it/s][A
 19%|█▉        | 13/67 [00:07<00:32,  1.67it/s][A
 21%|██        | 14/67 [00:08<00:31,  1.67it/s][A
 22%|██▏       | 15/67 [00:09<00:31,  1.67it/s][A
 24%|██▍       | 16/67 [00:09<00:30,  1.68it/s][A
 25%|██▌       | 17/67 [00:10<00:29,  1.68it/s][A
 27%|██▋       | 18/67 [00:10<00:29,  1.67it/s][A
 28%|██▊       | 19/67 [00:11<00:28,  1.67it/s]

Train loss: 0.5703826155235519



  0%|          | 0/67 [00:00<?, ?it/s][A
  1%|▏         | 1/67 [00:00<00:51,  1.27it/s][A
  3%|▎         | 2/67 [00:01<00:44,  1.47it/s][A
  4%|▍         | 3/67 [00:01<00:41,  1.56it/s][A
  6%|▌         | 4/67 [00:02<00:39,  1.59it/s][A
  7%|▋         | 5/67 [00:03<00:38,  1.61it/s][A
  9%|▉         | 6/67 [00:03<00:37,  1.64it/s][A
 10%|█         | 7/67 [00:04<00:36,  1.65it/s][A
 12%|█▏        | 8/67 [00:04<00:35,  1.66it/s][A
 13%|█▎        | 9/67 [00:05<00:34,  1.67it/s][A
 15%|█▍        | 10/67 [00:06<00:34,  1.67it/s][A
 16%|█▋        | 11/67 [00:06<00:33,  1.68it/s][A
 18%|█▊        | 12/67 [00:07<00:32,  1.68it/s][A
 19%|█▉        | 13/67 [00:07<00:32,  1.68it/s][A
 21%|██        | 14/67 [00:08<00:31,  1.68it/s][A
 22%|██▏       | 15/67 [00:09<00:30,  1.68it/s][A
 24%|██▍       | 16/67 [00:09<00:30,  1.68it/s][A
 25%|██▌       | 17/67 [00:10<00:29,  1.68it/s][A
 27%|██▋       | 18/67 [00:10<00:29,  1.68it/s][A
 28%|██▊       | 19/67 [00:11<00:28,  1.68it/s]

Train loss: 0.5239344448295992
Finished Training


100%|██████████| 67/67 [00:13<00:00,  4.80it/s]


Validation loss: 0.4483308267237535
Seq eval accuracy: 0.8633487074975866
F1-Score: 0.6579796772265392
Classification report: -- 
              precision    recall  f1-score   support

        CLS]       0.00      0.00      0.00         8
         LOC       0.82      0.78      0.80      1727
        MISC       0.44      0.27      0.34      1071
         ORG       0.55      0.52      0.54       817
         PER       0.91      0.74      0.82       920
        SEP]       0.00      0.00      0.00         1

   micro avg       0.72      0.61      0.66      4544
   macro avg       0.45      0.39      0.42      4544
weighted avg       0.70      0.61      0.65      4544



model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

Mixed model with percentage 5 upload to Hugging Face succesfully!


Some weights of NERSmall were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch:   0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/70 [00:00<?, ?it/s][A
  1%|▏         | 1/70 [00:00<00:54,  1.27it/s][A
  3%|▎         | 2/70 [00:01<00:45,  1.50it/s][A
  4%|▍         | 3/70 [00:01<00:42,  1.58it/s][A
  6%|▌         | 4/70 [00:02<00:40,  1.63it/s][A
  7%|▋         | 5/70 [00:03<00:39,  1.65it/s][A
  9%|▊         | 6/70 [00:03<00:38,  1.67it/s][A
 10%|█         | 7/70 [00:04<00:37,  1.68it/s][A
 11%|█▏        | 8/70 [00:04<00:36,  1.68it/s][A
 13%|█▎        | 9/70 [00:05<00:36,  1.69it/s][A
 14%|█▍        | 10/70 [00:06<00:35,  1.68it/s][A
 16%|█▌        | 11/70 [00:06<00:35,  1.68it/s][A
 17%|█▋        | 12/70 [00:07<00:34,  1.68it/s][A
 19%|█▊        | 13/70 [00:07<00:33,  1.

Train loss: 1.0772067495754787



  0%|          | 0/70 [00:00<?, ?it/s][A
  1%|▏         | 1/70 [00:00<00:49,  1.38it/s][A
  3%|▎         | 2/70 [00:01<00:44,  1.53it/s][A
  4%|▍         | 3/70 [00:01<00:42,  1.59it/s][A
  6%|▌         | 4/70 [00:02<00:40,  1.62it/s][A
  7%|▋         | 5/70 [00:03<00:39,  1.63it/s][A
  9%|▊         | 6/70 [00:03<00:39,  1.63it/s][A
 10%|█         | 7/70 [00:04<00:38,  1.64it/s][A
 11%|█▏        | 8/70 [00:04<00:37,  1.64it/s][A
 13%|█▎        | 9/70 [00:05<00:37,  1.64it/s][A
 14%|█▍        | 10/70 [00:06<00:36,  1.65it/s][A
 16%|█▌        | 11/70 [00:06<00:35,  1.66it/s][A
 17%|█▋        | 12/70 [00:07<00:34,  1.66it/s][A
 19%|█▊        | 13/70 [00:07<00:34,  1.67it/s][A
 20%|██        | 14/70 [00:08<00:33,  1.67it/s][A
 21%|██▏       | 15/70 [00:09<00:32,  1.67it/s][A
 23%|██▎       | 16/70 [00:09<00:32,  1.68it/s][A
 24%|██▍       | 17/70 [00:10<00:31,  1.68it/s][A
 26%|██▌       | 18/70 [00:10<00:30,  1.68it/s][A
 27%|██▋       | 19/70 [00:11<00:30,  1.68it/s]

Train loss: 0.6996300663266863



  0%|          | 0/70 [00:00<?, ?it/s][A
  1%|▏         | 1/70 [00:00<00:55,  1.25it/s][A
  3%|▎         | 2/70 [00:01<00:46,  1.46it/s][A
  4%|▍         | 3/70 [00:02<00:43,  1.56it/s][A
  6%|▌         | 4/70 [00:02<00:41,  1.60it/s][A
  7%|▋         | 5/70 [00:03<00:39,  1.63it/s][A
  9%|▊         | 6/70 [00:03<00:38,  1.65it/s][A
 10%|█         | 7/70 [00:04<00:38,  1.66it/s][A
 11%|█▏        | 8/70 [00:04<00:37,  1.67it/s][A
 13%|█▎        | 9/70 [00:05<00:36,  1.67it/s][A
 14%|█▍        | 10/70 [00:06<00:35,  1.67it/s][A
 16%|█▌        | 11/70 [00:06<00:35,  1.68it/s][A
 17%|█▋        | 12/70 [00:07<00:34,  1.68it/s][A
 19%|█▊        | 13/70 [00:07<00:33,  1.68it/s][A
 20%|██        | 14/70 [00:08<00:33,  1.68it/s][A
 21%|██▏       | 15/70 [00:09<00:32,  1.68it/s][A
 23%|██▎       | 16/70 [00:09<00:32,  1.68it/s][A
 24%|██▍       | 17/70 [00:10<00:31,  1.68it/s][A
 26%|██▌       | 18/70 [00:10<00:31,  1.68it/s][A
 27%|██▋       | 19/70 [00:11<00:30,  1.67it/s]

Train loss: 0.6019107299191612



  0%|          | 0/70 [00:00<?, ?it/s][A
  1%|▏         | 1/70 [00:00<00:54,  1.28it/s][A
  3%|▎         | 2/70 [00:01<00:46,  1.48it/s][A
  4%|▍         | 3/70 [00:01<00:42,  1.57it/s][A
  6%|▌         | 4/70 [00:02<00:41,  1.61it/s][A
  7%|▋         | 5/70 [00:03<00:39,  1.63it/s][A
  9%|▊         | 6/70 [00:03<00:39,  1.64it/s][A
 10%|█         | 7/70 [00:04<00:38,  1.65it/s][A
 11%|█▏        | 8/70 [00:04<00:37,  1.66it/s][A
 13%|█▎        | 9/70 [00:05<00:36,  1.66it/s][A
 14%|█▍        | 10/70 [00:06<00:35,  1.67it/s][A
 16%|█▌        | 11/70 [00:06<00:35,  1.67it/s][A
 17%|█▋        | 12/70 [00:07<00:34,  1.67it/s][A
 19%|█▊        | 13/70 [00:07<00:34,  1.67it/s][A
 20%|██        | 14/70 [00:08<00:33,  1.67it/s][A
 21%|██▏       | 15/70 [00:09<00:32,  1.67it/s][A
 23%|██▎       | 16/70 [00:09<00:32,  1.67it/s][A
 24%|██▍       | 17/70 [00:10<00:31,  1.68it/s][A
 26%|██▌       | 18/70 [00:10<00:31,  1.67it/s][A
 27%|██▋       | 19/70 [00:11<00:30,  1.67it/s]

Train loss: 0.5501926737172264



  0%|          | 0/70 [00:00<?, ?it/s][A
  1%|▏         | 1/70 [00:00<00:49,  1.39it/s][A
  3%|▎         | 2/70 [00:01<00:44,  1.54it/s][A
  4%|▍         | 3/70 [00:01<00:41,  1.60it/s][A
  6%|▌         | 4/70 [00:02<00:40,  1.63it/s][A
  7%|▋         | 5/70 [00:03<00:39,  1.65it/s][A
  9%|▊         | 6/70 [00:03<00:38,  1.66it/s][A
 10%|█         | 7/70 [00:04<00:37,  1.67it/s][A
 11%|█▏        | 8/70 [00:04<00:37,  1.67it/s][A
 13%|█▎        | 9/70 [00:05<00:36,  1.67it/s][A
 14%|█▍        | 10/70 [00:06<00:35,  1.68it/s][A
 16%|█▌        | 11/70 [00:06<00:35,  1.68it/s][A
 17%|█▋        | 12/70 [00:07<00:34,  1.68it/s][A
 19%|█▊        | 13/70 [00:07<00:33,  1.68it/s][A
 20%|██        | 14/70 [00:08<00:33,  1.68it/s][A
 21%|██▏       | 15/70 [00:09<00:32,  1.67it/s][A
 23%|██▎       | 16/70 [00:09<00:32,  1.66it/s][A
 24%|██▍       | 17/70 [00:10<00:31,  1.66it/s][A
 26%|██▌       | 18/70 [00:10<00:31,  1.67it/s][A
 27%|██▋       | 19/70 [00:11<00:30,  1.66it/s]

Train loss: 0.5116712880986077
Finished Training


100%|██████████| 70/70 [00:14<00:00,  4.83it/s]


Validation loss: 0.480901848418372
Seq eval accuracy: 0.8556380417335474
F1-Score: 0.6356536502546689
Classification report: -- 
              precision    recall  f1-score   support

        CLS]       0.00      0.00      0.00         8
         LOC       0.75      0.80      0.78      1789
        MISC       0.39      0.22      0.28      1084
         ORG       0.54      0.43      0.48       867
         PER       0.89      0.75      0.81      1016
        SEP]       0.00      0.00      0.00         1

   micro avg       0.69      0.59      0.64      4765
   macro avg       0.43      0.37      0.39      4765
weighted avg       0.66      0.59      0.62      4765



model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

Mixed model with percentage 10 upload to Hugging Face succesfully!


Some weights of NERSmall were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch:   0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/80 [00:00<?, ?it/s][A
  1%|▏         | 1/80 [00:00<01:03,  1.24it/s][A
  2%|▎         | 2/80 [00:01<00:52,  1.48it/s][A
  4%|▍         | 3/80 [00:01<00:49,  1.57it/s][A
  5%|▌         | 4/80 [00:02<00:46,  1.62it/s][A
  6%|▋         | 5/80 [00:03<00:45,  1.65it/s][A
  8%|▊         | 6/80 [00:03<00:44,  1.66it/s][A
  9%|▉         | 7/80 [00:04<00:43,  1.67it/s][A
 10%|█         | 8/80 [00:04<00:42,  1.68it/s][A
 11%|█▏        | 9/80 [00:05<00:42,  1.68it/s][A
 12%|█▎        | 10/80 [00:06<00:41,  1.68it/s][A
 14%|█▍        | 11/80 [00:06<00:41,  1.68it/s][A
 15%|█▌        | 12/80 [00:07<00:40,  1.68it/s][A
 16%|█▋        | 13/80 [00:07<00:39,  1.

Train loss: 1.0084951959550381



  0%|          | 0/80 [00:00<?, ?it/s][A
  1%|▏         | 1/80 [00:00<00:57,  1.37it/s][A
  2%|▎         | 2/80 [00:01<00:50,  1.54it/s][A
  4%|▍         | 3/80 [00:01<00:48,  1.59it/s][A
  5%|▌         | 4/80 [00:02<00:46,  1.63it/s][A
  6%|▋         | 5/80 [00:03<00:45,  1.65it/s][A
  8%|▊         | 6/80 [00:03<00:44,  1.66it/s][A
  9%|▉         | 7/80 [00:04<00:43,  1.67it/s][A
 10%|█         | 8/80 [00:04<00:43,  1.67it/s][A
 11%|█▏        | 9/80 [00:05<00:42,  1.68it/s][A
 12%|█▎        | 10/80 [00:06<00:41,  1.67it/s][A
 14%|█▍        | 11/80 [00:06<00:41,  1.67it/s][A
 15%|█▌        | 12/80 [00:07<00:40,  1.67it/s][A
 16%|█▋        | 13/80 [00:07<00:39,  1.68it/s][A
 18%|█▊        | 14/80 [00:08<00:39,  1.68it/s][A
 19%|█▉        | 15/80 [00:09<00:38,  1.68it/s][A
 20%|██        | 16/80 [00:09<00:38,  1.68it/s][A
 21%|██▏       | 17/80 [00:10<00:37,  1.68it/s][A
 22%|██▎       | 18/80 [00:10<00:36,  1.68it/s][A
 24%|██▍       | 19/80 [00:11<00:36,  1.68it/s]

Train loss: 0.6392712458968163



  0%|          | 0/80 [00:00<?, ?it/s][A
  1%|▏         | 1/80 [00:00<01:02,  1.27it/s][A
  2%|▎         | 2/80 [00:01<00:53,  1.46it/s][A
  4%|▍         | 3/80 [00:01<00:49,  1.56it/s][A
  5%|▌         | 4/80 [00:02<00:47,  1.60it/s][A
  6%|▋         | 5/80 [00:03<00:46,  1.62it/s][A
  8%|▊         | 6/80 [00:03<00:45,  1.64it/s][A
  9%|▉         | 7/80 [00:04<00:44,  1.65it/s][A
 10%|█         | 8/80 [00:04<00:43,  1.66it/s][A
 11%|█▏        | 9/80 [00:05<00:42,  1.66it/s][A
 12%|█▎        | 10/80 [00:06<00:42,  1.66it/s][A
 14%|█▍        | 11/80 [00:06<00:41,  1.66it/s][A
 15%|█▌        | 12/80 [00:07<00:40,  1.67it/s][A
 16%|█▋        | 13/80 [00:07<00:40,  1.67it/s][A
 18%|█▊        | 14/80 [00:08<00:39,  1.67it/s][A
 19%|█▉        | 15/80 [00:09<00:38,  1.67it/s][A
 20%|██        | 16/80 [00:09<00:38,  1.67it/s][A
 21%|██▏       | 17/80 [00:10<00:37,  1.67it/s][A
 22%|██▎       | 18/80 [00:10<00:37,  1.67it/s][A
 24%|██▍       | 19/80 [00:11<00:36,  1.67it/s]

Train loss: 0.5511350151151418



  0%|          | 0/80 [00:00<?, ?it/s][A
  1%|▏         | 1/80 [00:00<00:58,  1.35it/s][A
  2%|▎         | 2/80 [00:01<00:51,  1.52it/s][A
  4%|▍         | 3/80 [00:01<00:48,  1.58it/s][A
  5%|▌         | 4/80 [00:02<00:47,  1.62it/s][A
  6%|▋         | 5/80 [00:03<00:45,  1.64it/s][A
  8%|▊         | 6/80 [00:03<00:44,  1.65it/s][A
  9%|▉         | 7/80 [00:04<00:44,  1.65it/s][A
 10%|█         | 8/80 [00:04<00:43,  1.66it/s][A
 11%|█▏        | 9/80 [00:05<00:42,  1.66it/s][A
 12%|█▎        | 10/80 [00:06<00:42,  1.66it/s][A
 14%|█▍        | 11/80 [00:06<00:41,  1.67it/s][A
 15%|█▌        | 12/80 [00:07<00:40,  1.67it/s][A
 16%|█▋        | 13/80 [00:07<00:40,  1.67it/s][A
 18%|█▊        | 14/80 [00:08<00:39,  1.67it/s][A
 19%|█▉        | 15/80 [00:09<00:38,  1.68it/s][A
 20%|██        | 16/80 [00:09<00:38,  1.67it/s][A
 21%|██▏       | 17/80 [00:10<00:37,  1.67it/s][A
 22%|██▎       | 18/80 [00:10<00:37,  1.67it/s][A
 24%|██▍       | 19/80 [00:11<00:36,  1.68it/s]

Train loss: 0.508796464279294



  0%|          | 0/80 [00:00<?, ?it/s][A
  1%|▏         | 1/80 [00:00<00:56,  1.39it/s][A
  2%|▎         | 2/80 [00:01<00:50,  1.54it/s][A
  4%|▍         | 3/80 [00:01<00:48,  1.60it/s][A
  5%|▌         | 4/80 [00:02<00:46,  1.63it/s][A
  6%|▋         | 5/80 [00:03<00:45,  1.65it/s][A
  8%|▊         | 6/80 [00:03<00:44,  1.66it/s][A
  9%|▉         | 7/80 [00:04<00:43,  1.66it/s][A
 10%|█         | 8/80 [00:04<00:43,  1.67it/s][A
 11%|█▏        | 9/80 [00:05<00:42,  1.67it/s][A
 12%|█▎        | 10/80 [00:06<00:41,  1.67it/s][A
 14%|█▍        | 11/80 [00:06<00:41,  1.67it/s][A
 15%|█▌        | 12/80 [00:07<00:40,  1.67it/s][A
 16%|█▋        | 13/80 [00:07<00:40,  1.67it/s][A
 18%|█▊        | 14/80 [00:08<00:39,  1.66it/s][A
 19%|█▉        | 15/80 [00:09<00:39,  1.65it/s][A
 20%|██        | 16/80 [00:09<00:38,  1.65it/s][A
 21%|██▏       | 17/80 [00:10<00:38,  1.65it/s][A
 22%|██▎       | 18/80 [00:10<00:37,  1.66it/s][A
 24%|██▍       | 19/80 [00:11<00:36,  1.67it/s]

Train loss: 0.4781316600739956
Finished Training


100%|██████████| 80/80 [00:16<00:00,  4.83it/s]


Validation loss: 0.46214157100766895
Seq eval accuracy: 0.8629014873895237
F1-Score: 0.6366427840327533
Classification report: -- 
              precision    recall  f1-score   support

        CLS]       0.00      0.00      0.00         8
         LOC       0.71      0.81      0.76      1931
        MISC       0.44      0.17      0.24      1115
         ORG       0.52      0.39      0.44      1009
         PER       0.92      0.77      0.84      1246
        SEP]       0.00      0.00      0.00         1

   micro avg       0.70      0.59      0.64      5310
   macro avg       0.43      0.36      0.38      5310
weighted avg       0.66      0.59      0.61      5310



model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

Mixed model with percentage 25 upload to Hugging Face succesfully!


Some weights of NERSmall were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch:   0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/96 [00:00<?, ?it/s][A
  1%|          | 1/96 [00:00<01:18,  1.21it/s][A
  2%|▏         | 2/96 [00:01<01:04,  1.45it/s][A
  3%|▎         | 3/96 [00:02<00:59,  1.56it/s][A
  4%|▍         | 4/96 [00:02<00:56,  1.61it/s][A
  5%|▌         | 5/96 [00:03<00:55,  1.64it/s][A
  6%|▋         | 6/96 [00:03<00:54,  1.66it/s][A
  7%|▋         | 7/96 [00:04<00:53,  1.68it/s][A
  8%|▊         | 8/96 [00:04<00:52,  1.68it/s][A
  9%|▉         | 9/96 [00:05<00:51,  1.68it/s][A
 10%|█         | 10/96 [00:06<00:51,  1.69it/s][A
 11%|█▏        | 11/96 [00:06<00:50,  1.69it/s][A
 12%|█▎        | 12/96 [00:07<00:49,  1.69it/s][A
 14%|█▎        | 13/96 [00:07<00:49,  1.

Train loss: 0.8984471121802926



  0%|          | 0/96 [00:00<?, ?it/s][A
  1%|          | 1/96 [00:00<01:08,  1.38it/s][A
  2%|▏         | 2/96 [00:01<01:01,  1.54it/s][A
  3%|▎         | 3/96 [00:01<00:57,  1.61it/s][A
  4%|▍         | 4/96 [00:02<00:55,  1.64it/s][A
  5%|▌         | 5/96 [00:03<00:54,  1.66it/s][A
  6%|▋         | 6/96 [00:03<00:53,  1.68it/s][A
  7%|▋         | 7/96 [00:04<00:52,  1.68it/s][A
  8%|▊         | 8/96 [00:04<00:52,  1.68it/s][A
  9%|▉         | 9/96 [00:05<00:51,  1.69it/s][A
 10%|█         | 10/96 [00:06<00:51,  1.68it/s][A
 11%|█▏        | 11/96 [00:06<00:50,  1.69it/s][A
 12%|█▎        | 12/96 [00:07<00:49,  1.69it/s][A
 14%|█▎        | 13/96 [00:07<00:49,  1.69it/s][A
 15%|█▍        | 14/96 [00:08<00:48,  1.69it/s][A
 16%|█▌        | 15/96 [00:09<00:47,  1.69it/s][A
 17%|█▋        | 16/96 [00:09<00:47,  1.68it/s][A
 18%|█▊        | 17/96 [00:10<00:47,  1.67it/s][A
 19%|█▉        | 18/96 [00:10<00:46,  1.68it/s][A
 20%|█▉        | 19/96 [00:11<00:45,  1.68it/s]

Train loss: 0.5563099325324098



  0%|          | 0/96 [00:00<?, ?it/s][A
  1%|          | 1/96 [00:00<01:10,  1.35it/s][A
  2%|▏         | 2/96 [00:01<01:02,  1.51it/s][A
  3%|▎         | 3/96 [00:01<00:58,  1.59it/s][A
  4%|▍         | 4/96 [00:02<00:56,  1.61it/s][A
  5%|▌         | 5/96 [00:03<00:55,  1.63it/s][A
  6%|▋         | 6/96 [00:03<00:54,  1.64it/s][A
  7%|▋         | 7/96 [00:04<00:54,  1.65it/s][A
  8%|▊         | 8/96 [00:04<00:53,  1.65it/s][A
  9%|▉         | 9/96 [00:05<00:52,  1.65it/s][A
 10%|█         | 10/96 [00:06<00:52,  1.64it/s][A
 11%|█▏        | 11/96 [00:06<00:51,  1.65it/s][A
 12%|█▎        | 12/96 [00:07<00:50,  1.66it/s][A
 14%|█▎        | 13/96 [00:07<00:49,  1.66it/s][A
 15%|█▍        | 14/96 [00:08<00:49,  1.66it/s][A
 16%|█▌        | 15/96 [00:09<00:48,  1.67it/s][A
 17%|█▋        | 16/96 [00:09<00:47,  1.67it/s][A
 18%|█▊        | 17/96 [00:10<00:47,  1.67it/s][A
 19%|█▉        | 18/96 [00:10<00:46,  1.67it/s][A
 20%|█▉        | 19/96 [00:11<00:46,  1.67it/s]

Train loss: 0.49314487259835005



  0%|          | 0/96 [00:00<?, ?it/s][A
  1%|          | 1/96 [00:00<01:19,  1.20it/s][A
  2%|▏         | 2/96 [00:01<01:05,  1.43it/s][A
  3%|▎         | 3/96 [00:02<01:00,  1.54it/s][A
  4%|▍         | 4/96 [00:02<00:57,  1.59it/s][A
  5%|▌         | 5/96 [00:03<00:56,  1.62it/s][A
  6%|▋         | 6/96 [00:03<00:54,  1.64it/s][A
  7%|▋         | 7/96 [00:04<00:53,  1.65it/s][A
  8%|▊         | 8/96 [00:05<00:53,  1.66it/s][A
  9%|▉         | 9/96 [00:05<00:52,  1.67it/s][A
 10%|█         | 10/96 [00:06<00:51,  1.67it/s][A
 11%|█▏        | 11/96 [00:06<00:50,  1.67it/s][A
 12%|█▎        | 12/96 [00:07<00:50,  1.68it/s][A
 14%|█▎        | 13/96 [00:07<00:49,  1.67it/s][A
 15%|█▍        | 14/96 [00:08<00:48,  1.68it/s][A
 16%|█▌        | 15/96 [00:09<00:48,  1.67it/s][A
 17%|█▋        | 16/96 [00:09<00:47,  1.68it/s][A
 18%|█▊        | 17/96 [00:10<00:47,  1.68it/s][A
 19%|█▉        | 18/96 [00:10<00:46,  1.68it/s][A
 20%|█▉        | 19/96 [00:11<00:45,  1.67it/s]

Train loss: 0.449430116917938



  0%|          | 0/96 [00:00<?, ?it/s][A
  1%|          | 1/96 [00:00<01:10,  1.35it/s][A
  2%|▏         | 2/96 [00:01<01:02,  1.51it/s][A
  3%|▎         | 3/96 [00:01<00:58,  1.58it/s][A
  4%|▍         | 4/96 [00:02<00:56,  1.62it/s][A
  5%|▌         | 5/96 [00:03<00:55,  1.64it/s][A
  6%|▋         | 6/96 [00:03<00:54,  1.65it/s][A
  7%|▋         | 7/96 [00:04<00:53,  1.66it/s][A
  8%|▊         | 8/96 [00:04<00:52,  1.67it/s][A
  9%|▉         | 9/96 [00:05<00:52,  1.67it/s][A
 10%|█         | 10/96 [00:06<00:51,  1.67it/s][A
 11%|█▏        | 11/96 [00:06<00:50,  1.67it/s][A
 12%|█▎        | 12/96 [00:07<00:50,  1.65it/s][A
 14%|█▎        | 13/96 [00:07<00:50,  1.66it/s][A
 15%|█▍        | 14/96 [00:08<00:49,  1.66it/s][A
 16%|█▌        | 15/96 [00:09<00:49,  1.65it/s][A
 17%|█▋        | 16/96 [00:09<00:48,  1.65it/s][A
 18%|█▊        | 17/96 [00:10<00:47,  1.66it/s][A
 19%|█▉        | 18/96 [00:10<00:46,  1.66it/s][A
 20%|█▉        | 19/96 [00:11<00:46,  1.67it/s]

Train loss: 0.42745211472113925
Finished Training


100%|██████████| 96/96 [00:19<00:00,  4.83it/s]


Validation loss: 0.40141961676999927
Seq eval accuracy: 0.8823232662944777
F1-Score: 0.6485614412476471
Classification report: -- 
              precision    recall  f1-score   support

        CLS]       0.00      0.00      0.00         8
         LOC       0.77      0.73      0.75      2104
        MISC       0.46      0.14      0.21      1252
         ORG       0.55      0.53      0.54      1350
         PER       0.94      0.80      0.86      1479
        SEP]       0.00      0.00      0.00         1

   micro avg       0.73      0.58      0.65      6194
   macro avg       0.45      0.37      0.39      6194
weighted avg       0.70      0.58      0.62      6194



model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

Mixed model with percentage 50 upload to Hugging Face succesfully!


Some weights of NERSmall were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch:   0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/128 [00:00<?, ?it/s][A
  1%|          | 1/128 [00:00<01:41,  1.25it/s][A
  2%|▏         | 2/128 [00:01<01:25,  1.48it/s][A
  2%|▏         | 3/128 [00:01<01:19,  1.58it/s][A
  3%|▎         | 4/128 [00:02<01:16,  1.62it/s][A
  4%|▍         | 5/128 [00:03<01:14,  1.65it/s][A
  5%|▍         | 6/128 [00:03<01:13,  1.67it/s][A
  5%|▌         | 7/128 [00:04<01:12,  1.67it/s][A
  6%|▋         | 8/128 [00:04<01:11,  1.68it/s][A
  7%|▋         | 9/128 [00:05<01:10,  1.68it/s][A
  8%|▊         | 10/128 [00:06<01:10,  1.68it/s][A
  9%|▊         | 11/128 [00:06<01:09,  1.68it/s][A
  9%|▉         | 12/128 [00:07<01:08,  1.68it/s][A
 10%|█         | 13/128 [00

Train loss: 0.7070766694378108



  0%|          | 0/128 [00:00<?, ?it/s][A
  1%|          | 1/128 [00:00<01:45,  1.20it/s][A
  2%|▏         | 2/128 [00:01<01:27,  1.45it/s][A
  2%|▏         | 3/128 [00:02<01:20,  1.55it/s][A
  3%|▎         | 4/128 [00:02<01:17,  1.60it/s][A
  4%|▍         | 5/128 [00:03<01:15,  1.63it/s][A
  5%|▍         | 6/128 [00:03<01:14,  1.65it/s][A
  5%|▌         | 7/128 [00:04<01:12,  1.66it/s][A
  6%|▋         | 8/128 [00:04<01:11,  1.67it/s][A
  7%|▋         | 9/128 [00:05<01:10,  1.68it/s][A
  8%|▊         | 10/128 [00:06<01:10,  1.68it/s][A
  9%|▊         | 11/128 [00:06<01:09,  1.68it/s][A
  9%|▉         | 12/128 [00:07<01:09,  1.68it/s][A
 10%|█         | 13/128 [00:07<01:08,  1.68it/s][A
 11%|█         | 14/128 [00:08<01:07,  1.69it/s][A
 12%|█▏        | 15/128 [00:09<01:07,  1.68it/s][A
 12%|█▎        | 16/128 [00:09<01:06,  1.68it/s][A
 13%|█▎        | 17/128 [00:10<01:06,  1.68it/s][A
 14%|█▍        | 18/128 [00:10<01:05,  1.68it/s][A
 15%|█▍        | 19/128 [00:1

Train loss: 0.41938589059282094



  0%|          | 0/128 [00:00<?, ?it/s][A
  1%|          | 1/128 [00:00<01:47,  1.18it/s][A
  2%|▏         | 2/128 [00:01<01:28,  1.42it/s][A
  2%|▏         | 3/128 [00:02<01:21,  1.53it/s][A
  3%|▎         | 4/128 [00:02<01:18,  1.58it/s][A
  4%|▍         | 5/128 [00:03<01:16,  1.61it/s][A
  5%|▍         | 6/128 [00:03<01:14,  1.63it/s][A
  5%|▌         | 7/128 [00:04<01:13,  1.65it/s][A
  6%|▋         | 8/128 [00:05<01:12,  1.66it/s][A
  7%|▋         | 9/128 [00:05<01:11,  1.67it/s][A
  8%|▊         | 10/128 [00:06<01:10,  1.67it/s][A
  9%|▊         | 11/128 [00:06<01:09,  1.67it/s][A
  9%|▉         | 12/128 [00:07<01:09,  1.67it/s][A
 10%|█         | 13/128 [00:08<01:08,  1.67it/s][A
 11%|█         | 14/128 [00:08<01:08,  1.67it/s][A
 12%|█▏        | 15/128 [00:09<01:07,  1.67it/s][A
 12%|█▎        | 16/128 [00:09<01:06,  1.67it/s][A
 13%|█▎        | 17/128 [00:10<01:06,  1.67it/s][A
 14%|█▍        | 18/128 [00:11<01:05,  1.67it/s][A
 15%|█▍        | 19/128 [00:1

Train loss: 0.37330367392860353



  0%|          | 0/128 [00:00<?, ?it/s][A
  1%|          | 1/128 [00:00<01:48,  1.17it/s][A
  2%|▏         | 2/128 [00:01<01:29,  1.41it/s][A
  2%|▏         | 3/128 [00:02<01:22,  1.52it/s][A
  3%|▎         | 4/128 [00:02<01:19,  1.56it/s][A
  4%|▍         | 5/128 [00:03<01:16,  1.60it/s][A
  5%|▍         | 6/128 [00:03<01:15,  1.62it/s][A
  5%|▌         | 7/128 [00:04<01:13,  1.64it/s][A
  6%|▋         | 8/128 [00:05<01:12,  1.65it/s][A
  7%|▋         | 9/128 [00:05<01:11,  1.66it/s][A
  8%|▊         | 10/128 [00:06<01:10,  1.66it/s][A
  9%|▊         | 11/128 [00:06<01:10,  1.66it/s][A
  9%|▉         | 12/128 [00:07<01:09,  1.67it/s][A
 10%|█         | 13/128 [00:08<01:09,  1.67it/s][A
 11%|█         | 14/128 [00:08<01:08,  1.67it/s][A
 12%|█▏        | 15/128 [00:09<01:07,  1.67it/s][A
 12%|█▎        | 16/128 [00:09<01:07,  1.66it/s][A
 13%|█▎        | 17/128 [00:10<01:06,  1.67it/s][A
 14%|█▍        | 18/128 [00:11<01:06,  1.67it/s][A
 15%|█▍        | 19/128 [00:1

Train loss: 0.3442359803011641



  0%|          | 0/128 [00:00<?, ?it/s][A
  1%|          | 1/128 [00:00<01:35,  1.33it/s][A
  2%|▏         | 2/128 [00:01<01:24,  1.49it/s][A
  2%|▏         | 3/128 [00:01<01:19,  1.57it/s][A
  3%|▎         | 4/128 [00:02<01:17,  1.60it/s][A
  4%|▍         | 5/128 [00:03<01:15,  1.62it/s][A
  5%|▍         | 6/128 [00:03<01:14,  1.63it/s][A
  5%|▌         | 7/128 [00:04<01:13,  1.65it/s][A
  6%|▋         | 8/128 [00:04<01:12,  1.66it/s][A
  7%|▋         | 9/128 [00:05<01:11,  1.66it/s][A
  8%|▊         | 10/128 [00:06<01:10,  1.67it/s][A
  9%|▊         | 11/128 [00:06<01:10,  1.66it/s][A
  9%|▉         | 12/128 [00:07<01:09,  1.67it/s][A
 10%|█         | 13/128 [00:07<01:08,  1.67it/s][A
 11%|█         | 14/128 [00:08<01:08,  1.67it/s][A
 12%|█▏        | 15/128 [00:09<01:07,  1.67it/s][A
 12%|█▎        | 16/128 [00:09<01:06,  1.67it/s][A
 13%|█▎        | 17/128 [00:10<01:06,  1.67it/s][A
 14%|█▍        | 18/128 [00:10<01:05,  1.68it/s][A
 15%|█▍        | 19/128 [00:1

Train loss: 0.3317745514214039
Finished Training


100%|██████████| 128/128 [00:26<00:00,  4.88it/s]


Validation loss: 0.3255521811079234
Seq eval accuracy: 0.9101716747260172
F1-Score: 0.6622862794969621
Classification report: -- 
              precision    recall  f1-score   support

        CLS]       0.00      0.00      0.00         8
         LOC       0.74      0.77      0.76      2670
        MISC       0.46      0.23      0.31      1485
         ORG       0.60      0.49      0.54      1725
         PER       0.93      0.80      0.86      1779
        SEP]       0.00      0.00      0.00         1

   micro avg       0.72      0.61      0.66      7668
   macro avg       0.46      0.38      0.41      7668
weighted avg       0.70      0.61      0.64      7668



model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

Mixed model with percentage 100 upload to Hugging Face succesfully!


In [102]:
[metrics[1] for metrics in model_metrics  ]

[0.8538081444879877,
 0.8596247779751333,
 0.8633487074975866,
 0.8556380417335474,
 0.8629014873895237,
 0.8823232662944777,
 0.9101716747260172]

In [103]:
[acc for _, acc, *_ in model_metrics]

[0.8538081444879877,
 0.8596247779751333,
 0.8633487074975866,
 0.8556380417335474,
 0.8629014873895237,
 0.8823232662944777,
 0.9101716747260172]

In [101]:
model_metrics[0]

(0.4764656479542072,
 0.8538081444879877,
 0.6376014990630856,
 {'CLS]': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 8},
  'LOC': {'precision': 0.7536231884057971,
   'recall': 0.8139674894641782,
   'f1-score': 0.7826338639652678,
   'support': 1661},
  'MISC': {'precision': 0.45979381443298967,
   'recall': 0.21017907634307256,
   'f1-score': 0.28848641655886154,
   'support': 1061},
  'ORG': {'precision': 0.5754098360655737,
   'recall': 0.43014705882352944,
   'f1-score': 0.49228611500701264,
   'support': 816},
  'PER': {'precision': 0.833555259653795,
   'recall': 0.7652811735941321,
   'f1-score': 0.7979604843849586,
   'support': 818},
  'SEP]': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1},
  'micro avg': {'precision': 0.701098901098901,
   'recall': 0.5846506300114548,
   'f1-score': 0.6376014990630856,
   'support': 4365},
  'macro avg': {'precision': 0.4370636830930259,
   'recall': 0.3699291330374854,
   'f1-score': 0.3935611466526834,


todo:
- [ ] graph (plot of models)
- [ ] load models (MEEEEEEEEH)
- [ ] comparison analysis of all this
- [ ] fix python - remove pip
