<a href="https://colab.research.google.com/github/geleg-rigzin/NER_and_NEL/blob/main/NER_and_NEL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
!cp -a /content/drive/MyDrive/msu_task_3_ner-main/ ./data/

In [None]:
!pip install numpy==1.21.6 scikit-learn==1.0.2 tensorboard==2.9.0 torch==1.12.1 tqdm==4.64.0 transformers==4.21.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting numpy==1.21.6
  Downloading numpy-1.21.6-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn==1.0.2
  Downloading scikit_learn-1.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.4/26.4 MB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorboard==2.9.0
  Downloading tensorboard-2.9.0-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch==1.12.1
  Downloading torch-1.12.1-cp39-cp39-manylinux1_x86_64.whl (776.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.4/776.4 MB[0m [31m2.

In [None]:
import random
from collections import Counter, defaultdict, namedtuple
from typing import Tuple, List, Dict, Any

import torch
import numpy as np

from tqdm import tqdm, trange

In [None]:
def set_global_seed(seed: int) -> None:
    """
    Set global seed for reproducibility.
    """

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


set_global_seed(42)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs

In [None]:
def read_conll2003(
    path: str,
    lower: bool = True,
) -> Tuple[List[List[str]], List[List[str]]]:
    """
    Prepare data in CoNNL like format.
    """

    token_seq = []
    label_seq = []
    with open(path, 'r') as f:
      read_file = f.readlines()
    token_seq.append([])
    label_seq.append([])
    i = 0
    for line in tqdm(read_file):
      tmp = line.split()
      if len(tmp) > 0:
        t, s = tmp
        if lower:
          t = t.lower()
        token_seq[i].append(t)
        label_seq[i].append(s)
      else:
        i += 1
        token_seq.append([])
        label_seq.append([])

    if len(token_seq[-1]) == 0:
      token_seq = token_seq[:-1]
      label_seq = label_seq[:-1]
    return token_seq, label_seq

In [None]:
train_token_seq, train_label_seq = read_conll2003("data/train.tsv")
valid_token_seq, valid_label_seq = read_conll2003("data/valid.tsv")
test_token_seq, test_label_seq = read_conll2003("data/test.tsv")

100%|██████████| 219552/219552 [00:00<00:00, 1081066.80it/s]
100%|██████████| 55042/55042 [00:00<00:00, 1042769.36it/s]
100%|██████████| 50348/50348 [00:00<00:00, 592630.02it/s]


In [None]:
for token, label in zip(train_token_seq[0], train_label_seq[0]):
    print(f"{token}\t{label}")

eu	B-ORG
rejects	O
german	B-MISC
call	O
to	O
boycott	O
british	B-MISC
lamb	O
.	O


In [None]:
for token, label in zip(valid_token_seq[0], valid_label_seq[0]):
    print(f"{token}\t{label}")

cricket	O
-	O
leicestershire	B-ORG
take	O
over	O
at	O
top	O
after	O
innings	O
victory	O
.	O


In [None]:
for token, label in zip(test_token_seq[0], test_label_seq[0]):
    print(f"{token}\t{label}")

soccer	O
-	O
japan	B-LOC
get	O
lucky	O
win	O
,	O
china	B-PER
in	O
surprise	O
defeat	O
.	O


In [None]:
def read_data(
    path: str,
    lower: bool = False,
) -> Tuple[List[List[str]], List[List[str]], List[List[int]]]:
    
    token_seq = []
    label_seq = []
    persn_seq = []
    with open(path, 'r') as f:
      read_file = f.readlines()
    token_seq.append([])
    label_seq.append([])
    persn_seq.append([])
    i = 0
    is_name = False
    name_num = 0
    for line in tqdm(read_file):
      tmp = line.split()
      if len(tmp) == 0:
        i += 1
        token_seq.append([])
        label_seq.append([])
        persn_seq.append([])
      elif len(tmp) > 1:
        t = tmp[0]
        if lower:
          t = t.lower()
        token_seq[i].append(t)
        label_seq[i].append(tmp[1])
        if len(tmp) > 2:
          persn_seq[i].append(int(tmp[2]))
          is_name = True
          name_num = int(tmp[2])
        elif tmp[1] == 'I-PER' and is_name:
          persn_seq[i].append(name_num)
        else:
          is_name = False
          name_num = 0          
          persn_seq[i].append(0)

    if len(token_seq[-1]) == 0:
      token_seq = token_seq[:-1]
      label_seq = label_seq[:-1]
      persn_seq = persn_seq[:-1]
    return token_seq, label_seq, persn_seq

Данные для первого этапа обучения:

In [None]:
train_token_seq, train_label_seq, _ = read_data("/content/train.txt")
valid_token_seq, valid_label_seq, _ = read_data("/content/valid.txt")
test_token_seq, test_label_seq, _ = read_data("/content/test.txt")

100%|██████████| 259393/259393 [00:00<00:00, 1053900.49it/s]
100%|██████████| 53055/53055 [00:00<00:00, 454564.36it/s]
100%|██████████| 49409/49409 [00:00<00:00, 502310.82it/s]


In [None]:
not_empty_train = [i for i in range(len(train_token_seq)) if len(train_token_seq[i]) > 0]
not_empty_valid = [i for i in range(len(valid_token_seq)) if len(valid_token_seq[i]) > 0]
not_empty_test = [i for i in range(len(test_token_seq)) if len(test_token_seq[i]) > 0]

In [None]:
train_token_seq = [train_token_seq[i] for i in not_empty_train]
train_label_seq  = [train_label_seq[i] for i in not_empty_train]

valid_token_seq = [valid_token_seq[i] for i in not_empty_valid]
valid_label_seq  = [valid_label_seq[i] for i in not_empty_valid]

test_token_seq = [test_token_seq[i] for i in not_empty_test]
test_label_seq  = [test_label_seq[i] for i in not_empty_test]

Эксперименты:

In [None]:
data_file_name = '/content/ent_test_1.txt'

In [None]:
train_token_seq, train_label_seq, train_person_seq = read_data(data_file_name)

100%|██████████| 682/682 [00:00<00:00, 134713.92it/s]


In [None]:
not_empty_train = [i for i in range(len(train_token_seq)) if len(train_token_seq[i]) > 0]
train_token_seq = [train_token_seq[i] for i in not_empty_train]
train_label_seq  = [train_label_seq[i] for i in not_empty_train]
train_person_seq = [train_person_seq[i] for i in not_empty_train]

In [None]:
token2cnt = Counter([token for sentence in train_token_seq for token in sentence])

In [None]:
token2cnt.most_common(10)

[('the', 8390),
 ('.', 7374),
 (',', 7290),
 ('of', 3815),
 ('in', 3621),
 ('to', 3424),
 ('a', 3199),
 ('and', 2872),
 ('(', 2861),
 (')', 2861)]

In [None]:


def get_token2idx(
    token2cnt: Dict[str, int],
    min_count: int,
) -> Dict[str, int]:
    """
    Get mapping from tokens to indices to use with Embedding layer.
    """

    token2idx: Dict[str, int] = {}

    token2idx['<PAD>'] = 0
    token2idx['<UNK>'] = 1
    i = 2    
    for key, value in token2cnt.items():
      if value >= min_count:
        token2idx[key] = i
        i += 1

    return token2idx

In [None]:
token2idx = get_token2idx(token2cnt, min_count=2)

In [None]:
# Функция для сортировки тегов, чтобы сначала был тег O, потом теги B- и только после теги I- (можно задать вручную)

def sort_labels_func(x: str) -> int:
    if x == "O":
        return 0
    elif x.startswith("B-"):
        return 1
    else:
        return 2

label_set = sorted(
    set(label for sentence in train_label_seq for label in sentence),
    key=lambda x: (sort_labels_func(x), x),
)

In [None]:
label_set

['O', 'B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER']

In [None]:
def get_label2idx(label_set: List[str]) -> Dict[str, int]:
    """
    Get mapping from labels to indices.
    """

    label2idx: Dict[str, int] = {}

    for idx, i in enumerate(label_set):
      label2idx[i] = idx

    return label2idx

In [None]:
label2idx = get_label2idx(label_set)

In [None]:
for token, idx in list(token2idx.items())[:10]:
    print(f"{token}\t{idx}")

<PAD>	0
<UNK>	1
eu	2
german	3
call	4
to	5
boycott	6
british	7
lamb	8
.	9


In [None]:
for label, idx in label2idx.items():
    print(f"{label}\t{idx}")

O	0
B-LOC	1
B-MISC	2
B-ORG	3
B-PER	4
I-LOC	5
I-MISC	6
I-ORG	7
I-PER	8


## BiLSTM-теггер 

In [None]:
class BiLSTM(torch.nn.Module):
    """
    Bidirectional LSTM architecture.
    """

    def __init__(
        self,
        num_embeddings: int,
        embedding_dim: int,
        hidden_size: int,
        num_layers: int,
        dropout: float,
        bidirectional: bool,
        n_classes: int,
    ):
        super().__init__()

        self.embedding = torch.nn.Embedding(num_embeddings=num_embeddings, 
                                            embedding_dim=embedding_dim)
        self.rnn = torch.nn.LSTM(input_size=embedding_dim, 
                                 hidden_size=hidden_size,
                                 num_layers=num_layers,
                                 batch_first=True,
                                 dropout=dropout,
                                 bidirectional=bidirectional)
        self.head = torch.nn.Linear(2*hidden_size, n_classes)

    def forward(self, tokens: torch.LongTensor) -> torch.Tensor:
        embed = self.embedding(tokens)

        # используем специальную функцию pack_padded_sequence для того, чтобы получить структуру PackedSequence
        # которая не учитывать паддинг при проходе rnn
        length = (tokens != 0).sum(dim=1).detach().cpu()
        packed_embed = torch.nn.utils.rnn.pack_padded_sequence(
            embed, length, batch_first=True, enforce_sorted=False
          )
        
        # используем специальную функцию pad_packed_sequence для того, чтобы получить тензор из PackedSequence
        packed_rnn_output, _ = self.rnn(packed_embed)
        rnn_output, _ = torch.nn.utils.rnn.pad_packed_sequence(
            packed_rnn_output, batch_first=True)
        
        
        logits = self.head(rnn_output)
        return logits.transpose(1, 2)

In [None]:
model = BiLSTM(
    num_embeddings=len(token2idx),
    embedding_dim=100,
    hidden_size=100,
    num_layers=1,
    dropout=0.0,
    bidirectional=True,
    n_classes=len(label2idx),
).to(device)

In [None]:
model

BiLSTM(
  (embedding): Embedding(10952, 100)
  (rnn): LSTM(100, 100, batch_first=True, bidirectional=True)
  (head): Linear(in_features=200, out_features=9, bias=True)
)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss(ignore_index=-1)

In [None]:
torch.save(model.state_dict, '/content/drive/MyDrive/transformer_weights/BiLSTM_weights.pt')

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


def compute_metrics(
    outputs: torch.Tensor,
    labels: torch.LongTensor,
) -> Dict[str, float]:
    """
    Compute NER metrics.
    """

    metrics = {}

    mask = labels != -1
    y_pred = labels[mask].cpu()
    y_true = torch.argmax(outputs, dim = 1)[mask].cpu()

    # accuracy
    accuracy = accuracy_score(
        y_true=y_true,
        y_pred=y_pred,
    )

    # precision
    precision_micro = precision_score(
        y_true=y_true,
        y_pred=y_pred,
        average="micro",
        zero_division=0,
    )
    precision_macro = precision_score(
        y_true=y_true,
        y_pred=y_pred,
        average="macro",
        zero_division=0,
    )
    precision_weighted = precision_score(
        y_true=y_true,
        y_pred=y_pred,
        average="weighted",
        zero_division=0,
    )

    # recall
    recall_micro = recall_score(
        y_true=y_true,
        y_pred=y_pred,
        average="micro",
        zero_division=0,
        
    )
    recall_macro = recall_score(
        y_true=y_true,
        y_pred=y_pred,
        average="macro",
        zero_division=0,
    )
    recall_weighted = recall_score(
        y_true=y_true,
        y_pred=y_pred,
        average="weighted",
        zero_division=0,
    )

    # f1
    f1_micro = f1_score(
        y_true=y_true,
        y_pred=y_pred,
        average="micro",
        zero_division=0,
    )
    f1_macro = f1_score(
        y_true=y_true,
        y_pred=y_pred,
        average="macro",
        zero_division=0,
    )
    f1_weighted = f1_score(
        y_true=y_true,
        y_pred=y_pred,
        average="weighted",
        zero_division=0,
    )

    metrics["accuracy"] = accuracy

    metrics["precision_micro"]    = precision_micro
    metrics["precision_macro"]    = precision_macro
    metrics["precision_weighted"] = precision_weighted

    metrics["recall_micro"]    = recall_micro
    metrics["recall_macro"]    = recall_macro
    metrics["recall_weighted"] = recall_weighted

    metrics["f1_micro"]    = f1_micro
    metrics["f1_macro"]    = f1_macro
    metrics["f1_weighted"] = f1_weighted

    return metrics

In [None]:
def train_epoch(
    model: torch.nn.Module,
    dataloader: torch.utils.data.DataLoader,
    optimizer: torch.optim.Optimizer,
    criterion: torch.nn.Module,
    writer: SummaryWriter,
    device: torch.device,
    epoch: int,
) -> None:
    """
    One training cycle (loop).
    """

    model.train()

    epoch_loss = []
    batch_metrics_list = defaultdict(list)

    for i, (tokens, labels) in tqdm(
        enumerate(dataloader),
        total=len(dataloader),
        desc="loop over train batches",
    ):

        tokens, labels = tokens.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(tokens['input_ids'])
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        epoch_loss.append(loss.item())
        writer.add_scalar(
            "batch loss / train", loss.item(), epoch * len(dataloader) + i
        )

        with torch.no_grad():
            model.eval()
            outputs_inference = model(tokens['input_ids'])
            model.train()

        batch_metrics = compute_metrics(
            outputs=outputs_inference,
            labels=labels,
        )

        for metric_name, metric_value in batch_metrics.items():
            batch_metrics_list[metric_name].append(metric_value)
            writer.add_scalar(
                f"batch {metric_name} / train",
                metric_value,
                epoch * len(dataloader) + i,
            )

    avg_loss = np.mean(epoch_loss)
    print(f"Train loss: {avg_loss}\n")
    writer.add_scalar("loss / train", avg_loss, epoch)

    for metric_name, metric_value_list in batch_metrics_list.items():
        metric_value = np.mean(metric_value_list)
        print(f"Train {metric_name}: {metric_value}\n")
        writer.add_scalar(f"{metric_name} / train", metric_value, epoch)

In [None]:
def evaluate_epoch(
    model: torch.nn.Module,
    dataloader: torch.utils.data.DataLoader,
    criterion: torch.nn.Module,
    writer: SummaryWriter,
    device: torch.device,
    epoch: int,
) -> None:
    """
    One evaluation cycle (loop).
    """

    model.eval()

    epoch_loss = []
    batch_metrics_list = defaultdict(list)

    with torch.no_grad():

        for i, (tokens, labels) in tqdm(
            enumerate(dataloader),
            total=len(dataloader),
            desc="loop over test batches",
        ):

            tokens, labels = tokens.to(device), labels.to(device)

            outputs = model(tokens['input_ids'])
            loss = criterion(outputs, labels)
            
            epoch_loss.append(loss.item())
            writer.add_scalar(
                "batch loss / test", loss.item(), epoch * len(dataloader) + i
            )

            batch_metrics = compute_metrics(
                outputs=outputs,
                labels=labels,
            )

            for metric_name, metric_value in batch_metrics.items():
                batch_metrics_list[metric_name].append(metric_value)
                writer.add_scalar(
                    f"batch {metric_name} / test",
                    metric_value,
                    epoch * len(dataloader) + i,
                )

        avg_loss = np.mean(epoch_loss)
        print(f"Test loss:  {avg_loss}\n")
        writer.add_scalar("loss / test", avg_loss, epoch)

        for metric_name, metric_value_list in batch_metrics_list.items():
            metric_value = np.mean(metric_value_list)
            print(f"Test {metric_name}: {metric_value}\n")
            writer.add_scalar(f"{metric_name} / test", np.mean(metric_value), epoch)

In [None]:
def train(
    n_epochs: int,
    model: torch.nn.Module,
    train_dataloader: torch.utils.data.DataLoader,
    test_dataloader: torch.utils.data.DataLoader,
    optimizer: torch.optim.Optimizer,
    criterion: torch.nn.Module,
    writer: SummaryWriter,
    device: torch.device,
) -> None:
    """
    Training loop.
    """

    for epoch in range(n_epochs):

        print(f"Epoch [{epoch+1} / {n_epochs}]\n")

        train_epoch(
            model=model,
            dataloader=train_dataloader,
            optimizer=optimizer,
            criterion=criterion,
            writer=writer,
            device=device,
            epoch=epoch,
        )
        evaluate_epoch(
            model=model,
            dataloader=test_dataloader,
            criterion=criterion,
            writer=writer,
            device=device,
            epoch=epoch,
        )

In [None]:
train(n_epochs=12,
      model=model,
      train_dataloader=train_dataloader,
      test_dataloader=valid_dataloader,
      optimizer=optimizer,
      criterion=criterion,
      writer=writer,
      device=device)

In [None]:
!cp -a /content/logs/BiLSTMModel/ /content/drive/MyDrive/BiLSTM

In [None]:
evaluate_epoch(
  model=model,
  dataloader=test_dataloader,
  criterion=criterion,
  writer=writer,
  device=device,
  epoch=12,
)


## Transformers-теггер

In [None]:
from transformers import AutoTokenizer

In [None]:
model_name = "distilbert-base-cased"

In [None]:
model_name = 'DeepPavlov/rubert-base-cased'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/1.57M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
tokenizer

PreTrainedTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [None]:
class TransformersDataset(torch.utils.data.Dataset):
    """
    Transformers Dataset for NER.
    """

    def __init__(
        self,
        token_seq: List[List[str]],
        label_seq: List[List[str]],
    ):
        self.token_seq = token_seq
        self.label_seq = [self.process_labels(labels, label2idx) for labels in label_seq]

    def __len__(self):
        return len(self.token_seq)

    def __getitem__(
        self,
        idx: int,
    ) -> Tuple[List[str], List[int]]:

        return (self.token_seq[idx], self.label_seq[idx])
    
    @staticmethod
    def process_labels(
        labels: List[str],
        label2idx: Dict[str, int],
    ) -> List[int]:
        """
        Transform list of labels into list of labels' indices.
        """

        return [label2idx[label] for label in labels]

In [None]:
class TransformersDataset_plus(torch.utils.data.Dataset):
    """
    Transformers Dataset for NER.
    """

    def __init__(
        self,
        token_seq: List[List[str]],
        label_seq: List[List[str]],
        persn_seq: List[List[int]],
    ):
        self.token_seq = token_seq
        self.label_seq = [self.process_labels(labels, label2idx) for labels in label_seq]
        self.persn_seq = persn_seq

    def __len__(self):
        return len(self.token_seq)

    def __getitem__(
        self,
        idx: int,
    ) -> Tuple[List[str], List[int]]:

        return (self.token_seq[idx], self.label_seq[idx], self.persn_seq[idx])
    
    @staticmethod
    def process_labels(
        labels: List[str],
        label2idx: Dict[str, int],
    ) -> List[int]:
        """
        Transform list of labels into list of labels' indices.
        """

        return [label2idx[label] for label in labels]

In [None]:
train_dataset = TransformersDataset_plus(
    token_seq=train_token_seq,
    label_seq=train_label_seq,
    persn_seq=train_person_seq
)

In [None]:
train_dataset = TransformersDataset(
    token_seq=train_token_seq,
    label_seq=train_label_seq,
)
valid_dataset = TransformersDataset(
    token_seq=valid_token_seq,
    label_seq=valid_label_seq,
)
test_dataset = TransformersDataset(
    token_seq=test_token_seq,
    label_seq=test_label_seq,
)

In [None]:
train_dataset[1]

(['Один',
  'экземпляр',
  'я',
  'послал',
  'позже',
  'И',
  '.',
  'В',
  '.',
  'Сталину',
  ',',
  'чтобы',
  'он',
  'понял',
  ',',
  'как',
  'дерутся',
  'бойцы',
  '.'],
 [0, 0, 0, 0, 0, 3, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
valid_dataset[0]

In [None]:
test_dataset[0]

In [None]:
from transformers import PreTrainedTokenizer
from transformers.tokenization_utils_base import BatchEncoding


class TransformersCollator:
    """
    Transformers Collator that handles variable-size sentences.
    """

    def __init__(
        self,
        tokenizer: PreTrainedTokenizer,
        tokenizer_kwargs: Dict[str, Any],
        label_padding_value: int,
    ):
        self.tokenizer = tokenizer
        self.tokenizer_kwargs = tokenizer_kwargs
        
        self.label_padding_value = label_padding_value

    def __call__(
        self,
        batch: List[Tuple[List[str], List[int]]],
    ): #-> Tuple[torch.LongTensor, torch.LongTensor]:
        tokens, labels = zip(*batch)

        tokens = self.tokenizer(list(tokens), **self.tokenizer_kwargs)
        labels = self.encode_labels(tokens, labels, self.label_padding_value)

        tokens.pop("offset_mapping")

        return tokens, labels
    
    @staticmethod
    def encode_labels(
        tokens: BatchEncoding,
        labels: List[List[int]],
        label_padding_value: int,
    ) -> torch.LongTensor:
    
    

        encoded_labels = []
        
        for doc_labels, doc_offset in zip(labels, tokens.offset_mapping):

            doc_enc_labels = np.ones(len(doc_offset), dtype=int) * label_padding_value
            arr_offset = np.array(doc_offset)
            try:
              doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
              encoded_labels.append(doc_enc_labels.tolist())
            except ValueError:
              return 'Error'
    
        return torch.LongTensor(encoded_labels)

In [None]:
from transformers import PreTrainedTokenizer
from transformers.tokenization_utils_base import BatchEncoding


class TransformersCollator_plus:
    """
    Transformers Collator that handles variable-size sentences.
    """

    def __init__(
        self,
        tokenizer: PreTrainedTokenizer,
        tokenizer_kwargs: Dict[str, Any],
        label_padding_value: int,
        id2person: Dict[int, List[List[str]]],
    ):
        self.tokenizer = tokenizer
        self.tokenizer_kwargs = tokenizer_kwargs
        self.label_padding_value = label_padding_value
        self.id2person = id2person

    def __call__(
        self,
        batch: List[Tuple[List[str], List[int], List[int]]],
    ) -> Tuple[torch.LongTensor, torch.LongTensor, torch.LongTensor]:
        tokens, labels, persons = zip(*batch)

 
        tokens = list(tokens)
        persons_list = [person for person_in_sentence in persons 
                        for person in person_in_sentence if person > 0]
        pred_pers_label = 0
        pers_count = 0
        for person in persons_list:
          if pred_pers_label != person:
            pers_count += 1
            pred_pers_label = person
            tokens.append(self.positive_sample(person))
            tokens.append(self.negative_sample(person))
        tokens = self.tokenizer(list(tokens), **self.tokenizer_kwargs)
        labels, persons = self.encode_labels(tokens.offset_mapping[:-pers_count*2], labels, persons, self.label_padding_value)

        tokens.pop("offset_mapping")

        return tokens, labels, persons
    
    def positive_sample(self, person_id: int):
      return random.choice(self.id2person[person_id])
    
    def negative_sample(self, person_id: int):
      list_for_choice = [i+1 for i in range(len(self.id2person)) if i+1 != person_id]
      return random.choice(self.id2person[random.choice(list_for_choice)])

    @staticmethod
    def encode_labels(
        tokens_offset_mapping: BatchEncoding,
        labels: List[List[int]],
        persons: List[List[int]],
        label_padding_value: int,
    ) -> torch.LongTensor:

        encoded_labels = []
        encoded_persons = []

        for doc_labels, doc_persons, doc_offset in zip(labels, persons, tokens_offset_mapping):

            doc_enc_labels = np.ones(len(doc_offset), dtype=int) * label_padding_value
            doc_enc_persons = np.ones(len(doc_offset), dtype=int) * label_padding_value
            arr_offset = np.array(doc_offset)

            doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
            doc_enc_persons[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_persons
            encoded_labels.append(doc_enc_labels.tolist())
            encoded_persons.append(doc_enc_persons.tolist())

        return torch.LongTensor(encoded_labels), torch.LongTensor(encoded_persons)

In [None]:
tokenizer_kwargs = {
    "is_split_into_words":    True,
    "return_offsets_mapping": True,
    "padding":                True,
    "truncation":             True,
    "max_length":             512,
    "return_tensors":         "pt",
}

In [None]:
id2sent = {
            1:[['Сталин'],['И','.','В','.','Сталин'], ['Коба'], ['Иосиф', 'Виссарионович'], ['Джугашвили']],
            2: [['Жуков'],['Г','.','К','.','Жуков'],['Георгий','Константинович','Жуков']]
          }

In [None]:
collator = TransformersCollator_plus(
    tokenizer=tokenizer,
    tokenizer_kwargs=tokenizer_kwargs,
    label_padding_value=-1,
    id2person=id2sent
)

In [None]:
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=1,
    shuffle=True,
#    shuffle=False,
    collate_fn=collator,
)

Выше эксперименты

In [None]:
collator = TransformersCollator(
    tokenizer=tokenizer,
    tokenizer_kwargs=tokenizer_kwargs,
    label_padding_value=-1,
)

In [None]:
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=2,
    shuffle=True,
#    shuffle=False,
    collate_fn=collator,
)
valid_dataloader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=1,  
    shuffle=False, 
    collate_fn=collator,
)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=1,  
    shuffle=False, 
    collate_fn=collator,
)

In [None]:
tokens, labels = next(iter(train_dataloader))
tokens = tokens.to(device)
labels = labels.to(device)

In [None]:
tokens, labels, persons = next(iter(train_dataloader))

tokens = tokens.to(device)
labels = labels.to(device)
persons = persons.to(device)


In [None]:
tokens['input_ids'].shape

torch.Size([2, 64])

In [None]:
labels.shape

torch.Size([2, 64])

In [None]:
persons

tensor([[-1,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  1,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0, -1,  0,  0, -1,  0,  0,  0,  0, -1]])

In [None]:
[tokenizer.decode(i) for i in tokens['input_ids']]

In [None]:
inputs = tokenizer(train_dataset[0][0], **tokenizer_kwargs)
print(inputs.tokens())

['[CLS]', 'Некоторое', 'время', 'назад', 'Антонова', 'выступила', 'с', 'идеей', 'возрождения', 'ликвидирован', '##ного', 'по', 'приказу', 'Сталина', 'Государственного', 'музея', 'нового', 'западного', 'искусства', ',', 'основу', 'коллекции', 'которого', 'составляли', 'национализирован', '##ные', 'собрания', 'меценат', '##ов', 'Щукина', 'и', 'Морозова', '.', '[SEP]']


In [None]:
inputs

In [None]:
labels

In [None]:
from transformers import AutoModelForTokenClassification

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label2idx),
).to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initializ

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label2idx),
    output_hidden_states=True
).to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initializ

Загрузка весов трансформерв для экспериментов

In [None]:
state_dict = torch.load('/content/drive/MyDrive/transformer_weights/RUBERT_weights.pt', map_location=torch.device('cpu'))
model.load_state_dict(state_dict())

<All keys matched successfully>

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [None]:
batch_size = 1

In [None]:
class TripletLoss:
  def __init__(self, batch_size, indent):
    self.batch_size = batch_size
    self.indent = indent
    self.cos = torch.nn.CosineSimilarity(dim=0)
  def __call__(self, last_hidden_states, persons):
    #отделяем голову - предложения из батча
    hid_head = last_hidden_states[:self.batch_size]
    #хвост - положительные и отрицательные примеры из словаря
    hid_tail = last_hidden_states[self.batch_size:]
    #отбрасываем вектора ненужных токенов
    mask = persons > 0
    person = persons[mask]
    head = hid_head[mask]
    #создаем список интов для разбиения головы
    list_to_split = []
    prev_pers = 0
    cout = 1
    for pers in person:
      if pers != prev_pers:
        prev_pers = pers
        list_to_split.append(cout)
        cout = 1
      else:
        cout += 1
    list_to_split.append(cout)
    list_to_split = list_to_split[1:]
    #разбиваем голову в соответствии с сущностями
    head_tuple = torch.split(head, list_to_split)
    #усредняем для хвоста - одна сущность - один вектор
    tail = hid_tail.mean(axis=1)
    tail_tuple = torch.split(tail, 2)
    ret = None
    for hd, tl in zip(head_tuple, tail_tuple):
      hd = hd.mean(axis=0)
      sum = self.cos(hd, tl[1]) - self.cos(hd,tl[0]) + self.indent
      if ret == None:
        if sum > 0:
          ret = sum
      elif sum > 0:
        ret += sum
    if ret == None:
      ret = 0
  
    return ret


In [None]:

from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(log_dir=f"logs/Transformer")

In [None]:
criterion = torch.nn.CrossEntropyLoss(ignore_index=-1)

In [None]:
def train_epoch(
    model: torch.nn.Module,
    dataloader: torch.utils.data.DataLoader,
    optimizer: torch.optim.Optimizer,
    criterion: torch.nn.Module,
    writer: SummaryWriter,
    device: torch.device,
    epoch: int,
) -> None:
    """
    One training cycle (loop).
    """

    model.train()

    epoch_loss = []
    batch_metrics_list = defaultdict(list)

    for i, (tokens, labels) in tqdm(
        enumerate(dataloader),
        total=len(dataloader),
        desc="loop over train batches",
    ):
        if labels == 'Error':
          continue
        tokens, labels = tokens.to(device), labels.to(device)

 

        optimizer.zero_grad()
        outputs = model(**tokens)
        loss = criterion(outputs["logits"].transpose(1, 2), labels)
        loss.backward()
        optimizer.step()

        epoch_loss.append(loss.item())
        writer.add_scalar(
            "batch loss / train", loss.item(), epoch * len(dataloader) + i
        )

        with torch.no_grad():
            model.eval()
            outputs_inference = model(**tokens)
            model.train()

        batch_metrics = compute_metrics(
            outputs=outputs_inference["logits"].transpose(1, 2),
            labels=labels,
        )

        for metric_name, metric_value in batch_metrics.items():
            batch_metrics_list[metric_name].append(metric_value)
            writer.add_scalar(
                f"batch {metric_name} / train",
                metric_value,
                epoch * len(dataloader) + i,
            )

    avg_loss = np.mean(epoch_loss)
    print(f"Train loss: {avg_loss}\n")
    writer.add_scalar("loss / train", avg_loss, epoch)

    for metric_name, metric_value_list in batch_metrics_list.items():
        metric_value = np.mean(metric_value_list)
        print(f"Train {metric_name}: {metric_value}\n")
        writer.add_scalar(f"{metric_name} / train", metric_value, epoch)

In [None]:
def evaluate_epoch(
    model: torch.nn.Module,
    dataloader: torch.utils.data.DataLoader,
    criterion: torch.nn.Module,
    writer: SummaryWriter,
    device: torch.device,
    epoch: int,
) -> None:
    """
    One evaluation cycle (loop).
    """

    model.eval()

    epoch_loss = []
    batch_metrics_list = defaultdict(list)

    with torch.no_grad():

        for i, (tokens, labels) in tqdm(
            enumerate(dataloader),
            total=len(dataloader),
            desc="loop over test batches",
        ):
            if labels == 'Error':
              continue

            tokens, labels = tokens.to(device), labels.to(device)

            outputs = model(**tokens)
            loss = criterion(outputs["logits"].transpose(1, 2), labels)
            
            epoch_loss.append(loss.item())
            writer.add_scalar(
                "batch loss / test", loss.item(), epoch * len(dataloader) + i
            )

            batch_metrics = compute_metrics(
                outputs=outputs["logits"].transpose(1, 2),
                labels=labels,
            )

            for metric_name, metric_value in batch_metrics.items():
                batch_metrics_list[metric_name].append(metric_value)
                writer.add_scalar(
                    f"batch {metric_name} / test",
                    metric_value,
                    epoch * len(dataloader) + i,
                )

        avg_loss = np.mean(epoch_loss)
        print(f"Test loss:  {avg_loss}\n")
        writer.add_scalar("loss / test", avg_loss, epoch)

        for metric_name, metric_value_list in batch_metrics_list.items():
            metric_value = np.mean(metric_value_list)
            print(f"Test {metric_name}: {metric_value}\n")
            writer.add_scalar(f"{metric_name} / test", np.mean(metric_value), epoch)

In [None]:
def train(
    n_epochs: int,
    model: torch.nn.Module,
    train_dataloader: torch.utils.data.DataLoader,
    test_dataloader: torch.utils.data.DataLoader,
    optimizer: torch.optim.Optimizer,
    criterion: torch.nn.Module,
    writer: SummaryWriter,
    device: torch.device,
) -> None:
    """
    Training loop.
    """

    for epoch in range(n_epochs):

        print(f"Epoch [{epoch+1} / {n_epochs}]\n")

        train_epoch(
            model=model,
            dataloader=train_dataloader,
            optimizer=optimizer,
            criterion=criterion,
            writer=writer,
            device=device,
            epoch=epoch,
        )
        evaluate_epoch(
            model=model,
            dataloader=test_dataloader,
            criterion=criterion,
            writer=writer,
            device=device,
            epoch=epoch,
        )

In [None]:
train(n_epochs=8,
      model=model,
      train_dataloader=train_dataloader,
      test_dataloader=valid_dataloader,
      optimizer=optimizer,
      criterion=criterion,
      writer=writer,
      device=device)

Epoch [1 / 8]



loop over train batches: 100%|██████████| 6612/6612 [13:50<00:00,  7.96it/s]


Train loss: 0.056066848826155605

Train accuracy: 0.9888284104778672

Train precision_micro: 0.9888284104778672

Train precision_macro: 0.9552241822073263

Train precision_weighted: 0.9925906946729647

Train recall_micro: 0.9888284104778672

Train recall_macro: 0.9534081542512496

Train recall_weighted: 0.9888284104778672

Train f1_micro: 0.9888284104778672

Train f1_macro: 0.9523089873468832

Train f1_weighted: 0.9898025068578065



loop over test batches: 100%|██████████| 2575/2575 [00:52<00:00, 49.29it/s]


Test loss:  0.024658386643333995

Test accuracy: 0.9935605131184811

Test precision_micro: 0.9935605131184811

Test precision_macro: 0.981890647762415

Test precision_weighted: 0.9938698491750418

Test recall_micro: 0.9935605131184811

Test recall_macro: 0.9803520591133272

Test recall_weighted: 0.9935605131184811

Test f1_micro: 0.9935605131184811

Test f1_macro: 0.9802127450413017

Test f1_weighted: 0.9931641149808017

Epoch [2 / 8]



loop over train batches: 100%|██████████| 6612/6612 [13:43<00:00,  8.03it/s]


Train loss: 0.022317182376600277

Train accuracy: 0.996342637085972

Train precision_micro: 0.996342637085972

Train precision_macro: 0.9857338300209456

Train precision_weighted: 0.9967260551871542

Train recall_micro: 0.996342637085972

Train recall_macro: 0.9851684749934592

Train recall_weighted: 0.996342637085972

Train f1_micro: 0.996342637085972

Train f1_macro: 0.9846255165281877

Train f1_weighted: 0.9962403089393342



loop over test batches: 100%|██████████| 2575/2575 [00:52<00:00, 48.69it/s]


Test loss:  0.02218159767229153

Test accuracy: 0.9944400507589755

Test precision_micro: 0.9944400507589755

Test precision_macro: 0.9845537419349689

Test precision_weighted: 0.995715340394315

Test recall_micro: 0.9944400507589755

Test recall_macro: 0.9841887651459538

Test recall_weighted: 0.9944400507589755

Test f1_micro: 0.9944400507589755

Test f1_macro: 0.9836979753585587

Test f1_weighted: 0.9946100704797733

Epoch [3 / 8]



loop over train batches: 100%|██████████| 6612/6612 [13:46<00:00,  8.00it/s]


Train loss: 0.014736567804603423

Train accuracy: 0.9975956130688567

Train precision_micro: 0.9975956130688567

Train precision_macro: 0.9908526895929831

Train precision_weighted: 0.9979900603207105

Train recall_micro: 0.9975956130688567

Train recall_macro: 0.9907140728735953

Train recall_weighted: 0.9975956130688567

Train f1_micro: 0.9975956130688567

Train f1_macro: 0.9902121558410012

Train f1_weighted: 0.997600183901945



loop over test batches: 100%|██████████| 2575/2575 [00:53<00:00, 47.87it/s]


Test loss:  0.022294635523228738

Test accuracy: 0.9946381633496986

Test precision_micro: 0.9946381633496986

Test precision_macro: 0.9865352464798313

Test precision_weighted: 0.9955775996973124

Test recall_micro: 0.9946381633496986

Test recall_macro: 0.9855785284010388

Test recall_weighted: 0.9946381633496986

Test f1_micro: 0.9946381633496986

Test f1_macro: 0.9853509128088359

Test f1_weighted: 0.9946716698258409

Epoch [4 / 8]



loop over train batches: 100%|██████████| 6612/6612 [13:50<00:00,  7.96it/s]


Train loss: 0.011178229354397422

Train accuracy: 0.9983697991996818

Train precision_micro: 0.9983697991996818

Train precision_macro: 0.9936674979481366

Train precision_weighted: 0.9985972969952103

Train recall_micro: 0.9983697991996818

Train recall_macro: 0.9932795713987382

Train recall_weighted: 0.9983697991996818

Train f1_micro: 0.9983697991996818

Train f1_macro: 0.9931101738218248

Train f1_weighted: 0.9983577837396801



loop over test batches: 100%|██████████| 2575/2575 [00:54<00:00, 47.61it/s]


Test loss:  0.027590975546919756

Test accuracy: 0.9918470627573923

Test precision_micro: 0.9918470627573923

Test precision_macro: 0.9784916933234582

Test precision_weighted: 0.991868269466131

Test recall_micro: 0.9918470627573923

Test recall_macro: 0.9756920009030963

Test recall_weighted: 0.9918470627573923

Test f1_micro: 0.9918470627573923

Test f1_macro: 0.9760583443236253

Test f1_weighted: 0.9911516934980922

Epoch [5 / 8]



loop over train batches: 100%|██████████| 6612/6612 [13:51<00:00,  7.95it/s]


Train loss: 0.007775318792909052

Train accuracy: 0.9989441944170855

Train precision_micro: 0.9989441944170855

Train precision_macro: 0.9960596589199645

Train precision_weighted: 0.999029113823293

Train recall_micro: 0.9989441944170855

Train recall_macro: 0.9958694509185141

Train recall_weighted: 0.9989441944170855

Train f1_micro: 0.9989441944170855

Train f1_macro: 0.9957121804711955

Train f1_weighted: 0.9989032050339739



loop over test batches: 100%|██████████| 2575/2575 [00:57<00:00, 44.99it/s]


Test loss:  0.0284211199206794

Test accuracy: 0.9936026741846493

Test precision_micro: 0.9936026741846493

Test precision_macro: 0.9818879079475574

Test precision_weighted: 0.9942993620432217

Test recall_micro: 0.9936026741846493

Test recall_macro: 0.9808476305243576

Test recall_weighted: 0.9936026741846493

Test f1_micro: 0.9936026741846493

Test f1_macro: 0.9806271972827222

Test f1_weighted: 0.9934604341199167

Epoch [6 / 8]



loop over train batches: 100%|██████████| 6612/6612 [13:57<00:00,  7.90it/s]


Train loss: 0.006093465760784175

Train accuracy: 0.9992778121976886

Train precision_micro: 0.9992778121976886

Train precision_macro: 0.9969225538315193

Train precision_weighted: 0.9993305762031968

Train recall_micro: 0.9992778121976886

Train recall_macro: 0.9965248900222005

Train recall_weighted: 0.9992778121976886

Train f1_micro: 0.9992778121976886

Train f1_macro: 0.9965442955809757

Train f1_weighted: 0.9992431902746863



loop over test batches: 100%|██████████| 2575/2575 [00:55<00:00, 46.45it/s]


Test loss:  0.0316951150209496

Test accuracy: 0.993273768007262

Test precision_micro: 0.993273768007262

Test precision_macro: 0.9803429236688195

Test precision_weighted: 0.9936483026651564

Test recall_micro: 0.993273768007262

Test recall_macro: 0.9791066292144864

Test recall_weighted: 0.993273768007262

Test f1_micro: 0.993273768007262

Test f1_macro: 0.978674155138193

Test f1_weighted: 0.992888621224122

Epoch [7 / 8]



loop over train batches: 100%|██████████| 6612/6612 [13:52<00:00,  7.94it/s]


Train loss: 0.004934873717426103

Train accuracy: 0.9994532555194404

Train precision_micro: 0.9994532555194404

Train precision_macro: 0.9977974453734046

Train precision_weighted: 0.99951053344778

Train recall_micro: 0.9994532555194404

Train recall_macro: 0.9976705902450517

Train recall_weighted: 0.9994532555194404

Train f1_micro: 0.9994532555194404

Train f1_macro: 0.9976226329780622

Train f1_weighted: 0.9994479992998163



loop over test batches: 100%|██████████| 2575/2575 [00:54<00:00, 47.08it/s]


Test loss:  0.035686228609673125

Test accuracy: 0.9941410320682404

Test precision_micro: 0.9941410320682404

Test precision_macro: 0.9810833656970301

Test precision_weighted: 0.9944841992323067

Test recall_micro: 0.9941410320682404

Test recall_macro: 0.9804891807415828

Test recall_weighted: 0.9941410320682404

Test f1_micro: 0.9941410320682404

Test f1_macro: 0.9801152825895159

Test f1_weighted: 0.9938795053681576

Epoch [8 / 8]



loop over train batches: 100%|██████████| 6612/6612 [14:00<00:00,  7.87it/s]


Train loss: 0.004135170928074093

Train accuracy: 0.9994581172851434

Train precision_micro: 0.9994581172851434

Train precision_macro: 0.9980904100256628

Train precision_weighted: 0.9995247361894511

Train recall_micro: 0.9994581172851434

Train recall_macro: 0.9981080543027943

Train recall_weighted: 0.9994581172851434

Train f1_micro: 0.9994581172851434

Train f1_macro: 0.9980047213266702

Train f1_weighted: 0.9994534193446781



loop over test batches: 100%|██████████| 2575/2575 [00:54<00:00, 47.55it/s]


Test loss:  0.03549315295425313

Test accuracy: 0.9942217882908919

Test precision_micro: 0.9942217882908919

Test precision_macro: 0.9829347293207684

Test precision_weighted: 0.9945552448191197

Test recall_micro: 0.9942217882908919

Test recall_macro: 0.9824217725249851

Test recall_weighted: 0.9942217882908919

Test f1_micro: 0.9942217882908919

Test f1_macro: 0.981914576660603

Test f1_weighted: 0.9939018108943533



In [None]:
torch.save(model.state_dict, '/content/drive/MyDrive/transformer_weights/RUBERT_weights.pt')

In [None]:
!cp -a /content/logs/Transformer /content/drive/MyDrive/Transformer

In [None]:
evaluate_epoch(
  model=model,
  dataloader=test_dataloader,
  criterion=criterion,
  writer=writer,
  device=device,
  epoch=8,
)


In [None]:
class BiLSTMAttn(torch.nn.Module):
    """
    Bidirectional LSTM architecture.
    """

    def __init__(
        self,
        num_embeddings: int,
        embedding_dim: int,
        hidden_size: int,
        num_layers: int,
        dropout: float,
        bidirectional: bool,
        num_heads: int,
        n_classes: int,
    ):
        super().__init__()


        self.embedding = torch.nn.Embedding(num_embeddings=num_embeddings, 
                                            embedding_dim=embedding_dim)
        self.rnn = torch.nn.LSTM(input_size=embedding_dim,
                                 hidden_size=hidden_size,
                                 num_layers=num_layers,
                                 batch_first=True,
                                 dropout=dropout,
                                 bidirectional=bidirectional)
        
        self.self_attention = torch.nn.MultiheadAttention(embed_dim=2*hidden_size,
                                                          num_heads=num_heads, 
                                                          dropout=dropout, 
                                                          batch_first=True, 
                                                          )
        self.head = torch.nn.Linear(2*hidden_size, n_classes)

    def forward(self, tokens: torch.LongTensor) -> torch.Tensor:
        embed = self.embedding(tokens)
        mask = tokens == 0
        # используем специальную функцию pack_padded_sequence для того, чтобы получить структуру PackedSequence
        # которая не учитывать паддинг при проходе rnn
        length = (tokens != 0).sum(dim=1).detach().cpu()
        packed_embed = torch.nn.utils.rnn.pack_padded_sequence(
            embed, length, batch_first=True, enforce_sorted=False
          )
        
        # используем специальную функцию pad_packed_sequence для того, чтобы получить тензор из PackedSequence
        packed_rnn_output, _ = self.rnn(packed_embed)
        rnn_output, _ = torch.nn.utils.rnn.pad_packed_sequence(
            packed_rnn_output, batch_first=True)
        attention_out, _ = self.self_attention(query=rnn_output, 
                                               key=rnn_output, 
                                               value=rnn_output, 
                                               key_padding_mask=mask, 
                                               need_weights=False)
        
        logits = self.head(attention_out)
        return logits.transpose(1, 2)

In [None]:
model = BiLSTMAttn(
    num_embeddings=len(token2idx),
    embedding_dim=100,
    hidden_size=100,
    num_layers=1,
    dropout=0.0,
    bidirectional=True,
    num_heads = 10,
    n_classes=len(label2idx),
).to(device)

In [None]:
model

BiLSTMAttn(
  (embedding): Embedding(10952, 100)
  (rnn): LSTM(100, 100, batch_first=True, bidirectional=True)
  (self_attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
  )
  (head): Linear(in_features=200, out_features=9, bias=True)
)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss(ignore_index=-1)

In [None]:
torch.save(model.state_dict, '/content/drive/MyDrive/transformer_weights/BiLSTM_attn_weights.pt')

In [None]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(log_dir=f"logs/BiLSTMAttn")

In [None]:
def train_epoch(
    model: torch.nn.Module,
    dataloader: torch.utils.data.DataLoader,
    optimizer: torch.optim.Optimizer,
    criterion: torch.nn.Module,
    writer: SummaryWriter,
    device: torch.device,
    epoch: int,
) -> None:
    """
    One training cycle (loop).
    """

    model.train()

    epoch_loss = []
    batch_metrics_list = defaultdict(list)

    for i, (tokens, labels) in tqdm(
        enumerate(dataloader),
        total=len(dataloader),
        desc="loop over train batches",
    ):

        tokens, labels = tokens.to(device), labels.to(device)


        optimizer.zero_grad()
        outputs = model(tokens)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        epoch_loss.append(loss.item())
        writer.add_scalar(
            "batch loss / train", loss.item(), epoch * len(dataloader) + i
        )

        with torch.no_grad():
            model.eval()
            outputs_inference = model(tokens)
            model.train()

        batch_metrics = compute_metrics(
            outputs=outputs_inference,
            labels=labels,
        )

        for metric_name, metric_value in batch_metrics.items():
            batch_metrics_list[metric_name].append(metric_value)
            writer.add_scalar(
                f"batch {metric_name} / train",
                metric_value,
                epoch * len(dataloader) + i,
            )

    avg_loss = np.mean(epoch_loss)
    print(f"Train loss: {avg_loss}\n")
    writer.add_scalar("loss / train", avg_loss, epoch)

    for metric_name, metric_value_list in batch_metrics_list.items():
        metric_value = np.mean(metric_value_list)
        print(f"Train {metric_name}: {metric_value}\n")
        writer.add_scalar(f"{metric_name} / train", metric_value, epoch)

In [None]:
def evaluate_epoch(
    model: torch.nn.Module,
    dataloader: torch.utils.data.DataLoader,
    criterion: torch.nn.Module,
    writer: SummaryWriter,
    device: torch.device,
    epoch: int,
) -> None:
    """
    One evaluation cycle (loop).
    """

    model.eval()

    epoch_loss = []
    batch_metrics_list = defaultdict(list)

    with torch.no_grad():

        for i, (tokens, labels) in tqdm(
            enumerate(dataloader),
            total=len(dataloader),
            desc="loop over test batches",
        ):

            tokens, labels = tokens.to(device), labels.to(device)

            outputs = model(tokens)
            loss = criterion(outputs, labels)
            
            epoch_loss.append(loss.item())
            writer.add_scalar(
                "batch loss / test", loss.item(), epoch * len(dataloader) + i
            )

            batch_metrics = compute_metrics(
                outputs=outputs,
                labels=labels,
            )

            for metric_name, metric_value in batch_metrics.items():
                batch_metrics_list[metric_name].append(metric_value)
                writer.add_scalar(
                    f"batch {metric_name} / test",
                    metric_value,
                    epoch * len(dataloader) + i,
                )

        avg_loss = np.mean(epoch_loss)
        print(f"Test loss:  {avg_loss}\n")
        writer.add_scalar("loss / test", avg_loss, epoch)

        for metric_name, metric_value_list in batch_metrics_list.items():
            metric_value = np.mean(metric_value_list)
            print(f"Test {metric_name}: {metric_value}\n")
            writer.add_scalar(f"{metric_name} / test", np.mean(metric_value), epoch)

In [None]:
def train(
    n_epochs: int,
    model: torch.nn.Module,
    train_dataloader: torch.utils.data.DataLoader,
    test_dataloader: torch.utils.data.DataLoader,
    optimizer: torch.optim.Optimizer,
    criterion: torch.nn.Module,
    writer: SummaryWriter,
    device: torch.device,
) -> None:
    """
    Training loop.
    """

    for epoch in range(n_epochs):

        print(f"Epoch [{epoch+1} / {n_epochs}]\n")

        train_epoch(
            model=model,
            dataloader=train_dataloader,
            optimizer=optimizer,
            criterion=criterion,
            writer=writer,
            device=device,
            epoch=epoch,
        )
        evaluate_epoch(
            model=model,
            dataloader=test_dataloader,
            criterion=criterion,
            writer=writer,
            device=device,
            epoch=epoch,
        )

In [None]:
train(n_epochs=16,
      model=model,
      train_dataloader=train_dataloader,
      test_dataloader=valid_dataloader,
      optimizer=optimizer,
      criterion=criterion,
      writer=writer,
      device=device)

In [None]:
!cp -a /content/logs/BiLSTMAttn /content/drive/MyDrive/BiLSTMlAttn

In [None]:
evaluate_epoch(
  model=model,
  dataloader=test_dataloader,
  criterion=criterion,
  writer=writer,
  device=device,
  epoch=16,
)


#Distil


In [None]:
tokenizer

PreTrainedTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [None]:
model = BiLSTM(
    num_embeddings=28996,
    embedding_dim=100,
    hidden_size=100,
    num_layers=1,
    dropout=0.0,
    bidirectional=True,
    n_classes=len(label2idx),
).to(device)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss(ignore_index=-1)

In [None]:
model

In [None]:
model = BiLSTMAttn(
    num_embeddings=28996,
    embedding_dim=100,
    hidden_size=100,
    num_layers=1,
    dropout=0.0,
    bidirectional=True,
    num_heads = 10,
    n_classes=len(label2idx),
).to(device)

In [None]:
model

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss(ignore_index=-1)

In [None]:
model_name = "distilbert-base-cased"

In [None]:
from transformers import AutoModelForTokenClassification

In [None]:
model_bert = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label2idx),
).to(device)

In [None]:
state_dict = torch.load('/content/drive/MyDrive/transformer_weights/transformer_weights.pt', map_location=torch.device('cpu'))
model_bert.load_state_dict(state_dict())

In [None]:
train_dataset_bert = TransformersDataset(
    token_seq=train_token_seq,
    label_seq=train_label_seq,
)
valid_dataset_bert = TransformersDataset(
    token_seq=valid_token_seq,
    label_seq=valid_label_seq,
)
test_dataset_bert = TransformersDataset(
    token_seq=test_token_seq,
    label_seq=test_label_seq,
)

In [None]:
collator_bert = TransformersCollator(
    tokenizer=tokenizer,
    tokenizer_kwargs=tokenizer_kwargs,
    label_padding_value=-1,
)

In [None]:
train_dataloader_bert = torch.utils.data.DataLoader(
    train_dataset_bert,
    batch_size=2,
    shuffle=True, # для корректных замеров метрик оставить shuffle=False
    collate_fn=collator_bert,
)
valid_dataloader_bert = torch.utils.data.DataLoader(
    valid_dataset_bert,
    batch_size=1,  # для корректных замеров метрик оставить batch_size=1
    shuffle=False, # для корректных замеров метрик оставить shuffle=False
    collate_fn=collator_bert,
)
test_dataloader_bert = torch.utils.data.DataLoader(
    test_dataset_bert,
    batch_size=1,  # для корректных замеров метрик оставить batch_size=1
    shuffle=False, # для корректных замеров метрик оставить shuffle=False
    collate_fn=collator_bert,
)

In [None]:
tokens, labels = next(iter(train_dataloader_bert))

In [None]:
out = model_bert(**tokens.to(device))

In [None]:
# создадим SummaryWriter для эксперимента с BiLSTMModel + Transformer

from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(log_dir=f"logs/BiLSTM_Transformer_label")

In [None]:
def train_epoch(
    model: torch.nn.Module,
    model_bert: torch.nn.Module,
    dataloader: torch.utils.data.DataLoader,
#    dataloader_bert: torch.utils.data.DataLoader,
    optimizer: torch.optim.Optimizer,
    criterion: torch.nn.Module,
    writer: SummaryWriter,
    device: torch.device,
    epoch: int,
) -> None:
    """
    One training cycle (loop).
    """
    a = 0.5
    criterion_mse = torch.nn.MSELoss()
    criterion_ce = torch.nn.CrossEntropyLoss(ignore_index=-1)

    model.train()
    model_bert.eval()
    epoch_loss = []
    batch_metrics_list = defaultdict(list)

    for i, (tokens, labels) in tqdm(
        enumerate(dataloader),
        total=len(dataloader),
        desc="loop over train batches",
    ):

        tokens, labels = tokens.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(tokens['input_ids'])
        outputs_bert = model_bert(**tokens)

        #loss = a*criterion_ce(outputs, labels) + (1-a)*criterion_mse(outputs, outputs_bert["logits"].transpose(1, 2))
        loss = criterion(outputs, outputs_bert['logits'].argmax(dim=-1))
        loss.backward()
        optimizer.step()

        epoch_loss.append(loss.item())
        writer.add_scalar(
            "batch loss / train", loss.item(), epoch * len(dataloader) + i
        )

        with torch.no_grad():
            model.eval()
            outputs_inference = model(tokens['input_ids'])
            model.train()

        batch_metrics = compute_metrics(
            outputs=outputs_inference,
            labels=labels,
        )

        for metric_name, metric_value in batch_metrics.items():
            batch_metrics_list[metric_name].append(metric_value)
            writer.add_scalar(
                f"batch {metric_name} / train",
                metric_value,
                epoch * len(dataloader) + i,
            )

    avg_loss = np.mean(epoch_loss)
    print(f"Train loss: {avg_loss}\n")
    writer.add_scalar("loss / train", avg_loss, epoch)

    for metric_name, metric_value_list in batch_metrics_list.items():
        metric_value = np.mean(metric_value_list)
        print(f"Train {metric_name}: {metric_value}\n")
        writer.add_scalar(f"{metric_name} / train", metric_value, epoch)

In [None]:
def evaluate_epoch(
    model: torch.nn.Module,
    dataloader: torch.utils.data.DataLoader,
    criterion: torch.nn.Module,
    writer: SummaryWriter,
    device: torch.device,
    epoch: int,
) -> None:
    """
    One evaluation cycle (loop).
    """

    model.eval()

    epoch_loss = []
    batch_metrics_list = defaultdict(list)

    with torch.no_grad():

        for i, (tokens, labels) in tqdm(
            enumerate(dataloader),
            total=len(dataloader),
            desc="loop over test batches",
        ):

            tokens, labels = tokens.to(device), labels.to(device)

            outputs = model(tokens['input_ids'])
            loss = criterion(outputs, labels)
            
            epoch_loss.append(loss.item())
            writer.add_scalar(
                "batch loss / test", loss.item(), epoch * len(dataloader) + i
            )

            batch_metrics = compute_metrics(
                outputs=outputs,
                labels=labels,
            )

            for metric_name, metric_value in batch_metrics.items():
                batch_metrics_list[metric_name].append(metric_value)
                writer.add_scalar(
                    f"batch {metric_name} / test",
                    metric_value,
                    epoch * len(dataloader) + i,
                )

        avg_loss = np.mean(epoch_loss)
        print(f"Test loss:  {avg_loss}\n")
        writer.add_scalar("loss / test", avg_loss, epoch)

        for metric_name, metric_value_list in batch_metrics_list.items():
            metric_value = np.mean(metric_value_list)
            print(f"Test {metric_name}: {metric_value}\n")
            writer.add_scalar(f"{metric_name} / test", np.mean(metric_value), epoch)

In [None]:
def train(
    n_epochs: int,
    model: torch.nn.Module,
    model_bert: torch.nn.Module,
    train_dataloader: torch.utils.data.DataLoader,
    #train_dataloader_bert: torch.utils.data.DataLoader,
    test_dataloader: torch.utils.data.DataLoader,
    optimizer: torch.optim.Optimizer,
    criterion: torch.nn.Module,
    writer: SummaryWriter,
    device: torch.device,
) -> None:
    """
    Training loop.
    """

    for epoch in range(n_epochs):

        print(f"Epoch [{epoch+1} / {n_epochs}]\n")

        train_epoch(
            model=model,
            model_bert=model_bert,
            dataloader=train_dataloader,
            #dataloader_bert=train_dataloader_bert,
            optimizer=optimizer,
            criterion=criterion,
            writer=writer,
            device=device,
            epoch=epoch,
        )
        evaluate_epoch(
            model=model,
            dataloader=test_dataloader,
            criterion=criterion,
            writer=writer,
            device=device,
            epoch=epoch,
        )

Простая BiLSTM с коэффициентом 'a' в лоссе равным 0.5 - учитывается порвону норма разности между логитами модели учителя и ученика и кроссэнтропия

In [None]:

train(n_epochs=12,
      model=model,
      model_bert=model_bert,
      train_dataloader=train_dataloader_bert,
      #train_dataloader_bert=train_dataloader_bert,
      test_dataloader=valid_dataloader_bert,
      optimizer=optimizer,
      criterion=criterion,
      writer=writer,
      device=device)

Простая BiLSTM

In [None]:
evaluate_epoch(
  model=model,
  dataloader=test_dataloader_bert,
  criterion=criterion,
  writer=writer,
  device=device,
  epoch=8,
)


BiLSTM по меткам учителя

In [None]:

train(n_epochs=12,
      model=model,
      model_bert=model_bert,
      train_dataloader=train_dataloader_bert,
      #train_dataloader_bert=train_dataloader_bert,
      test_dataloader=valid_dataloader_bert,
      optimizer=optimizer,
      criterion=criterion,
      writer=writer,
      device=device)

In [None]:
evaluate_epoch(
  model=model,
  dataloader=test_dataloader_bert,
  criterion=criterion,
  writer=writer,
  device=device,
  epoch=8,
)


In [None]:
!cp -a ./logs/BiLSTM_Transformer/* /content/drive/MyDrive/BiLSTM_Trans

In [None]:
!cp -a ./logs/BiLSTM_Transformer_label/* /content/drive/MyDrive/BiLSTM_Trans_label

Сеть с вниманием

In [None]:

train(n_epochs=12,
      model=model,
      model_bert=model_bert,
      train_dataloader=train_dataloader_bert,
      #train_dataloader_bert=train_dataloader_bert,
      test_dataloader=valid_dataloader_bert,
      optimizer=optimizer,
      criterion=criterion,
      writer=writer,
      device=device)

In [None]:
evaluate_epoch(
  model=model,
  dataloader=test_dataloader_bert,
  criterion=criterion,
  writer=writer,
  device=device,
  epoch=8,
)


Простая BiLSTM с коэффициентом 'a' в лоссе равным нулю - учитывается только норма разности между логитами модели учителя и ученика

In [None]:

train(n_epochs=12,
      model=model,
      model_bert=model_bert,
      train_dataloader=train_dataloader_bert,
      #train_dataloader_bert=train_dataloader_bert,
      test_dataloader=valid_dataloader_bert,
      optimizer=optimizer,
      criterion=criterion,
      writer=writer,
      device=device)

In [None]:
evaluate_epoch(
  model=model,
  dataloader=test_dataloader_bert,
  criterion=criterion,
  writer=writer,
  device=device,
  epoch=8,
)


# Тренировка на векторную близость


In [None]:
indent = 1
triplet_loss = TripletLoss(batch_size, indent)

In [None]:
def train_epoch_plus(
    model: torch.nn.Module,
    dataloader: torch.utils.data.DataLoader,
    optimizer: torch.optim.Optimizer,
    criterion: torch.nn.Module,
    #writer: SummaryWriter,
    device: torch.device,
    epoch: int,
) -> None:
    """
    One training cycle (loop).
    """

    model.train()

    epoch_loss = []
    
    
    for i, (tokens, labels, persons) in tqdm(
        enumerate(dataloader),
        total=len(dataloader),
        desc="loop over train batches",
    ):
        mask = persons > 0
        if len(persons[mask]) == 0:
          continue
        tokens, labels, persons = tokens.to(device), labels.to(device), persons.to(device)

  
        optimizer.zero_grad()
        outputs = model(**tokens)
        #loss = criterion(outputs["logits"].transpose(1, 2), labels)
        loss = criterion(outputs["logits"][:batch_size].transpose(1, 2), labels) + triplet_loss(outputs['hidden_states'][-1], persons)
        if loss == 0:
          continue
        loss.backward()
        optimizer.step()

        epoch_loss.append(loss.item())
        #writer.add_scalar(
         #   "batch loss / train", loss.item(), epoch * len(dataloader) + i
        #)

    avg_loss = np.mean(epoch_loss)
    print(f"Train loss: {avg_loss}\n")
#    writer.add_scalar("loss / train", avg_loss, epoch)


In [None]:
def evaluate_epoch_plus(
    model: torch.nn.Module,
    dataloader: torch.utils.data.DataLoader,
    criterion: torch.nn.Module,
    #writer: SummaryWriter,
    device: torch.device,
    epoch: int,
) -> None:
    """
    One evaluation cycle (loop).
    """

    model.eval()

    epoch_loss = []

    with torch.no_grad():

        for i, (tokens, labels, persons) in tqdm(
            enumerate(dataloader),
            total=len(dataloader),
            desc="loop over test batches",
        ):
            mask = persons > 0
            if len(list(persons[mask])) == 0:
              continue
            tokens, labels, persons = tokens.to(device), labels.to(device), persons.to(device)


            outputs = model(**tokens)
            loss = criterion(outputs["logits"][:batch_size].transpose(1, 2), labels) + triplet_loss(outputs['hidden_states'][-1], persons)
            if loss == 0:
              continue           
            epoch_loss.append(loss.item())
            #writer.add_scalar(
                #"batch loss / test", loss.item(), epoch * len(dataloader) + i
            #)

        avg_loss = np.mean(epoch_loss)
        print(f"Test loss:  {avg_loss}\n")
       # writer.add_scalar("loss / test", avg_loss, epoch)


In [None]:
def train_plus(
    n_epochs: int,
    model: torch.nn.Module,
    train_dataloader: torch.utils.data.DataLoader,
    test_dataloader: torch.utils.data.DataLoader,
    optimizer: torch.optim.Optimizer,
    criterion: torch.nn.Module,
    #writer: SummaryWriter,
    device: torch.device,
) -> None:
    """
    Training loop.
    """

    for epoch in range(n_epochs):

        print(f"Epoch [{epoch+1} / {n_epochs}]\n")

        train_epoch_plus(
            model=model,
            dataloader=train_dataloader,
            optimizer=optimizer,
            criterion=criterion,
            #writer=writer,
            device=device,
            epoch=epoch,
        )
        evaluate_epoch_plus(
            model=model,
            dataloader=test_dataloader,
            criterion=criterion,
            #writer=writer,
            device=device,
            epoch=epoch,
        )

In [None]:
criterion = torch.nn.CrossEntropyLoss(ignore_index=-1)

In [None]:
train_plus(n_epochs=1,
      model=model,
      train_dataloader=train_dataloader,
      test_dataloader=train_dataloader,
      optimizer=optimizer,
      criterion=criterion,
      #writer=writer,
      device=device)

Epoch [1 / 1]



loop over train batches: 100%|██████████| 158/158 [09:31<00:00,  3.62s/it]


Train loss: 0.48222977928770394



loop over test batches: 100%|██████████| 158/158 [01:28<00:00,  1.79it/s]

Test loss:  0.04150095114003142






In [None]:
torch.save(model.state_dict, '/content/drive/MyDrive/transformer_weights/RUBERT_weights_plus.pt')

# 2D Визуализация

In [None]:
#from sklearn.linear_model import Ridge
#from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import matplotlib.pyplot as plt

from ipywidgets import interactive, fixed, interact_manual, IntSlider, FloatLogSlider, FloatSlider
from sklearn.datasets import make_classification, make_moons, make_blobs
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.manifold import TSNE, Isomap

In [None]:
def plot_2d_data(data, labels, title='Исходные данные', cmap='tab20', ax=None):
    '''
    Отрисовка 2d scatter plot. 
    :param np.ndarray data: 2d массив точек
    :param Union[list, np.ndarray] labels: список меток для каждой точки выборки
    :param str title: Заголовок графика
    :param str cmap: Цветовая палитра
    :param ax Optional[matplotlib.axes.Axes]: Оси для отрисовки графика.
        Если оси не заданы, то создаётся новая фигура и сразу же происходит её отрисовка
        Иначе, график добавляется на существуюущие оси. Отрисовки фигуры не происходит
    '''
    n_clusters = len(np.unique(labels))
    
    if ax is None:
        fig, ax = plt.subplots(1, 1, figsize=(10, 5))
    else:
        fig = None
        
    scatter = ax.scatter(
        data[:, 0], data[:, 1], c=labels, 
        cmap=plt.get_cmap(cmap, n_clusters)
    )

    cbar = plt.colorbar(scatter, label='Номер кластера', ax=ax)
    cbar.set_ticks(np.min(labels) + (np.arange(n_clusters) + 0.5) * (n_clusters - 1) / n_clusters)
    cbar.set_ticklabels(np.unique(labels))

    ax.set_title(title)
    ax.grid(True)
    
    if fig is not None:
        fig.tight_layout()
        plt.show()

In [None]:
data, labels = make_classification(
    n_samples=1000, n_features=200, n_informative=100,
    n_repeated=0, n_classes=5, n_clusters_per_class=2, weights=None, 
    flip_y=0.01, class_sep=2.5, hypercube=True, shift=0.0, scale=1.0, 
    shuffle=True, random_state=None
)

In [None]:
def get_entyti_list(last_hidden_states, persons):
  hid_head = last_hidden_states[:batch_size]
  #отбрасываем вектора ненужных токенов
  mask = persons > 0
  person = persons[mask]
  head = hid_head[mask]
  #создаем список интов для разбиения головы
  list_to_split = []
  entity_list = []
  prev_pers = 0
  cout = 1
  for pers in person:
    if pers != prev_pers:      
      list_to_split.append(cout)
      entity_list.append(prev_pers)
      prev_pers = pers
      cout = 1
    else:
      cout += 1
  list_to_split.append(cout)
  entity_list.append(prev_pers)
  list_to_split = list_to_split[1:]
  entity_list = entity_list[1:]
  #разбиваем голову в соответствии с сущностями
  head_tuple = torch.split(head, list_to_split)
  entity_vec = [hd.mean(axis=0) for hd in head_tuple]
    
  return entity_vec, entity_list


In [None]:
model.eval()
ent_vec, ent_list = [], []
with torch.no_grad():
  for tokens, labels, persons in train_dataloader:
    mask = persons > 0
    if len(persons[mask]) == 0:
      continue
    outputs = model(**tokens)
    tmp_ent_vec, tmp_ent_lis = get_entyti_list(outputs['hidden_states'][-1], persons)
    ent_vec = ent_vec + tmp_ent_vec
    ent_list = ent_list + tmp_ent_lis

In [None]:
entus = torch.stack(ent_vec)

In [None]:
labus = torch.stack(ent_list)

In [None]:
import gc
del ent_vec
del ent_list
gc.collect()


120

In [None]:
data, labels = entus.detach().numpy(), labus.detach().numpy()

In [None]:
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)

In [None]:
def get_tnse_model(data, labels, perplexity, learning_rate, title):
    model = TSNE(perplexity=perplexity, learning_rate=learning_rate, random_state=0, init="pca")
    transformed_data = model.fit_transform(data)
    plot_2d_data(transformed_data, labels, title=title)
    
interactive_plot = interactive(
    get_tnse_model,
    data=fixed(scaled_data),
    labels=fixed(labels),
    perplexity=IntSlider(min=1, max=100, step=1, value=30, description=r'perplexity'),
    learning_rate=FloatLogSlider(value=200, min=1, max=3, step=0.1, description=r'learning_rate'),
    title=fixed(TSNE)
    )
interactive_plot

interactive(children=(IntSlider(value=30, description='perplexity', min=1), FloatLogSlider(value=200.0, descri…

In [None]:
def get_tnse_model(data, labels, perplexity, learning_rate, title):
    model = TSNE(perplexity=perplexity, learning_rate=learning_rate, random_state=0, init="pca")
    transformed_data = model.fit_transform(data)
    plot_2d_data(transformed_data, labels, title=title)
    
interactive_plot = interactive(
    get_tnse_model,
    data=fixed(scaled_data),
    labels=fixed(labels),
    perplexity=IntSlider(min=1, max=100, step=1, value=30, description=r'perplexity'),
    learning_rate=FloatLogSlider(value=200, min=1, max=3, step=0.1, description=r'learning_rate'),
    title=fixed(TSNE)
    )
interactive_plot

interactive(children=(IntSlider(value=30, description='perplexity', min=1), FloatLogSlider(value=200.0, descri…

# Обработка ввода обученной моделью


In [None]:
def sentences_dict_to_tensor(model, dictonary, tokenizer, tokenizer_kwargs, label_padding_value):
  model.eval
  all_entitys = []
  for key in sorted(dictonary.keys()):
    one_entity = []
    for sent in dictonary[key]:
      tokens = tokenizer(list(sent), **tokenizer_kwargs)
      #получаем маску для токенов
      doc_offset = tokens.offset_mapping.squeeze(0)
      token_mask = np.ones(len(doc_offset), dtype=int) * label_padding_value
      arr_offset = np.array(doc_offset)
      token_mask[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = 1
      mask = token_mask != label_padding_value
      #вывод модели
      tokens.pop("offset_mapping")
      outputs = model(**tokens)
      last_hidden_layer = outputs['hidden_states'][-1].squeeze(0)[mask].cpu()
      
      one_entity.append(last_hidden_layer.mean(0))
    all_entitys.append(torch.stack(one_entity).mean(0))
  return torch.stack(all_entitys)



In [None]:
entity_centr = sentences_dict_to_tensor(model, id2sent, tokenizer, tokenizer_kwargs, label_padding_value=-1)

In [None]:
entity_centr.shape

torch.Size([2, 768])

In [None]:
ent_id = 0

In [None]:
cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
cos_list = []

for vector in ent_vec:
  cos_list.append(cos(vector, entity_centr)[ent_id])
min_cos = torch.min(torch.stack(cos_list))

In [None]:
min_cos

tensor(-0.3941, grad_fn=<MinBackward1>)

In [None]:
ent_id = 1

In [None]:
cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
cos_list = []

for vector in ent_vec:
  cos_list.append(cos(vector, entity_centr)[ent_id])
min_cos_2 = torch.min(torch.stack(cos_list))

In [None]:
min_cos_2

tensor(0.5027, grad_fn=<MinBackward1>)

In [None]:
person_id = []
for vector in ent_vec:
  person_id.append(torch.argmax(cos(vector, entity_centr)) + 1)

In [None]:
person_id

In [None]:
cos_list = []

for vector in ent_vec:
  cos_list.append(cos(vector, entity_centr)[ent_id])
max_cos_1_2 = torch.max(torch.stack(cos_list))

In [None]:
max_cos_1_2

tensor(0.9100, grad_fn=<MaxBackward1>)

In [None]:
wrong_person = []
for vector in ent_vec:
  cosin = cos(vector, entity_centr)
  if torch.argmax(cosin) == 1:
    wrong_person.append(cosin[1])
wrong_person_min = torch.max(torch.stack(wrong_person))

In [None]:
wrong_person_min

tensor(0.9100, grad_fn=<MaxBackward1>)

In [None]:
right_person = []
for vector in ent_vec:
  cosin = cos(vector, entity_centr)
  if torch.argmax(cosin) == 0:
    right_person.append(cosin[0])
right_person_max = torch.min(torch.stack(right_person))

In [None]:
right_person_max

tensor(0.3414, grad_fn=<MinBackward1>)

итак отступ для неопределяемой сущности:

In [None]:
ident_not_id = 30/42 * right_person_max + 12/42 * wrong_person_min

Если косинусное сходство для сущности меньше этого порога, то ставим ей метку 0:

In [None]:
ident_not_id

tensor(0.5039, grad_fn=<AddBackward0>)

In [None]:
!pip install natasha

In [None]:
text_name = '1909-02.txt'

In [None]:
import re
from natasha import (
    Segmenter,
    Doc
)

def text_to_list(text_name):

  segmenter = Segmenter()

  with open(text_name, 'r') as f:
    text = f.read()
  doc = Doc(text)
  doc.segment(segmenter)
  text_list = []
  for sent in doc.sents:
    text_list.append(re.findall(r"[\w]+|[.,!?;'\":-]", sent.text))
  return text_list

In [None]:
inp = text_to_list(text_name)

In [None]:
def sent_ent_vec(model: torch.nn.Module,
                seq: List[List[str]],
                tokenizer: PreTrainedTokenizer,
                tokenizer_kwargs: Dict[str, Any],
                label_padding_value: int
                ):
   return_list = []
   for sent in seq:
     tokens = tokenizer(list(sent), **tokenizer_kwargs)
     #получаем маску для токенов
     doc_offset = tokens.offset_mapping.squeeze(0)
     token_mask = np.ones(len(doc_offset), dtype=int) * label_padding_value
     arr_offset = np.array(doc_offset)
     token_mask[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = 1
     #вывод модели
     tokens.pop("offset_mapping")
     model.eval()
     outputs = model(**tokens)
     #предсказния классов
     mask = token_mask != label_padding_value
     y_true = torch.argmax(outputs['logits'].squeeze(0), dim = 1)[mask].cpu()
     #последний скрытый слой
     last_hidden_layer = outputs['hidden_states'][-1].squeeze(0)[mask].cpu()
     #отфильтровывание сущностей и векторов скрытого слоя для них
     mask_for_entity = (y_true == label2idx['B-PER']) + (y_true == label2idx['I-PER'])
     entity_vector = last_hidden_layer[mask_for_entity]
     persons = y_true[mask_for_entity]
     #составление вектора каждой сущности
     list_to_split = []
     cout = 0
     for pers in persons:
       if pers == label2idx['B-PER']:
         list_to_split.append(cout)
         cout = 1
       else:
         cout += 1
     
     list_to_split.append(cout)
     list_to_split = list_to_split[1:]
     entitys = torch.split(entity_vector, list_to_split)
     entitys = [x.mean(axis=0) for x in entitys]
     
     ret = [sent, y_true, entitys]
     return_list.append(ret)
  
   return return_list

In [None]:
test_seq = inp[:10]

In [None]:
test_run = sent_ent_vec(model, test_seq, tokenizer, tokenizer_kwargs, label_padding_value = -1)

In [None]:
test_run[7][1]

tensor([0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0])