### Подготовительный этап

Установим все необходимые библиотеки, а также скачаем датасет.

In [1]:
FIRST_RUN = False    

In [2]:
import os

if FIRST_RUN:
    %pip install numpy
    %pip install torch --index-url https://download.pytorch.org/whl/cu121
    %pip install transformers
    %pip install nerus
    %pip install scikit-learn
    %pip install wget
    file_name = 'nerus_lenta.conllu.gz'
    if not os.path.exists(file_name):
        !python -m wget https://storage.yandexcloud.net/natasha-nerus/data/nerus_lenta.conllu.gz

### Импорт основных требуемых библиотек и формирование датасета

In [3]:
# Importing pytorch and the library for TPU execution

import torch

if torch.cuda.is_available():
    dev = torch.device("cuda") # I set it specifically to CUDA
else:
    dev = torch.device("cpu")
    

import numpy as np
import transformers
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForTokenClassification, BertTokenizer, BertConfig, BertModel

In [4]:
from nerus import load_nerus
nerus = load_nerus("nerus_lenta.conllu.gz")

Поставим в соответствие каждому тегу число. В дальнейшем будем хранить метки в виде чисел.

In [5]:
tags_vals = {'B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O'}
tag2idx = {t: i for i, t in enumerate(tags_vals)}
idx2tags = {i: t for i, t in enumerate(tags_vals)}
idx2tags[len(tags_vals)] = '<PAD>'

Отберем некоторое количество документов из датасета NERUS и сформируем датасет из предложений. В качестве откликов выступают числовые обозначения тегов токенов.

In [6]:
# Creating new lists and dicts that will be used at a later stage for reference and processing

sentences = []
labels = []
cnt = 0
for d in nerus:
    if cnt > 10:  # Was 10
        break
    for s in d.sents:
        temp_sentence = []
        temp_labels = []
        for t in s.tokens:
            temp_sentence.append(t.text)
            temp_labels.append(tag2idx[t.tag])
        sentences.append(temp_sentence)
        labels.append(temp_labels)
    cnt += 1

<a id='section03'></a>
### Подготовка датасета

Определим некоторые ключевые переменные, которые будут использоваться позже на этапе обучения / точной настройки.

Создадим класс <code>Dataset</code> - он отвечает за предварительную обработку текста перед отправкой в нейронную сеть.

Класс <code>Dataloader</code> используется для передачи данных в нейронную сеть относительно небольшими партиями во избежание нехватки оперативной памяти.

<code>Dataset</code> и <code>Dataloader</code> - это конструкции библиотеки PyTorch. Более детальную информацию можно найти [в документации](https://pytorch.org/docs/stable/data.html) 

#### Класс Dataset
- Этот класс определен для того, чтобы принимать `tokenizer`, `sentences` и `labels` как входные данные, и генерировать токенизированные предложения, а также теги для обучения модели BERT. 
- Для токенизации используется BERT tokenizer. 
- Токенизатор использует метод `encode_plus` для непосредственно токенизации и генерации необходимых выходных данных: `ids`, `attention_mask`
- С более детальной информацией об используемом токенизацторе можно ознакомиться, [по ссылке](https://huggingface.co/transformers/model_doc/bert.html#berttokenizer)

#### Класс Dataloader
- Dataloader выступает в роли так называемого загрузчика данных, который загружает данные в нейронную сеть определенным образом. Данные в нейронную сеть подаются частями, что позволяет избежать переполнения оперативной памяти.

In [7]:
# Defining some key variables that will be used later on in the training

MAX_LEN = 100 # WAS 100
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 16 # WAS 16
EPOCHS = 10 # WAS 5
LEARNING_RATE = 1e-05 # WAS 2e-05
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')



In [8]:
class CustomDataset(Dataset):
    def __init__(self, tokenizer, sentences, labels, max_len):
        self.len = len(sentences)
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        sentence = str(self.sentences[index])
        inputs = self.tokenizer.encode_plus(
            sentence,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            is_split_into_words=True,
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        label = self.labels[index]
        label.extend([len(tags_vals)]*self.max_len)
        label=label[:self.max_len]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'tags': torch.tensor(label, dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len
    
# Creating the dataset and dataloader for the neural network

train_percent = 0.8
train_size = int(train_percent*len(sentences))
# train_dataset=df.sample(frac=train_size,random_state=200).reset_index(drop=True)
# test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_sentences = sentences[0:train_size]
train_labels = labels[0:train_size]

test_sentences = sentences[train_size:]
test_labels = labels[train_size:]

print("FULL Dataset: {}".format(len(sentences)))
print("TRAIN Dataset: {}".format(len(train_sentences)))
print("TEST Dataset: {}".format(len(test_sentences)))

training_set = CustomDataset(tokenizer, train_sentences, train_labels, MAX_LEN)
testing_set = CustomDataset(tokenizer, test_sentences, test_labels, MAX_LEN)

FULL Dataset: 129
TRAIN Dataset: 103
TEST Dataset: 26


In [9]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

<a id='section04'></a>
### Проектирование нейронной сети

#### Neural Network
 - Для создание нейронной сети использвется класс `BERTClass`. 
 - Сеть основана на модели `BertForTokenClassification`.

In [10]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertForTokenClassification.from_pretrained('DeepPavlov/rubert-base-cased', 
                                                                          num_labels=len(tags_vals) + 1)
        # self.dropout = torch.nn.Dropout(0.3)
        # self.classifier = torch.nn.Linear(self.l1.config.hidden_size, len(tags_vals) + 1) # Possible error
    
    def forward(self, ids, mask, labels):
        output_1 = self.l1(ids, mask, labels = labels)
        # output_2 = output_1.last_hidden_state
        # dropout_output = self.dropout(output_2)
        # logits = self.classifier(dropout_output)
        return output_1

In [11]:
model = BERTClass()
model.to(dev)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTClass(
  (l1): BertForTokenClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(119547, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=

In [12]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

<a id='section05'></a>
### Обучение модели

In [13]:
from torch import nn
import time

loss_fn = nn.CrossEntropyLoss(ignore_index=len(tags_vals))
def train(epoch):
    start_time = time.time()
    model.train()
    total_loss = 0
    
    for _, data in enumerate(training_loader, 0):
        ids = data['ids'].to(dev, dtype=torch.long)
        mask = data['mask'].to(dev, dtype=torch.long)
        targets = data['tags'].to(dev, dtype=torch.long)
        # for debug:
        debug_ids = ids.cpu().numpy()
        debug_mask = mask.cpu().numpy()
        debug_targets = targets.cpu().numpy()

        # Forward pass
        outputs = model(ids, mask, labels = targets)
        
        # Calculate loss
        loss = loss_fn(outputs.logits.view(-1, outputs.logits.shape[-1]), targets.view(-1))
        total_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    epoch_duration = time.time() - start_time
    print(f'Epoch {epoch} completed in {epoch_duration:.2f} seconds, Loss: {total_loss/len(training_loader)}')

for epoch in range(EPOCHS):
    train(epoch)    

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch 0 completed in 1.12 seconds, Loss: 1.4216500222682953
Epoch 1 completed in 1.00 seconds, Loss: 0.577496312558651
Epoch 2 completed in 0.96 seconds, Loss: 0.4945468194782734
Epoch 3 completed in 1.00 seconds, Loss: 0.5188896059989929
Epoch 4 completed in 1.01 seconds, Loss: 0.5086330845952034
Epoch 5 completed in 0.99 seconds, Loss: 0.5209735333919525
Epoch 6 completed in 0.97 seconds, Loss: 0.47011637687683105
Epoch 7 completed in 0.96 seconds, Loss: 0.5341756790876389
Epoch 8 completed in 0.96 seconds, Loss: 0.4977182075381279
Epoch 9 completed in 0.95 seconds, Loss: 0.5193262994289398


<a id='section06'></a>
### Оценка модели

Произведем оценку модели на ранее отобранных валидационных данных.

In [14]:
from sklearn.metrics import f1_score

def evaluate_model(model, valid_loader):
    model.eval()
    true_labels = []
    pred_labels = []
    
    with torch.no_grad():
        for batch in valid_loader:
            ids = batch['ids'].to(dev, dtype=torch.long)
            mask = batch['mask'].to(dev, dtype=torch.long)
            targets = batch['tags'].to(dev, dtype=torch.long)
            
            outputs = model(ids, mask, labels = targets)
            predictions = torch.argmax(outputs.logits, dim=2)
            
            # For each sequence in the batch
            for i in range(targets.shape[0]):
                # For each token in the sequence
                for j in range(targets.shape[1]):
                    if targets[i, j] != len(tags_vals):  # Exclude padding index
                        true_labels.append(targets[i, j].cpu().item())
                        pred_labels.append(predictions[i, j].cpu().item())
    
    f1 = f1_score(true_labels, pred_labels, average='weighted')
    print(f"F1 Score: {f1}")
    
evaluate_model(model, testing_loader)

F1 Score: 0.8803364360210834


In [15]:
if FIRST_RUN:
    import json
    
    # Load the JSON file
    with open('Soc_Net_Task_3_File_1/content/variants/Soc_Net_Task_3_File_1.json', 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    # Extract the sentences array
    sentences = data['sentences']
    
    # Find the largest list in the sentences array
    largest_sentence = max(sentences, key=len)
    
    print("The largest list in the sentences array is:", largest_sentence)
    print("Length of the largest list:", len(largest_sentence))

In [16]:
import json

def predict_from_json(json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)
    
    sentences = data['sentences']
    predictions = []
    
    model.eval()
    with torch.no_grad():
        for sentence in sentences:
            encoded_inputs = tokenizer.encode_plus(
                sentence,
                add_special_tokens=True,
                max_length=MAX_LEN,
                padding='max_length',
                return_tensors='pt',
                truncation=True,
                is_split_into_words=True
            )
            ids = encoded_inputs['input_ids'].to(dev, dtype=torch.long)
            mask = encoded_inputs['attention_mask'].to(dev, dtype=torch.long)

            outputs = model(ids, mask, labels=None)
            pred = torch.argmax(outputs.logits, dim=2)

            # Convert predictions to tag names
            predicted_tags = [idx2tags[p.item()] for p in pred[0]]
            predictions.extend(predicted_tags[:len(sentence)])  # Slice to match sentence length
    
    return predictions


predictions = predict_from_json('Soc_Net_Task_3_File_1/content/variants/Soc_Net_Task_3_File_1.json')
print(len(predictions))
print(','.join(predictions))


9441
O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,I-ORG,O,O,I-ORG,O,O,O,O,O,O,O,O,O,I-ORG,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O