# Установка нужных модулей и их импорт

In [1]:
!pip3 install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import torch
import os
import pickle
import lzma

from collections import Counter
from transformers.modeling_utils import unwrap_model
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForTokenClassification
from typing import Tuple, List, Dict, Any
from tqdm import tqdm

# Переход в рабочую директорию

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
os.chdir('drive/MyDrive/text')

# Загрузка исходных данных датасета

In [5]:
# !wget https://github.com/nerel-ds/NEREL/releases/download/1.1/NEREL-v1.1.zip -O /content/drive/MyDrive/text/archive.zip

In [6]:
# !unzip /content/drive/MyDrive/text/archive.zip -d /content/drive/MyDrive/text/

# Функции для обработки датасета

#### Считывание из txt-файла

In [7]:
def txt_read(file):
    with open(file, 'r') as f:
        data = f.read()
    return data

#### Считывание из ann-файлы

Из ann-файла нам нужны только метки, начало и конец, причем выкинем из него вложенные сущености, оставим только непересекающиеся внешние

In [8]:
def ann_read(file):
    lst = []
    with open(file, 'r') as g:
        for string in g.readlines():
            if string[0] == 'T' and ';' not in string:
                string = string.split()
                lst.append((string[1], int(string[2]), int(string[3])))
    
    lst.sort(key=lambda x: [x[1], -x[2]])
    end = -1
    ans = []
    for i in lst:
        if end < i[1]:
            ans.append(i)
            end = i[2]

    return ans

#### Создание BIO-разметки

In [9]:
def bio(txt, ann, tokenizer):
    txt = txt_read(txt)
    ann = ann_read(ann)
    spans = tokenizer(txt, return_offsets_mapping=True, add_special_tokens=False, truncation=True)['offset_mapping']
    tokens = [txt[i[0]:i[1]] for i in spans]
    bio_labels = []
    for i in spans:
        for j in ann:
            if i[0] == j[1] and i[0] != i[1]:
                bio_labels.append('B-' + j[0])
                break
            elif i[0] > j[1] and i[1] <= j[2] and i[0] != i[1]:
                bio_labels.append('I-' + j[0])
                break
        else: 
            bio_labels.append('O')
    return tokens, bio_labels, spans

#### Считывание всех данных из директории

In [10]:
def all_read(dir, tokenizer):
    tokens = []
    labels = []
    spans = []
    names = {name[:-4] for name in os.listdir(dir)}
    for name in names:
        token, label, span = bio(dir + '/' + name + '.txt', dir + '/' + name + '.ann', tokenizer)
        tokens.append(token)
        labels.append(label)
        spans.append(span)
    return tokens, labels, spans

# Считывание всех данных в BIO-разметку

#### Создание токенизатора

In [11]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")

#### Создание train, dev, test частей

In [12]:
TRAIN_TOKENS, TRAIN_LABELS, TRAIN_SPANS = all_read('train', tokenizer)
DEV_TOKENS, DEV_LABELS, DEV_SPANS = all_read('dev', tokenizer)
TEST_TOKENS, TEST_LABELS, TEST_SPANS = all_read('test', tokenizer)

# Подготовка словарей

In [13]:
token2cnt = Counter([token for sentence in TRAIN_TOKENS for token in sentence])

In [15]:
def get_token2idx(
    token2cnt: Dict[str, int],
    min_count: int,
) -> Dict[str, int]:
    """
    Get mapping from tokens to indices to use with Embedding layer.
    """

    token2idx: Dict[str, int] = {}
    token2idx.update({'<PAD>': 0, '<UNK>': 1})

    idx = 2
    for token, token_frequency in tqdm(token2cnt.items()):
        if token_frequency >= min_count:
            token2idx[token] = idx
            idx += 1

    return token2idx

In [16]:
token2idx = get_token2idx(token2cnt, min_count=2)

100%|██████████| 31282/31282 [00:00<00:00, 611259.39it/s]


In [17]:
# Функция для сортировки тегов, чтобы сначала был тег O, потом теги B- и только после теги I-

def sort_labels_func(x: str) -> int:
    if x == "O":
        return 0
    elif x.startswith("B-"):
        return 1
    else:
        return 2

label_set = sorted(
    set(label for sentence in TRAIN_LABELS for label in sentence),
    key=lambda x: (sort_labels_func(x), x),
)

In [18]:
def get_label2idx(label_set: List[str]) -> Dict[str, int]:
    """
    Get mapping from labels to indices.
    """

    label2idx: Dict[str, int] = {label: idx for idx, label in tqdm(enumerate(label_set))}

    return label2idx

In [19]:
label2idx = get_label2idx(label_set)

59it [00:00, 57549.75it/s]


# Подготовка датасета и загрузчика

In [20]:
class NERDataset(torch.utils.data.Dataset):
    """
    PyTorch Dataset for NER.
    """

    def __init__(
        self,
        token_seq: List[List[str]],
        label_seq: List[List[str]],
        token2idx: Dict[str, int],
        label2idx: Dict[str, int],
    ):
        self.token2idx = token2idx
        self.label2idx = label2idx

        self.token_seq = [self.process_tokens(tokens, token2idx) for tokens in token_seq]
        self.label_seq = [self.process_labels(labels, label2idx) for labels in label_seq]

    def __len__(self):
        return len(self.token_seq)

    def __getitem__(
        self,
        idx: int,
    ) -> Tuple[torch.LongTensor, torch.LongTensor]:
        return (
            torch.LongTensor(self.token_seq[idx]),
            torch.LongTensor(self.label_seq[idx])
        )
    
    @staticmethod
    def process_tokens(
        tokens: List[str],
        token2idx: Dict[str, int],
        unk: str = "<UNK>",
    ) -> List[int]:
        """
        Transform list of tokens into list of tokens' indices.
        """
        result = [
            token2idx[token] if token in token2idx else token2idx[unk]
            for token in tokens
        ]
        return result

    @staticmethod
    def process_labels(
        labels: List[str],
        label2idx: Dict[str, int],
    ) -> List[int]:
        """
        Transform list of labels into list of labels' indices.
        """
        result = [label2idx[label] for label in labels]
        return result

#### Создание 3 датасетов

In [21]:
train_dataset = NERDataset(
    token_seq=TRAIN_TOKENS,
    label_seq=TRAIN_LABELS,
    token2idx=token2idx,
    label2idx=label2idx,
)
valid_dataset = NERDataset(
    token_seq=DEV_TOKENS,
    label_seq=DEV_LABELS,
    token2idx=token2idx,
    label2idx=label2idx,
)
test_dataset = NERDataset(
    token_seq=TEST_TOKENS,
    label_seq=TEST_LABELS,
    token2idx=token2idx,
    label2idx=label2idx,
)

#### Создание Коллатора

In [22]:
class NERCollator:
    """
    Collator that handles variable-size sentences.
    """

    def __init__(
        self,
        token_padding_value: int,
        label_padding_value: int,
    ):
        self.token_padding_value = token_padding_value
        self.label_padding_value = label_padding_value

    def __call__(
        self,
        batch: List[Tuple[torch.LongTensor, torch.LongTensor]],
    ):

        tokens, labels = zip(*batch)

        tokens = torch.nn.utils.rnn.pad_sequence(tokens, batch_first=True, padding_value=self.token_padding_value)
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=self.label_padding_value)

        return {"input_ids": tokens, "labels": labels}

In [23]:
collator = NERCollator(
    token_padding_value=token2idx["<PAD>"],
    label_padding_value=-100,
)

# Создание, обучение и сохранение модели

In [24]:
model = AutoModelForTokenClassification.from_pretrained("cointegrated/rubert-tiny2", num_labels=len(label_set))  

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized 

In [25]:
training_args = TrainingArguments(
    output_dir=os.getcwd()+'/output_model',
    learning_rate=8e-5,
    weight_decay=1e-3,
    lr_scheduler_type='cosine',
    full_determinism=False,
    seed=42,
    per_device_train_batch_size=2,
    num_train_epochs=42,
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=5000
)

In [26]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset
)

In [27]:
trainer.train()

***** Running training *****
  Num examples = 746
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 3730
  Number of trainable parameters = 29114579


Step,Training Loss,Validation Loss
500,1.7596,1.496149
1000,1.2331,1.199375
1500,0.9696,1.113023
2000,0.7898,1.059876
2500,0.6957,1.032367
3000,0.6264,1.028914
3500,0.5908,1.038841


***** Running Evaluation *****
  Num examples = 94
  Batch size = 8
***** Running Evaluation *****
  Num examples = 94
  Batch size = 8
***** Running Evaluation *****
  Num examples = 94
  Batch size = 8
***** Running Evaluation *****
  Num examples = 94
  Batch size = 8
***** Running Evaluation *****
  Num examples = 94
  Batch size = 8
***** Running Evaluation *****
  Num examples = 94
  Batch size = 8
***** Running Evaluation *****
  Num examples = 94
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=3730, training_loss=0.9297234788337277, metrics={'train_runtime': 113.5704, 'train_samples_per_second': 65.686, 'train_steps_per_second': 32.843, 'total_flos': 40133599192596.0, 'train_loss': 0.9297234788337277, 'epoch': 10.0})

#### Сохранение модели

In [28]:
final = unwrap_model(trainer.model_wrapped)
final = final.to("cpu")

In [29]:
def compress_data(data):
    return lzma.compress(
        pickle.dumps(data), 
        format=lzma.FORMAT_RAW, 
        filters=[{"id":lzma.FILTER_LZMA2,"dict_size":268435456, "preset":9, "mf":lzma.MF_HC3, "depth":0, "lc":3}]
    )

In [30]:
compressed_state = compress_data(final.state_dict())
compressed_config = compress_data(final.config)

with open("state.xz", "wb") as f:
    f.write(compressed_state)

with open("config.xz", "wb") as f:
    f.write(compressed_config)

In [31]:
idx2label = {}
for key, val in label2idx.items():
    idx2label[val] = key

with open('token2idx.pkl', 'wb') as f:
    pickle.dump(token2idx, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('idx2label.pkl', 'wb') as g:
    pickle.dump(idx2label, g, protocol=pickle.HIGHEST_PROTOCOL)