<a href="https://colab.research.google.com/github/ekaterinatao/NER_biomed_domain/blob/main/transformers_base/%D0%92%D0%9A%D0%A0_nerel_bio_ruBERT_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Инструменты
Предобработанный дасасет [NEREL-BIO](https://huggingface.co/datasets/ekaterinatao/nerel_bio_ner_unnested)  
[Чек-пойнт](https://huggingface.co/ekaterinatao/nerel-bio-rubert-base) дообученной модели на всем датасете nerel-bio  

Исходная модель [RuBERT](https://huggingface.co/DeepPavlov/rubert-base-cased)

### Установка зависимостей

In [None]:
!pip install datasets accelerate evaluate wandb seqeval -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m2

In [None]:
import numpy as np
import pandas as pd
import random
from dataclasses import dataclass

import torch
import datasets
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
from transformers import pipeline
import evaluate

import warnings
warnings.filterwarnings("ignore")

In [None]:
@dataclass
class TrainingConfig:
    seed = 64
    dataset = 'ekaterinatao/nerel_bio_ner_unnested'
    checkpoint = 'DeepPavlov/rubert-base-cased'
    n_labels = 45
    n_epochs = 10
    train_batch_size = 6
    eval_batch_size = 6
    device = "cuda" if torch.cuda.is_available() else "cpu"
    l_rate = 5e-05
    w_decay = 0.1
    warm_up = 0.1

config = TrainingConfig()

In [None]:
seed = config.seed

random.seed(seed)
np.random.seed(seed)

torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

### Скачивание датасета

In [None]:
dataset = datasets.load_dataset(config.dataset)
dataset

Downloading readme:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/603k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/76.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/70.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/612 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/77 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/77 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'words', 'ner_tags'],
        num_rows: 612
    })
    valid: Dataset({
        features: ['id', 'words', 'ner_tags'],
        num_rows: 77
    })
    test: Dataset({
        features: ['id', 'words', 'ner_tags'],
        num_rows: 77
    })
})

In [None]:
for token, ner_tag in zip(dataset['train'][0]['words'], dataset['train'][0]['ner_tags']):
    print(f'{token:_<60}{ner_tag}')

161_________________________________________________________28
ОКБ_________________________________________________________30
12-275 мес__________________________________________________20
95,65±8,4 мес_______________________________________________20
23__________________________________________________________28
14,29%______________________________________________________31
8___________________________________________________________28
23,13 мес___________________________________________________20
11 мес______________________________________________________20
Более 36 мес________________________________________________20
142_________________________________________________________28
104,87 мес__________________________________________________20
15__________________________________________________________28
80,27 мес___________________________________________________20
3___________________________________________________________28
13,45%_________________________________________________

In [None]:
# Labels
url = 'https://raw.githubusercontent.com/ekaterinatao/NER_biomed_domain/main/labels.txt'
tags = pd.read_csv(url, names=['tag']).values.tolist()
tags = [item for sublist in tags for item in sublist]
tag_to_id = {tag: i for i, tag in enumerate(tags)}
id_to_tag = {i: tag for i, tag in enumerate(tags)}

___
### Токенизация

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.checkpoin)

tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
example = dataset["train"][64]
tokenized_input = tokenizer(example["words"], truncation=True, max_length=512, is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

print(f'Token len initial tokens__{len(example["words"])}\n')
print(f'Token len from BERT tokenizer__{len(tokens)}')
print(' '.join(tokens))

Token len initial tokens__77

Token len from BERT tokenizer__189
[CLS] И ##БС тремя ишем ##ической болезни сердца 301 1066 США 1517 США транс ##жи ##р ##ных кислот пище ишем ##ической болезни сердца пищевых калорий Т ##ЖК жир ##ы масла Т ##ЖК Т ##ЖК И ##БС лип ##идов плазме лип ##идам биом ##арке ##рам воспал ##ительного процесса лип ##идов смерть Т ##ЖК смертей Т ##ЖК населения ответственных лиц населения Т ##ЖК Аргентине сердца до 2004 года 1 , 5 % Т ##ЖК пищевых снижения уровня Т ##ЖК И ##БС процентной доли снижения И ##БС И ##БС сердца И ##БС 523 ##7 лет ежегодных 17 млн долл . США пищевых Т ##ЖК пищевых проспект ##ивных кого ##рт ##ных исследованиях проспект ##ивных кого ##рт ##ных исследований ежегодно 537 ##3 И ##БС 26 394 года острых случаев И ##БС острых случая И ##БС 87 млн долл . США Аргентине болезни сердца калорий уровень потребления риски возникновения И ##БС популя ##ционной выборки профиля ##м лип ##идов болезни сердца профил ##е лип ##идов предотвратить спасти 523 ##7 

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, max_length=512, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_idxs = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_idxs:
            if word_idx is None:
                label_ids.append(-100) # Set the special tokens to -100.
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
tokenized_dataset

Map:   0%|          | 0/612 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'words', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 612
    })
    valid: Dataset({
        features: ['id', 'words', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 77
    })
    test: Dataset({
        features: ['id', 'words', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 77
    })
})

In [None]:
df = tokenized_dataset['train']
pd.DataFrame({
    'words': df['words'], 'ner_tags': df['ner_tags'],
    'input_ids': df['input_ids'], 'labels': df['labels']
}).head(5)

Unnamed: 0,words,ner_tags,input_ids,labels
0,"[161, ОКБ, 12-275 мес, 95,65±8,4 мес, 23, 14,2...","[28, 30, 20, 20, 28, 31, 28, 20, 20, 20, 28, 2...","[101, 20132, 39736, 4367, 130, 24408, 57175, 1...","[-100, 28, 30, 20, -100, -100, -100, 20, -100,..."
1,"[ИМО, 201, от 2 мес до 18 лет, до 74%, 5,5%, И...","[5, 28, 17, 31, 31, 5, 5, 2, 5, 32, 32, 5, 32,...","[101, 61991, 806, 2777, 1641, 140, 57175, 2785...","[-100, 5, -100, 28, 17, -100, -100, -100, -100..."
2,"[ОФТГ, 2,5-3,0 кг, ВПГ, 1-го, Панавир, 6 инсти...","[5, 28, 12, 29, 3, 28, 20, 5, 12, 5, 5, 15, 15...","[101, 96813, 814, 784, 140, 128, 146, 130, 142...","[-100, 5, -100, -100, 28, -100, -100, -100, -1..."
3,"[29, целиакией, 8, ИГ, целиакией, 7, 18, АГД, ...","[28, 5, 28, 3, 5, 28, 28, 7, 28, 28, 28, 28, 5...","[101, 7688, 16934, 87843, 152, 24481, 16934, 8...","[-100, 28, 5, -100, 28, 3, 5, -100, 28, 28, 7,..."
4,"[ОА, 248, РА, от 38 до 65 лет, II, четыре, 1-я...","[5, 28, 5, 17, 29, 28, 29, 29, 29, 29, 3, 29, ...","[101, 805, 778, 25401, 34662, 1641, 11683, 278...","[-100, 5, -100, 28, 5, 17, -100, -100, -100, -..."


In [None]:
tokenized_dataset = tokenized_dataset.map(remove_columns = ['id', 'words', 'ner_tags'])
tokenized_dataset

Map:   0%|          | 0/612 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 612
    })
    valid: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 77
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 77
    })
})

### Обучение модели с различными гиперпараметрами (`learning rate`, `decay weight`)

In [None]:
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [tags[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [tags[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
print(f'device is {config.device}')

device is cuda


____
# Подбор гиперпараметров

In [None]:
LR_VALUES = (1e-5, 2e-5, 5e-5)
DECAY_VALUES = (1e-4, 0.1)
WARM_UP_VALUES = (0, 0.1)

In [None]:
for i, LR in enumerate(LR_VALUES):
    for j, WD in enumerate(DECAY_VALUES):
        for y, WU in enumerate(WARM_UP_VALUES):

            model = AutoModelForTokenClassification.from_pretrained(
                config.checkpoint, num_labels=config.n_labels,
                id2label=id_to_tag, label2id=tag_to_id
            )
            print(f'Log: training for l_r:{LR}, w_d:{WD}, w_up:{WU}...')

            training_args = TrainingArguments(
                output_dir="token_class_model",
                num_train_epochs=config.n_epochs,
                learning_rate=LR,
                weight_decay=WD,
                warmup_ratio=WU,
                per_device_train_batch_size=config.train_batch_size,
                per_device_eval_batch_size=config.eval_batch_size,
                group_by_length=True,
                optim="adamw_torch",
                lr_scheduler_type="cosine",
                evaluation_strategy="epoch",
                seed = config.seed,
                data_seed = config.seed,
                push_to_hub=False,
                save_strategy="no"
            )

            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=tokenized_dataset["train"],
                eval_dataset=tokenized_dataset["valid"],
                tokenizer=tokenizer,
                data_collator=data_collator,
                compute_metrics=compute_metrics,
            )

            trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Log: training for l_r:1e-05, w_d:0.0001, w_up:0...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.497717,0.520128,0.457552,0.486837,0.627972
2,No log,1.058169,0.643254,0.597092,0.619314,0.727428
3,No log,0.841241,0.697225,0.659944,0.678072,0.778089
4,No log,0.750172,0.715015,0.690197,0.702387,0.797669
5,1.063100,0.704931,0.729736,0.707317,0.718352,0.805439
6,1.063100,0.666475,0.73641,0.714822,0.725455,0.812587
7,1.063100,0.654234,0.732614,0.716463,0.724449,0.812898
8,1.063100,0.646552,0.739518,0.719747,0.729498,0.816628
9,1.063100,0.647828,0.738232,0.720919,0.729473,0.81554
10,0.420600,0.647565,0.738015,0.722092,0.729967,0.815385


Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Log: training for l_r:1e-05, w_d:0.0001, w_up:0.1...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,2.167491,0.258602,0.163931,0.20066,0.437451
2,No log,1.228741,0.599041,0.556754,0.577124,0.692618
3,No log,0.931805,0.681116,0.641182,0.660546,0.760995
4,No log,0.804476,0.705396,0.680582,0.692767,0.788345
5,1.322100,0.727224,0.721272,0.702158,0.711586,0.801709
6,1.322100,0.68136,0.738307,0.721857,0.729989,0.813986
7,1.322100,0.670152,0.739851,0.722326,0.730984,0.816628
8,1.322100,0.650956,0.745215,0.730535,0.737802,0.822222
9,1.322100,0.651183,0.744347,0.733349,0.738807,0.822222
10,0.415900,0.650567,0.744291,0.733818,0.739017,0.822999


Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Log: training for l_r:1e-05, w_d:0.1, w_up:0...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.568406,0.494294,0.41651,0.452081,0.609635
2,No log,1.098555,0.622239,0.587946,0.604606,0.711888
3,No log,0.870704,0.694335,0.661116,0.677319,0.773737
4,No log,0.769822,0.710922,0.686914,0.698712,0.792075
5,1.087400,0.7159,0.719615,0.70122,0.710298,0.801088
6,1.087400,0.676004,0.735831,0.718574,0.7271,0.813986
7,1.087400,0.660523,0.738215,0.723499,0.730783,0.818337
8,1.087400,0.650194,0.742461,0.727486,0.734897,0.821445
9,1.087400,0.649826,0.741059,0.728893,0.734926,0.820824
10,0.414800,0.649129,0.743082,0.730535,0.736755,0.822378


Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Log: training for l_r:1e-05, w_d:0.1, w_up:0.1...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,2.167506,0.258602,0.163931,0.20066,0.437451
2,No log,1.228689,0.59889,0.556754,0.577054,0.692463
3,No log,0.931587,0.67992,0.641651,0.660232,0.760528
4,No log,0.802212,0.70727,0.682223,0.694521,0.7885
5,1.322200,0.726597,0.72162,0.702158,0.711756,0.801865
6,1.322200,0.681729,0.735527,0.718105,0.726712,0.8115
7,1.322200,0.671547,0.73889,0.721388,0.730034,0.816472
8,1.322200,0.651006,0.74629,0.731238,0.738688,0.822222
9,1.322200,0.651092,0.745359,0.734522,0.739901,0.823155
10,0.416100,0.650467,0.745896,0.735225,0.740522,0.822999


Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Log: training for l_r:2e-05, w_d:0.0001, w_up:0...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.177941,0.599654,0.568715,0.583775,0.699145
2,No log,0.789737,0.709701,0.689728,0.699572,0.794406
3,No log,0.645379,0.735047,0.717636,0.726237,0.816006
4,No log,0.621908,0.745625,0.729362,0.737404,0.822533
5,0.792800,0.606606,0.759924,0.749765,0.754811,0.831857
6,0.792800,0.589996,0.764999,0.756567,0.760759,0.838228
7,0.792800,0.581318,0.767166,0.75727,0.762186,0.840093
8,0.792800,0.575676,0.771368,0.761961,0.766635,0.842269
9,0.792800,0.580757,0.770655,0.761257,0.765927,0.841958
10,0.223700,0.582098,0.770655,0.761257,0.765927,0.842113


Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Log: training for l_r:2e-05, w_d:0.0001, w_up:0.1...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.586825,0.50239,0.41909,0.456975,0.610412
2,No log,0.884827,0.686836,0.652205,0.669073,0.770474
3,No log,0.681562,0.728846,0.711069,0.719848,0.812743
4,No log,0.643673,0.746462,0.729831,0.738053,0.820513
5,1.035000,0.618301,0.759686,0.749531,0.754574,0.82906
6,1.035000,0.602772,0.761939,0.752111,0.756993,0.833722
7,1.035000,0.588415,0.764106,0.755863,0.759962,0.839472
8,1.035000,0.580502,0.771842,0.76243,0.767107,0.841492
9,1.035000,0.586383,0.769742,0.761257,0.765476,0.841647
10,0.227200,0.587188,0.769888,0.760319,0.765074,0.841647


Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Log: training for l_r:2e-05, w_d:0.1, w_up:0...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.177909,0.599654,0.568715,0.583775,0.699145
2,No log,0.789881,0.710907,0.690901,0.700761,0.795027
3,No log,0.646175,0.734532,0.71834,0.726346,0.815851
4,No log,0.618923,0.746344,0.730066,0.738115,0.822999
5,0.792800,0.60506,0.758842,0.749765,0.754276,0.831857
6,0.792800,0.590085,0.763843,0.757036,0.760424,0.837918
7,0.792800,0.580731,0.769194,0.758912,0.764018,0.84087
8,0.792800,0.574004,0.772846,0.763602,0.768196,0.843046
9,0.792800,0.578525,0.769888,0.760319,0.765074,0.842113
10,0.223500,0.579906,0.770052,0.761023,0.765511,0.841958


Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Log: training for l_r:2e-05, w_d:0.1, w_up:0.1...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.583525,0.501682,0.419794,0.457099,0.612121
2,No log,0.886042,0.68997,0.655019,0.67204,0.771096
3,No log,0.679105,0.731121,0.712946,0.721919,0.812121
4,No log,0.636498,0.742384,0.725844,0.734021,0.818493
5,1.034600,0.624545,0.758464,0.746013,0.752187,0.827506
6,1.034600,0.59541,0.770457,0.759615,0.764998,0.839161
7,1.034600,0.585721,0.767591,0.75985,0.763701,0.841336
8,1.034600,0.580062,0.767921,0.761257,0.764574,0.841958
9,1.034600,0.586209,0.769067,0.761492,0.76526,0.842424
10,0.226700,0.586446,0.769614,0.761492,0.765531,0.842424


Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Log: training for l_r:5e-05, w_d:0.0001, w_up:0...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.802316,0.690891,0.668856,0.679695,0.781507
2,No log,0.59058,0.765114,0.750938,0.75796,0.83885
3,No log,0.552147,0.768201,0.76712,0.76766,0.845688
4,No log,0.578454,0.767929,0.765947,0.766937,0.845221
5,0.529600,0.593931,0.785459,0.772749,0.779052,0.853768
6,0.529600,0.584552,0.783828,0.786585,0.785204,0.854545
7,0.529600,0.612773,0.786689,0.784475,0.78558,0.855167
8,0.529600,0.613842,0.786283,0.787758,0.78702,0.856566
9,0.529600,0.623049,0.785346,0.78682,0.786082,0.856566
10,0.079800,0.62438,0.785047,0.787992,0.786517,0.856255


Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Log: training for l_r:5e-05, w_d:0.0001, w_up:0.1...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.149979,0.613824,0.574812,0.593678,0.707226
2,No log,0.677157,0.728023,0.703096,0.715342,0.813054
3,No log,0.565484,0.76824,0.755629,0.761882,0.838073
4,No log,0.561818,0.764412,0.75258,0.75845,0.840249
5,0.806500,0.616475,0.768688,0.766886,0.767786,0.841492
6,0.806500,0.599701,0.779838,0.769231,0.774498,0.847552
7,0.806500,0.605144,0.781672,0.774156,0.777896,0.850194
8,0.806500,0.607227,0.783917,0.781895,0.782905,0.854545
9,0.806500,0.62141,0.787429,0.784475,0.785949,0.854856
10,0.101200,0.623333,0.786638,0.78424,0.785437,0.855167


Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Log: training for l_r:5e-05, w_d:0.1, w_up:0...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.805948,0.683666,0.66651,0.674979,0.779176
2,No log,0.575234,0.765,0.753518,0.759216,0.842113
3,No log,0.544066,0.766209,0.759381,0.76278,0.843357
4,No log,0.58462,0.766047,0.764071,0.765058,0.843667
5,0.531200,0.600345,0.771388,0.767589,0.769484,0.850505
6,0.531200,0.603919,0.777856,0.77439,0.776119,0.849728
7,0.531200,0.61886,0.77859,0.77439,0.776484,0.850971
8,0.531200,0.628763,0.782283,0.782833,0.782558,0.853147
9,0.531200,0.631612,0.784471,0.781895,0.783181,0.853924
10,0.080500,0.635425,0.782895,0.781426,0.78216,0.853613


Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Log: training for l_r:5e-05, w_d:0.1, w_up:0.1...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.14925,0.614612,0.574109,0.59367,0.708003
2,No log,0.661019,0.727294,0.715525,0.721362,0.813675
3,No log,0.575303,0.758038,0.757505,0.757771,0.838384
4,No log,0.562447,0.773137,0.764071,0.768577,0.845688
5,0.795100,0.621142,0.766604,0.768762,0.767681,0.841026
6,0.795100,0.625854,0.777517,0.77697,0.777243,0.848485
7,0.795100,0.619768,0.783714,0.780957,0.782333,0.851593
8,0.795100,0.626221,0.783032,0.792214,0.787596,0.853147
9,0.795100,0.634473,0.785398,0.789634,0.78751,0.855633
10,0.093900,0.637206,0.787328,0.792683,0.789996,0.85641


___
# Обучение с подобранными гиперпараметрами
Лучшими оказались гиперпараметры: `learning_rate`= 5e-5, `weight_decay` = 0.1, `warm_up` = 0.1  

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
import os
os.environ["WANDB_PROJECT"]="ner_bert_nerel_bio"
hf_repo_id = "ekaterinatao/nerel-bio-rubert-base"

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    config.checkpoint, num_labels=config.n_labels, id2label=id_to_tag, label2id=tag_to_id)

training_args = TrainingArguments(
    output_dir=hf_repo_id,
    num_train_epochs=config.n_epochs,
    learning_rate=config.l_rate,
    weight_decay=config.w_decay,
    warmup_ratio=config.warm_up,
    per_device_train_batch_size=config.train_batch_size,
    per_device_eval_batch_size=config.eval_batch_size,
    group_by_length=True,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    evaluation_strategy="epoch",
    seed = config.seed,
    data_seed = config.seed,
    push_to_hub=True,
    save_strategy="no",
    report_to="wandb",
    run_name="rubert-base",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[34m[1mwandb[0m: Currently logged in as: [33mtaoea[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.121077,0.61956,0.58091,0.599613,0.71251
2,No log,0.679956,0.733317,0.716463,0.724792,0.813675
3,No log,0.598507,0.744463,0.748827,0.746639,0.830303
4,No log,0.567296,0.760768,0.762195,0.761481,0.840249
5,0.795400,0.566465,0.775077,0.770169,0.772615,0.848485
6,0.795400,0.593402,0.782598,0.774156,0.778354,0.85439
7,0.795400,0.580425,0.779481,0.775094,0.777281,0.852681
8,0.795400,0.607486,0.783897,0.787758,0.785823,0.857653
9,0.795400,0.613945,0.788746,0.788931,0.788838,0.861383
10,0.102400,0.612167,0.787304,0.788227,0.787765,0.86014


TrainOutput(global_step=1020, training_loss=0.4412419216305602, metrics={'train_runtime': 139.794, 'train_samples_per_second': 43.779, 'train_steps_per_second': 7.296, 'total_flos': 856255185792720.0, 'train_loss': 0.4412419216305602, 'epoch': 10.0})

In [None]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.022 MB uploaded\r'), FloatProgress(value=0.05566678272980501, max=1.…

0,1
eval/accuracy,▁▆▇▇▇█████
eval/f1,▁▆▆▇▇█████
eval/loss,█▂▁▁▁▁▁▂▂▂
eval/precision,▁▆▆▇▇█████
eval/recall,▁▆▇▇▇█████
eval/runtime,█▄▆▃▅▁▃▅▁▁
eval/samples_per_second,▁▅▃▆▄█▆▄██
eval/steps_per_second,▁▅▃▆▄█▆▄██
train/epoch,▁▂▃▃▄▄▅▆▆▇███
train/global_step,▁▂▃▃▄▄▅▆▆▇███

0,1
eval/accuracy,0.86014
eval/f1,0.78777
eval/loss,0.61217
eval/precision,0.7873
eval/recall,0.78823
eval/runtime,0.8926
eval/samples_per_second,86.265
eval/steps_per_second,14.564
train/epoch,10.0
train/global_step,1020.0


In [None]:
trainer.save_model(hf_repo_id)

training_args.bin:   0%|          | 0.00/4.73k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/709M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

### Оценка качества на тестовой выборке

In [None]:
predictions = trainer.predict(test_dataset=tokenized_dataset["test"])

In [None]:
predictions.metrics

{'test_loss': 0.6296454071998596,
 'test_precision': 0.7918465227817746,
 'test_recall': 0.7890083632019116,
 'test_f1': 0.7904248952722921,
 'test_accuracy': 0.8646103896103896,
 'test_runtime': 0.8113,
 'test_samples_per_second': 94.915,
 'test_steps_per_second': 16.025}

Оценка модели на абстракте, которого не было в тестовом наборе

In [None]:
ner_bio = pipeline("ner", model=model, tokenizer=tokenizer, device=config.device)

In [None]:
abstract = """Цель. Оценить выживаемость у пациентов с болезнью Фабри (БФ) в зависимости от вида заместительной почечной терапии, и определить роль диализного скрининга в ранней диагностике БФ у родственников.
Материалы и методы. В исследование включали взрослых (старше 18 лет) пациентов с подтвержденным диагнозом БФ. Терминальная стадия хронической почечной недостаточности (тХПН) диагностировали в соответствии с рекомендациями Научного общества нефрологов России (2016) и KDIGO (2012). На основании опроса пробандов выявляли его родственников, которые могли унаследовать мутантный ген.
Результаты. У 50 (24,9%) из 201 обследованных пациентов с БФ диагностирована тХПН, в том числе у 48 (40%) из 120 мужчин и 2 (2,7%) из 81 женщин. Оценка кумулятивной частоты методом Каплана-Майера демонстрирует выраженное увеличение частоты регистрации тХПН к возрасту 20-30 лет, а к возрасту 50 лет ожидаемое количество пациентов с тХПН составляет 95%. Пяти из 50 больных с тХПН была выполнена трансплантация почки, в среднем, через 17 месяцев (диапазон от 7 до 70 месяцев) после инициации лечения гемодиализом. Умерло 15 (30%) из 50 пациентов, получавших лечение гемодиализом. Все умершие пациенты были мужского пола. Медиана возраста на момент летального исхода составила 45 (39; 58) лет. Среди пациентов, которым проведена трансплантация почки, летальных исходов зарегистрировано не было. У 44 (88%) из 50 пациентов диагноз БФ установлен, в среднем, через 1 год (диапазон от 0 до 12 лет) после начала лечения программным гемодиализом, в том числе у одного пациента – после трансплантации почки. Среди 44 пробандов, выявленных при всероссийском диализном скрининге, проведен семейный скрининг. Патогенная мутация в гене GLA диагностирована у 89 (57%) из 156 обследованных родственников диализных пробандов, в том числе у 18 детей моложе 18 лет, клинические проявления БФ имелись у 48 родственников. У 80,4% обследованных родственников диализных пробандов обнаружено поражение почек, преимущественно на ранних стадиях.
Заключение. ТХПН нередкое осложнение БФ, ассоциированное с неблагоприятным прогнозом. Однако диализный скрининг –  эффективный способ выявления пробандов с БФ, открывающий возможность установить диагноз БФ у родственников на ранних стадиях, когда лечение наиболее эффективно.
"""

In [None]:
ner_bio(abstract)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity': 'SCIPROC',
  'score': 0.76758903,
  'index': 1,
  'word': 'Цель',
  'start': 0,
  'end': 4},
 {'entity': 'SCIPROC',
  'score': 0.5501071,
  'index': 2,
  'word': '.',
  'start': 4,
  'end': 5},
 {'entity': 'FINDING',
  'score': 0.8515094,
  'index': 3,
  'word': 'Оцен',
  'start': 6,
  'end': 10},
 {'entity': 'FINDING',
  'score': 0.9006018,
  'index': 4,
  'word': '##ить',
  'start': 10,
  'end': 13},
 {'entity': 'PHYS',
  'score': 0.77701473,
  'index': 5,
  'word': 'выжив',
  'start': 14,
  'end': 19},
 {'entity': 'PHYS',
  'score': 0.7618812,
  'index': 6,
  'word': '##аемость',
  'start': 19,
  'end': 26},
 {'entity': 'PERSON',
  'score': 0.9907691,
  'index': 7,
  'word': 'у',
  'start': 27,
  'end': 28},
 {'entity': 'PERSON',
  'score': 0.9979462,
  'index': 8,
  'word': 'пациентов',
  'start': 29,
  'end': 38},
 {'entity': 'PERSON',
  'score': 0.85923153,
  'index': 9,
  'word': 'с',
  'start': 39,
  'end': 40},
 {'entity': 'DISO',
  'score': 0.99781764,
  'index': 