<a href="https://colab.research.google.com/github/ekaterinatao/NER_biomed_domain/blob/main/transformers_base/%D0%92%D0%9A%D0%A0_nerel_bio_RuBioBERT_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Инструменты
Предобработанный дасасет [NEREL-BIO](https://huggingface.co/datasets/ekaterinatao/nerel_bio_ner_unnested)  
[Чек-пойнт](https://huggingface.co/ekaterinatao/nerel-bio-RuBioBERT-base) дообученной модели на всем датасете nerel-bio  

Исходная модель [RuBioBERT](https://huggingface.co/alexyalunin/RuBioBERT)

### Установка зависимостей

In [1]:
!pip install datasets accelerate evaluate wandb seqeval -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m17

In [2]:
import numpy as np
import pandas as pd
import random
from dataclasses import dataclass

import torch
import datasets
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
from transformers import pipeline
import evaluate

import warnings
warnings.filterwarnings("ignore")

In [3]:
@dataclass
class TrainingConfig:
    seed = 64
    dataset = 'ekaterinatao/nerel_bio_ner_unnested'
    checkpoint = 'alexyalunin/RuBioBERT'
    n_labels = 45
    n_epochs = 10
    train_batch_size = 4
    eval_batch_size = 4
    device = "cuda" if torch.cuda.is_available() else "cpu"
    l_rate = 5e-05
    w_decay = 0.1
    warm_up = 0.1

config = TrainingConfig()

In [4]:
seed = config.seed

random.seed(seed)
np.random.seed(seed)

torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

### Скачивание датасета

In [5]:
dataset = datasets.load_dataset(config.dataset)
dataset

Downloading readme:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/603k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/76.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/70.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/612 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/77 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/77 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'words', 'ner_tags'],
        num_rows: 612
    })
    valid: Dataset({
        features: ['id', 'words', 'ner_tags'],
        num_rows: 77
    })
    test: Dataset({
        features: ['id', 'words', 'ner_tags'],
        num_rows: 77
    })
})

In [6]:
# Загрузка словаря токенов
url = 'https://raw.githubusercontent.com/ekaterinatao/NER_biomed_domain/main/labels.txt'
tags = pd.read_csv(url, names=['tag']).values.tolist()
tags = [item for sublist in tags for item in sublist]
tag_to_id = {tag: i for i, tag in enumerate(tags)}
id_to_tag = {i: tag for i, tag in enumerate(tags)}

In [7]:
tokenizer = AutoTokenizer.from_pretrained(config.checkpoint)

tokenizer_config.json:   0%|          | 0.00/413 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.75M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [8]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, max_length=512, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_idxs = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_idxs:
            if word_idx is None:
                label_ids.append(-100) # Set the special tokens to -100.
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [9]:
tokenized_dataset = dataset.map(
    tokenize_and_align_labels, batched=True,
    remove_columns = ['id', 'words', 'ner_tags']
)
tokenized_dataset

Map:   0%|          | 0/612 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 612
    })
    valid: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 77
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 77
    })
})

# Обучение модели

In [10]:
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [tags[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [tags[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [11]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [12]:
print(f'device is {config.device}')

device is cuda


In [13]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [14]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [15]:
import os
os.environ["WANDB_PROJECT"]="ner_bert_nerel_bio"
hf_repo_id = "ekaterinatao/nerel-bio-RuBioBERT-base"

# Обучение на полном датасете

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    config.checkpoint, num_labels=config.n_labels, id2label=id_to_tag, label2id=tag_to_id)

training_args = TrainingArguments(
    output_dir=hf_repo_id,
    num_train_epochs=config.n_epochs,
    learning_rate=config.l_rate,
    weight_decay=config.w_decay,
    warmup_ratio=config.warm_up,
    per_device_train_batch_size=config.train_batch_size,
    per_device_eval_batch_size=config.eval_batch_size,
    group_by_length=True,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    evaluation_strategy="epoch",
    seed = config.seed,
    data_seed = config.seed,
    push_to_hub=True,
    save_strategy="no",
    report_to="wandb",
    run_name="RuBioBERT-base_batch_4",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

config.json:   0%|          | 0.00/889 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at alexyalunin/RuBioBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[34m[1mwandb[0m: Currently logged in as: [33mtaoea[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.893706,0.736277,0.707786,0.721751,0.799845
2,No log,0.575283,0.790725,0.779784,0.785217,0.852681
3,No log,0.530835,0.78935,0.785647,0.787494,0.855944
4,0.902500,0.547688,0.795641,0.796201,0.795921,0.861849
5,0.902500,0.573182,0.801075,0.803705,0.802388,0.864646
6,0.902500,0.555094,0.808541,0.808161,0.808351,0.869308
7,0.133000,0.58126,0.808161,0.808161,0.808161,0.870085
8,0.133000,0.600553,0.805343,0.813086,0.809196,0.868687
9,0.133000,0.618291,0.810389,0.815901,0.813135,0.872261
10,0.040600,0.618693,0.810723,0.815666,0.813187,0.872106


TrainOutput(global_step=1530, training_loss=0.3522425438453949, metrics={'train_runtime': 155.4088, 'train_samples_per_second': 39.38, 'train_steps_per_second': 9.845, 'total_flos': 868743082982520.0, 'train_loss': 0.3522425438453949, 'epoch': 10.0})

In [None]:
wandb.finish()

VBox(children=(Label(value='0.022 MB of 0.022 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▆▆▇▇█████
eval/f1,▁▆▆▇▇█████
eval/loss,█▂▁▁▂▁▂▂▃▃
eval/precision,▁▆▆▇▇██▇██
eval/recall,▁▆▆▇▇▇▇███
eval/runtime,▃▂▂▂▁█▁▂▁█
eval/samples_per_second,▆▇▇▇█▁▇▇█▁
eval/steps_per_second,▆▇▇▇█▁▇▇█▁
train/epoch,▁▂▃▃▃▄▅▅▆▆▇███
train/global_step,▁▂▃▃▃▄▅▅▆▆▇███

0,1
eval/accuracy,0.87211
eval/f1,0.81319
eval/loss,0.61869
eval/precision,0.81072
eval/recall,0.81567
eval/runtime,1.1608
eval/samples_per_second,66.332
eval/steps_per_second,17.229
train/epoch,10.0
train/global_step,1530.0


In [None]:
trainer.save_model(hf_repo_id)

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.79k [00:00<?, ?B/s]

### Оценка качества на тестовой выборке

In [None]:
predictions = trainer.predict(test_dataset=tokenized_dataset["test"])

In [None]:
predictions.metrics

{'test_loss': 0.5603048801422119,
 'test_precision': 0.8299205010840761,
 'test_recall': 0.8231780167264038,
 'test_f1': 0.8265355086372361,
 'test_accuracy': 0.8898814357641709,
 'test_runtime': 0.8317,
 'test_samples_per_second': 92.578,
 'test_steps_per_second': 24.046}

Оценка модели на абстракте, которого не было в тестовом наборе

In [None]:
ner_bio = pipeline("ner", model=model, tokenizer=tokenizer, device=config.device)

In [None]:
abstract = """Цель. Оценить выживаемость у пациентов с болезнью Фабри (БФ) в зависимости от вида заместительной почечной терапии, и определить роль диализного скрининга в ранней диагностике БФ у родственников.
Материалы и методы. В исследование включали взрослых (старше 18 лет) пациентов с подтвержденным диагнозом БФ. Терминальная стадия хронической почечной недостаточности (тХПН) диагностировали в соответствии с рекомендациями Научного общества нефрологов России (2016) и KDIGO (2012). На основании опроса пробандов выявляли его родственников, которые могли унаследовать мутантный ген.
Результаты. У 50 (24,9%) из 201 обследованных пациентов с БФ диагностирована тХПН, в том числе у 48 (40%) из 120 мужчин и 2 (2,7%) из 81 женщин. Оценка кумулятивной частоты методом Каплана-Майера демонстрирует выраженное увеличение частоты регистрации тХПН к возрасту 20-30 лет, а к возрасту 50 лет ожидаемое количество пациентов с тХПН составляет 95%. Пяти из 50 больных с тХПН была выполнена трансплантация почки, в среднем, через 17 месяцев (диапазон от 7 до 70 месяцев) после инициации лечения гемодиализом. Умерло 15 (30%) из 50 пациентов, получавших лечение гемодиализом. Все умершие пациенты были мужского пола. Медиана возраста на момент летального исхода составила 45 (39; 58) лет. Среди пациентов, которым проведена трансплантация почки, летальных исходов зарегистрировано не было. У 44 (88%) из 50 пациентов диагноз БФ установлен, в среднем, через 1 год (диапазон от 0 до 12 лет) после начала лечения программным гемодиализом, в том числе у одного пациента – после трансплантации почки. Среди 44 пробандов, выявленных при всероссийском диализном скрининге, проведен семейный скрининг. Патогенная мутация в гене GLA диагностирована у 89 (57%) из 156 обследованных родственников диализных пробандов, в том числе у 18 детей моложе 18 лет, клинические проявления БФ имелись у 48 родственников. У 80,4% обследованных родственников диализных пробандов обнаружено поражение почек, преимущественно на ранних стадиях.
Заключение. ТХПН нередкое осложнение БФ, ассоциированное с неблагоприятным прогнозом. Однако диализный скрининг –  эффективный способ выявления пробандов с БФ, открывающий возможность установить диагноз БФ у родственников на ранних стадиях, когда лечение наиболее эффективно.
"""

In [None]:
ner_bio(abstract)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity': 'SCIPROC',
  'score': 0.16079731,
  'index': 1,
  'word': 'Цель',
  'start': 0,
  'end': 4},
 {'entity': 'SCIPROC',
  'score': 0.23720512,
  'index': 2,
  'word': '.',
  'start': 4,
  'end': 5},
 {'entity': 'SCIPROC',
  'score': 0.5926297,
  'index': 3,
  'word': 'Оцени',
  'start': 6,
  'end': 11},
 {'entity': 'FINDING',
  'score': 0.66653377,
  'index': 4,
  'word': '##ть',
  'start': 11,
  'end': 13},
 {'entity': 'PHYS',
  'score': 0.91353697,
  'index': 5,
  'word': 'выжи',
  'start': 14,
  'end': 18},
 {'entity': 'PHYS',
  'score': 0.8990054,
  'index': 6,
  'word': '##ваемость',
  'start': 18,
  'end': 26},
 {'entity': 'PERSON',
  'score': 0.34884652,
  'index': 7,
  'word': 'у',
  'start': 27,
  'end': 28},
 {'entity': 'PERSON',
  'score': 0.98654556,
  'index': 8,
  'word': 'пациентов',
  'start': 29,
  'end': 38},
 {'entity': 'DISO',
  'score': 0.98012084,
  'index': 9,
  'word': 'с',
  'start': 39,
  'end': 40},
 {'entity': 'DISO',
  'score': 0.9962953,
  'index':

#_____
___
# Обучение на 70% датасета

In [19]:
train_dataset_70 = tokenized_dataset['train'].select(
    random.sample(
        range(tokenized_dataset['train'].num_rows),
        int(tokenized_dataset['train'].num_rows * 0.7)
    )
)

In [20]:
model = AutoModelForTokenClassification.from_pretrained(
    config.checkpoint, num_labels=config.n_labels, id2label=id_to_tag, label2id=tag_to_id)

training_args = TrainingArguments(
    output_dir=hf_repo_id,
    num_train_epochs=config.n_epochs,
    learning_rate=config.l_rate,
    weight_decay=config.w_decay,
    warmup_ratio=config.warm_up,
    per_device_train_batch_size=config.train_batch_size,
    per_device_eval_batch_size=config.eval_batch_size,
    group_by_length=True,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    evaluation_strategy="epoch",
    seed = config.seed,
    data_seed = config.seed,
    push_to_hub=True,
    save_strategy="no",
    report_to="wandb",
    run_name="RuBioBERT-base_70%_of_data",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_70,
    eval_dataset=tokenized_dataset["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

config.json:   0%|          | 0.00/889 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at alexyalunin/RuBioBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[34m[1mwandb[0m: Currently logged in as: [33mtaoea[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.08277,0.676427,0.6447,0.660183,0.747786
2,No log,0.673041,0.765089,0.743199,0.753985,0.834188
3,No log,0.597802,0.777434,0.765947,0.771648,0.845688
4,No log,0.634298,0.77321,0.767589,0.77039,0.844755
5,0.782300,0.599693,0.798863,0.790807,0.794814,0.861538
6,0.782300,0.623642,0.787411,0.777439,0.782393,0.853924
7,0.782300,0.63306,0.790939,0.786116,0.78852,0.856721
8,0.782300,0.666185,0.789028,0.785882,0.787452,0.855944
9,0.782300,0.670574,0.793673,0.788462,0.791059,0.85843
10,0.077000,0.671194,0.793673,0.788462,0.791059,0.85812


TrainOutput(global_step=1070, training_loss=0.4041379315830837, metrics={'train_runtime': 129.6103, 'train_samples_per_second': 33.022, 'train_steps_per_second': 8.256, 'total_flos': 606938351693040.0, 'train_loss': 0.4041379315830837, 'epoch': 10.0})

In [21]:
wandb.finish()

VBox(children=(Label(value='0.022 MB of 0.022 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▆▇▇██████
eval/f1,▁▆▇▇█▇████
eval/loss,█▂▁▂▁▁▂▂▂▂
eval/precision,▁▆▇▇█▇█▇██
eval/recall,▁▆▇▇█▇████
eval/runtime,▁▁▁▁▁█▅▅▅▁
eval/samples_per_second,▇███▇▁▃▃▃▇
eval/steps_per_second,▇███▇▁▃▃▃▇
train/epoch,▁▂▃▃▄▄▅▆▆▇▇██
train/global_step,▁▂▃▃▄▄▅▆▆▇▇██

0,1
eval/accuracy,0.85812
eval/f1,0.79106
eval/loss,0.67119
eval/precision,0.79367
eval/recall,0.78846
eval/runtime,0.9356
eval/samples_per_second,82.3
eval/steps_per_second,21.377
train/epoch,10.0
train/global_step,1070.0


In [22]:
predictions = trainer.predict(test_dataset=tokenized_dataset["test"])

In [23]:
predictions.metrics

{'test_loss': 0.5828642249107361,
 'test_precision': 0.8193939393939393,
 'test_recall': 0.8076463560334528,
 'test_f1': 0.8134777376654632,
 'test_accuracy': 0.881760597693682,
 'test_runtime': 1.9576,
 'test_samples_per_second': 39.335,
 'test_steps_per_second': 10.217}

#_____
___
# Обучение на 50% датасета

In [16]:
train_dataset_50 = tokenized_dataset['train'].select(
    random.sample(
        range(tokenized_dataset['train'].num_rows),
        int(tokenized_dataset['train'].num_rows * 0.5)
    )
)

In [18]:
model = AutoModelForTokenClassification.from_pretrained(
    config.checkpoint, num_labels=config.n_labels, id2label=id_to_tag, label2id=tag_to_id)

training_args = TrainingArguments(
    output_dir=hf_repo_id,
    num_train_epochs=config.n_epochs,
    learning_rate=config.l_rate,
    weight_decay=config.w_decay,
    warmup_ratio=config.warm_up,
    per_device_train_batch_size=config.train_batch_size,
    per_device_eval_batch_size=config.eval_batch_size,
    group_by_length=True,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    evaluation_strategy="epoch",
    seed = config.seed,
    data_seed = config.seed,
    push_to_hub=True,
    save_strategy="no",
    report_to="wandb",
    logging_steps=16,
    run_name="RuBioBERT-base_50%_of_data",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_50,
    eval_dataset=tokenized_dataset["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

config.json:   0%|          | 0.00/889 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at alexyalunin/RuBioBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[34m[1mwandb[0m: Currently logged in as: [33mtaoea[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,1.8088,1.402433,0.597264,0.553002,0.574282,0.683605
2,0.7203,0.758823,0.763879,0.738977,0.751222,0.820824
3,0.4601,0.636499,0.771572,0.754925,0.763158,0.835742
4,0.2678,0.595391,0.79294,0.784944,0.788922,0.853458
5,0.1765,0.606268,0.794362,0.786351,0.790336,0.85641
6,0.1466,0.644648,0.77992,0.777908,0.778913,0.849106
7,0.1084,0.656092,0.788805,0.786585,0.787694,0.854701
8,0.0781,0.648455,0.792982,0.789634,0.791304,0.855322
9,0.0556,0.6714,0.792413,0.788696,0.79055,0.85641
10,0.05,0.675577,0.793307,0.7894,0.791348,0.856876


TrainOutput(global_step=770, training_loss=0.4690267160147816, metrics={'train_runtime': 107.7234, 'train_samples_per_second': 28.406, 'train_steps_per_second': 7.148, 'total_flos': 440698198154220.0, 'train_loss': 0.4690267160147816, 'epoch': 10.0})

In [19]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.022 MB uploaded\r'), FloatProgress(value=0.05494930400412442, max=1.…

0,1
eval/accuracy,▁▇▇███████
eval/f1,▁▇▇███████
eval/loss,█▂▁▁▁▁▂▁▂▂
eval/precision,▁▇▇██▇████
eval/recall,▁▇▇███████
eval/runtime,▇▃▅▃▄▆█▁▁▁
eval/samples_per_second,▁▅▃▄▄▂▁███
eval/steps_per_second,▁▅▃▄▄▂▁███
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████

0,1
eval/accuracy,0.85688
eval/f1,0.79135
eval/loss,0.67558
eval/precision,0.79331
eval/recall,0.7894
eval/runtime,0.9033
eval/samples_per_second,85.245
eval/steps_per_second,22.142
train/epoch,10.0
train/global_step,770.0


In [20]:
predictions = trainer.predict(test_dataset=tokenized_dataset["test"])

In [21]:
predictions.metrics

{'test_loss': 0.5908907055854797,
 'test_precision': 0.8163512857836002,
 'test_recall': 0.8040621266427718,
 'test_f1': 0.8101601059347538,
 'test_accuracy': 0.8777001786584375,
 'test_runtime': 0.8572,
 'test_samples_per_second': 89.824,
 'test_steps_per_second': 23.331}

**Вывод**: в первом приближении качество обучения на 100%, 70% и 50% датасета не различается.