<a href="https://colab.research.google.com/github/ekaterinatao/NER_biomed_domain/blob/main/active_learning/%D0%92%D0%9A%D0%A0_nerel_bio_RuBioBERT_active_learning_v4_mnlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Инструменты
Предобработанный дасасет [NEREL-BIO](https://huggingface.co/datasets/ekaterinatao/nerel_bio_ner_unnested)  

Исходная модель [RuBioBERT](https://huggingface.co/alexyalunin/RuBioBERT)

### Установка зависимостей

In [1]:
!pip install datasets accelerate evaluate wandb seqeval -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.3 M

In [2]:
import numpy as np
import pandas as pd
import random
from dataclasses import dataclass

import torch
from torch.nn.functional import softmax
import datasets
from datasets import Dataset, DatasetDict, concatenate_datasets
from transformers import (AutoTokenizer,
                          DataCollatorForTokenClassification,
                          AutoModelForTokenClassification,
                          TrainingArguments, Trainer,
                          pipeline)
from transformers.trainer_utils import get_last_checkpoint
import evaluate

import warnings
warnings.filterwarnings("ignore")

In [3]:
@dataclass
class TrainingConfig:
    seed = 64
    dataset = 'ekaterinatao/nerel_bio_ner_unnested'
    checkpoint = 'alexyalunin/RuBioBERT'
    n_labels = 45
    n_epochs = 10
    train_batch_size = 4
    eval_batch_size = 4
    device = "cuda" if torch.cuda.is_available() else "cpu"
    l_rate = 5e-05
    w_decay = 0.1
    warm_up = 0.1

config = TrainingConfig()

In [4]:
seed = config.seed

random.seed(seed)
np.random.seed(seed)

torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

### Скачивание датасета

In [5]:
dataset = datasets.load_dataset(config.dataset)
dataset

Downloading readme:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/603k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/76.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/70.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/612 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/77 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/77 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'words', 'ner_tags'],
        num_rows: 612
    })
    valid: Dataset({
        features: ['id', 'words', 'ner_tags'],
        num_rows: 77
    })
    test: Dataset({
        features: ['id', 'words', 'ner_tags'],
        num_rows: 77
    })
})

In [6]:
# Labels
url = 'https://raw.githubusercontent.com/ekaterinatao/NER_biomed_domain/main/labels.txt'
tags = pd.read_csv(url, names=['tag']).values.tolist()
tags = [item for sublist in tags for item in sublist]
tag_to_id = {tag: i for i, tag in enumerate(tags)}
id_to_tag = {i: tag for i, tag in enumerate(tags)}

___
### Токенизация

In [7]:
tokenizer = AutoTokenizer.from_pretrained(config.checkpoint)

tokenizer_config.json:   0%|          | 0.00/413 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.75M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [8]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, max_length=512, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_idxs = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_idxs:
            if word_idx is None:
                label_ids.append(-100) # Set the special tokens to -100.
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [9]:
tokenized_dataset = dataset.map(
    tokenize_and_align_labels, batched=True,
    #remove_columns = ['id', 'words', 'ner_tags']
)
tokenized_dataset

Map:   0%|          | 0/612 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'words', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 612
    })
    valid: Dataset({
        features: ['id', 'words', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 77
    })
    test: Dataset({
        features: ['id', 'words', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 77
    })
})

# Обучение модели

In [10]:
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [tags[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [tags[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [11]:
def choose_samples_mnlp(test_predictions, n_tokens) -> np.ndarray:
    """Функция для реализации стратеги отбора примеров для разметки
    на основе MNLP"""

    test_pred_pt = torch.from_numpy(test_predictions)
    probas = softmax(test_pred_pt, dim=-1).detach().cpu().numpy()
    scores = np.array([
        -np.sum(np.log(np.max(i, axis=1))) / len(i) for i in probas
    ])
    args = np.argsort(-scores)

    return args[:n_tokens]

In [12]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [13]:
model = AutoModelForTokenClassification.from_pretrained(
    config.checkpoint, num_labels=config.n_labels, id2label=id_to_tag, label2id=tag_to_id
)

config.json:   0%|          | 0.00/889 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at alexyalunin/RuBioBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
print(f'device is {config.device}')

device is cuda


In [15]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [17]:
import os
os.environ["WANDB_PROJECT"]="ner_bert_nerel_bio"
hf_repo_id = "ekaterinatao/nerel-bio-RuBioBERT-al"

In [18]:
def train_and_predict(
    train_dataset, eval_dataset, test_dataset,
    model, tokenizer, data_collator, compute_metrics
):
    "Функция для тренировки модели, сохранения метрик и предсказаний"

    global config
    training_args = TrainingArguments(
        output_dir=hf_repo_id,
        num_train_epochs=config.n_epochs,
        learning_rate=config.l_rate,
        weight_decay=config.w_decay,
        warmup_ratio=config.warm_up,
        per_device_train_batch_size=config.train_batch_size,
        per_device_eval_batch_size=config.eval_batch_size,
        group_by_length=True,
        optim="adamw_torch",
        lr_scheduler_type="cosine",
        evaluation_strategy="epoch",
        seed = config.seed,
        data_seed = config.seed,
        #use_cpu=True,
        push_to_hub=True,
        save_strategy="no",
        report_to="wandb",
        logging_steps=16,
        run_name="RuBioBERT_AL_v4",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    train_result = trainer.train()
    test_predictions = trainer.predict(
        test_dataset, metric_key_prefix="pred"
    )
    metrics = {}
    metrics['train_loss'] = train_result.metrics['train_loss']
    metrics.update(trainer.evaluate(eval_dataset))
    metrics.update(test_predictions.metrics)
    probas = test_predictions.predictions

    return metrics, probas, trainer

In [19]:
def run_active_learning(
    dataset, al_iters, init_train_size, choose_tokens,
    model, tokenizer, data_collator, compute_metrics
):
    "Функция для реализации цикла активного обучения"

    train_dataset = dataset['train'].select(
        random.sample(
            range(dataset['train'].num_rows),
            int(dataset['train'].num_rows * init_train_size)
        )
    )
    test_dataset = dataset['train'].filter(
        lambda s: s['id'] not in train_dataset['id']
    )
    all_metrics = {}

    for iter in range(al_iters):
        print(f'Training using {train_dataset.num_rows} samples')

        eval_metrics, test_predictions, trainer = train_and_predict(
            train_dataset, dataset['valid'], test_dataset,
            model, tokenizer, data_collator, compute_metrics
        )

        new_args = choose_samples_mnlp(test_predictions, choose_tokens)
        new_train_samples = test_dataset.select(new_args)
        extended_train_dataset = concatenate_datasets(
            [train_dataset, new_train_samples]
        )

        train_dataset = extended_train_dataset
        test_dataset = dataset['train'].filter(
            lambda s: s['id'] not in train_dataset['id']
        )

        all_metrics[f'{iter}'] = {'metrics': eval_metrics, 'train_size': train_dataset.num_rows}

    return all_metrics, trainer

In [20]:
metrics, trainer = run_active_learning(
    dataset=tokenized_dataset, al_iters=7, init_train_size=0.1, choose_tokens=61,
    model=model, tokenizer=tokenizer,
    data_collator=data_collator, compute_metrics=compute_metrics
)

Filter:   0%|          | 0/612 [00:00<?, ? examples/s]

Training using 61 samples


[34m[1mwandb[0m: Currently logged in as: [33mtaoea[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,3.3855,2.925789,0.049296,0.001642,0.003177,0.158042
2,2.267,2.012529,0.420856,0.343574,0.378309,0.537685
3,1.3506,1.434593,0.603119,0.562383,0.582039,0.693551
4,0.8601,1.24049,0.645208,0.620544,0.632636,0.727894
5,0.6634,1.12402,0.660775,0.640009,0.650226,0.742191
6,0.4729,1.049967,0.69108,0.670497,0.680633,0.770319
7,0.3967,1.00027,0.703881,0.689024,0.696374,0.781974
8,0.3229,1.009882,0.695569,0.684803,0.690144,0.779021
9,0.2958,1.000625,0.702774,0.689259,0.695951,0.784771
10,0.2825,1.000039,0.702774,0.689259,0.695951,0.784771


Filter:   0%|          | 0/612 [00:00<?, ? examples/s]

Training using 122 samples


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.5628,0.865911,0.727535,0.710131,0.718728,0.802176
2,0.3821,0.800964,0.748732,0.726782,0.737594,0.818026
3,0.3342,0.806068,0.745331,0.720685,0.732801,0.817094
4,0.2514,0.762279,0.751892,0.745544,0.748705,0.826729
5,0.1397,0.761715,0.763252,0.753049,0.758116,0.833256
6,0.1028,0.812299,0.75849,0.749062,0.753746,0.830925
7,0.0946,0.809445,0.750351,0.751407,0.750879,0.828594
8,0.0476,0.812492,0.763028,0.758912,0.760964,0.837296
9,0.0471,0.819832,0.764998,0.759615,0.762297,0.837141
10,0.0479,0.823464,0.765304,0.759381,0.762331,0.83683


Filter:   0%|          | 0/612 [00:00<?, ? examples/s]

Training using 183 samples


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1936,0.742198,0.755425,0.751173,0.753293,0.831857
2,0.1358,0.781195,0.751839,0.743199,0.747494,0.82906
3,0.1109,0.778787,0.75,0.754925,0.752454,0.831857
4,0.0718,0.780028,0.769374,0.772983,0.771175,0.844289
5,0.0428,0.834578,0.766824,0.764306,0.765563,0.840404
6,0.0322,0.825753,0.765325,0.76712,0.766222,0.841803
7,0.0235,0.848445,0.758428,0.765009,0.761705,0.83885
8,0.0163,0.847994,0.771167,0.768996,0.77008,0.843978
9,0.0118,0.849698,0.7723,0.771576,0.771938,0.844911
10,0.0088,0.850637,0.772717,0.771811,0.772263,0.844911


Filter:   0%|          | 0/612 [00:00<?, ? examples/s]

Training using 244 samples


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1654,0.680812,0.764939,0.768527,0.766729,0.84258
2,0.1697,0.732331,0.763895,0.76712,0.765504,0.839938
3,0.0902,0.766527,0.776319,0.776501,0.77641,0.849417
4,0.079,0.775407,0.763287,0.76454,0.763913,0.842424
5,0.0364,0.761999,0.765766,0.777439,0.771558,0.84662
6,0.0263,0.796551,0.770195,0.778143,0.774148,0.848019
7,0.0129,0.819894,0.770557,0.775797,0.773168,0.84864
8,0.0085,0.833156,0.77186,0.778377,0.775105,0.85035
9,0.0057,0.836889,0.773129,0.780019,0.776558,0.850971
10,0.007,0.838428,0.773488,0.780019,0.77674,0.851282


Filter:   0%|          | 0/612 [00:00<?, ? examples/s]

Training using 305 samples


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1422,0.751495,0.747661,0.749765,0.748712,0.830769
2,0.0929,0.786132,0.763666,0.769934,0.766787,0.845221
3,0.0773,0.772859,0.765854,0.773218,0.769518,0.845221
4,0.0565,0.784471,0.77827,0.782833,0.780545,0.851593
5,0.0223,0.800218,0.769536,0.773687,0.771606,0.847397
6,0.0182,0.789694,0.780613,0.781895,0.781254,0.853458
7,0.0156,0.835446,0.782396,0.785882,0.784135,0.85237
8,0.0064,0.840614,0.782396,0.785882,0.784135,0.853302
9,0.0043,0.848809,0.782964,0.78682,0.784887,0.854079
10,0.0038,0.849244,0.782446,0.786116,0.784277,0.853768


Filter:   0%|          | 0/612 [00:00<?, ? examples/s]

Training using 366 samples


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.127,0.746309,0.783656,0.778143,0.78089,0.853613
2,0.0514,0.826447,0.783164,0.776735,0.779936,0.847863
3,0.0524,0.797915,0.774602,0.775328,0.774965,0.846775
4,0.023,0.821456,0.781279,0.779081,0.780178,0.851127
5,0.0417,0.821247,0.779312,0.780957,0.780134,0.852681
6,0.015,0.869418,0.782548,0.782364,0.782456,0.85439
7,0.007,0.862448,0.787565,0.78424,0.785899,0.855944
8,0.0032,0.859354,0.785932,0.786116,0.786024,0.85641
9,0.0031,0.865057,0.787189,0.78682,0.787004,0.857032
10,0.0053,0.866488,0.787474,0.787289,0.787381,0.857032


Filter:   0%|          | 0/612 [00:00<?, ? examples/s]

Training using 427 samples


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1246,0.739138,0.784123,0.792214,0.788147,0.858275
2,0.0441,0.707252,0.788163,0.787054,0.787609,0.858897
3,0.0362,0.724527,0.777518,0.778612,0.778064,0.854856
4,0.0484,0.713519,0.786533,0.788931,0.78773,0.861694
5,0.0239,0.788612,0.783322,0.78424,0.783781,0.855012
6,0.0059,0.789787,0.785351,0.797139,0.791201,0.861694
7,0.0102,0.824214,0.785034,0.792214,0.788607,0.860295
8,0.0016,0.82163,0.789131,0.796904,0.792999,0.864336
9,0.0025,0.83034,0.789865,0.796904,0.793369,0.86418
10,0.0021,0.832297,0.789682,0.796904,0.793277,0.86418


Filter:   0%|          | 0/612 [00:00<?, ? examples/s]

In [21]:
wandb.finish()

VBox(children=(Label(value='0.023 MB of 0.023 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▅▇▇▇▇▇█████████████████████████████████
eval/f1,▁▄▇▇▇▇▇▇████████████████████████████████
eval/loss,█▅▃▂▂▂▂▁▁▁▁▁▁▁▁▂▂▁▁▁▁▁▁▁▁▁▁▂▂▁▁▂▂▂▁▁▁▁▁▁
eval/precision,▁▅▇▇▇▇▇█████████████████████████████████
eval/recall,▁▄▆▇▇▇▇▇████████████████████████████████
eval/runtime,▁▁▃▂▆▁▁▄▁▁▅▂▁▁▂▁▂▁▁▂▇█▆▁▁▆▂▁▁▁▂▂▁▂▂▁▂▇▂▂
eval/samples_per_second,▇█▆▇▃█▇▅██▃▇██▇█▇██▇▁▁▃██▃▇▇█▇▇▇█▇▇█▇▂▇▇
eval/steps_per_second,▇█▆▇▃█▇▅██▃▇██▇█▇██▇▁▁▃██▃▇▇█▇▇▇█▇▇█▇▂▇▇
train/epoch,▂▅█▂▄▇▂▃▄▆█▂▃▄▆▇▁▂▃▄▅▇▇▁▂▃▄▅▆▇█▁▂▃▄▅▅▆▇█
train/global_step,▁▂▂▁▂▃▁▂▂▃▄▂▂▃▄▅▁▂▂▃▄▅▆▁▂▃▄▄▅▆▇▁▂▃▄▅▅▆▇█

0,1
eval/accuracy,0.86418
eval/f1,0.79328
eval/loss,0.8323
eval/precision,0.78968
eval/recall,0.7969
eval/runtime,0.9082
eval/samples_per_second,84.785
eval/steps_per_second,22.022
train/epoch,10.0
train/global_step,1070.0


In [22]:
metrics

{'0': {'metrics': {'train_loss': 1.0297480553388596,
   'eval_loss': 1.0000391006469727,
   'eval_precision': 0.7027737924438068,
   'eval_recall': 0.6892589118198874,
   'eval_f1': 0.6959507459152261,
   'eval_accuracy': 0.7847707847707848,
   'eval_runtime': 1.201,
   'eval_samples_per_second': 64.113,
   'eval_steps_per_second': 16.653,
   'epoch': 10.0,
   'pred_loss': 0.7819056510925293,
   'pred_precision': 0.7273767885532592,
   'pred_recall': 0.7065291247143122,
   'pred_f1': 0.7168014037726389,
   'pred_accuracy': 0.8206197854588796,
   'pred_runtime': 6.7995,
   'pred_samples_per_second': 81.035,
   'pred_steps_per_second': 20.296},
  'train_size': 122},
 '1': {'metrics': {'train_loss': 0.2042718634490044,
   'eval_loss': 0.8234638571739197,
   'eval_precision': 0.7653037107066887,
   'eval_recall': 0.7593808630393997,
   'eval_f1': 0.7623307828134197,
   'eval_accuracy': 0.8368298368298368,
   'eval_runtime': 0.8978,
   'eval_samples_per_second': 85.765,
   'eval_steps_per_s

**Вывод**: к 5 итерации активного обучения (50% датасета) достигается максимальное качество.  

### Оценка качества на тестовой выборке

In [23]:
predictions = trainer.predict(test_dataset=tokenized_dataset["test"])

In [24]:
predictions.metrics

{'test_loss': 0.7540450692176819,
 'test_precision': 0.8198219870098629,
 'test_recall': 0.814336917562724,
 'test_f1': 0.817070246943179,
 'test_accuracy': 0.8851713496832874,
 'test_runtime': 1.6279,
 'test_samples_per_second': 47.301,
 'test_steps_per_second': 12.286}

Оценка модели на абстракте, которого не было в тестовом наборе

In [25]:
ner_bio = pipeline("ner", model=model, tokenizer=tokenizer, device=config.device)

In [26]:
abstract = """Цель. Оценить выживаемость у пациентов с болезнью Фабри (БФ) в зависимости от вида заместительной почечной терапии, и определить роль диализного скрининга в ранней диагностике БФ у родственников.
Материалы и методы. В исследование включали взрослых (старше 18 лет) пациентов с подтвержденным диагнозом БФ. Терминальная стадия хронической почечной недостаточности (тХПН) диагностировали в соответствии с рекомендациями Научного общества нефрологов России (2016) и KDIGO (2012). На основании опроса пробандов выявляли его родственников, которые могли унаследовать мутантный ген.
Результаты. У 50 (24,9%) из 201 обследованных пациентов с БФ диагностирована тХПН, в том числе у 48 (40%) из 120 мужчин и 2 (2,7%) из 81 женщин. Оценка кумулятивной частоты методом Каплана-Майера демонстрирует выраженное увеличение частоты регистрации тХПН к возрасту 20-30 лет, а к возрасту 50 лет ожидаемое количество пациентов с тХПН составляет 95%. Пяти из 50 больных с тХПН была выполнена трансплантация почки, в среднем, через 17 месяцев (диапазон от 7 до 70 месяцев) после инициации лечения гемодиализом. Умерло 15 (30%) из 50 пациентов, получавших лечение гемодиализом. Все умершие пациенты были мужского пола. Медиана возраста на момент летального исхода составила 45 (39; 58) лет. Среди пациентов, которым проведена трансплантация почки, летальных исходов зарегистрировано не было. У 44 (88%) из 50 пациентов диагноз БФ установлен, в среднем, через 1 год (диапазон от 0 до 12 лет) после начала лечения программным гемодиализом, в том числе у одного пациента – после трансплантации почки. Среди 44 пробандов, выявленных при всероссийском диализном скрининге, проведен семейный скрининг. Патогенная мутация в гене GLA диагностирована у 89 (57%) из 156 обследованных родственников диализных пробандов, в том числе у 18 детей моложе 18 лет, клинические проявления БФ имелись у 48 родственников. У 80,4% обследованных родственников диализных пробандов обнаружено поражение почек, преимущественно на ранних стадиях.
Заключение. ТХПН нередкое осложнение БФ, ассоциированное с неблагоприятным прогнозом. Однако диализный скрининг –  эффективный способ выявления пробандов с БФ, открывающий возможность установить диагноз БФ у родственников на ранних стадиях, когда лечение наиболее эффективно.
"""

In [27]:
ner_bio(abstract)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity': 'DATE',
  'score': 0.10373853,
  'index': 1,
  'word': 'Цель',
  'start': 0,
  'end': 4},
 {'entity': 'FINDING',
  'score': 0.13174264,
  'index': 2,
  'word': '.',
  'start': 4,
  'end': 5},
 {'entity': 'SCIPROC',
  'score': 0.53618705,
  'index': 3,
  'word': 'Оцени',
  'start': 6,
  'end': 11},
 {'entity': 'FINDING',
  'score': 0.83551633,
  'index': 4,
  'word': '##ть',
  'start': 11,
  'end': 13},
 {'entity': 'PHYS',
  'score': 0.9970161,
  'index': 5,
  'word': 'выжи',
  'start': 14,
  'end': 18},
 {'entity': 'PHYS',
  'score': 0.9975267,
  'index': 6,
  'word': '##ваемость',
  'start': 18,
  'end': 26},
 {'entity': 'PERSON',
  'score': 0.79657394,
  'index': 7,
  'word': 'у',
  'start': 27,
  'end': 28},
 {'entity': 'PERSON',
  'score': 0.99926406,
  'index': 8,
  'word': 'пациентов',
  'start': 29,
  'end': 38},
 {'entity': 'DISO',
  'score': 0.9991222,
  'index': 9,
  'word': 'с',
  'start': 39,
  'end': 40},
 {'entity': 'DISO',
  'score': 0.9995628,
  'index': 10,