<a href="https://colab.research.google.com/github/ekaterinatao/NER_biomed_domain/blob/main/active_learning/%D0%92%D0%9A%D0%A0_nerel_bio_RuBioBERT_active_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Инструменты
Предобработанный дасасет [NEREL-BIO](https://huggingface.co/datasets/ekaterinatao/nerel_bio_ner_unnested)  
[Чек-пойнт](https://huggingface.co/ekaterinatao/nerel-bio-RuBioBERT-al) дообученной модели на всем датасете nerel-bio  

Исходная модель [RuBioBERT](https://huggingface.co/alexyalunin/RuBioBERT)

### Установка зависимостей

In [1]:
!pip install datasets accelerate evaluate wandb seqeval -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m14.8

In [2]:
import numpy as np
import pandas as pd
import random
from dataclasses import dataclass

import torch
import datasets
from datasets import Dataset, DatasetDict, concatenate_datasets
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
from transformers.trainer_utils import get_last_checkpoint
from transformers import pipeline
import evaluate

import warnings
warnings.filterwarnings("ignore")

In [3]:
@dataclass
class TrainingConfig:
    seed = 64
    dataset = 'ekaterinatao/nerel_bio_ner_unnested'
    checkpoint = 'alexyalunin/RuBioBERT'
    n_labels = 45
    n_epochs = 10
    train_batch_size = 4
    eval_batch_size = 4
    device = "cuda" if torch.cuda.is_available() else "cpu"
    l_rate = 5e-05
    w_decay = 0.1
    warm_up = 0.1

config = TrainingConfig()

In [4]:
seed = config.seed

random.seed(seed)
np.random.seed(seed)

torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

### Скачивание датасета

In [5]:
dataset = datasets.load_dataset(config.dataset)
dataset

Downloading readme:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/603k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/76.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/70.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/612 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/77 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/77 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'words', 'ner_tags'],
        num_rows: 612
    })
    valid: Dataset({
        features: ['id', 'words', 'ner_tags'],
        num_rows: 77
    })
    test: Dataset({
        features: ['id', 'words', 'ner_tags'],
        num_rows: 77
    })
})

In [6]:
# Labels
url = 'https://raw.githubusercontent.com/ekaterinatao/NER_biomed_domain/main/labels.txt'
tags = pd.read_csv(url, names=['tag']).values.tolist()
tags = [item for sublist in tags for item in sublist]
tag_to_id = {tag: i for i, tag in enumerate(tags)}
id_to_tag = {i: tag for i, tag in enumerate(tags)}

___
### Токенизация

In [7]:
tokenizer = AutoTokenizer.from_pretrained(config.checkpoint)

tokenizer_config.json:   0%|          | 0.00/413 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.75M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [8]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, max_length=512, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_idxs = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_idxs:
            if word_idx is None:
                label_ids.append(-100) # Set the special tokens to -100.
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [9]:
tokenized_dataset = dataset.map(
    tokenize_and_align_labels, batched=True,
    #remove_columns = ['id', 'words', 'ner_tags']
)
tokenized_dataset

Map:   0%|          | 0/612 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'words', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 612
    })
    valid: Dataset({
        features: ['id', 'words', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 77
    })
    test: Dataset({
        features: ['id', 'words', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 77
    })
})

# Обучение модели

In [10]:
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [tags[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [tags[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [11]:
def choose_samples_mnlp(probas, n_tokens) -> np.ndarray:
    """Функция для реализации стратеги отбора примеров для разметки
    на основе MNLP"""

    scores = np.array([
        -np.sum(np.log(np.max(i, axis=1))) / len(i) for i in probas
    ])
    args = np.argsort(scores)

    return args[:n_tokens]

In [12]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [13]:
model = AutoModelForTokenClassification.from_pretrained(
    config.checkpoint, num_labels=config.n_labels, id2label=id_to_tag, label2id=tag_to_id
)

config.json:   0%|          | 0.00/889 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at alexyalunin/RuBioBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
print(f'device is {config.device}')

device is cuda


In [15]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [17]:
import os
os.environ["WANDB_PROJECT"]="ner_bert_nerel_bio"
hf_repo_id = "ekaterinatao/nerel-bio-RuBioBERT-al"

In [18]:
def train_and_predict(
    train_dataset, eval_dataset, test_dataset,
    model, tokenizer, data_collator, compute_metrics
):
    "Функция для тренировки модели, сохранения метрик и предсказаний"

    global config
    training_args = TrainingArguments(
        output_dir=hf_repo_id,
        num_train_epochs=config.n_epochs,
        learning_rate=config.l_rate,
        weight_decay=config.w_decay,
        warmup_ratio=config.warm_up,
        per_device_train_batch_size=config.train_batch_size,
        per_device_eval_batch_size=config.eval_batch_size,
        group_by_length=True,
        optim="adamw_torch",
        lr_scheduler_type="cosine",
        evaluation_strategy="epoch",
        seed = config.seed,
        data_seed = config.seed,
        #use_cpu=True,
        push_to_hub=True,
        save_strategy="no",
        report_to="wandb",
        logging_steps=16,
        run_name="RuBioBERT_AL",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    train_result = trainer.train()
    test_predictions = trainer.predict(
        test_dataset, metric_key_prefix="pred_"
    )
    metrics = {}
    metrics['train_loss'] = train_result.metrics['train_loss']
    metrics.update(trainer.evaluate(eval_dataset))
    metrics.update(test_predictions.metrics)
    probas = test_predictions.predictions

    return metrics, probas, trainer

In [19]:
def run_active_learning(
    dataset, al_iters, train_size,
    model, tokenizer, data_collator, compute_metrics
):
    "Функция для реализации цикла активного обучения"

    train_dataset = dataset['train'].select(
        random.sample(
            range(dataset['train'].num_rows),
            int(dataset['train'].num_rows * train_size)
        )
    )
    test_dataset = dataset['train'].filter(
        lambda s: s['id'] not in train_dataset['id']
    )
    n_tokens = train_dataset.num_rows
    all_metrics = {}

    for iter in range(al_iters):
        print(f'Training using {train_dataset.num_rows}')

        eval_metrics, test_predictions, trainer = train_and_predict(
            train_dataset, dataset['valid'], test_dataset,
            model, tokenizer, data_collator, compute_metrics
        )

        new_args = choose_samples_mnlp(test_predictions, n_tokens)
        new_train_samples = test_dataset.select(new_args)
        extended_train_dataset = concatenate_datasets(
            [train_dataset, new_train_samples]
        )

        train_dataset = extended_train_dataset
        test_dataset = dataset['train'].filter(
            lambda s: s['id'] not in train_dataset['id']
        )

        all_metrics[f'{iter}'] = {'metrics': eval_metrics, 'train_size': train_dataset.num_rows}

    return all_metrics, trainer

In [20]:
metrics, trainer = run_active_learning(
    dataset=tokenized_dataset, al_iters=7, train_size=0.1,
    model=model, tokenizer=tokenizer,
    data_collator=data_collator, compute_metrics=compute_metrics
)

Filter:   0%|          | 0/612 [00:00<?, ? examples/s]

Training using 61


[34m[1mwandb[0m: Currently logged in as: [33mtaoea[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,3.3855,2.925789,0.049296,0.001642,0.003177,0.158042
2,2.267,2.012529,0.420856,0.343574,0.378309,0.537685
3,1.3506,1.434593,0.603119,0.562383,0.582039,0.693551
4,0.8601,1.24049,0.645208,0.620544,0.632636,0.727894
5,0.6634,1.12402,0.660775,0.640009,0.650226,0.742191
6,0.4729,1.049967,0.69108,0.670497,0.680633,0.770319
7,0.3967,1.00027,0.703881,0.689024,0.696374,0.781974
8,0.3229,1.009882,0.695569,0.684803,0.690144,0.779021
9,0.2958,1.000625,0.702774,0.689259,0.695951,0.784771
10,0.2825,1.000039,0.702774,0.689259,0.695951,0.784771


Filter:   0%|          | 0/612 [00:00<?, ? examples/s]

Training using 122


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.5164,0.891704,0.7315,0.704737,0.717869,0.798135
2,0.3964,0.802977,0.733652,0.720919,0.72723,0.812432
3,0.2812,0.799037,0.741912,0.726079,0.73391,0.81554
4,0.1775,0.761486,0.746682,0.738977,0.74281,0.823155
5,0.1573,0.765908,0.753351,0.751407,0.752378,0.828127
6,0.0894,0.803739,0.748111,0.743199,0.745647,0.823932
7,0.0826,0.807202,0.752777,0.746951,0.749853,0.827195
8,0.0623,0.825924,0.750591,0.744606,0.747587,0.827506
9,0.053,0.831907,0.754199,0.747655,0.750913,0.829215
10,0.0495,0.831941,0.754378,0.747655,0.751001,0.829215


Filter:   0%|          | 0/612 [00:00<?, ? examples/s]

Training using 183


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2571,0.65983,0.763102,0.761492,0.762296,0.83512
2,0.12,0.677009,0.769087,0.770169,0.769627,0.842735
3,0.1408,0.772668,0.745263,0.747186,0.746223,0.824087
4,0.1198,0.721608,0.763998,0.77439,0.769159,0.84289
5,0.0585,0.775844,0.760348,0.758208,0.759277,0.837762
6,0.0349,0.752171,0.769841,0.773452,0.771642,0.845066
7,0.0247,0.808208,0.770623,0.768996,0.769809,0.844134
8,0.0189,0.783893,0.775035,0.780488,0.777752,0.847863
9,0.015,0.78771,0.774953,0.779315,0.777128,0.847552
10,0.0126,0.790171,0.774276,0.777908,0.776088,0.846931


Filter:   0%|          | 0/612 [00:00<?, ? examples/s]

Training using 244


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1863,0.739497,0.777408,0.766651,0.771992,0.840715
2,0.0909,0.802778,0.767404,0.765244,0.766322,0.839782
3,0.0719,0.767696,0.773276,0.765478,0.769358,0.843667
4,0.0611,0.832969,0.753086,0.758208,0.755639,0.830925
5,0.0404,0.833712,0.751614,0.764306,0.757907,0.833411
6,0.0215,0.85773,0.780482,0.782129,0.781305,0.848485
7,0.0171,0.853097,0.781499,0.782598,0.782048,0.847397
8,0.0098,0.888705,0.774648,0.773921,0.774284,0.842735
9,0.0073,0.891537,0.773815,0.773452,0.773634,0.84289
10,0.0095,0.891938,0.774519,0.774156,0.774337,0.843201


Filter:   0%|          | 0/612 [00:00<?, ? examples/s]

Training using 305


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1526,0.71096,0.781857,0.774156,0.777987,0.846931
2,0.0949,0.760942,0.769249,0.777908,0.773554,0.843667
3,0.0634,0.765341,0.781322,0.792683,0.786962,0.852059
4,0.0413,0.80933,0.784198,0.779784,0.781985,0.851282
5,0.0277,0.800503,0.79194,0.792683,0.792311,0.855167
6,0.0176,0.806643,0.788847,0.792917,0.790877,0.854701
7,0.0136,0.838673,0.790979,0.797842,0.794396,0.855478
8,0.0044,0.839996,0.794412,0.800188,0.797289,0.857964
9,0.0032,0.847771,0.792765,0.79667,0.794713,0.85641
10,0.0043,0.848609,0.792814,0.796904,0.794854,0.856721


Filter:   0%|          | 0/612 [00:00<?, ? examples/s]

Training using 366


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1212,0.738065,0.780988,0.786116,0.783544,0.851282
2,0.0598,0.760692,0.789898,0.781191,0.785521,0.853458
3,0.06,0.788295,0.769177,0.778377,0.77375,0.845532
4,0.0277,0.835833,0.785864,0.790103,0.787978,0.853147
5,0.0278,0.835904,0.788049,0.791745,0.789892,0.852681
6,0.0158,0.869691,0.788183,0.79151,0.789843,0.854079
7,0.0081,0.849534,0.787921,0.7894,0.78866,0.854079
8,0.002,0.887479,0.792049,0.789634,0.79084,0.855944
9,0.0045,0.88743,0.79346,0.791041,0.792249,0.856566
10,0.0032,0.888071,0.793184,0.79151,0.792347,0.856566


Filter:   0%|          | 0/612 [00:00<?, ? examples/s]

Training using 427


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0824,0.749376,0.791306,0.79409,0.792696,0.85641
2,0.0463,0.833653,0.778141,0.781426,0.77978,0.849106
3,0.0368,0.836467,0.789277,0.79409,0.791676,0.853768
4,0.0214,0.860872,0.788818,0.787523,0.78817,0.856099
5,0.0117,0.870145,0.792577,0.791276,0.791926,0.857653
6,0.0069,0.860464,0.788907,0.790572,0.789739,0.855789
7,0.004,0.889022,0.793208,0.799719,0.79645,0.859674
8,0.0035,0.89954,0.792211,0.79667,0.794434,0.858897
9,0.0021,0.902412,0.794296,0.796904,0.795598,0.86014
10,0.0014,0.902636,0.794296,0.796904,0.795598,0.86014


Filter:   0%|          | 0/612 [00:00<?, ? examples/s]

In [21]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▅▇▇▇▇▇█████████████████████████████████
eval/f1,▁▄▇▇▇▇▇▇████████████████████████████████
eval/loss,█▅▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▁▁▁▂▂▂▁▁▂▂▂▁▁▂▂▂▂
eval/precision,▁▄▇▇▇▇▇█████████████████████████████████
eval/recall,▁▄▆▇▇▇▇▇████████████████████████████████
eval/runtime,▁▁▁█▇▄▁▁▁▁▁▁▁▁▃▂▂▁▁▁▂▁▁▁▁▁▁▂▂▁▁▂▁▁▁▁▁▂▁▂
eval/samples_per_second,▇██▁▂▄██▇█████▅▆▆███▅▇█████▅▅██▅█████▆█▇
eval/steps_per_second,▇██▁▂▄██▇█████▅▆▆███▅▇█████▅▅██▅█████▆█▇
train/epoch,▂▅█▂▄▇▂▃▄▆█▂▃▄▆▇▁▂▃▄▅▇▇▁▂▃▄▅▆▇█▁▂▃▄▅▅▆▇█
train/global_step,▁▂▂▁▂▃▁▂▂▃▄▂▂▃▄▅▁▂▂▃▄▅▆▁▂▃▄▄▅▆▇▁▂▃▄▅▅▆▇█

0,1
eval/accuracy,0.86014
eval/f1,0.7956
eval/loss,0.90264
eval/precision,0.7943
eval/recall,0.7969
eval/runtime,0.9933
eval/samples_per_second,77.517
eval/steps_per_second,20.134
train/epoch,10.0
train/global_step,1070.0


In [34]:
metrics

{'0': {'metrics': {'train_loss': 1.0297480553388596,
   'eval_loss': 1.0000391006469727,
   'eval_precision': 0.7027737924438068,
   'eval_recall': 0.6892589118198874,
   'eval_f1': 0.6959507459152261,
   'eval_accuracy': 0.7847707847707848,
   'eval_runtime': 1.0287,
   'eval_samples_per_second': 74.854,
   'eval_steps_per_second': 19.443,
   'epoch': 10.0,
   'pred__loss': 0.7819056510925293,
   'pred__precision': 0.7273767885532592,
   'pred__recall': 0.7065291247143122,
   'pred__f1': 0.7168014037726389,
   'pred__accuracy': 0.8206197854588796,
   'pred__runtime': 11.7906,
   'pred__samples_per_second': 46.732,
   'pred__steps_per_second': 11.704},
  'train_size': 61},
 '1': {'metrics': {'train_loss': 0.18960652644595793,
   'eval_loss': 0.8319405913352966,
   'eval_precision': 0.7543776620918126,
   'eval_recall': 0.7476547842401501,
   'eval_f1': 0.7510011778563015,
   'eval_accuracy': 0.8292152292152292,
   'eval_runtime': 0.8664,
   'eval_samples_per_second': 88.874,
   'eval_s

**Вывод**: к 5 итерации активного обучения (50% датасета) достигается максимальное качество.  

In [23]:
trainer.save_model(hf_repo_id)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.98k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

### Оценка качества на тестовой выборке

In [35]:
predictions = trainer.predict(test_dataset=tokenized_dataset["test"])

In [36]:
predictions.metrics

{'test_loss': 0.7208327054977417,
 'test_precision': 0.8238841978287093,
 'test_recall': 0.8160095579450418,
 'test_f1': 0.8199279711884755,
 'test_accuracy': 0.882410264739321,
 'test_runtime': 0.9198,
 'test_samples_per_second': 83.718,
 'test_steps_per_second': 21.745}

Оценка модели на абстракте, которого не было в тестовом наборе

In [37]:
ner_bio = pipeline("ner", model=model, tokenizer=tokenizer, device=config.device)

In [38]:
abstract = """Цель. Оценить выживаемость у пациентов с болезнью Фабри (БФ) в зависимости от вида заместительной почечной терапии, и определить роль диализного скрининга в ранней диагностике БФ у родственников.
Материалы и методы. В исследование включали взрослых (старше 18 лет) пациентов с подтвержденным диагнозом БФ. Терминальная стадия хронической почечной недостаточности (тХПН) диагностировали в соответствии с рекомендациями Научного общества нефрологов России (2016) и KDIGO (2012). На основании опроса пробандов выявляли его родственников, которые могли унаследовать мутантный ген.
Результаты. У 50 (24,9%) из 201 обследованных пациентов с БФ диагностирована тХПН, в том числе у 48 (40%) из 120 мужчин и 2 (2,7%) из 81 женщин. Оценка кумулятивной частоты методом Каплана-Майера демонстрирует выраженное увеличение частоты регистрации тХПН к возрасту 20-30 лет, а к возрасту 50 лет ожидаемое количество пациентов с тХПН составляет 95%. Пяти из 50 больных с тХПН была выполнена трансплантация почки, в среднем, через 17 месяцев (диапазон от 7 до 70 месяцев) после инициации лечения гемодиализом. Умерло 15 (30%) из 50 пациентов, получавших лечение гемодиализом. Все умершие пациенты были мужского пола. Медиана возраста на момент летального исхода составила 45 (39; 58) лет. Среди пациентов, которым проведена трансплантация почки, летальных исходов зарегистрировано не было. У 44 (88%) из 50 пациентов диагноз БФ установлен, в среднем, через 1 год (диапазон от 0 до 12 лет) после начала лечения программным гемодиализом, в том числе у одного пациента – после трансплантации почки. Среди 44 пробандов, выявленных при всероссийском диализном скрининге, проведен семейный скрининг. Патогенная мутация в гене GLA диагностирована у 89 (57%) из 156 обследованных родственников диализных пробандов, в том числе у 18 детей моложе 18 лет, клинические проявления БФ имелись у 48 родственников. У 80,4% обследованных родственников диализных пробандов обнаружено поражение почек, преимущественно на ранних стадиях.
Заключение. ТХПН нередкое осложнение БФ, ассоциированное с неблагоприятным прогнозом. Однако диализный скрининг –  эффективный способ выявления пробандов с БФ, открывающий возможность установить диагноз БФ у родственников на ранних стадиях, когда лечение наиболее эффективно.
"""

In [39]:
ner_bio(abstract)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity': 'SCIPROC',
  'score': 0.25182974,
  'index': 1,
  'word': 'Цель',
  'start': 0,
  'end': 4},
 {'entity': 'SCIPROC',
  'score': 0.48384497,
  'index': 2,
  'word': '.',
  'start': 4,
  'end': 5},
 {'entity': 'FINDING',
  'score': 0.533997,
  'index': 3,
  'word': 'Оцени',
  'start': 6,
  'end': 11},
 {'entity': 'FINDING',
  'score': 0.8935187,
  'index': 4,
  'word': '##ть',
  'start': 11,
  'end': 13},
 {'entity': 'PHYS',
  'score': 0.9943568,
  'index': 5,
  'word': 'выжи',
  'start': 14,
  'end': 18},
 {'entity': 'PHYS',
  'score': 0.99370664,
  'index': 6,
  'word': '##ваемость',
  'start': 18,
  'end': 26},
 {'entity': 'PERSON',
  'score': 0.7056615,
  'index': 7,
  'word': 'у',
  'start': 27,
  'end': 28},
 {'entity': 'PERSON',
  'score': 0.99944156,
  'index': 8,
  'word': 'пациентов',
  'start': 29,
  'end': 38},
 {'entity': 'DISO',
  'score': 0.99791306,
  'index': 9,
  'word': 'с',
  'start': 39,
  'end': 40},
 {'entity': 'DISO',
  'score': 0.9996755,
  'index': 10