1.  Дообучить берт на задачу NER
2.  Дообучить GPT на генерацию текста
3. Дообучить T5 на задачу суммаризации текста

In [1]:
import torch
import re
import os
import logging
import numpy as np
import pandas as pd
from razdel import tokenize
from corus import load_rudrec
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from datasets import load_dataset, load_metric, Dataset, DatasetDict
from collections import Counter, defaultdict
from transformers import AutoTokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TextDataset, AutoModelForCausalLM, AutoModelForTokenClassification, DataCollatorForTokenClassification, pipeline
from transformers.trainer import logger as noisy_logger

2024-01-28 22:13:07.186585: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-28 22:13:07.186621: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-28 22:13:07.188096: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-28 22:13:07.196880: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Дообучение BERT

In [2]:
data_path_r = r'/media/dmitriy/Disk/Downloads/ai_nlp_hw_data/hw_14/rudrec/'

In [3]:
model_checkpoint = r'cointegrated/rubert-tiny'
batch_size = 16

In [4]:
drugs = list(load_rudrec(os.path.join(data_path_r, r'rudrec_annotated.json')))

In [5]:
type2text = defaultdict(Counter)
ents = Counter()
for item in drugs:
    for e in item.entities:
        ents[e.entity_type] += 1
        type2text[e.entity_type][e.entity_text] += 1
for k, v in ents.most_common():
    print(k, v)
    print(type2text[k].most_common(3))

DI 1401
[('простуды', 64), ('ОРВИ', 47), ('профилактики', 42)]
Drugname 1043
[('Виферон', 33), ('Анаферон', 25), ('Циклоферон', 24)]
Drugform 836
[('таблетки', 154), ('таблеток', 79), ('свечи', 63)]
ADR 720
[('аллергия', 16), ('слабость', 13), ('диарея', 12)]
Drugclass 330
[('противовирусный', 21), ('противовирусное', 18), ('противовирусных', 13)]
Finding 236
[('аллергии', 12), ('температуры', 6), ('сонливости', 5)]


In [6]:
def extract_labels(item):
    raw_toks = list(tokenize(item.text))
    words = [tok.text for tok in raw_toks]
    word_labels = ['O'] * len(raw_toks)
    char2word = [None] * len(item.text)
    for i, word in enumerate(raw_toks):
        char2word[word.start:word.stop] = [i] * len(word.text)
    for e in item.entities:
        e_words = sorted({idx for idx in char2word[e.start:e.end] if idx is not None})
        word_labels[e_words[0]] = 'B-' + e.entity_type
        for idx in e_words[1:]:
            word_labels[idx] = 'I-' + e.entity_type
    return {'tokens': words, 'tags': word_labels}

In [7]:
ner_data = [extract_labels(item) for item in drugs]
ner_train, ner_test = train_test_split(ner_data, test_size=0.1, random_state=1)

In [8]:
pd.options.display.max_colwidth = 300
pd.DataFrame(ner_train).sample(3)

Unnamed: 0,tokens,tags
316,"[Препарат, "", Ликопид, "", покупала, для, лечения, герпеса, .]","[O, O, B-Drugname, O, O, O, O, B-DI, O]"
885,"[Первые, сутки, .]","[O, O, O]"
577,"[Если, раньше, было, лучше, ,, то, сейчас, качесво, препарата, оставляет, желать, лучшего, .]","[O, O, O, O, O, O, O, O, O, O, O, O, O]"


In [9]:
label_list = sorted({label for item in ner_train for label in item['tags']})
if 'O' in label_list:
    label_list.remove('O')
    label_list = ['O'] + label_list
label_list

['O',
 'B-ADR',
 'B-DI',
 'B-Drugclass',
 'B-Drugform',
 'B-Drugname',
 'B-Finding',
 'I-ADR',
 'I-DI',
 'I-Drugclass',
 'I-Drugform',
 'I-Drugname',
 'I-Finding']

In [10]:
ner_data = DatasetDict({
    'train': Dataset.from_pandas(pd.DataFrame(ner_train)),
    'test': Dataset.from_pandas(pd.DataFrame(ner_test))
})
ner_data

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 4328
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 481
    })
})

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [12]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [13]:
tokenize_and_align_labels(ner_data['train'][22:23])

{'input_ids': [[2, 3130, 3374, 23324, 871, 314, 1556, 14068, 16902, 1029, 6899, 18, 3]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]]}

In [14]:
tokenized_datasets = ner_data.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/4328 [00:00<?, ? examples/s]

Map:   0%|          | 0/481 [00:00<?, ? examples/s]

In [15]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
model.config.id2label = dict(enumerate(label_list))
model.config.label2id = {v: k for k, v in model.config.id2label.items()}

Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
args = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none',
)

In [17]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [18]:
metric = load_metric(r'seqeval')

  metric = load_metric(r'seqeval')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [19]:
example = ner_train[4]
labels = example['tags']
metric.compute(predictions=[labels], references=[labels])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


{'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 1.0}

In [20]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [21]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [22]:
trainer.evaluate()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 2.5628395080566406,
 'eval_precision': 0.020123839009287926,
 'eval_recall': 0.11875761266747868,
 'eval_f1': 0.034415813625132366,
 'eval_accuracy': 0.09160815780675359,
 'eval_runtime': 2.7154,
 'eval_samples_per_second': 177.139,
 'eval_steps_per_second': 11.416}

In [23]:
for param in model.bert.parameters():
    param.requires_grad = False

In [24]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)
        print(param)

classifier.weight
Parameter containing:
tensor([[-0.0254,  0.0040, -0.0014,  ..., -0.0161, -0.0037, -0.0215],
        [-0.0262, -0.0322,  0.0139,  ..., -0.0156,  0.0122, -0.0087],
        [-0.0007, -0.0423,  0.0350,  ...,  0.0338, -0.0270,  0.0033],
        ...,
        [-0.0278, -0.0005,  0.0129,  ...,  0.0081,  0.0052,  0.0039],
        [ 0.0304,  0.0134,  0.0062,  ...,  0.0108,  0.0056, -0.0339],
        [-0.0140, -0.0046, -0.0118,  ..., -0.0006, -0.0151, -0.0342]],
       device='cuda:0', requires_grad=True)
classifier.bias
Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0',
       requires_grad=True)


In [25]:
noisy_logger.setLevel(logging.WARNING)

In [26]:
trainer.train()

  0%|          | 0/2710 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 1.9466941356658936, 'eval_precision': 0.02390057361376673, 'eval_recall': 0.030450669914738125, 'eval_f1': 0.02678093197643278, 'eval_accuracy': 0.6762788365095286, 'eval_runtime': 0.5729, 'eval_samples_per_second': 839.615, 'eval_steps_per_second': 54.112, 'epoch': 1.0}
{'loss': 1.9936, 'learning_rate': 1.6309963099630997e-05, 'epoch': 1.85}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 1.4911056756973267, 'eval_precision': 0.029411764705882353, 'eval_recall': 0.0018270401948842874, 'eval_f1': 0.003440366972477064, 'eval_accuracy': 0.812019391507857, 'eval_runtime': 0.6053, 'eval_samples_per_second': 794.65, 'eval_steps_per_second': 51.214, 'epoch': 2.0}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 1.192145824432373, 'eval_precision': 0.25, 'eval_recall': 0.0006090133982947625, 'eval_f1': 0.001215066828675577, 'eval_accuracy': 0.8182046138415245, 'eval_runtime': 0.7261, 'eval_samples_per_second': 662.433, 'eval_steps_per_second': 42.693, 'epoch': 3.0}
{'loss': 1.2333, 'learning_rate': 1.2619926199261994e-05, 'epoch': 3.69}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 1.0157833099365234, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8183717820127048, 'eval_runtime': 0.5443, 'eval_samples_per_second': 883.768, 'eval_steps_per_second': 56.958, 'epoch': 4.0}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.9175371527671814, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8183717820127048, 'eval_runtime': 0.5778, 'eval_samples_per_second': 832.427, 'eval_steps_per_second': 53.649, 'epoch': 5.0}
{'loss': 0.9003, 'learning_rate': 8.92988929889299e-06, 'epoch': 5.54}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.8635350465774536, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8183717820127048, 'eval_runtime': 0.6874, 'eval_samples_per_second': 699.754, 'eval_steps_per_second': 45.098, 'epoch': 6.0}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.8332812786102295, 'eval_precision': 1.0, 'eval_recall': 0.0018270401948842874, 'eval_f1': 0.00364741641337386, 'eval_accuracy': 0.8186225342694751, 'eval_runtime': 0.6301, 'eval_samples_per_second': 763.403, 'eval_steps_per_second': 49.201, 'epoch': 7.0}
{'loss': 0.7981, 'learning_rate': 5.2398523985239855e-06, 'epoch': 7.38}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.8164395093917847, 'eval_precision': 1.0, 'eval_recall': 0.0030450669914738123, 'eval_f1': 0.00607164541590771, 'eval_accuracy': 0.8187897024406553, 'eval_runtime': 0.5465, 'eval_samples_per_second': 880.096, 'eval_steps_per_second': 56.721, 'epoch': 8.0}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.8077921867370605, 'eval_precision': 1.0, 'eval_recall': 0.0030450669914738123, 'eval_f1': 0.00607164541590771, 'eval_accuracy': 0.8187897024406553, 'eval_runtime': 0.6412, 'eval_samples_per_second': 750.111, 'eval_steps_per_second': 48.344, 'epoch': 9.0}
{'loss': 0.7511, 'learning_rate': 1.5498154981549817e-06, 'epoch': 9.23}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.8050906658172607, 'eval_precision': 1.0, 'eval_recall': 0.0030450669914738123, 'eval_f1': 0.00607164541590771, 'eval_accuracy': 0.8187897024406553, 'eval_runtime': 0.539, 'eval_samples_per_second': 892.363, 'eval_steps_per_second': 57.512, 'epoch': 10.0}
{'train_runtime': 38.4755, 'train_samples_per_second': 1124.873, 'train_steps_per_second': 70.434, 'train_loss': 1.1050868016767326, 'epoch': 10.0}


TrainOutput(global_step=2710, training_loss=1.1050868016767326, metrics={'train_runtime': 38.4755, 'train_samples_per_second': 1124.873, 'train_steps_per_second': 70.434, 'train_loss': 1.1050868016767326, 'epoch': 10.0})

In [27]:
for param in model.parameters():
    param.requires_grad = True

In [28]:
args = TrainingArguments(
    "ner",
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=20,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none',
)

In [29]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [30]:
trainer.train()

  0%|          | 0/5420 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.6071533560752869, 'eval_precision': 0.6429809358752167, 'eval_recall': 0.2259439707673569, 'eval_f1': 0.33438485804416407, 'eval_accuracy': 0.8448679371447676, 'eval_runtime': 0.5556, 'eval_samples_per_second': 865.663, 'eval_steps_per_second': 55.791, 'epoch': 1.0}
{'loss': 0.5609, 'learning_rate': 9.07749077490775e-06, 'epoch': 1.85}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.5356666445732117, 'eval_precision': 0.5959821428571429, 'eval_recall': 0.32521315468940315, 'eval_f1': 0.42080378250591016, 'eval_accuracy': 0.856486125041792, 'eval_runtime': 0.6261, 'eval_samples_per_second': 768.22, 'eval_steps_per_second': 49.511, 'epoch': 2.0}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.4938695728778839, 'eval_precision': 0.5878070973612375, 'eval_recall': 0.3934226552984166, 'eval_f1': 0.4713608172199927, 'eval_accuracy': 0.8643430290872618, 'eval_runtime': 0.6597, 'eval_samples_per_second': 729.13, 'eval_steps_per_second': 46.992, 'epoch': 3.0}
{'loss': 0.4479, 'learning_rate': 8.154981549815498e-06, 'epoch': 3.69}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.468112051486969, 'eval_precision': 0.5764525993883792, 'eval_recall': 0.4591961023142509, 'eval_f1': 0.5111864406779661, 'eval_accuracy': 0.87094617184888, 'eval_runtime': 0.6742, 'eval_samples_per_second': 713.484, 'eval_steps_per_second': 45.983, 'epoch': 4.0}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.4426218867301941, 'eval_precision': 0.6027104136947218, 'eval_recall': 0.5146163215590743, 'eval_f1': 0.5551905387647832, 'eval_accuracy': 0.87905382815112, 'eval_runtime': 0.6388, 'eval_samples_per_second': 752.966, 'eval_steps_per_second': 48.528, 'epoch': 5.0}
{'loss': 0.3877, 'learning_rate': 7.232472324723247e-06, 'epoch': 5.54}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.4242127239704132, 'eval_precision': 0.5953488372093023, 'eval_recall': 0.5456760048721072, 'eval_f1': 0.5694312043215761, 'eval_accuracy': 0.8821464393179539, 'eval_runtime': 0.6929, 'eval_samples_per_second': 694.152, 'eval_steps_per_second': 44.737, 'epoch': 6.0}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.4124130308628082, 'eval_precision': 0.6019736842105263, 'eval_recall': 0.5572472594397077, 'eval_f1': 0.5787476280834916, 'eval_accuracy': 0.8851554663991976, 'eval_runtime': 0.6553, 'eval_samples_per_second': 734.005, 'eval_steps_per_second': 47.306, 'epoch': 7.0}
{'loss': 0.3484, 'learning_rate': 6.309963099630997e-06, 'epoch': 7.38}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.40231767296791077, 'eval_precision': 0.6295037389530931, 'eval_recall': 0.5639464068209501, 'eval_f1': 0.5949245101188564, 'eval_accuracy': 0.8882480775660314, 'eval_runtime': 0.5536, 'eval_samples_per_second': 868.82, 'eval_steps_per_second': 55.995, 'epoch': 8.0}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.39269474148750305, 'eval_precision': 0.6269052352551359, 'eval_recall': 0.5761266747868453, 'eval_f1': 0.6004443033957474, 'eval_accuracy': 0.8902540956201939, 'eval_runtime': 0.6814, 'eval_samples_per_second': 705.85, 'eval_steps_per_second': 45.491, 'epoch': 9.0}
{'loss': 0.3267, 'learning_rate': 5.387453874538746e-06, 'epoch': 9.23}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.3823018968105316, 'eval_precision': 0.5947083583884546, 'eval_recall': 0.6023142509135201, 'eval_f1': 0.5984871406959152, 'eval_accuracy': 0.8902540956201939, 'eval_runtime': 0.7548, 'eval_samples_per_second': 637.248, 'eval_steps_per_second': 41.07, 'epoch': 10.0}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.3759065866470337, 'eval_precision': 0.6153362664990571, 'eval_recall': 0.5962241169305724, 'eval_f1': 0.6056294463346736, 'eval_accuracy': 0.892761618187897, 'eval_runtime': 0.704, 'eval_samples_per_second': 683.192, 'eval_steps_per_second': 44.031, 'epoch': 11.0}
{'loss': 0.3061, 'learning_rate': 4.464944649446495e-06, 'epoch': 11.07}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.37303224205970764, 'eval_precision': 0.601571946795647, 'eval_recall': 0.6059683313032886, 'eval_f1': 0.6037621359223301, 'eval_accuracy': 0.8925944500167168, 'eval_runtime': 0.6281, 'eval_samples_per_second': 765.834, 'eval_steps_per_second': 49.357, 'epoch': 12.0}
{'loss': 0.2883, 'learning_rate': 3.5424354243542435e-06, 'epoch': 12.92}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.3665774166584015, 'eval_precision': 0.6153381642512077, 'eval_recall': 0.620584652862363, 'eval_f1': 0.6179502728926622, 'eval_accuracy': 0.8952691407556002, 'eval_runtime': 0.7303, 'eval_samples_per_second': 658.589, 'eval_steps_per_second': 42.445, 'epoch': 13.0}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.36489608883857727, 'eval_precision': 0.6117717003567182, 'eval_recall': 0.6266747868453106, 'eval_f1': 0.6191335740072202, 'eval_accuracy': 0.8951019725844199, 'eval_runtime': 0.7112, 'eval_samples_per_second': 676.277, 'eval_steps_per_second': 43.585, 'epoch': 14.0}
{'loss': 0.2799, 'learning_rate': 2.6199261992619928e-06, 'epoch': 14.76}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.3607807159423828, 'eval_precision': 0.6244700181708056, 'eval_recall': 0.6278928136419001, 'eval_f1': 0.6261767385362891, 'eval_accuracy': 0.8973587428953528, 'eval_runtime': 0.6695, 'eval_samples_per_second': 718.456, 'eval_steps_per_second': 46.304, 'epoch': 15.0}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.35692039132118225, 'eval_precision': 0.6262870987280436, 'eval_recall': 0.6297198538367844, 'eval_f1': 0.627998785302156, 'eval_accuracy': 0.8984453360080241, 'eval_runtime': 0.7143, 'eval_samples_per_second': 673.354, 'eval_steps_per_second': 43.397, 'epoch': 16.0}
{'loss': 0.2663, 'learning_rate': 1.6974169741697418e-06, 'epoch': 16.61}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.3562316298484802, 'eval_precision': 0.6213301378070701, 'eval_recall': 0.6315468940316687, 'eval_f1': 0.6263968589549984, 'eval_accuracy': 0.8978602474088934, 'eval_runtime': 0.665, 'eval_samples_per_second': 723.296, 'eval_steps_per_second': 46.616, 'epoch': 17.0}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.35572418570518494, 'eval_precision': 0.6241810601548541, 'eval_recall': 0.6382460414129111, 'eval_f1': 0.6311352002408913, 'eval_accuracy': 0.8985289200936142, 'eval_runtime': 0.638, 'eval_samples_per_second': 753.898, 'eval_steps_per_second': 48.588, 'epoch': 18.0}
{'loss': 0.268, 'learning_rate': 7.749077490774908e-07, 'epoch': 18.45}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.3551291227340698, 'eval_precision': 0.6263408820023838, 'eval_recall': 0.6400730816077954, 'eval_f1': 0.633132530120482, 'eval_accuracy': 0.8987796723503845, 'eval_runtime': 0.6096, 'eval_samples_per_second': 788.986, 'eval_steps_per_second': 50.849, 'epoch': 19.0}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.35484275221824646, 'eval_precision': 0.6259678379988088, 'eval_recall': 0.6400730816077954, 'eval_f1': 0.6329418849744054, 'eval_accuracy': 0.8987796723503845, 'eval_runtime': 0.5976, 'eval_samples_per_second': 804.899, 'eval_steps_per_second': 51.875, 'epoch': 20.0}
{'train_runtime': 180.7794, 'train_samples_per_second': 478.816, 'train_steps_per_second': 29.981, 'train_loss': 0.3410950636071913, 'epoch': 20.0}


TrainOutput(global_step=5420, training_loss=0.3410950636071913, metrics={'train_runtime': 180.7794, 'train_samples_per_second': 478.816, 'train_steps_per_second': 29.981, 'train_loss': 0.3410950636071913, 'epoch': 20.0})

In [31]:
trainer.evaluate()

  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.35484275221824646,
 'eval_precision': 0.6259678379988088,
 'eval_recall': 0.6400730816077954,
 'eval_f1': 0.6329418849744054,
 'eval_accuracy': 0.8987796723503845,
 'eval_runtime': 0.6682,
 'eval_samples_per_second': 719.853,
 'eval_steps_per_second': 46.394,
 'epoch': 20.0}

In [32]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)
# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
results = metric.compute(predictions=true_predictions, references=true_labels)
results

  0%|          | 0/31 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'ADR': {'precision': 0.39906103286384975,
  'recall': 0.32075471698113206,
  'f1': 0.3556485355648536,
  'number': 265},
 'DI': {'precision': 0.4483985765124555,
  'recall': 0.5526315789473685,
  'f1': 0.49508840864440085,
  'number': 456},
 'Drugclass': {'precision': 0.7771084337349398,
  'recall': 0.8012422360248447,
  'f1': 0.7889908256880734,
  'number': 161},
 'Drugform': {'precision': 0.8202247191011236,
  'recall': 0.8202247191011236,
  'f1': 0.8202247191011236,
  'number': 267},
 'Drugname': {'precision': 0.7770700636942676,
  'recall': 0.912718204488778,
  'f1': 0.8394495412844037,
  'number': 401},
 'Finding': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 92},
 'overall_precision': 0.6259678379988088,
 'overall_recall': 0.6400730816077954,
 'overall_f1': 0.6329418849744054,
 'overall_accuracy': 0.8987796723503845}

In [33]:
cm = pd.DataFrame(
    confusion_matrix(sum(true_labels, []), sum(true_predictions, []), labels=label_list),
    index=label_list,
    columns=label_list
)
cm

Unnamed: 0,O,B-ADR,B-DI,B-Drugclass,B-Drugform,B-Drugname,B-Finding,I-ADR,I-DI,I-Drugclass,I-Drugform,I-Drugname,I-Finding
O,9598,15,77,12,29,42,0,5,13,0,0,0,0
B-ADR,87,94,73,5,1,0,0,4,1,0,0,0,0
B-DI,153,18,270,2,8,4,0,0,1,0,0,0,0
B-Drugclass,18,0,10,129,0,3,0,0,1,0,0,0,0
B-Drugform,35,0,3,0,220,9,0,0,0,0,0,0,0
B-Drugname,17,0,5,1,2,375,0,0,1,0,0,0,0
B-Finding,21,31,24,7,5,3,0,0,1,0,0,0,0
I-ADR,109,16,23,0,2,1,0,32,19,0,0,0,0
I-DI,143,19,43,3,0,3,0,6,35,0,0,0,0
I-Drugclass,0,0,0,0,0,0,0,0,0,0,0,0,0


In [34]:
model.save_pretrained(os.path.join(data_path_r, r'ner_bert.bin'))
tokenizer.save_pretrained(os.path.join(data_path_r, r'ner_bert.bin'))

('/media/dmitriy/Disk/Downloads/ai_nlp_hw_data/hw_14/rudrec/ner_bert.bin/tokenizer_config.json',
 '/media/dmitriy/Disk/Downloads/ai_nlp_hw_data/hw_14/rudrec/ner_bert.bin/special_tokens_map.json',
 '/media/dmitriy/Disk/Downloads/ai_nlp_hw_data/hw_14/rudrec/ner_bert.bin/vocab.txt',
 '/media/dmitriy/Disk/Downloads/ai_nlp_hw_data/hw_14/rudrec/ner_bert.bin/added_tokens.json',
 '/media/dmitriy/Disk/Downloads/ai_nlp_hw_data/hw_14/rudrec/ner_bert.bin/tokenizer.json')

In [35]:
text = ' '.join(ner_train[8]['tokens'])
text = ' '.join(ner_test[4]['tokens'])
text

'Охотно применяю его при борьбе с насморком , что в моем случае явление очень частое .'

In [36]:
tokens = tokenizer(text, return_tensors='pt')
tokens = {k: v.to(model.device) for k, v in tokens.items()}
with torch.no_grad():
    pred = model(**tokens)
pred.logits.shape

torch.Size([1, 29, 13])

In [37]:
indices = pred.logits.argmax(dim=-1)[0].cpu().numpy()
token_text = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])
for t, idx in zip(token_text, indices):
    print(f'{t:15s} {label_list[idx]:10s}')

[CLS]           O         
О               O         
##хо            O         
##тно           O         
при             O         
##мен           O         
##я             O         
##ю             O         
его             O         
при             O         
борьбе          O         
с               O         
нас             B-DI      
##мор           B-DI      
##ком           B-DI      
,               O         
что             O         
в               O         
м               O         
##ое            O         
##м             O         
случае          O         
я               O         
##вление        O         
очень           O         
часто           O         
##е             O         
.               O         
[SEP]           O         


In [38]:
pipe = pipeline(model=model, tokenizer=tokenizer, task='ner', aggregation_strategy='average', device=0)

In [39]:
print(text)
print(pipe(text))

Охотно применяю его при борьбе с насморком , что в моем случае явление очень частое .
[{'entity_group': 'DI', 'score': 0.7405183, 'word': 'насморком', 'start': 33, 'end': 42}]


### Дообучение GPT

In [2]:
data_path = r'/media/dmitriy/Disk/Downloads/ai_nlp_hw_data/hw_14/recepies/'

In [3]:
df_rec = pd.read_csv(os.path.join(data_path, r'all_recepies_inter.csv'), sep='\t')

In [4]:
data = df_rec.loc[:5000, 'Инструкции']

In [5]:
def build_text_files(data_json, dest_path):
    f = open(dest_path, 'w')
    data = ''
    for texts in data_json:
        summary = str(texts).strip()
        summary = re.sub(r"\s", " ", summary)
        data += summary + "  "
    f.write(data)

In [6]:
train, test = train_test_split(data, test_size=0.15)
build_text_files(train, os.path.join(data_path, r'train_dataset.txt'))
build_text_files(test, os.path.join(data_path, r'test_dataset.txt'))

In [7]:
tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2")
train_path = os.path.join(data_path, r'train_dataset.txt')
test_path = os.path.join(data_path, r'test_dataset.txt')

In [14]:
def load_dataset(train_path, test_path, tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset, test_dataset, data_collator


train_dataset, test_dataset, data_collator = load_dataset(train_path, test_path, tokenizer)

In [9]:
model = AutoModelForCausalLM.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2")

In [10]:
training_args = TrainingArguments(
    output_dir="./gpt2-chief", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=4, # batch size for training
    per_device_eval_batch_size=4,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    )

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)
trainer.train()

In [None]:
trainer.save_model(data_path)

In [None]:
tokenizer.save_pretrained(os.path.join(data_path, r'gpt_chf'))

In [None]:
model.save_pretrained(os.path.join(data_path, r'model_gpt_chf'))

In [None]:
tokenizer = AutoTokenizer.from_pretrained(os.path.join(data_path, r'gpt_chf'))
model1 = AutoModelForCausalLM.from_pretrained(os.path.join(data_path, r'model_gpt_chf'))

In [None]:
prefix = 'берем свежие томаты '

In [None]:
tokens = tokenizer(prefix, return_tensors='pt')

In [None]:
size = tokens['input_ids'].shape[1]
output = model1.generate(
    **tokens, 
    #end_token=end_token_id,
    do_sample=False, 
    max_length=size+50, 
    repetition_penalty=5., 
    temperature=0.5,
    num_beams=10,
)
decoded = tokenizer.decode(output[0])
result = decoded[len(prefix):]
print(prefix + result)

### Дообучение T5

In [2]:
from datasets import load_dataset

In [3]:
dataset_train = load_dataset('IlyaGusev/gazeta', revision="v1.0", split= 'train[:10%]')
dataset_test = load_dataset('IlyaGusev/gazeta', revision="v1.0", split= 'test[:10%]')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [4]:
model_name = "IlyaGusev/rut5_base_sum_gazeta"

In [5]:
def len_tok(text):
    return len(text.split())

In [6]:
max_len_sum, max_len_tl = max(map(len_tok, dataset_train['summary'])), max(map(len_tok, dataset_train['title']))
max_len_sum, max_len_tl

(75, 18)

In [7]:
max_len_sum, max_len_tl = 60, 15

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)


def tokenize(batch):
    tokenized_input = tokenizer(batch['summary'], padding='max_length', truncation=True, max_length=max_len_sum)
    tokenized_label = tokenizer(batch['title'], padding='max_length', truncation=True, max_length=max_len_tl)
    tokenized_input['labels'] = tokenized_label['input_ids']
    return tokenized_input


dataset_train = dataset_train.map(tokenize, batched=True, batch_size=8)
dataset_test = dataset_test.map(tokenize, batched=True, batch_size=8)
dataset_train.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])
dataset_test.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])

In [9]:
dataset_train.save_to_disk(r'/media/dmitriy/Disk/Downloads/ai_nlp_hw_data/hw_14/gazeta/train')
dataset_test.save_to_disk(r'/media/dmitriy/Disk/Downloads/ai_nlp_hw_data/hw_14/gazeta/test')

Saving the dataset (0/1 shards):   0%|          | 0/5240 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/577 [00:00<?, ? examples/s]

In [10]:
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [11]:
output_dir = r'/media/dmitriy/Disk/Downloads/ai_nlp_hw_data/hw_14/gazeta/output'
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=10,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    eval_accumulation_steps=1, # Number of eval steps to keep in GPU (the higher, the mor vRAM used)
    prediction_loss_only=True, # If I need co compute only loss and not other metrics, setting this to true will use less RAM
    learning_rate=0.00001,
    evaluation_strategy='steps', # Run evaluation every eval_steps
    save_steps=1000, # How often to save a checkpoint
    save_total_limit=1, # Number of maximum checkpoints to save
    remove_unused_columns=True, # Removes useless columns from the dataset
    run_name='run_gazeta', # Wandb run name
    logging_steps=500, # How often to log loss to wandb
    eval_steps=500, # How often to run evaluation on the val_set
    logging_first_step=False, # Whether to log also the very first training step to wandb
    load_best_model_at_end=True, # Whether to load the best model found at each evaluation.
    metric_for_best_model="loss", # Use loss to evaluate best model.
    greater_is_better=False # Best model is the one with the lowest loss, not highest.
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test
)
trainer.train()

In [None]:
trainer.save_model(output_dir + '/model')

In [None]:
INX = 100
print("SUMMARY: | {}".format(dataset_test['summary'][INX]))
print("TITLE: | {}".format(dataset_test['title'][INX]))

In [None]:
device = "cuda"

In [None]:
input_text = dataset_test['summary'][INX]
with torch.no_grad():
    tokenized_text = tokenizer(input_text, truncation=True, padding=True, return_tensors='pt')
    source_ids = tokenized_text['input_ids'].to(device, dtype = torch.long)
    source_mask = tokenized_text['attention_mask'].to(device, dtype = torch.long)
    generated_ids = model.generate(
        input_ids = source_ids,
        attention_mask = source_mask, 
        max_length=512,
        num_beams=7,
        temperature = 1.3,
        repetition_penalty=1, 
        length_penalty=1, 
        early_stopping=True,
        no_repeat_ngram_size=2
    )
    pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
print("\noutput:\n" + pred)