In [None]:
%pip install datasets >> None
%pip install evaluate >> None
%pip install transformers >> None
%pip install rouge-score >> None
%pip install nltk >> None
%pip install ipywidgets >> None
%pip install transformers[torch] >> None
%pip install accelerate -U >> None


Логин в Huggin Face. Может спросить access token

In [2]:
from huggingface_hub import notebook_login

notebook_login()

import transformers

print(transformers.__version__)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

4.43.2


Fine-tuning модели для задачи суммаризации

Используем модель t5-small с датасетом XSum_dataset (extreme summarizaton) новости и их саммари в одном предложении

In [3]:
model_checkpoint = "t5-small" # выбираем модель
dataset = "xsum"

Загружаем и исследуем датасет

In [4]:
from datasets import load_dataset
from evaluate import load

raw_datasets = load_dataset(dataset, trust_remote_code=True)
metric = load("rouge")

Изучаем выборку

In [5]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

In [6]:
raw_datasets["train"][:2]

  'A fire alarm went off at the Holiday Inn in Hope Street at about 04:20 BST on Saturday and guests were asked to leave the hotel.\nAs they gathered outside they saw the two buses, parked side-by-side in the car park, engulfed by flames.\nOne of the tour groups is from Germany, the other from China and Taiwan. It was their first night in Northern Ireland.\nThe driver of one of the buses said many of the passengers had left personal belongings on board and these had been destroyed.\nBoth groups have organised replacement coaches and will begin their tour of the north coast later than they had planned.\nPolice have appealed for information about the attack.\nInsp David Gibson said: "It appears as though the fire started under one of the buses before spreading to the second.\n"While the exact cause is still under investigation, it is thought that the fire was started deliberately."'],
 'summary': ['Clean-up operations are continuing across the Scottish Borders and Dumfries and Galloway a

In [7]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

Предобработка данных

Используем токенайзер от модели

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [9]:
print(tokenizer('Я иду в школу'))
print(tokenizer('Im going to school'))

{'input_ids': [3, 2, 3, 2795, 5814, 3700, 8724, 3, 2, 12377, 6588, 3700, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [1318, 352, 12, 496, 1], 'attention_mask': [1, 1, 1, 1, 1]}


Задаём функцию для преобразования текста в токены. Важно - добавляем к тексту промпт ("summarize:").

In [10]:
max_input_length = 1024
max_target_length = 128

prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
preprocess_function(raw_datasets['train'][:2])

Токенизируем датасет с добавленным промптом 

In [11]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/204045 [00:00<?, ? examples/s]

Дообучаем модель

Так как исходная и новая модели работают режиме Sequence-to-Sequence, используем AutoModelForSeq2SeqLM

In [12]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) 

Задаём гиперпараметры обучения

In [14]:
batch_size = 4
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-xsum",
    eval_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

Задаём упаковщик, для упаковки параметров в батчи для параллельного обучения

In [15]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Используем метрику оценки Rouge, также будем замеряь длину сгенерированных summary

In [16]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

Передаём параметры в trainer

In [17]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Запускаем обучение

In [18]:
trainer.train()

  0%|          | 0/51012 [00:00<?, ?it/s]

{'loss': 3.1294, 'grad_norm': 3.2733824253082275, 'learning_rate': 1.980396769387595e-05, 'epoch': 0.01}


KeyboardInterrupt: 