In [7]:
# 1. Импорт библиотек
import pandas as pd
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments

In [8]:

# 2. Загрузка данных
dataset = load_dataset("RussianNLP/Mixed-Summarization-Dataset")

train_ds = dataset["train"]
val_ds = dataset["test"]


# 3. Токенизация
model_name = "facebook/bart-base"  # или "t5-small", "google/pegasus-xsum"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [9]:
def preprocess(examples):
    model_inputs = tokenizer(
        examples["text"],
        max_length=512,
        truncation=True,
        padding="max_length"  # или "longest" для экономии памяти
    )

    labels = tokenizer(
        examples["summary"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_ds.map(preprocess, batched=True)
tokenized_val = val_ds.map(preprocess, batched=True)

Map:   0%|          | 0/197561 [00:00<?, ? examples/s]

Map:   0%|          | 0/258 [00:00<?, ? examples/s]

In [None]:
# 5. Модель
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# 6. Тренировка
args = TrainingArguments(
    output_dir="./checkpoints",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="epoch",   # теперь включаем валидацию
    save_strategy="epoch",
    num_train_epochs=3,
    learning_rate=5e-5,
    logging_steps=100,
    fp16=True,  # если используешь GPU
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer
)

trainer.train()

In [None]:
text = "Long article text goes here..."
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
summary_ids = model.generate(**inputs, max_length=128, num_beams=4)
print(tokenizer.decode(summary_ids[0], skip_special_tokens=True))