In [None]:
!pip install torch numpy peft datasets evaluate rouge_chinese -q

## 导包

In [None]:
import torch
import numpy as np
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from transformers import (
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)
from rouge_chinese import Rouge

In [None]:
model_name_or_path = "Langboat/mengzi-t5-base"
file_path = "hugcyp/LCSTS"

## 加载数据集

In [None]:
ds = load_dataset(file_path, num_proc=4)

In [None]:
print(ds)

In [None]:
ds["train"] = ds["train"].select(range(8000))

## 数据预处理

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

In [None]:
def data_pipe(example):
    text_inputs = tokenizer(
        text=["摘要生成：\n" + e for e in example["text"]],
        max_length=64,
        truncation=True,
    )

    target_inputs = tokenizer(
        text_target=example["summary"],
        max_length=32,
        truncation=True,
    )

    text_inputs["labels"] = target_inputs["input_ids"]
    return text_inputs

In [None]:
tokenized_ds = ds.map(data_pipe, batched=True)

## 模型

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)

In [None]:
peft_config = LoraConfig(
    peft_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

In [None]:
model = get_peft_model(model, peft_config)

In [None]:
model.print_trainable_parameters()

## 性能指标

In [None]:
rouge = Rouge()

In [None]:
def compute_metrics(evalPred):
    predictions, labels = evalPred
    decode_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decode_preds = [" ".join(p) for p in decode_preds]
    decode_labels = [" ".join(l) for l in decode_labels]
    scores = rouge.get_scores(decode_preds, decode_labels, avg=True)
    return {
        "rouge-1": scores["rouge-1"]["f"],
        "rouge-2": scores["rouge-2"]["f"],
        "rouge-l": scores["rouge-l"]["f"],
    }

## 训练参数

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./mengzi_lcsts",
    num_train_epochs=5,
    learning_rate=5e-3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    save_total_limit=3,
    metric_for_best_model="rouge-l",
    predict_with_generate=True,
)

## Trainer

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    args=training_args,
)

## 训练

In [None]:
trainer.train()