In [1]:
from tokenize import tokenize

#导入相关的包
import torch
from transformers import AutoModel, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer,Seq2SeqTrainingArguments, AutoTokenizer

In [2]:
#加载数据集
from datasets import Dataset
ds = Dataset.load_from_disk("./nlpcc_2017")
ds

Dataset({
    features: ['title', 'content'],
    num_rows: 5000
})

In [3]:
ds = ds.train_test_split(100,seed=42)

In [4]:
#数据处理
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained("D:\Hugging Face Hub\mengzi-t5-base",use_sentencepiece=True)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
def process_fun(examples):
    content =["摘要生成:\n"+i for i in examples["content"]]#相当于在提示模型要干什么
    inputs = tokenizer(content,max_length=384,truncation=True)
    labels = tokenizer(examples["title"],max_length=384,truncation=True)
    inputs["labels"] = labels["input_ids"]
    return inputs

In [6]:
tokenizer_ds = ds.map(process_fun,batched=True)

Map:   0%|          | 0/4900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [7]:
#创建模型
model = AutoModelForSeq2SeqLM.from_pretrained("D:\Hugging Face Hub\mengzi-t5-base")


In [8]:

#创建评估函数
import numpy as np
from rouge_chinese import Rouge
def compute_metric(pred):
    predictions,labels = pred
    decode_preds = tokenizer.batch_decode(predictions,skip_special_tokens=True)
    labels =np.where(labels !=100,labels,tokenizer.pad_token_id)
    decode_labels = tokenizer.batch_decode(labels,skip_special_tokens=True)
    decode_preds = [" ".join(i) for i in decode_preds]
    decode_labels = [" ".join(i) for i in decode_labels]
    scores = Rouge.get_scores(decode_preds,decode_labels)
    return {
        "rouge-1":scores["rouge-1"]["f"],
        "rouge-2":scores["rouge-2"]["f"],
        "rouge-l":scores["rouge-l"]["f"],
    }

In [9]:
#配置训练参数
args = Seq2SeqTrainingArguments(
    output_dir="./seq2seq",
    per_device_eval_batch_size=4,
    per_gpu_train_batch_size=2,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model = "rouge-1",
    predict_with_generate=True

)

In [13]:
#
trainer = Seq2SeqTrainer(
    args = args,
    model = model,
    train_dataset=tokenizer_ds["train"],
    eval_dataset = tokenizer_ds["test"],
    compute_metrics = compute_metric,
    tokenizer = tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer)
)

  trainer = Seq2SeqTrainer(
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


In [14]:
#训练
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [15]:
#模型推理
from transformers import pipeline
pipe = pipeline("text2text-generation",model=model,tokenizer=tokenizer,device=0)

Device set to use cuda:0


In [21]:
pipe("摘要生成：\n"+ds["test"][2]["content"],max_length=128,do_sample=True)

Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'generated_text': '黑龙江省:非法生产、销售和使用“伪基站”致“伪基站”案件,共打掉200个“伪基站”案件。'}]

In [20]:
ds["test"][2]["title"]

'黑龙江:报道揭露黑龙江”伪基站“盛行,每天向百万手机用户发放垃圾短信。'