# 基于GLM的文本摘要

#### Step1. 导入相关包

In [None]:
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

#### Step2. 加载数据集

In [None]:
ds = Dataset.load_from_disk("./nlpcc_2017/")
ds

In [None]:
ds = ds.train_test_split(100, seed=42)

In [None]:
ds["train"]

#### Step3. 数据处理

In [None]:
tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-large-chinese", trust_remote_code=True)
# tokenizer

def process_func(examples):
    contents = ["摘要生成:\n" + e + tokenizer.mask_token for e in examples["content"]]
    inputs = tokenizer(contents, max_length=384, truncation=True, padding="max_length", return_tensors="pt")
    inputs = tokenizer.build_inputs_for_generation(inputs, targets=examples["title"], padding=True, max_gen_length=64)
    return inputs

tokenized_ds = ds.map(process_func, batched=True, remove_columns=ds["train"].column_names)
print(tokenized_ds)
tokenizer.decode(tokenized_ds["train"][0]["input_ids"])
tokenized_ds["train"][0]["labels"] # 不decode是因为有-100

In [None]:
print(tokenized_ds["train"][0]["input_ids"])

#### Step4. 创建模型

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("THUDM/glm-large-chinese", trust_remote_code=True)

#### Step6. 配置训练参数

In [None]:
args = Seq2SeqTrainingArguments(
    output_dir="./summary_glm",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=8,
    logging_steps=8,
    num_train_epochs=1,
)

#### Step7. 创建训练器

In [None]:
trainer = Seq2SeqTrainer(
    args=args,
    model=model,
    train_dataset=tokenized_ds["train"],
    tokenizer=tokenizer
)

#### Step8. 模型训练

In [None]:
trainer.train()

#### Step9. 模型推理

In [None]:
input_text = ds["test"][-1]["content"]
inputs = tokenizer("摘要生成: \n" + input_text + tokenizer.mask_token, return_tensors="pt")
inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=64)
inputs = inputs.to("cuda")

output = model.generate(**inputs, max_new_tokens=64, eos_token_id=tokenizer.eop_token_id, do_sample=True)
tokenizer.decode(output[0]).tolist()

In [None]:
import torch
model = model.eval()

def predic_test():
    predict = []
    with torch.inference_mode():
        for d in ds["test"]:
            inputs = tokenizer("摘要生成: \n" + d["content"] + tokenizer.mask_token, return_tensors="pt")
            inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=64)
            inputs = inputs.to("cuda")
            output = model.generate(**inputs, max_new_tokens=64, eos_token_id=tokenizer.eop_token_id, do_sample=True)
            predict.append(tokenizer.decode(output[0].tolist()).split("<|startofpiece|>")[1].replace("|<endtofpiece>|", "").strip())
            print("curID:", len(predict))
    return predict

In [None]:
result = predic_test()

In [None]:
from rouge_chinese import Rouge
rouge = Rouge()

decode_preds = [" ".join(p) for p in result]
decode_labels = [" ".join(l) for l in ds["test"]["title"]]
scores = rouge.get_scores(decode_preds, decode_labels, avg=True)
{
    "rouge-1":scores["rouge-1"]["f"],
    "rouge-2":scores["rouge-2"]["f"],
    "rouge-l":scores["rouge-l"]["f"]
}