# PEFT 微调用于对话摘要的生成式 AI 模型


## First, check that the correct kernel is chosen.

<img src="img/kernel_set_up.png" width="300"/>

You can click on that to see and check the details of the image, kernel, and instance type.

<img src="img/w3_kernel_and_instance_type.png" width="600"/>

In [2]:
# 导入datasets库的load_dataset函数，用于加载数据集。
from datasets import load_dataset

# 导入transformers库的一些模型和功能。AutoModelForSeq2SeqLM用于加载序列到序列的模型，AutoTokenizer用于加载tokenizer，
# GenerationConfig用于配置模型生成的参数，TrainingArguments用于配置训练参数，Trainer用于训练模型。
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer

# 导入torch库，是一个用于深度学习的开源库。
import torch

# 导入time库，用于处理时间相关的操作。
import time

# 导入evaluate模块，通常用于评估模型的性能（这个模块的具体内容会根据你的代码库和项目有所不同）。
import evaluate

# 导入pandas库，这是一个用于数据处理和分析的库。
import pandas as pd

# 导入numpy库，这是一个用于数值计算的库。
import numpy as np

# 导入tqdm库，这是一个快速，可扩展的Python进度条，可以在长循环中添加一个进度提示信息，用户只需要封装任意的迭代器tqdm(iterator)。
from tqdm import tqdm

# tqdm库的progress_apply方法可以方便地将进度条应用到Pandas的DataFrame或Series的apply方法上。
tqdm.pandas()

## 加载数据集和 LLM

In [3]:
# 从Hugging Face的数据集库中加载一个名为"knkarthick/dialogsum"的数据集，并将其保存到变量dataset中。

# 设置Hugging Face数据集库中的数据集名称。
huggingface_dataset_name = "knkarthick/dialogsum"

# 使用load_dataset函数加载指定名称的数据集。
dataset = load_dataset(huggingface_dataset_name)

# 显示加载的数据集。
dataset

Found cached dataset csv (/root/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-1d0df498900a79f1/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [4]:
# 从Hugging Face的模型库中加载一个预训练的序列到序列模型和对应的tokenizer。这里选择的模型是谷歌的'flan-t5-base'模型。
# AutoModelForSeq2SeqLM.from_pretrained函数会根据提供的名称从Hugging Face的模型库中查找并下载对应的预训练模型。
# 这个模型是一个序列到序列（Seq2Seq）模型，适用于像机器翻译、文本摘要等任务。
# AutoTokenizer.from_pretrained函数同样会从模型库中查找并下载对应的tokenizer。
# Tokenizer是用于将原始文本转化为模型可以处理的格式的工具。

# 设置Hugging Face模型库中的模型名称。
model_name='google/flan-t5-base'

# 使用AutoModelForSeq2SeqLM的from_pretrained方法加载预训练模型。torch_dtype=torch.bfloat16表示模型的张量数据类型为bfloat16，
# 这种数据类型可以在保持大部分精度的同时，减少模型的内存占用，加速计算。
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)

# 使用AutoTokenizer的from_pretrained方法加载与预训练模型相对应的tokenizer，用于将原始文本转化为模型可以处理的格式。
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
# 这段代码定义了一个函数print_number_of_trainable_model_parameters，用于打印给定模型的可训练参数数量、总参数数量以及可训练参数占总参数的百分比。

# 定义函数，接受一个模型作为输入。
def print_number_of_trainable_model_parameters(model):
    # 初始化可训练参数和总参数的计数器。
    trainable_model_params = 0
    all_model_params = 0

    # 遍历模型的所有参数。
    for _, param in model.named_parameters():
        # 计算所有参数的总数。numel()函数返回参数中元素的数量。
        all_model_params += param.numel()

        # 如果参数需要梯度（即可训练），则计算可训练参数的数量。
        if param.requires_grad:
            trainable_model_params += param.numel()

    # 返回一个格式化的字符串，显示可训练参数数量、总参数数量和可训练参数占总参数的百分比。
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params}%"

# 调用函数，打印模型的参数信息。
print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.0%


## 使用Zero Shot 推理测试模型

In [6]:
# 这段代码的功能是从测试数据集中选择一条对话数据，使用预训练的模型生成对应的对话摘要，并将人工编写的摘要（基线）和模型生成的摘要进行比较。

# 选择测试数据集中的一个索引。
index = 200

# 从测试数据集中获取对话和摘要。
dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

# 构建模型输入的提示，包括要求模型进行对话摘要的指示和实际的对话内容。
prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

# 使用tokenizer将输入提示转换为模型可以处理的形式，返回张量格式的输入。
inputs = tokenizer(prompt, return_tensors='pt')

# 使用模型生成摘要。generate函数接受输入ID和最大新生成令牌数（这里设为200个），然后返回生成的摘要的ID。
# decode函数将生成摘要的ID转换回文本形式，并跳过特殊令牌（比如开始和结束令牌）。
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"], 
        max_new_tokens=200,
    )[0], 
    skip_special_tokens=True
)

# 定义一个分隔线，用于在打印输出时分隔不同部分的内容。
dash_line = ('-'.join('' for x in range(100)))

# 打印输入的提示、基线人工摘要和模型生成的摘要。
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

-------------------------------------------------------------------

## 执行全面微调 Perform Full Fine-Tuning

In [7]:
# 这段代码首先定义了一个函数tokenize_function，该函数将原始对话和摘要文本转换为模型训练所需的格式，然后对数据集应用这个函数，最后移除了数据集中不需要的列。

# 定义函数，接受一个example（包含对话和摘要的字典）作为输入。
def tokenize_function(example):
    # 定义模型输入的提示，包括开始和结束的提示语。
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '

    # 将对话和提示语合并，生成模型的输入。
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]

    # 使用tokenizer将输入提示转换为模型可以处理的形式，返回张量格式的输入。
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids

    # 使用tokenizer将摘要转换为模型可以处理的形式，作为训练目标（标签）。
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    # 返回处理后的example。
    return example

# 使用map函数将tokenize_function应用到数据集的每一个元素，参数batched=True表示按批次处理数据。
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 使用remove_columns函数移除数据集中不需要的列，包括'id'、'topic'、'dialogue'和'summary'。
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])

Loading cached processed dataset at /root/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-1d0df498900a79f1/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-5f46830011c7eba2.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-1d0df498900a79f1/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-6aff51518f8c17a6.arrow


Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [8]:
# 使用filter函数筛选出数据集中索引为100的倍数的样本。
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

Loading cached processed dataset at /root/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-1d0df498900a79f1/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-dc63ff0c1c2e5347.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-1d0df498900a79f1/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-54224b7cd734d868.arrow


Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [9]:
# 打印训练、验证和测试数据集的形状，即样本数量。
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

# 打印数据集的详细信息，包括数据集的名称、版本、描述、许可证、格式等。
print(tokenized_datasets)

Shapes of the datasets:
Training: (125, 2)
Validation: (5, 2)
Test: (15, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
})


In [10]:
# 设置输出目录的名称，其中包含了当前的时间戳，以区分不同的训练运行。
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

# 初始化TrainingArguments实例，定义了训练过程中的一些参数。
training_args = TrainingArguments(
    # 指定模型和训练日志的保存路径。
    output_dir=output_dir,
    # 设置学习率，决定了模型参数在训练中的更新速度。
    learning_rate=1e-5,
    # 设置训练的轮数，每一轮会遍历整个训练数据集一次。
    num_train_epochs=1,
    # 设置权重衰减，用于正则化模型，防止过拟合。
    weight_decay=0.01,
    # 指定每隔多少步打印训练日志。
    logging_steps=1,
    # 指定训练的最大步数，如果设为1，那么训练将在完成一步后立即结束。
    max_steps=1
)

# 初始化Trainer实例，用于进行模型训练。
trainer = Trainer(
    # 指定要训练的模型。
    model=original_model,
    # 指定训练参数。
    args=training_args,
    # 指定训练数据集。
    train_dataset=tokenized_datasets['train'],
    # 指定验证数据集，用于在训练过程中评估模型的性能。
    eval_dataset=tokenized_datasets['validation']
)

In [11]:
# 开始训练模型
trainer.train()



Step,Training Loss
1,49.5


TrainOutput(global_step=1, training_loss=49.5, metrics={'train_runtime': 3.2011, 'train_samples_per_second': 2.499, 'train_steps_per_second': 0.312, 'total_flos': 5478058819584.0, 'train_loss': 49.5, 'epoch': 0.06})

In [12]:
!aws s3 cp --recursive s3://dsoaws/models/flan-dialogue-summary-checkpoint/ ./flan-dialogue-summary-checkpoint/

download: s3://dsoaws/models/flan-dialogue-summary-checkpoint/generation_config.json to flan-dialogue-summary-checkpoint/generation_config.json
download: s3://dsoaws/models/flan-dialogue-summary-checkpoint/training_args.bin to flan-dialogue-summary-checkpoint/training_args.bin
download: s3://dsoaws/models/flan-dialogue-summary-checkpoint/config.json to flan-dialogue-summary-checkpoint/config.json
download: s3://dsoaws/models/flan-dialogue-summary-checkpoint/rng_state.pth to flan-dialogue-summary-checkpoint/rng_state.pth
download: s3://dsoaws/models/flan-dialogue-summary-checkpoint/scheduler.pt to flan-dialogue-summary-checkpoint/scheduler.pt
download: s3://dsoaws/models/flan-dialogue-summary-checkpoint/trainer_state.json to flan-dialogue-summary-checkpoint/trainer_state.json
download: s3://dsoaws/models/flan-dialogue-summary-checkpoint/pytorch_model.bin to flan-dialogue-summary-checkpoint/pytorch_model.bin
download: s3://dsoaws/models/flan-dialogue-summary-checkpoint/optimizer.pt to fl

In [12]:
!ls -al ./flan-dialogue-summary-checkpoint/pytorch_model.bin

-rw-r--r-- 1 root root 990408885 Apr 10 17:51 ./flan-dialogue-summary-checkpoint/pytorch_model.bin


## 对模型进行定性评估

In [13]:
import torch

# 检查是否有可用的GPU
print(torch.cuda.is_available())

# 获取GPU的数量
print(torch.cuda.device_count())

# 获取当前设备的索引，如果你在使用GPU，这将返回你正在使用的GPU的索引
print(torch.cuda.current_device())

# 获取当前设备的名称
print(torch.cuda.get_device_name(torch.cuda.current_device()))

True
1
0
Tesla T4


In [14]:
# 判断设备是否有GPU资源
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(device)

# 从指定路径加载预训练的序列到序列的语言模型，并指定模型的数据类型为bfloat16
instruct_model = AutoModelForSeq2SeqLM.from_pretrained("./flan-dialogue-summary-checkpoint", torch_dtype=torch.bfloat16).to(device)
# torch_dtype=torch.bfloat16是一个指定模型数据类型的参数。
# bfloat16是一种16位宽的浮点数数据类型，相比32位的float32数据类型，它可以减少模型的内存占用和计算时间，但可能会略微降低模型的精度。

In [15]:
# 这段代码的主要目的是比较两个模型：original_model和instruct_model在对同一对话进行摘要的性能。同时，也与人类的base line摘要进行了比较。

# 选择测试数据集中的一个样本的索引
index = 200
# 提取此索引对应的对话
dialogue = dataset['test'][index]['dialogue']
# 提取此索引对应的人类基线摘要
human_baseline_summary = dataset['test'][index]['summary']

# 构造模型的输入提示，包含对话和需要模型进行的任务
prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

# 使用tokenizer将输入提示转化为模型可以接受的输入形式
# 准备输入数据时，确保与模型在同一设备上
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

# 使用original_model生成对话摘要
original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
# 将模型生成的摘要从id形式转化为文本形式
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

# 使用instruct_model生成对话摘要
instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
# 将模型生成的摘要从id形式转化为文本形式
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

# 打印出人类基线摘要、original_model生成的摘要和instruct_model生成的摘要
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
#Person1#: I'm thinking of upgrading my computer.
---------------------------------------------------------------------------------------------------
INSTRUCT MODEL:
#Person1# suggests #Person2# upgrading #Person2#'s system, hardware, and CD-ROM drive. #Person2# thinks it's great.


## 对模型进行定量评估

In [16]:
rouge = evaluate.load('rouge')  # 加载'rouge'评价指标
# ROUGE (Recall-Oriented Understudy for Gisting Evaluation)是一种评估自动文本摘要的方法，尤其在机器翻译和自动摘要生成等任务中使用较多。
# ROUGE主要通过比较机器生成的摘要和人类编写的参考摘要之间的重叠来评价机器生成的摘要的质量。

生成测试数据集样本的输出（为了节省时间，只有 10 个对话和摘要），然后保存结果。

In [17]:
# 这段代码的主要目标是对测试数据集中的一些对话进行摘要，并将人类编写的摘要与两种不同模型生成的摘要进行比较。

# 从数据集中获取测试数据的前10条对话和对应的人类编写的摘要
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

# 初始化列表，用于保存两种模型生成的摘要
original_model_summaries = []
instruct_model_summaries = []

# 对测试数据中的每一条对话进行迭代
for _, dialogue in enumerate(tqdm(dialogues)):
    # 定义模型的输入提示
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """
    # 使用tokenizer处理输入提示，并获取输入的id
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    # 使用original_model生成对话摘要
    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    # 将模型生成的摘要从id形式转化为文本形式
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    # 将生成的摘要添加到original_model_summaries列表中
    original_model_summaries.append(original_model_text_output)

    # 使用instruct_model生成对话摘要
    instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    # 将模型生成的摘要从id形式转化为文本形式
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
    # 将生成的摘要添加到instruct_model_summaries列表中
    instruct_model_summaries.append(instruct_model_text_output)

# 将人类编写的摘要、original_model生成的摘要和instruct_model生成的摘要组合在一起
zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries))

# 设置pandas显示列的最大宽度
pd.set_option('display.max_colwidth', 500)
# 创建一个pandas的DataFrame，用于显示每一条对话的人类编写的摘要、original_model生成的摘要和instruct_model生成的摘要
df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries'])
df  # 显示这个DataFrame

100%|██████████| 10/10 [00:15<00:00,  1.57s/it]


Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to inform every employee that they have to change the communication method and should not use Instant Messaging anymore.,#Person1#: I need to take a dictation for you.,#Person1# asks Ms. Dawson to take a dictation to all employees by this afternoon. Ms. Dawson tells #Person1# all office communications are restricted to email correspondence and official memos and the use of Instant Message programs by employees during working hours is strictly prohibited. #Person1# wants to change the communication methods and Ms. Dawson tells #Person1# it applies to internal and external communications.
1,"In order to prevent employees from wasting time on Instant Message programs, #Person1# decides to terminate the use of those programs and asks Ms. Dawson to send out a memo to all employees by the afternoon.",#Person1#: I need to take a dictation for you.,#Person1# asks Ms. Dawson to take a dictation to all employees by this afternoon. Ms. Dawson tells #Person1# all office communications are restricted to email correspondence and official memos and the use of Instant Message programs by employees during working hours is strictly prohibited. #Person1# wants to change the communication methods and Ms. Dawson tells #Person1# it applies to internal and external communications.
2,Ms. Dawson takes a dictation for #Person1# about prohibiting the use of Instant Message programs in the office. They argue about its reasonability but #Person1# still insists.,#Person1#: I need to take a dictation for you.,#Person1# asks Ms. Dawson to take a dictation to all employees by this afternoon. Ms. Dawson tells #Person1# all office communications are restricted to email correspondence and official memos and the use of Instant Message programs by employees during working hours is strictly prohibited. #Person1# wants to change the communication methods and Ms. Dawson tells #Person1# it applies to internal and external communications.
3,#Person2# arrives late because of traffic jam. #Person1# persuades #Person2# to use public transportations to keep healthy and to protect the environment.,The traffic jam at the Carrefour intersection is a problem.,#Person2# got stuck in traffic again. #Person1# suggests #Person2# start taking public transport system to work and suggests #Person2# start biking to work when it's nicer outside. #Person2# agrees.
4,#Person2# decides to follow #Person1#'s suggestions on quitting driving to work and will try to use public transportations.,The traffic jam at the Carrefour intersection is a problem.,#Person2# got stuck in traffic again. #Person1# suggests #Person2# start taking public transport system to work and suggests #Person2# start biking to work when it's nicer outside. #Person2# agrees.
5,"#Person2# complains to #Person1# about the traffic jam, #Person1# suggests quitting driving and taking public transportation instead.",The traffic jam at the Carrefour intersection is a problem.,#Person2# got stuck in traffic again. #Person1# suggests #Person2# start taking public transport system to work and suggests #Person2# start biking to work when it's nicer outside. #Person2# agrees.
6,#Person1# tells Kate that Masha and Hero get divorced. Kate is surprised because she thought they are perfect couple.,Masha and Hero are getting divorced.,Masha and Hero are getting divorced. Kate can't believe it. #Person1# tells Kate they are having a separation for 2 months and filed for divorce. Kate thinks it's surprising and can't believe it.
7,#Person1# tells Kate that Masha and Hero are getting a peaceful divorce. Kate feels surprised and asks about their kids.,Masha and Hero are getting divorced.,Masha and Hero are getting divorced. Kate can't believe it. #Person1# tells Kate they are having a separation for 2 months and filed for divorce. Kate thinks it's surprising and can't believe it.
8,#Person1# and Kate talk about the divorce between Masha and Hero. Kate feels surprised because she thought they are well matched,Masha and Hero are getting divorced.,Masha and Hero are getting divorced. Kate can't believe it. #Person1# tells Kate they are having a separation for 2 months and filed for divorce. Kate thinks it's surprising and can't believe it.
9,#Person1# and Brian are at the birthday party of Brian. Brian thinks #Person1# looks great and is popular.,"#Person1#: Happy birthday, Brian. #Person2#: I'm so happy you're having a good time. #Person1#: Thank you, I'm sure you're having a good time. #Person2#: Thank you, I'm sure you're having a good time. #Person1#: Thank you, I'm sure you're having a good time. #Person2#: Thank you, I'm sure you're having a good time. #Person1#: Thank you, I'm sure you're having a good time.",Brian's birthday is coming. #Person1# invites Brian to have a dance and Brian compliments #Person1#'s looks. Brian thinks #Person1# looks great and invites #Person1# to have a drink together.


In [19]:
# 这段代码的主要目标是计算并打印两种模型生成的摘要与人类编写的摘要之间的ROUGE得分。ROUGE得分是用来评估自动文本摘要或机器翻译的质量的一种指标。

# 使用ROUGE指标计算original_model生成的摘要和人类编写的摘要之间的相似度
original_model_results = rouge.compute(
    predictions=original_model_summaries,  # original_model生成的摘要
    references=human_baseline_summaries[0:len(original_model_summaries)],  # 对应的人类编写的摘要
    use_aggregator=True,  # 使用聚合器，用于计算所有摘要的平均ROUGE得分
    use_stemmer=True,  # 使用词干提取，将单词变为其基本形式以进行比较
)

# 使用ROUGE指标计算instruct_model生成的摘要和人类编写的摘要之间的相似度
instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,  # instruct_model生成的摘要
    references=human_baseline_summaries[0:len(instruct_model_summaries)],  # 对应的人类编写的摘要
    use_aggregator=True,  # 使用聚合器，用于计算所有摘要的平均ROUGE得分
    use_stemmer=True,  # 使用词干提取，将单词变为其基本形式以进行比较
)

# 打印original_model的ROUGE得分
print('ORIGINAL MODEL:')
print(original_model_results)

# 打印instruct_model的ROUGE得分
print('INSTRUCT MODEL:')
print(instruct_model_results)

ORIGINAL MODEL:
{'rouge1': 0.23884559093833285, 'rouge2': 0.11535720375106562, 'rougeL': 0.21714203657752046, 'rougeLsum': 0.2175800707655546}
INSTRUCT MODEL:
{'rouge1': 0.41026607717457186, 'rouge2': 0.17840645241958838, 'rougeL': 0.2977022096267017, 'rougeLsum': 0.2987374187518165}


In [20]:
# 引入pandas库，并读取CSV文件的数据，将其存储在results变量中
results = pd.read_csv("data-peft/dialogue-summary-training-results-peft.csv")

# 从results数据框架中获取'human_baseline_summaries'列的值，并将其转换为numpy数组，存储在human_baseline_summaries变量中
human_baseline_summaries = results['human_baseline_summaries'].values

# 从results数据框架中获取'original_model_summaries'列的值，并将其转换为numpy数组，存储在original_model_summaries变量中
original_model_summaries = results['original_model_summaries'].values

# 从results数据框架中获取'instruct_model_summaries'列的值，并将其转换为numpy数组，存储在instruct_model_summaries变量中
instruct_model_summaries = results['instruct_model_summaries'].values

# 使用ROUGE (Recall-Oriented Understudy for Gisting Evaluation) 度量方法计算原始模型的预测摘要和人类基线摘要之间的相似度
# 使用聚合器和词干提取器进行计算，结果存储在original_model_results变量中
original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

# 使用ROUGE度量方法计算指导模型的预测摘要和人类基线摘要之间的相似度
# 使用聚合器和词干提取器进行计算，结果存储在instruct_model_results变量中
instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

# 打印原始模型的评估结果
print('ORIGINAL MODEL:')
print(original_model_results)

# 打印指导模型的评估结果
print('INSTRUCT MODEL:')
print(instruct_model_results)

ORIGINAL MODEL:
{'rouge1': 0.2334158581572823, 'rouge2': 0.07603964187010573, 'rougeL': 0.20145520923859048, 'rougeLsum': 0.20145899339006135}
INSTRUCT MODEL:
{'rouge1': 0.42161291557556113, 'rouge2': 0.18035380596301792, 'rougeL': 0.3384439349963909, 'rougeLsum': 0.33835653595561666}


In [21]:
# 打印一条消息，表示我们将计算指导模型相对于人类基线模型的百分比改进情况
print("Absolute percentage improvement of INSTRUCT MODEL over HUMAN BASELINE")

# 计算指导模型的评估结果和原始模型的评估结果的差异，并将其转换为numpy数组
# 这个差异就是指导模型相对于原始模型的改进程度
improvement = (np.array(list(instruct_model_results.values())) - np.array(list(original_model_results.values())))

# 遍历指导模型的评估结果的每一个键（即评估指标名称）和对应的改进程度
# 将每一个改进程度乘以100，转换为百分比，并保留两位小数
# 打印每一个评估指标的名称和对应的百分比改进情况
for key, value in zip(instruct_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

Absolute percentage improvement of INSTRUCT MODEL over HUMAN BASELINE
rouge1: 18.82%
rouge2: 10.43%
rougeL: 13.70%
rougeLsum: 13.69%


## Parameter Efficient Fine-Tuning (PEFT)

![](img/peft_lora.png)

## Lora的例子：

假设你有一本非常厚的书，这本书包含了许多的信息。然而，你需要在一个小书包中带着这本书。显然，这本书太大，无法放入书包。这个时候，Lora就像是一个压缩工具，它可以将这本书压缩成一本小册子，这本小册子虽然比原书小，但它包含了原书的主要信息。

当你需要在外面查阅书中的信息时，你不再需要带着那本厚重的书，而是可以带着那本压缩后的小册子。这样，你不仅可以节省空间，还可以节省查阅信息所需的时间。

这就是Lora的主要思想。在语言模型中，"厚重的书"就是原始的Attention矩阵，"小册子"就是Lora参数。

-------------------------------------

用一个更形象的比喻来解释Lora参数是如何捕获原始矩阵的主要信息的。

想象一下，你正在看一部电影。这部电影很长，有很多细节和复杂的情节。但是，你只有几分钟的时间，你想了解电影的大致内容。

这时，你可能会选择看电影的预告片。预告片是电影的一个简化版本，它虽然没有电影的所有细节，但是它可以在短时间内给你一个关于电影的大体概念。

在这个比喻中，电影就像是我们的原始矩阵，预告片就像是我们的Lora参数。Lora参数通过保留最重要的特征（就像预告片保留电影的关键情节）来简化原始矩阵，从而使我们可以用更少的资源（就像看预告片比看整部电影更节约时间）来理解和使用模型。

----------------------------

让我们看一个更具体的例子来说明Lora在模型微调中的应用。

假设我们有一个预训练的语言模型，它已经被训练来理解和生成自然语言文本。我们希望微调这个模型来执行一个特殊的任务，比如情感分析，即预测输入文本的情感是积极还是消极。

在原始模型（没有应用Lora）中，我们需要调整模型中的所有参数来适应新的任务。这个过程可能会非常耗费计算资源，并且有可能导致过拟合，因为模型可能会过度地适应新的训练数据。

然而，如果我们应用Lora，我们可以将模型的注意力参数从一个高维空间（原始参数空间）映射到一个低维空间（Lora参数空间）。这样，我们只需要微调这个低维空间中的参数，而不是原始的高维空间中的所有参数。

例如，假设原始的注意力参数是一个10000x10000的矩阵，我们可以通过Lora将其映射到一个10x10的矩阵。然后，我们只需要微调这个10x10的矩阵，而不是原始的10000x10000的矩阵。这将大大减少微调过程中所需的计算资源，并且有可能提高模型在新任务上的性能，因为低维空间中的参数可以更好地捕获原始矩阵的主要信息。

总的来说，Lora在模型微调中的应用就是通过将模型的参数映射到一个低维空间，然后在这个低维空间中进行微调，从而提高模型的效率和性能。

In [24]:
# 引入需要的模块和类
from peft import LoraConfig, get_peft_model, TaskType

# 创建一个LoraConfig对象，用于配置LORA（Low-Rank Adaptation）模型的参数
lora_config = LoraConfig(
    r=32, # Rank，表示低秩适配的秩，即在低秩空间中的维度
    lora_alpha=32, # alpha参数，表示在训练过程中对LORA层的缩放因子
    target_modules=["q", "v"], # target_modules参数，表示要应用LORA的模块
    lora_dropout=0.05, # dropout参数，表示在LORA层中的dropout比例
    bias="none", # bias参数，表示是否在LORA层中添加偏置项
    task_type=TaskType.SEQ_2_SEQ_LM # task_type参数，表示任务类型，这里是序列到序列的语言模型任务
)

`r`：这是低秩适配的秩，表示在低秩空间中的维度。这个值越大，低维空间的维度就越高，可以捕获更多的原始信息，但计算和存储需求也会更高。相反，如果这个值越小，低维空间的维度就越低，虽然计算和存储需求会降低，但可能会丢失一些原始信息。

`lora_alpha`：这是一个缩放因子，用于在训练过程中调整Lora层的学习率。这个值越大，Lora层的学习率就越高，模型可能会更快地适应新的任务，但也可能会导致过拟合。相反，如果这个值越小，Lora层的学习率就越低，过拟合的风险可能会降低，但模型可能需要更长的时间来适应新的任务。

`target_modules`：这是一个列表，定义了要应用Lora的模块。在这个例子中，我们选择了"q"和"v"，这表示我们将Lora应用到注意力机制的查询向量（"q"）和值向量（"v"）。

`lora_dropout`：这是在Lora层中的dropout比例，用于防止过拟合。在训练过程中，每个步骤中都会随机地将一部分Lora层的神经元设为0，这可以帮助模型更好地泛化到未见过的数据。

`bias`：这个参数表示是否在Lora层中添加偏置项。在这个例子中，我们选择了"none"，表示我们不在Lora层中添加偏置项。

`task_type`：这个参数表示任务类型。在这个例子中，我们选择了TaskType.SEQ_2_SEQ_LM，表示我们的任务是一个序列到序列的语言模型任务。

In [25]:
# 使用原始模型和LORA配置信息，获取带有PEFT（Performance-Enhancing Fine-Tuning）的模型
peft_model = get_peft_model(original_model, lora_config)

# 打印出PEFT模型的可训练参数的数量
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.4092820552029972%


In [26]:
# 这段代码的主要目的是设置PEFT模型的训练参数，并创建一个用于训练模型的训练器。

# 使用当前时间戳创建一个唯一的输出目录，用于保存训练过程中的模型和日志文件
output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'

# 创建一个TrainingArguments对象，该对象包含了训练参数
peft_training_args = TrainingArguments(
    output_dir=output_dir,  # 指定输出目录
    auto_find_batch_size=True,  # 自动寻找合适的批次大小
    learning_rate=1e-3,  # 设置学习率，这里的学习率比全模型微调的学习率要高
    num_train_epochs=1,  # 设置训练轮次为1
    logging_steps=1,  # 每训练1步就记录日志
    max_steps=1  # 最大训练步数为1
)

# 创建一个Trainer对象，用于训练模型
peft_trainer = Trainer(
    model=peft_model,  # 指定要训练的模型
    args=peft_training_args,  # 指定训练参数
    train_dataset=tokenized_datasets["train"],  # 指定训练数据集
)

In [27]:
# 这段代码的主要目的是开始训练模型并保存训练后的模型和对应的分词器。

# 使用Trainer对象开始训练模型
peft_trainer.train()

# 指定保存模型的路径
peft_model_path="./peft-dialogue-summary-checkpoint-local"

# 保存训练后的模型到指定路径
peft_trainer.model.save_pretrained(peft_model_path)

# 保存模型使用的分词器到指定路径
tokenizer.save_pretrained(peft_model_path)



Step,Training Loss
1,49.25


('./peft-dialogue-summary-checkpoint-local/tokenizer_config.json',
 './peft-dialogue-summary-checkpoint-local/special_tokens_map.json',
 './peft-dialogue-summary-checkpoint-local/tokenizer.json')

In [28]:
!aws s3 cp --recursive s3://dsoaws/models/peft-dialogue-summary-checkpoint/ ./peft-dialogue-summary-checkpoint-from-s3/ 

download: s3://dsoaws/models/peft-dialogue-summary-checkpoint/adapter_config.json to peft-dialogue-summary-checkpoint-from-s3/adapter_config.json
download: s3://dsoaws/models/peft-dialogue-summary-checkpoint/special_tokens_map.json to peft-dialogue-summary-checkpoint-from-s3/special_tokens_map.json
download: s3://dsoaws/models/peft-dialogue-summary-checkpoint/tokenizer_config.json to peft-dialogue-summary-checkpoint-from-s3/tokenizer_config.json
download: s3://dsoaws/models/peft-dialogue-summary-checkpoint/tokenizer.json to peft-dialogue-summary-checkpoint-from-s3/tokenizer.json
download: s3://dsoaws/models/peft-dialogue-summary-checkpoint/adapter_model.bin to peft-dialogue-summary-checkpoint-from-s3/adapter_model.bin


In [29]:
!ls -al ./peft-dialogue-summary-checkpoint-from-s3/adapter_model.bin

-rw-r--r-- 1 root root 14208525 Jun 15 23:37 ./peft-dialogue-summary-checkpoint-from-s3/adapter_model.bin


In [31]:
# 这段代码的主要目的是从预训练的模型中加载一个模型，并且初始化一个针对这个模型的分词器。之后，使用这个预训练模型和分词器创建一个PEFT模型。

# 导入PEFT模型和配置类
from peft import PeftModel, PeftConfig

# 使用"google/flan-t5-base"作为预训练模型，加载一个用于序列到序列的学习模型
# 并指定模型使用的数据类型为bfloat16
peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16).to(device)

# 从同样的预训练模型"google/flan-t5-base"中加载分词器
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

# 从本地路径'./peft-dialogue-summary-checkpoint-from-s3/'加载预训练的PEFT模型
# 并指定模型使用的数据类型为bfloat16
# 通过设置is_trainable参数为False，指定模型在接下来的使用中不会被训练
peft_model = PeftModel.from_pretrained(peft_model_base, 
                                       './peft-dialogue-summary-checkpoint-from-s3/', 
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False).to(device)

In [32]:
# 调用print_number_of_trainable_model_parameters函数，打印PEFT模型中的可训练参数的数量
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 0
all model parameters: 251116800
percentage of trainable model parameters: 0.0%


##  对模型进行定性评估

In [34]:
# 这段代码的主目的是使用三个不同的模型（原始模型、指导模型和PEFT模型）对同一段对话进行摘要，并将结果与人工摘要进行比较

# 选择要摘要的对话的索引
index = 200

# 从数据集的测试集中获取对话文本
dialogue = dataset['test'][index]['dialogue']

# 从数据集的测试集中获取人工摘要
baseline_human_summary = dataset['test'][index]['summary']

# 构建模型输入的提示
prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

# 使用分词器将提示转换为模型可以接受的输入形式
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

# 使用原始模型生成摘要，设置最大新令牌数为200，束宽为1
original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
# 使用分词器将模型输出的ID解码为文本
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

# 使用指导模型生成摘要
instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
# 解码指导模型的输出
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

# 使用PEFT模型生成摘要
peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
# 解码PEFT模型的输出
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

# 打印出不同模型的摘要结果以及人工摘要
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')
print(dash_line)
print(f'PEFT MODEL: {peft_model_text_output}')

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
Talk to a computer expert.
---------------------------------------------------------------------------------------------------
INSTRUCT MODEL:
#Person1# suggests #Person2# upgrading #Person2#'s system, hardware, and CD-ROM drive. #Person2# thinks it's great.
---------------------------------------------------------------------------------------------------
PEFT MODEL: #Person1# recommends adding a painting program to #Person2#'s software and upgrading hardware. #Person2# also wants to upgrade the hardware because it's outdated now.


## 对模型进行定量评估

In [36]:
# 这段代码的主要目的是对数据集中的前十个对话样本进行摘要，并将这些摘要以及对应的人工摘要存储在一个Pandas DataFrame中。

# 从测试数据集中获取前十个对话样本
dialogues = dataset['test'][0:10]['dialogue']

# 获取这些对话的人工摘要
human_baseline_summaries = dataset['test'][0:10]['summary']

# 创建空列表，用于存储不同模型生成的摘要
original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

# 使用tqdm函数创建一个进度条，并对每一个对话样本进行处理
for idx, dialogue in enumerate(tqdm(dialogues)):
    # 创建模型输入的提示
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

    # 使用分词器将提示转换为模型可以接受的输入形式
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    # 从人工摘要列表中获取当前对话的人工摘要
    human_baseline_text_output = human_baseline_summaries[idx]
    
    # 使用原始模型生成摘要，并解码为文本
    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    # 使用指导模型生成摘要，并解码为文本
    instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

    # 使用PEFT模型生成摘要，并解码为文本
    peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

    # 将生成的摘要添加到对应的列表中
    original_model_summaries.append(original_model_text_output)
    instruct_model_summaries.append(instruct_model_text_output)
    peft_model_summaries.append(peft_model_text_output)

# 使用zip函数将人工摘要和各个模型的摘要配对
zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries, peft_model_summaries))

# 使用Pandas创建一个DataFrame，每一列代表一个摘要来源
df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries', 'peft_model_summaries'])

# 显示这个DataFrame
df

100%|██████████| 10/10 [00:38<00:00,  3.86s/it]


Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries,peft_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to inform every employee that they have to change the communication method and should not use Instant Messaging anymore.,Employees are now allowed to use the Instant Messaging system.,#Person1# asks Ms. Dawson to take a dictation to all employees by this afternoon. Ms. Dawson tells #Person1# all office communications are restricted to email correspondence and official memos and the use of Instant Message programs by employees during working hours is strictly prohibited. #Person1# wants to change the communication methods and Ms. Dawson tells #Person1# it applies to internal and external communications.,#Person1# asks Ms. Dawson to take a dictation to all employees by this afternoon. Ms. Dawson tells #Person1# that all office communications are restricted to email correspondence and official memos. #Person1# wants to change the communication methods and asks Ms. Dawson to get the memo typed up and distributed to all employees before 4 pm.
1,"In order to prevent employees from wasting time on Instant Message programs, #Person1# decides to terminate the use of those programs and asks Ms. Dawson to send out a memo to all employees by the afternoon.","#Person1#: I'm sorry, Ms. Dawson. I'm going to give you a memo.",#Person1# asks Ms. Dawson to take a dictation to all employees by this afternoon. Ms. Dawson tells #Person1# all office communications are restricted to email correspondence and official memos and the use of Instant Message programs by employees during working hours is strictly prohibited. #Person1# wants to change the communication methods and Ms. Dawson tells #Person1# it applies to internal and external communications.,#Person1# asks Ms. Dawson to take a dictation to all employees by this afternoon. Ms. Dawson tells #Person1# that all office communications are restricted to email correspondence and official memos. #Person1# wants to change the communication methods and asks Ms. Dawson to get the memo typed up and distributed to all employees before 4 pm.
2,Ms. Dawson takes a dictation for #Person1# about prohibiting the use of Instant Message programs in the office. They argue about its reasonability but #Person1# still insists.,"#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. #Person2#: Yes, sir. #Person1#: It should apply to all communications, not only intra-office communications, but also external communications. #Person2#: It should apply to external communications. #Person1#: It should apply to internal and external communications. #Person2#: Yes, it should apply to internal and external communications. #Person2#: It should apply to internal and external communica...",#Person1# asks Ms. Dawson to take a dictation to all employees by this afternoon. Ms. Dawson tells #Person1# all office communications are restricted to email correspondence and official memos and the use of Instant Message programs by employees during working hours is strictly prohibited. #Person1# wants to change the communication methods and Ms. Dawson tells #Person1# it applies to internal and external communications.,#Person1# asks Ms. Dawson to take a dictation to all employees by this afternoon. Ms. Dawson tells #Person1# that all office communications are restricted to email correspondence and official memos. #Person1# wants to change the communication methods and asks Ms. Dawson to get the memo typed up and distributed to all employees before 4 pm.
3,#Person2# arrives late because of traffic jam. #Person1# persuades #Person2# to use public transportations to keep healthy and to protect the environment.,The traffic jam at the intersection of Carrefour and Carrefour is really bad.,#Person2# got stuck in traffic again. #Person1# suggests #Person2# start taking public transport system to work and suggests #Person2# start biking to work when it's nicer outside. #Person2# agrees.,#Person2# got stuck in traffic and #Person1# suggests #Person2# start taking public transport system to work. #Person2# thinks it's better for the environment and #Person2# will miss having freedom with a car. #Person1# suggests biking to work when it's nicer outside.
4,#Person2# decides to follow #Person1#'s suggestions on quitting driving to work and will try to use public transportations.,#Person2#: I'm stuck in traffic again. #Person1#: I'm stuck in traffic again. #Person2#: I'm stuck in traffic again. #Person1#: I'm a bit afraid of the traffic. #Person2#: I'm going to have to consider taking public transport. #Person2#: I'm going to try to get a subway to work. #Person1#: I'm going to try to take the subway. #Person2#: I'm going to try to try to bike to work. #Person1#: I'm going to try to get a bike to work. #Person2#: I'm going to try to get a bike to work.,#Person2# got stuck in traffic again. #Person1# suggests #Person2# start taking public transport system to work and suggests #Person2# start biking to work when it's nicer outside. #Person2# agrees.,#Person2# got stuck in traffic and #Person1# suggests #Person2# start taking public transport system to work. #Person2# thinks it's better for the environment and #Person2# will miss having freedom with a car. #Person1# suggests biking to work when it's nicer outside.
5,"#Person2# complains to #Person1# about the traffic jam, #Person1# suggests quitting driving and taking public transportation instead.",People are talking about the traffic problems in their lives.,#Person2# got stuck in traffic again. #Person1# suggests #Person2# start taking public transport system to work and suggests #Person2# start biking to work when it's nicer outside. #Person2# agrees.,#Person2# got stuck in traffic and #Person1# suggests #Person2# start taking public transport system to work. #Person2# thinks it's better for the environment and #Person2# will miss having freedom with a car. #Person1# suggests biking to work when it's nicer outside.
6,#Person1# tells Kate that Masha and Hero get divorced. Kate is surprised because she thought they are perfect couple.,Masha and Hero are getting divorced.,Masha and Hero are getting divorced. Kate can't believe it. #Person1# tells Kate they are having a separation for 2 months and filed for divorce. Kate thinks it's surprising and can't believe it.,Kate tells #Person2# Masha and Hero are getting divorced. #Person2# thinks it's surprising because they are having a separation for 2 months and filed for divorce. #Person2# thinks it's the change from all the back stepping.
7,#Person1# tells Kate that Masha and Hero are getting a peaceful divorce. Kate feels surprised and asks about their kids.,Masha and Hero are getting divorced.,Masha and Hero are getting divorced. Kate can't believe it. #Person1# tells Kate they are having a separation for 2 months and filed for divorce. Kate thinks it's surprising and can't believe it.,Kate tells #Person2# Masha and Hero are getting divorced. #Person2# thinks it's surprising because they are having a separation for 2 months and filed for divorce. #Person2# thinks it's the change from all the back stepping.
8,#Person1# and Kate talk about the divorce between Masha and Hero. Kate feels surprised because she thought they are well matched,Masha and Hero are getting divorced.,Masha and Hero are getting divorced. Kate can't believe it. #Person1# tells Kate they are having a separation for 2 months and filed for divorce. Kate thinks it's surprising and can't believe it.,Kate tells #Person2# Masha and Hero are getting divorced. #Person2# thinks it's surprising because they are having a separation for 2 months and filed for divorce. #Person2# thinks it's the change from all the back stepping.
9,#Person1# and Brian are at the birthday party of Brian. Brian thinks #Person1# looks great and is popular.,"#Person1: Happy Birthday, Brian!",Brian's birthday is coming. #Person1# invites Brian to have a dance and Brian compliments #Person1#'s looks. Brian thinks #Person1# looks great and invites #Person1# to have a drink together.,Brian remembers his birthday and invites #Person1# to the party. Brian is popular with everyone and looks pretty today. #Person1# and Brian will have a drink together to celebrate Brian's birthday.


In [37]:
# 这段代码的主要目标是使用ROUGE（Recall-Oriented Understudy for Gisting Evaluation）指标对三个模型（原始模型，指导模型，PEFT模型）生成的摘要进行评估，并将结果打印出来。

# 加载ROUGE评估工具
rouge = evaluate.load('rouge')

# rouge.compute函数接受以下参数：
# - predictions：模型生成的摘要。
# - references：参考摘要，这里使用的是人工摘要。
# - use_aggregator：如果为True，将返回所有评分的平均值和95%置信区间。
# - use_stemmer：如果为True，将在计算ROUGE分数前对预测和参考摘要进行词干化。

# use_aggregator：该参数决定了是否对所有评分的结果进行聚合。
# 如果设置为 True，那么结果将包含所有评分的平均值（mean）以及一个 95% 的置信区间（confidence interval）。
# 置信区间是一种用于估计某个参数的区间，如果样本足够大，那么 95% 的概率下，参数的真实值会在这个区间内。

# use_stemmer：该参数决定了在计算 ROUGE 分数之前，是否对预测和参考摘要进行词干化（stemming）。
# 词干化是一种处理单词的方式，将单词简化为其词干或者词根形式，例如，“running”、“runner”和“ran”经过词干化后都可能变为“run”。
# 这样可以帮助评估过程忽略单词的具体形式，只关注其基本含义。如果设置为 True，那么在计算评分之前，会先对预测和参考摘要进行词干化处理。

# 对原始模型生成的摘要进行评估，参考标准为人工摘要
original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

# 对指导模型生成的摘要进行评估
instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

# 对PEFT模型生成的摘要进行评估
peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

# 打印三个模型的评估结果
print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)
print('PEFT MODEL:')
print(peft_model_results)

ORIGINAL MODEL:
{'rouge1': 0.28937301301960516, 'rouge2': 0.11499567424080169, 'rougeL': 0.24994683805358164, 'rougeLsum': 0.25145160436685865}
INSTRUCT MODEL:
{'rouge1': 0.41026607717457186, 'rouge2': 0.17840645241958838, 'rougeL': 0.2977022096267017, 'rougeLsum': 0.2987374187518165}
PEFT MODEL:
{'rouge1': 0.3725351062275605, 'rouge2': 0.12138811933618107, 'rougeL': 0.27620639623170606, 'rougeLsum': 0.2758134870822362}


In [38]:
# 这段代码的主要目标是从 results 的 DataFrame 中提取出摘要，
# 并使用 ROUGE（Recall-Oriented Understudy for Gisting Evaluation）指标来评估由三个模型（原始模型，指导模型，PEFT模型）生成的摘要。
# 打印出了每个模型的评估结果。

# 从 DataFrame 'results' 中提取人工生成的基线摘要
human_baseline_summaries = results['human_baseline_summaries'].values

# 从 DataFrame 'results' 中提取原始模型生成的摘要
original_model_summaries = results['original_model_summaries'].values

# 从 DataFrame 'results' 中提取指导模型生成的摘要
instruct_model_summaries = results['instruct_model_summaries'].values

# 从 DataFrame 'results' 中提取 PEFT 模型生成的摘要
peft_model_summaries     = results['peft_model_summaries'].values

# 使用 ROUGE 指标对原始模型生成的摘要进行评估
original_model_results = rouge.compute(
    predictions=original_model_summaries,  # 原始模型生成的摘要
    references=human_baseline_summaries[0:len(original_model_summaries)],  # 人工生成的参考摘要
    use_aggregator=True,  # 返回所有评分的平均值和 95% 置信区间
    use_stemmer=True,  # 在计算 ROUGE 分数之前对预测和参考摘要进行词干化
)

# 使用 ROUGE 指标对指导模型生成的摘要进行评估
instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,  # 指导模型生成的摘要
    references=human_baseline_summaries[0:len(instruct_model_summaries)],  # 人工生成的参考摘要
    use_aggregator=True,  # 返回所有评分的平均值和 95% 置信区间
    use_stemmer=True,  # 在计算 ROUGE 分数之前对预测和参考摘要进行词干化
)

# 使用 ROUGE 指标对 PEFT 模型生成的摘要进行评估
peft_model_results = rouge.compute(
    predictions=peft_model_summaries,  # PEFT 模型生成的摘要
    references=human_baseline_summaries[0:len(peft_model_summaries)],  # 人工生成的参考摘要
    use_aggregator=True,  # 返回所有评分的平均值和 95% 置信区间
    use_stemmer=True,  # 在计算 ROUGE 分数之前对预测和参考摘要进行词干化
)

# 打印三个模型的评估结果
print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)
print('PEFT MODEL:')
print(peft_model_results)

ORIGINAL MODEL:
{'rouge1': 0.2334158581572823, 'rouge2': 0.07603964187010573, 'rougeL': 0.20145520923859048, 'rougeLsum': 0.20145899339006135}
INSTRUCT MODEL:
{'rouge1': 0.42161291557556113, 'rouge2': 0.18035380596301792, 'rougeL': 0.3384439349963909, 'rougeLsum': 0.33835653595561666}
PEFT MODEL:
{'rouge1': 0.40810631575616746, 'rouge2': 0.1633255794568712, 'rougeL': 0.32507074586565354, 'rougeLsum': 0.3248950182867091}


In [39]:
# 这段代码的目的是计算 PEFT 模型在 ROUGE 评分上相对于原始模型的绝对百分比改进，并将每个 ROUGE 指标的改进打印出来。

# 打印说明信息
print("Absolute percentage improvement of PEFT MODEL over HUMAN BASELINE")

# 计算 PEFT 模型的 ROUGE 分数相对于原始模型的绝对改进
# 首先将每个模型的 ROUGE 分数从字典中提取出来，转换为 numpy 数组
# 然后计算两个数组的差，得到每个 ROUGE 指标的改进
improvement = (np.array(list(peft_model_results.values())) - np.array(list(original_model_results.values())))

# 循环遍历每个 ROUGE 指标及其对应的改进，打印改进的百分比
# 字符串格式化用于保留两位小数
for key, value in zip(peft_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

Absolute percentage improvement of PEFT MODEL over HUMAN BASELINE
rouge1: 17.47%
rouge2: 8.73%
rougeL: 12.36%
rougeLsum: 12.34%


In [40]:
# 打印说明信息
print("Absolute percentage improvement of PEFT MODEL over INSTRUCT MODEL")

# 计算 PEFT 模型的 ROUGE 分数相对于指导模型的绝对改进
# 首先将每个模型的 ROUGE 分数从字典中提取出来，转换为 numpy 数组
# 然后计算两个数组的差，得到每个 ROUGE 指标的改进
improvement = (np.array(list(peft_model_results.values())) - np.array(list(instruct_model_results.values())))

# 循环遍历每个 ROUGE 指标及其对应的改进，打印改进的百分比
# 字符串格式化用于保留两位小数
for key, value in zip(peft_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

Absolute percentage improvement of PEFT MODEL over INSTRUCT MODEL
rouge1: -1.35%
rouge2: -1.70%
rougeL: -1.34%
rougeLsum: -1.35%


# Release Resources

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>