In [None]:
# %pip freeze > peft_requirement.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset

from transformers import (
    AutoTokenizer, 
    AutoModel, 
    AutoModelForSequenceClassification,
    AutoConfig,
    get_cosine_schedule_with_warmup,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from peft import get_peft_model, PrefixTuningConfig, LoraConfig, TaskType

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import numpy as np
from tqdm.auto import tqdm

# 固定seed
torch.manual_seed(42)
# 确定设备：如果有GPU可用则使用GPU，否则使用CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#如果GPU可以，可以改为20
num_epochs = 5
patience = 5

training_record = {}

# load dataset

In [2]:
# tokenizer，加载bert的分词器,uncased就是不区分大小写
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# dataset，加载数据里
dataset_sst2 = load_dataset(
    "parquet",
    data_files={
        "train": "./sst2/data/train-00000-of-00001.parquet",
        "validation": "./sst2/data/validation-00000-of-00001.parquet"
        })

# preprocessing
def collate_fn(batch):
    #对字符串文本，进行编码，变为id,longest就是最长，padding就是填充,truncation为True就是截断
    inputs = tokenizer([x["sentence"] for x in batch], padding="longest", truncation=True, return_tensors="pt", max_length=512)
    labels = torch.tensor([x["label"] for x in batch])
    return inputs, labels

train_loader = DataLoader(dataset_sst2["train"], batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(dataset_sst2["validation"], batch_size=32, collate_fn=collate_fn)


# define evaluattion and training function

In [3]:
def evaluate(model, val_loader):
    model.eval()
    val_loss = 0
    val_acc = 0
    with torch.no_grad():# 在评估过程中关闭梯度计算
        total_samples = 0 #统计验证集总样本数量
        for inputs, labels in val_loader:
            inputs = {k: v.to(device) for k, v in inputs.items()} #输入是一个字典，所以拿value
            labels = labels.to(device)
            probs = model(**inputs)
            probs = probs.logits.squeeze()
            loss = F.cross_entropy(probs, labels.float()) #求损失
            val_loss += loss.item()
            val_acc += ((probs > 0.5) == labels).sum().item() #模型的预测结果与实际标签是否相等,求和得到预测正确数量
            total_samples += len(labels)

    val_loss /= len(val_loader)
    val_acc /= total_samples
    return val_loss, val_acc


def train(model, train_loader, val_loader, device, num_epochs=3, patience=3):
    # 将模型移动到指定设备
    model.to(device)

    # 定义优化器
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

    # 计算训练步数总数
    total_steps = num_epochs * len(train_loader)

    # 使用transformers库中的余弦学习率调度器进行学习率调整
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.2 * total_steps), #前20%步，学习率提升
        num_training_steps=total_steps
    )

    # 提前停止训练的控制变量
    best_val_acc = -1
    cur = 0

    # 存储训练和验证指标的容器
    history = {"train_loss": [], "train_acc": [], "val_loss": [], "val_acc": []}

    for epoch in tqdm(range(num_epochs)):
        # 进入训练模式
        model.train()
        train_loss = 0
        train_acc = 0
        total_samples = 0

        # 对训练数据进行迭代
        for inputs, labels in train_loader:
            # 将数据移动到指定设备
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = labels.to(device)

            # 前向传播并计算损失
            optimizer.zero_grad()
            probs = model(**inputs) # **代表字典解包，inputs 中的键名必须与模型 forward() 方法的参数名完全一致
            probs = probs.logits.squeeze()
            loss = F.binary_cross_entropy_with_logits(probs, labels.float())
            loss.backward()
            optimizer.step()
            scheduler.step()

            # 收集指标
            train_loss += loss.item()
            train_acc += ((probs > 0.5) == labels).sum().item()
            total_samples += len(labels)

        train_loss /= len(train_loader)
        train_acc  /= total_samples

        # 进行验证
        val_loss, val_acc = evaluate(model, val_loader)

        # 记录指标
        print(f"epoch {epoch}: train_loss {train_loss:.4f}, train_acc {train_acc:.4f}, val_loss {val_loss:.4f}, val_acc {val_acc:.4f}")
        history["train_loss"].append(train_loss)
        history["train_acc"].append(train_acc)
        history["val_loss"].append(val_loss)
        history["val_acc"].append(val_acc)

        # 提前停止训练
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            cur = 0
        else:
            cur += 1
        if cur >= patience:
            print("提前停止训练")
            break

    return history

# Prefix Tuning

In [4]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)
peft_config = PrefixTuningConfig(
    task_type=TaskType.SEQ_CLS,    # 序列分类
    num_virtual_tokens=20,
    prefix_projection=True,        # 默认 False；True 时可加 MLP 映射
    encoder_hidden_size=512  # 设置 MLP 映射的维度
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # 查看可训练参数（<1%）

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

trainable params: 9,865,473 || all params: 119,348,482 || trainable%: 8.266106811480014


In [5]:
training_record["prefix_tuning"] = train(model, train_loader, val_loader, device, num_epochs=10, patience=5)

  0%|          | 0/10 [00:00<?, ?it/s]

epoch 0: train_loss 0.6176, train_acc 0.5447, val_loss 52.8172, val_acc 0.8463
epoch 1: train_loss 0.3238, train_acc 0.8643, val_loss 54.1661, val_acc 0.8761
epoch 2: train_loss 0.2720, train_acc 0.8825, val_loss 59.6702, val_acc 0.8876
epoch 3: train_loss 0.2515, train_acc 0.8924, val_loss 55.8352, val_acc 0.8945
epoch 4: train_loss 0.2376, train_acc 0.8985, val_loss 57.7168, val_acc 0.9071
epoch 5: train_loss 0.2264, train_acc 0.9038, val_loss 60.9866, val_acc 0.9083
epoch 6: train_loss 0.2192, train_acc 0.9069, val_loss 60.5933, val_acc 0.9128
epoch 7: train_loss 0.2144, train_acc 0.9093, val_loss 59.8683, val_acc 0.9128
epoch 8: train_loss 0.2124, train_acc 0.9104, val_loss 61.1281, val_acc 0.9151
epoch 9: train_loss 0.2114, train_acc 0.9111, val_loss 60.4365, val_acc 0.9151


In [6]:
del model

# P-tuning-v2

In [12]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)
peft_config = PrefixTuningConfig(
    task_type=TaskType.SEQ_CLS,    # 序列分类
    num_virtual_tokens=32,
    prefix_projection=False        # 默认 False；True 时可加 MLP 映射
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # 查看可训练参数（<1%）

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

trainable params: 590,593 || all params: 110,073,602 || trainable%: 0.5365437209913417


In [13]:
training_record["P-tuning-v2"] = train(model, train_loader, val_loader, device, num_epochs=10, patience=5)

  0%|          | 0/10 [00:00<?, ?it/s]

epoch 0: train_loss 0.6885, train_acc 0.4495, val_loss 54.7002, val_acc 0.4908
epoch 1: train_loss 0.6847, train_acc 0.4463, val_loss 54.4352, val_acc 0.5046
epoch 2: train_loss 0.6767, train_acc 0.4746, val_loss 54.0218, val_acc 0.6227
epoch 3: train_loss 0.6650, train_acc 0.5310, val_loss 53.6759, val_acc 0.6686
epoch 4: train_loss 0.6516, train_acc 0.5730, val_loss 53.4537, val_acc 0.6961
epoch 5: train_loss 0.6401, train_acc 0.6063, val_loss 53.3117, val_acc 0.7190
epoch 6: train_loss 0.6282, train_acc 0.6247, val_loss 53.2884, val_acc 0.7271
epoch 7: train_loss 0.6220, train_acc 0.6339, val_loss 53.2778, val_acc 0.7317
epoch 8: train_loss 0.6191, train_acc 0.6385, val_loss 53.2710, val_acc 0.7328
epoch 9: train_loss 0.6170, train_acc 0.6435, val_loss 53.2746, val_acc 0.7317


In [9]:
del model

# LoRA

In [10]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,    # 序列分类
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # 查看可训练参数（<1%）

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

trainable params: 295,681 || all params: 109,778,690 || trainable%: 0.26934280232347463


In [11]:
training_record["LoRA"] = train(model, train_loader, val_loader, device, num_epochs=10, patience=5)

  0%|          | 0/10 [00:00<?, ?it/s]

epoch 0: train_loss 0.6996, train_acc 0.4427, val_loss 54.5700, val_acc 0.4908
epoch 1: train_loss 0.4900, train_acc 0.7105, val_loss 54.1376, val_acc 0.8658
epoch 2: train_loss 0.3167, train_acc 0.8694, val_loss 56.5489, val_acc 0.8784
epoch 3: train_loss 0.2926, train_acc 0.8786, val_loss 58.1786, val_acc 0.8945
epoch 4: train_loss 0.2769, train_acc 0.8844, val_loss 58.2912, val_acc 0.9014
epoch 5: train_loss 0.2688, train_acc 0.8880, val_loss 58.3683, val_acc 0.9025
epoch 6: train_loss 0.2638, train_acc 0.8896, val_loss 59.0802, val_acc 0.9025
epoch 7: train_loss 0.2603, train_acc 0.8906, val_loss 59.2532, val_acc 0.9014
epoch 8: train_loss 0.2589, train_acc 0.8905, val_loss 59.2620, val_acc 0.9060
epoch 9: train_loss 0.2583, train_acc 0.8917, val_loss 59.1033, val_acc 0.9037
