In [1]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
result = classifier("I've been waiting for a HuggingFace course my whole life.")
print(result)
results = classifier(
  ["I've been waiting for a HuggingFace course my whole life.", "I hate this so much!"]
)
print(results)

# 微调预训练模型

In [2]:
from torch.utils.data import Dataset
import json

class AFQMC(Dataset):
    def __init__(self, data_file):
        self.data = self.load_data(data_file)
    
    def load_data(self, data_file):
        Data = {}
        with open(data_file, 'rt') as f:
            for idx, line in enumerate(f):
                sample = json.loads(line.strip())
                Data[idx] = sample
        return Data
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

train_data = AFQMC('data/afqmc_public/train.json')
valid_data = AFQMC('data/afqmc_public/dev.json')

print(train_data[0])

{'sentence1': '蚂蚁借呗等额还款可以换成先息后本吗', 'sentence2': '借呗有先息到期还本吗', 'label': '0'}


In [3]:
train_data.__getitem__(2)

{'sentence1': '帮我看一下本月花呗账单有没有结清', 'sentence2': '下月花呗账单', 'label': '0'}

In [16]:
# # 如果数据集很大，难以一次加载到内存中，可以继承IterableDataset

# from torch.utils.data import IterableDataset
# import json

# class IterableAFQMC(IterableDataset):
#     def __init__(self,data_file):
#         self.data_file = data_file
#     def __iter__(self):
#         with open(self.data_file, 'rt') as f:
#             for line in f:
#                 sample = json.loads(line.strip())
#                 yield sample


# train_data = IterableAFQMC('data/afqmc_public/train.json')
# print(next(iter(train_data)))

In [None]:
# Data loader
# 按批次处理数据，将样本转换成模型可接受的输入格式

import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

checkpoint = 'bert-base-chinese'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def collote_fn(batch_samples):
    batch_sentence_1, batch_sentence_2 = [],[]
    batch_label = []
    for sample in batch_samples:
        batch_sentence_1.append(sample['sentence1'])
        batch_sentence_2.append(sample['sentence2'])
        batch_label.append(int(sample['label']))

    X = tokenizer(
        batch_sentence_1,
        batch_sentence_2,
        padding = True,
        truncation = True,
        return_tensors = "pt"

    ).to(device) 

    y = torch.tensor(batch_label).to(device) 
    return X,y


train_dataloader = DataLoader(train_data, batch_size = 4, shuffle=True,collate_fn=collote_fn)
batch_X, batch_y = next(iter(train_dataloader))
valid_dataloader  = DataLoader(valid_data, batch_size = 4, shuffle=False, collate_fn=collote_fn)
print('batch_X shape:', {k: v.shape for k, v in batch_X.items()})
print('batch_y shape:', batch_y.shape)
print(batch_X)
print(batch_y)

# Dataloader按照设置的Batch size =4 每次对四个样本进行编码，并自动对每个batch中的样本进行补全和截断，
# 选择了BERT模型作为checkpoint, 所以每个样本都被处理成 [CLS]sen1[SEP]sen2[SEP]

batch_X shape: {'input_ids': torch.Size([4, 41]), 'token_type_ids': torch.Size([4, 41]), 'attention_mask': torch.Size([4, 41])}
batch_y shape: torch.Size([4])
{'input_ids': tensor([[ 101, 5709, 1446, 1146, 3309,  743,  702, 2797, 3322,  852, 3221, 1372,
         5543, 4500,  115,  115,  115, 1779, 2797, 3322,  115,  115,  115, 1779,
         1377,  809,  743, 1408,  102, 5709, 1446, 1372, 1377,  809, 4500,  754,
         3867, 6589, 3221, 1408,  102],
        [ 101, 4509, 6435,  955, 1446, 4518, 7481, 2144, 3417, 1914,  719,  102,
          791, 1921, 4509, 6435, 6010, 6009,  955, 1446, 5543, 2144, 3417, 1408,
          102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0],
        [ 101, 4500, 4916, 1146, 2940,  955, 1446, 1048, 2622, 4638, 3833, 1220,
         1357, 3867,  749,  102, 6010, 6009, 4916, 1050, 2940,  955, 1446, 1048,
         2622, 1171, 3766, 3300,  749, 1408,  102,    0,    0,    0,    0,    0,
            0,    

NVIDIA GeForce RTX 5060 Ti with CUDA capability sm_120 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_50 sm_60 sm_70 sm_75 sm_80 sm_86 sm_90.
If you want to use the NVIDIA GeForce RTX 5060 Ti GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



In [5]:
# 分类任务 构建模型
from torch import nn
from transformers import AutoModel

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('using',device,'device')

class BertForPairwiseCLS(nn.Module):
    def __init__(self):
        super(BertForPairwiseCLS,self).__init__()
        self.bert_encoder = AutoModel.from_pretrained(checkpoint)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768,2)

    def forward(self, x):
        bert_output = self.bert_encoder(**x)
        cls_vector = bert_output.last_hidden_state[:,0,:]
        cls_vectors = self.dropout(cls_vector)
        logits = self.classifier(cls_vectors)
        return logits
    

model = BertForPairwiseCLS().to(device)
print(model)

"""
这里模型首先将输入送入到 BERT 模型中，
将每一个 token 都编码为维度为 768 的向量，
然后从输出序列中取出第一个 [CLS] token 的编码表示作为整个句子对的语义表示，
再送入到一个线性全连接层中预测两个类别的分数。

last_hidden_state的形状是 (batch_size, seq_len, hidden_dim)
(32,128,768)
32个批次的样本
128序列长度，每个序列都是128长度
768 每个token的维度

[:,0,:] 也就是取所有批次数据，每个序列的第一个token也就是【cls】，这个token的向量
这种方式简单粗暴，但是相当于在 Transformers 模型外又包了一层，因此无法再调用 Transformers 库预置的模型函数。

"""



using cuda device
BertForPairwiseCLS(
  (bert_encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

'\n这里模型首先将输入送入到 BERT 模型中，\n将每一个 token 都编码为维度为 768 的向量，\n然后从输出序列中取出第一个 [CLS] token 的编码表示作为整个句子对的语义表示，\n再送入到一个线性全连接层中预测两个类别的分数。\n\nlast_hidden_state的形状是 (batch_size, seq_len, hidden_dim)\n(32,128,768)\n32个批次的样本\n128序列长度，每个序列都是128长度\n768 每个token的维度\n\n[:,0,:] 也就是取所有批次数据，每个序列的第一个token也就是【cls】，这个token的向量\n这种方式简单粗暴，但是相当于在 Transformers 模型外又包了一层，因此无法再调用 Transformers 库预置的模型函数。\n\n'

In [6]:
# 更加常见的写法

from torch import nn
from transformers import AutoConfig
from transformers import BertModel,BertPreTrainedModel

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('device', device)

class BertForPairwiseCLS(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(768,2)
        self.post_init()

    def forward(self,x):
        bert_output = self.bert(**x)
        cls_vectors = bert_output.last_hidden_state[:,0,:]
        cls_vectors = self.dropout(cls_vectors)
        logits = self.classifier(cls_vectors)
        return logits
    
config = AutoConfig.from_pretrained(checkpoint)
model = BertForPairwiseCLS.from_pretrained(checkpoint, config=config).to(device)
print(model)


Some weights of BertForPairwiseCLS were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


device cuda
BertForPairwiseCLS(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [7]:
outputs = model(batch_X)
print(outputs.shape)

RuntimeError: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [1]:
import torch
print(torch.__version__)          # PyTorch 版本
print(torch.version.cuda)         # 绑定的 CUDA 版本
print(torch.cuda.is_available())  # CUDA 是否可用
print(torch.cuda.get_device_capability(0))  # 当前 GPU 的计算能力（如 (8, 6)）

2.7.1+cu126
12.6
True
(12, 0)


NVIDIA GeForce RTX 5060 Ti with CUDA capability sm_120 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_50 sm_60 sm_70 sm_75 sm_80 sm_86 sm_90.
If you want to use the NVIDIA GeForce RTX 5060 Ti GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



In [1]:
import torch
print(torch.version.cuda) 

12.6


In [None]:
# 训练模型时，每一轮epoch分为训练循环和验证测试循环，循环中计算损失，优化模型参数
from tqdm.auto import tqdm
def train_loop(dataloader, model, loss_fn, optimizer, lr_scheduler, epoch, total_loss):
    progress_bar = tqdm(range(len(dataloader)))
    progress_bar.set_description(f'loss:{0:>7f}')

    finish_step_num = (epoch-1)* len(dataloader)

    model.train()
    for step, (X,y) in enumerate(dataloader, start=1):
        X,y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred,y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        progress_bar.set_description(f'loss: {total_loss/(finish_step_num + step):>7f}')
        progress_bar.update(1)
    return total_loss

def test_loop(dataloader, model, mode='Test'):
    assert mode in ['Valid','Test']

    size = len(dataloader.dataset)
    correct = 0

    model.eval()
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            correct += (pred.argmax(1)  == y).type(torch.float).sum().item()

    correct/=size
    print(f"{mode} Accuracy: {(100*correct):>0.1f}%\n")

In [None]:
# 优化器 会随着训练过程逐步减小学习率
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr = 5e-5)

# 默认情况下， 优化器会线性衰减学习率
# 训练部署 step = 训练轮数 乘以 每一轮中的步数（也就是训练dataloader的大小）


In [None]:
from transformers import get_scheduler

epochs = 3
num_training_steps = epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

print(num_training_steps)


In [None]:
# 完整的训练过程 
from transformers import AdamW, get_scheduler
learning_rate = 1e-5
epoch_num = 3

loss_fn = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=epoch_num* len(train_dataloader),
)

total_loss = 0.
for t in range(epoch_num):
    print(f"Epoch {t+1}/{epoch_num}\n-------------------------------")
    total_loss = train_loop(train_dataloader, model, loss_fn, optimizer, lr_scheduler, t+1, total_loss)
    test_loop(valid_data, model, mode='Valid')