In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
from torch.optim.lr_scheduler import LambdaLR
from sklearn.metrics import classification_report

In [2]:
def set_seed(seed):
    """PyTorch随机数种子设置大全"""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)  # CPU上设置随机种子
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)  # 当前GPU上设置随机种子
        # torch.cuda.manual_seed_all(seed) # 所有GPU上设置随机种子


set_seed(42)

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

### 数据构建

In [4]:
# 加载mrpc数据集
dataset = load_dataset('glue', 'mrpc')

Using the latest cached version of the module from C:\Users\dcdmm\.cache\huggingface\modules\datasets_modules\datasets\glue\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad (last modified on Wed Apr 27 20:43:06 2022) since it couldn't be found locally at glue., or remotely on the Hugging Face Hub.
Reusing dataset glue (C:\Users\dcdmm\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
df_train = dataset['train'].to_pandas()
df_val = dataset['test'].to_pandas()
df_test = dataset['validation'].to_pandas()

In [6]:
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(3668, 4)
(1725, 4)
(408, 4)


In [7]:
df_train.head()  # label:whether the sentences in the pair are semantically equivalent

Unnamed: 0,sentence1,sentence2,label,idx
0,"Amrozi accused his brother , whom he called "" ...","Referring to him as only "" the witness "" , Amr...",1,0
1,Yucaipa owned Dominick 's before selling the c...,Yucaipa bought Dominick 's in 1995 for $ 693 m...,0,1
2,They had published an advertisement on the Int...,"On June 10 , the ship 's owners had published ...",1,2
3,"Around 0335 GMT , Tab shares were up 19 cents ...","Tab shares jumped 20 cents , or 4.6 % , to set...",0,3
4,"The stock rose $ 2.11 , or about 11 percent , ...",PG & E Corp. shares jumped $ 1.63 or 8 percent...,1,4


In [8]:
df_train.info()  # 没有缺失值

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3668 entries, 0 to 3667
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentence1  3668 non-null   object
 1   sentence2  3668 non-null   object
 2   label      3668 non-null   int64 
 3   idx        3668 non-null   int32 
dtypes: int32(1), int64(1), object(2)
memory usage: 100.4+ KB


In [9]:
class CustomDataset(Dataset):
    def __init__(self, data, max_length, pretrained_model='albert-base-v2'):
        self.data = data
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sent1 = str(self.data.loc[index, 'sentence1'])
        sent2 = str(self.data.loc[index, 'sentence2'])

        encoded_pair = self.tokenizer(text=sent1, text_pair=sent2,
                                      padding='max_length',
                                      truncation=True,
                                      max_length=self.max_length,
                                      return_tensors='pt')

        token_ids = encoded_pair['input_ids'].squeeze(0)
        attn_masks = encoded_pair['attention_mask'].squeeze(0)
        token_type_ids = encoded_pair['token_type_ids'].squeeze(0)

        label = self.data.loc[index, 'label']
        return token_ids, attn_masks, token_type_ids, label

In [10]:
bert_model = "albert-base-v2"  # 预训练模型名称
maxlen = 128
bs = 32  # 批次大小

train_set = CustomDataset(df_train, maxlen, bert_model)
val_set = CustomDataset(df_val, maxlen, bert_model)
train_loader = DataLoader(train_set, batch_size=bs, shuffle=True)
val_loader = DataLoader(val_set, batch_size=bs)

### 模型构建

In [11]:
class SentencePairClassifier(nn.Module):
    def __init__(self, pretrained_model="albert-base-v2", hidden_size=768):
        super(SentencePairClassifier, self).__init__()
        self.bert_layer = AutoModel.from_pretrained(pretrained_model)
        self.hidden_size = hidden_size  # 不同预训练模型有不同的隐藏层大小
        self.dropout = nn.Dropout(p=0.5)
        self.cls_layer = nn.Linear(self.hidden_size, 1)  # 下游分类任务

    def forward(self, input_ids, attn_masks, token_type_ids):
        outputs = self.bert_layer(input_ids, attn_masks, token_type_ids)
        logits = self.cls_layer(self.dropout(outputs['pooler_output']))
        return logits

In [12]:
net = SentencePairClassifier()
net.to(device)

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.LayerNorm.bias', 'predictions.decoder.bias', 'predictions.decoder.weight', 'predictions.dense.weight', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.dense.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SentencePairClassifier(
  (bert_layer): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias

### 模型训练

In [13]:
def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps):
    """
    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        num_training_steps (`int`):
            The total number of training steps.
    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """

    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            # 学习率预热(线性增加)
            return float(current_step) / float(max(1, num_warmup_steps))
        # 学习率线性衰减(最小为0)
        # num_training_steps后学习率恒为0
        return max(
            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
        )

    return LambdaLR(optimizer, lr_lambda)

In [14]:
def evaluate_loss(net, device, criterion, dataloader):
    net.eval()

    loss, count = 0, 0
    with torch.no_grad():
        for seq, attn_masks, token_type_ids, labels in dataloader:
            seq, attn_masks, token_type_ids, labels = seq.to(device), attn_masks.to(device), token_type_ids.to(
                device), labels.to(device)
            logits = net(seq, attn_masks, token_type_ids)
            loss += criterion(logits.squeeze(-1), labels.float()).item()
            count += 1

    return loss / count  # 每轮的平均损失


def train_bert(net, criterion, opti, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate):
    for ep in range(epochs):
        net.train()
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(train_loader):
            seq, attn_masks, token_type_ids, labels = seq.to(device), attn_masks.to(device), token_type_ids.to(
                device), labels.to(device)
            logits = net(seq, attn_masks, token_type_ids)
            # logits.shape=(32, 1)
            loss = criterion(logits.squeeze(-1), labels.to(torch.float))
            ###########################################################################
            # 梯度累加可以先累加多个batch的梯度再进行一次参数更新,相当于增大了batch-size
            loss = loss / iters_to_accumulate
            loss.backward()  # 每次获取1个batch的数据,计算1次梯度,梯度不清空,不断累加
            if (it + 1) % iters_to_accumulate == 0:  # 累加一定次数后,根据累加的梯度更新网络参数,然后清空梯度,进行下一次循环
                opti.step()
                lr_scheduler.step()
                opti.zero_grad()
            ###########################################################################
        val_loss = evaluate_loss(net, device, criterion, val_loader)
        print("Epoch {} complete! Validation Loss : {}".format(ep + 1, val_loss))

In [15]:
iters_to_accumulate = 2
lr = 2e-5  # 学习率
epochs = 5  # 训练轮数

criterion = nn.BCEWithLogitsLoss()  # 二分类交叉熵损失函数
opti = optim.AdamW(net.parameters(), lr=lr, weight_decay=1e-2)  # 优化器
t_total = (len(train_loader) // iters_to_accumulate) * epochs  # 总步数
lr_scheduler = get_linear_schedule_with_warmup(optimizer=opti, num_warmup_steps=0,
                                               num_training_steps=t_total)  # 学习率策略

In [16]:
train_bert(net, criterion, opti, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate)

Epoch 1 complete! Validation Loss : 0.46110450945518633
Epoch 2 complete! Validation Loss : 0.34429635962954275
Epoch 3 complete! Validation Loss : 0.33958045338039045
Epoch 4 complete! Validation Loss : 0.36206909600231385
Epoch 5 complete! Validation Loss : 0.375990265911376


### 评估

In [17]:
def test_prediction(net, device, dataloader):
    net.eval()
    probs_all = []
    with torch.no_grad():
        for seq, attn_masks, token_type_ids, _ in dataloader:
            seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
            logits = net(seq, attn_masks, token_type_ids).squeeze(-1)
            probs = torch.sigmoid(logits.unsqueeze(-1))
            probs = probs.detach().cpu().numpy().squeeze(-1)
            probs_all.extend(probs.tolist())
    return np.array(probs_all)

In [22]:
test_set = CustomDataset(df_test, maxlen, bert_model)
test_loader = DataLoader(test_set, batch_size=bs)
test_probs_all = test_prediction(net, device, test_loader)
print(test_probs_all)  # 概率向量
print(test_probs_all.shape)

[0.99298429 0.04565214 0.83503133 0.90706313 0.04442825 0.99266899
 0.08452644 0.99114311 0.99298179 0.9937017  0.99283469 0.04185068
 0.03460848 0.95590115 0.96891731 0.99206102 0.98882383 0.04016789
 0.99308264 0.90801013 0.04755341 0.33021644 0.05235379 0.99285853
 0.94888848 0.68846685 0.97375488 0.99145329 0.98986584 0.99317825
 0.54840916 0.9933843  0.99123281 0.99275726 0.98958123 0.89708847
 0.04817563 0.04130866 0.98649317 0.99266982 0.11675536 0.99151719
 0.30865005 0.05822035 0.03599201 0.99234009 0.99354941 0.04328445
 0.99353904 0.98876733 0.96335608 0.99286193 0.9889642  0.9933694
 0.99088436 0.99284726 0.85534787 0.99275619 0.99353927 0.98396075
 0.03855406 0.34536195 0.99367887 0.99213785 0.99090648 0.09596301
 0.99262166 0.99033499 0.07382733 0.99459141 0.98917496 0.96623868
 0.9934783  0.98693538 0.99254227 0.98829657 0.96153784 0.99395555
 0.99337834 0.99188143 0.35985035 0.98949534 0.99277675 0.05196293
 0.9846732  0.38236961 0.98823255 0.1154686  0.99321717 0.99366

In [21]:
threshold = 0.6  # 阈值
test_bool_all = np.where(test_probs_all >= threshold, 1, 0)
test_bool_all

array([1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1,

In [19]:
labels_test = df_test['label']
print(classification_report(labels_test, test_bool_all))

              precision    recall  f1-score   support

           0       0.80      0.74      0.77       129
           1       0.89      0.91      0.90       279

    accuracy                           0.86       408
   macro avg       0.84      0.83      0.84       408
weighted avg       0.86      0.86      0.86       408

