In [10]:
import torch
import torch.nn as nn
import torch.utils.data as Data
import torch.nn.functional as F
from datasets import load_dataset
from transformers import BertTokenizer
from transformers import BertModel
import torch.optim as optim
import numpy as np
import random

In [11]:
def set_seed(seed):
    """PyTorch随机数种子设置大全"""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)  # CPU上设置随机种子
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)  # 当前GPU上设置随机种子
        # torch.cuda.manual_seed_all(seed) # 所有GPU上设置随机种子


set_seed(42)

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [13]:
class Dataset(Data.Dataset):
    """定义数据集"""

    def __init__(self, split):
        self.dataset = load_dataset(path='seamew/ChnSentiCorp', split=split)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        """定义索引方式"""
        text = self.dataset[i]['text']
        label = self.dataset[i]['label']
        return text, label


dataset = Dataset('train')  # torch.utils.data.Dataset

for text, label in dataset:
    # 调用__getitem__方法
    print(text)
    print(label)
    break

Using custom data configuration default
Reusing dataset chn_senti_corp (C:\Users\dcdmm\.cache\huggingface\datasets\seamew___chn_senti_corp\default\0.0.0\1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85)


选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般
1


In [14]:
model_ckpt = "bert-base-chinese"

token = BertTokenizer.from_pretrained(model_ckpt)
print(token)

pretrained = BertModel.from_pretrained(model_ckpt)
print(pretrained)

# 冻结网络层参数(不进行梯度更新)
for param in pretrained.parameters():
    param.requires_grad = False

PreTrainedTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [15]:
def collate_fn(data):
    sents = [i[0] for i in data]
    labels = [i[1] for i in data]

    # 批量编码句子
    data = token(text=sents,
                 truncation=True,
                 padding='max_length',
                 max_length=512,
                 return_token_type_ids=True,
                 return_attention_mask=True,
                 return_tensors='pt')

    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    labels = torch.LongTensor(labels)
    return input_ids, attention_mask, token_type_ids, labels


# 数据处理
loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=4,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

print(len(loader))

for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
    print(input_ids)
    print(input_ids.shape)
    print(attention_mask)
    print(token_type_ids)
    print(labels)
    model_result = pretrained(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids)
    print(model_result.last_hidden_state.shape)
    break

2400
tensor([[ 101, 1599, 3614,  ...,    0,    0,    0],
        [ 101, 2218, 3221,  ...,    0,    0,    0],
        [ 101, 3193, 7623,  ...,    0,    0,    0],
        [ 101, 5466, 1767,  ...,    0,    0,    0]])
torch.Size([4, 512])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
tensor([1, 1, 1, 0])
torch.Size([4, 512, 768])


In [16]:
class Model(torch.nn.Module):
    """下游训练任务模型"""

    def __init__(self, pretrained_model, dropout_ratio=0.3):
        super().__init__()
        self.fc = torch.nn.Linear(768, 2)  # 二分类任务
        self.pretrained = pretrained_model
        self.dropout = nn.Dropout(dropout_ratio)

    def forward(self, input_ids, attention_mask, token_type_ids):
        out = self.pretrained(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids)
        out = self.fc(self.dropout(out.pooler_output))
        out = out.softmax(dim=1)  # 模型预测值
        return out

Dropout is a powerful and widely used technique to regularize the training of deep neural networks.
Though effective and performing well, the randomness introduced by dropout causes unnegligible inconsistency between training and inference.

<img src="../../Other/img/R-drop.png">

With the basic negative log-likelihood learning objective $\mathcal{L}_{N L L}^{i}$ of the two forward passes:

$$ \mathcal{L}_{NLL1}^{i} =  -\log \mathcal{P}_{1}^{w}\left(y_{i} \mid x_{i}\right)-\log \mathcal{P}_{2}^{w}\left(y_{i} \mid x_{i}\right) $$

the final training objective is to minimize $\mathcal{L}^{i}$ for data $(x_i, y_i)$:

\begin{aligned}
\mathcal{L}^{i}=\mathcal{L}_{NLL1}^{i}+\alpha \cdot \mathcal{L}_{KL2}^{i}=&-\log \mathcal{P}_{1}^{w}\left(y_{i} \mid x_{i}\right)-\log \mathcal{P}_{2}^{w}\left(y_{i} \mid x_{i}\right) \\
&+\frac{\alpha}{2}\left[\mathcal{D}_{K L}\left(\mathcal{P}_{1}^{w}\left(y_{i} \mid x_{i}\right) \| \mathcal{P}_{2}^{w}\left(y_{i} \mid x_{i}\right)\right)+\mathcal{D}_{K L}\left(\mathcal{P}_{2}^{w}\left(y_{i} \mid x_{i}\right) \| \mathcal{P}_{1}^{w}\left(y_{i} \mid x_{i}\right)\right)\right]
\end{aligned}

In [17]:
# 损失函数
criterion = torch.nn.CrossEntropyLoss()

model = Model(pretrained)
model = model.to(device)  # 模型设备切换

# 优化器
optimizer = optim.AdamW(model.parameters(), lr=5e-4)


def compute_kl_loss(p, q):
    # KL散度具有不对称性
    p_loss = F.kl_div(F.log_softmax(p, dim=-1), F.softmax(q, dim=-1), reduction='none')
    q_loss = F.kl_div(F.log_softmax(q, dim=-1), F.softmax(p, dim=-1), reduction='none')

    # You can choose whether to use function "sum" and "mean" depending on your task
    p_loss = p_loss.sum()
    q_loss = q_loss.sum()

    pq_loss = (p_loss + q_loss) / 2  # 两次KL散度的平均
    return pq_loss


# 模型训练
model.train()
for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
    # 数据设备切换
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    token_type_ids = token_type_ids.to(device)
    labels = labels.to(device)

    ###########################################################################
    # step 1. 相同的数据送入一个带有dropout的模型两次
    logits = model(input_ids=input_ids,
                   attention_mask=attention_mask,
                   token_type_ids=token_type_ids)

    logits2 = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids)
    ###########################################################################

    ###########################################################################
    # step 2. 计算总损失
    ce_loss = 0.5 * (criterion(logits, labels) + criterion(logits2, labels))
    kl_loss = compute_kl_loss(logits, logits2)  # 约束两次的输出一致
    alpha = 1  # alphe为KL散度的权重(超参数)
    loss = ce_loss + alpha * kl_loss
    ###########################################################################

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if i % 20 == 0:
        out = logits.argmax(dim=1)
        accuracy = (out == labels).sum().item() / len(labels)
        print(i, loss.item(), accuracy)

    if i == 300:
        break

0 0.7189090847969055 0.75
20 1.0231369733810425 0.0
40 1.0207936763763428 0.25
60 0.982821524143219 0.25
80 0.7893469333648682 0.5
100 0.5838350057601929 0.75
120 0.6930160522460938 0.25
140 0.7196400165557861 0.75
160 0.5752890706062317 0.75
180 0.7348844408988953 0.5
200 0.6555920243263245 0.75
220 0.5124338269233704 1.0
240 0.6854724884033203 0.75
260 0.5981273055076599 0.75
280 0.5034768581390381 1.0
300 0.6566712260246277 0.75


In [18]:
# 模型验证
def test():
    model.eval()
    correct = 0
    total = 0

    loader_test = torch.utils.data.DataLoader(dataset=Dataset('validation'),
                                              batch_size=32,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader_test):
        # 数据设备切换
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        labels = labels.to(device)

        if i == 5:
            break
        with torch.no_grad():
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)

        out = out.argmax(dim=1)
        correct += (out == labels).sum().item()
        total += len(labels)

    return correct / total

# no R-drop: 0.775
# R-drop: 0.7875
test()

Using custom data configuration default
Reusing dataset chn_senti_corp (C:\Users\dcdmm\.cache\huggingface\datasets\seamew___chn_senti_corp\default\0.0.0\1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85)


0.7875