In [1]:
import torch
import torch.utils.data as Data
from datasets import load_from_disk
from transformers import BertTokenizer
from transformers import BertModel
import torch.optim as optim
import numpy as np
import random

In [2]:
def set_seed(seed):
    """PyTorch随机数种子设置大全"""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)  # CPU上设置随机种子
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)  # 当前GPU上设置随机种子
        # A bool that, if True, causes cuDNN to only use deterministic convolution algorithms.
        torch.backends.cudnn.deterministic = True
        # torch.cuda.manual_seed_all(seed) # 所有GPU上设置随机种子


set_seed(42)

In [3]:
class Dataset(Data.Dataset):
    """定义数据集"""

    def __init__(self, split):
        dataset_init = load_from_disk("dataset")  # 从HuggingFace保存的数据集
        if split == 'train':
            self.dataset = dataset_init['train']
        elif split == 'validation':
            self.dataset = dataset_init['validation']
        else:
            self.dataset = dataset_init['test']

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        """定义索引方式"""
        text = self.dataset[i]['text']
        label = self.dataset[i]['label']
        return text, label


def collate_fn(data):
    sents = [i[0] for i in data]
    labels = [i[1] for i in data]

    # 批量编码句子
    data = token(text=sents,
                 truncation=True,
                 padding='max_length',
                 max_length=512,
                 return_token_type_ids=True,
                 return_attention_mask=True,
                 return_tensors='pt')

    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    labels = torch.LongTensor(labels)
    return input_ids, attention_mask, token_type_ids, labels


class Model(torch.nn.Module):
    """下游训练任务模型"""

    def __init__(self, pretrained_model):
        super().__init__()
        self.fc = torch.nn.Linear(768, 2)  # 二分类任务
        self.pretrained = pretrained_model

    def forward(self, input_ids, attention_mask, token_type_ids):
        out = self.pretrained(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids)

        out = self.fc(out.pooler_output)
        out = out.softmax(dim=1)
        return out

In [4]:
dataset = Dataset('train')
loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=32,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

token = BertTokenizer.from_pretrained('bert-base-chinese')
pretrained = BertModel.from_pretrained('bert-base-chinese')

# 损失函数
criterion = torch.nn.CrossEntropyLoss()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Model(pretrained)
model = model.to(device)

# 不同网络层拥有不同的优化器参数
optimizer = optim.AdamW([dict(params=model.fc.parameters(), lr=2e-5),
                         dict(params=model.pretrained.parameters(), lr=5e-6)])

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
class FGM():
    """Fast Gradient Sign Method"""

    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self,
               emb_name,  # 添加扰动的embedding层名称
               epsilon=1.0):  # 扰动项中的\epsilon
        for name, param in self.model.named_parameters():
            if param.requires_grad and name == emb_name:
                self.backup[name] = param.detach().clone()
                norm = torch.linalg.norm(param.grad)
                if norm != 0 and not torch.isnan(norm):
                    r_at = epsilon * param.grad / norm  # \epsilon * (g / ||g||_2)
                    param.data.add_(r_at)  # embedding层参数增加扰动\Delta x

    def restore(self, emb_name):
        for name, param in self.model.named_parameters():
            if param.requires_grad and name == emb_name:
                param.data = self.backup[name]  # 恢复embedding层原有参数值
        self.backup = {}

In [6]:
###########################################################################
# step 1. 初始化
fgm = FGM(model)
###########################################################################

model.train()
for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    token_type_ids = token_type_ids.to(device)
    labels = labels.to(device)
    out = model(input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)
    loss = criterion(out, labels)
    loss.backward()  # 未进行对抗训练的梯度

    ###########################################################################
    # step 2. 对抗训练
    fgm.attack(emb_name='pretrained.embeddings.word_embeddings.weight', epsilon=1.)
    out_adv = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids)
    loss_adv = criterion(out_adv, labels)
    loss_adv.backward()  # 对抗训练的梯度
    fgm.restore(emb_name='pretrained.embeddings.word_embeddings.weight')  # 恢复embedding层原有参数值
    ###########################################################################

    optimizer.step()  # 梯度累加
    optimizer.zero_grad()

    if i % 10 == 0:
        out = out.argmax(dim=1)
        accuracy = (out == labels).sum().item() / len(labels)
        print(i, loss.item(), accuracy)

    if i == 200:
        break

0 0.7277681827545166 0.25
10 0.6629133224487305 0.75
20 0.6543726325035095 0.6875
30 0.5911693572998047 0.9375
40 0.5501372814178467 0.8125
50 0.5407504439353943 0.8125
60 0.46793609857559204 0.84375
70 0.44919583201408386 0.875
80 0.42310693860054016 0.9375
90 0.47143206000328064 0.84375
100 0.3642938435077667 0.96875
110 0.396707147359848 0.90625
120 0.4522962272167206 0.84375
130 0.42607593536376953 0.875
140 0.43155309557914734 0.875
150 0.4146895110607147 0.875
160 0.3909030258655548 0.9375
170 0.41944530606269836 0.90625
180 0.37423813343048096 0.9375
190 0.4125397801399231 0.90625
200 0.4017783999443054 0.90625


In [7]:
# 模型验证
def test():
    model.eval()
    correct = 0
    total = 0

    loader_test = torch.utils.data.DataLoader(dataset=Dataset('validation'),
                                              batch_size=32,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader_test):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        labels = labels.to(device)
        if i == 5:
            break
        with torch.no_grad():
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)

        out = out.argmax(dim=1)
        correct += (out == labels).sum().item()
        total += len(labels)

    return correct / total


# 未添加对抗训练:0.88125
# 添加对抗训练后:0.89375
test()

0.89375