In [1]:
import torch
import torch.utils.data as Data
from datasets import load_from_disk
from transformers import BertTokenizer, BertModel
import torch.optim as optim
import time
from sklearn.metrics import accuracy_score
import numpy as np
import random
import copy

In [2]:
dataset = load_from_disk('seamew_ChnSentiCorp/')
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9600
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1200
    })
})

In [3]:
class CFG:
    seed = 42
    epochs = 5
    model_name = "bert-base-chinese"
    lr = 5e-4
    verbose = 100
    batch_size = 16

In [4]:
def set_seed(seed):
    """PyTorch随机数种子设置大全"""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)  # CPU上设置随机种子
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)  # 当前GPU上设置随机种子
        # A bool that, if True, causes cuDNN to only use deterministic convolution algorithms.
        torch.backends.cudnn.deterministic = True
        # torch.cuda.manual_seed_all(seed) # 所有GPU上设置随机种子

set_seed(CFG.seed)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
class Dataset(Data.Dataset):
    """定义数据集"""

    def __init__(self, data, split):
        self.dataset = data[split]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        """定义索引方式"""
        text = self.dataset[i]['text']
        label = self.dataset[i]['label']
        return text, label


dataset_train = Dataset(dataset, 'train')  # torch.utils.data.Dataset

for text, label in dataset_train:
    # 调用__getitem__方法
    print(text)
    print(label)
    break

选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般
1


In [7]:
tokenizer = BertTokenizer.from_pretrained(CFG.model_name)
print(tokenizer.model_input_names)
print(tokenizer)

pretrained = BertModel.from_pretrained(CFG.model_name)
print(pretrained.num_parameters())

# 冻结网络层参数(不进行梯度更新)
for param in pretrained.parameters():
    param.requires_grad = False

['input_ids', 'token_type_ids', 'attention_mask']
BertTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)


Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


102267648


In [8]:
def get_collate_fn(tokenizer, max_len=512):
    """返回collate_fun函数(通过闭包函数引入形参)"""

    def collate_fn(data):
        sents = [i[0] for i in data]
        labels = [i[1] for i in data]

        # 批量编码句子
        text_t = tokenizer(text=sents,
                           truncation=True,
                           padding='max_length',
                           max_length=max_len,
                           return_token_type_ids=True,
                           return_attention_mask=True,
                           return_tensors='pt')

        input_ids = text_t['input_ids']
        attention_mask = text_t['attention_mask']
        token_type_ids = text_t['token_type_ids']
        labels = torch.LongTensor(labels)
        return input_ids, attention_mask, token_type_ids, labels

    return collate_fn


# 数据处理
dataloader_valid = torch.utils.data.DataLoader(dataset=Dataset(dataset, 'validation'),
                                               batch_size=CFG.batch_size,
                                               collate_fn=get_collate_fn(tokenizer))

dataloader_train = torch.utils.data.DataLoader(dataset=dataset_train,
                                               batch_size=CFG.batch_size,
                                               collate_fn=get_collate_fn(tokenizer),
                                               shuffle=True,
                                               drop_last=False)

print(len(dataloader_train))

for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(dataloader_train):
    print(input_ids)
    print(input_ids.shape)
    print(attention_mask)
    print(token_type_ids)
    print(labels)
    model_result = pretrained(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids)
    print(model_result.last_hidden_state.shape)
    break

600
tensor([[ 101, 1599, 3614,  ...,    0,    0,    0],
        [ 101, 2218, 3221,  ...,    0,    0,    0],
        [ 101, 3193, 7623,  ...,    0,    0,    0],
        ...,
        [ 101, 2190,  754,  ...,    0,    0,    0],
        [ 101, 2242, 2391,  ...,    0,    0,    0],
        [ 101, 6163, 5143,  ...,    0,    0,    0]])
torch.Size([16, 512])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
tensor([1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0])
torch.Size([16, 512, 768])


In [9]:
class Model(torch.nn.Module):
    """下游训练任务模型"""

    def __init__(self, pretrained_model):
        super().__init__()
        self.fc = torch.nn.Linear(768, 2)  # 二分类任务
        self.pretrained = pretrained_model

    def forward(self, input_ids, attention_mask, token_type_ids):
        out = self.pretrained(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids)

        out = self.fc(out.pooler_output)
        out = out.softmax(dim=1)  # 模型预测值
        return out

In [10]:
# 损失函数
criterion_cross_entropy = torch.nn.CrossEntropyLoss()

model_bert_base = Model(copy.deepcopy(pretrained))  # 必须进行深拷贝(pretrained(模型子网络结构)会参与梯度更新)
model_bert_base = model_bert_base.to(device)  # 模型设备切换

# 优化器
optimizer_adamw = optim.AdamW(model_bert_base.parameters(), lr=CFG.lr)  # 模型设备切换必须在优化器定义前执行

In [11]:
# 模型训练
def train(model, dataloader, criterion, optimizer, device):
    model.train()

    for idx, (input_ids, attention_mask, token_type_ids, labels) in enumerate(dataloader):
        # 数据设备切换
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        out = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids)
        loss = criterion(out, labels)  # 损失值
        loss.backward()
        optimizer.step()

        if idx % CFG.verbose == 0 and idx > 0:
            predict = out.argmax(dim=1).cpu().numpy()
            accuracy = accuracy_score(labels.cpu().numpy(), predict)  # 评估指标
            print('| step {:5d} | loss {:8.5f} | accuracy {:8.5f} |'.format(idx, loss.item(), accuracy))

In [12]:
# 模型验证
def evaluate(model, dataloader, device):
    model.eval()

    predict_list = []
    y_true_list = []
    with torch.no_grad():
        for input_ids, attention_mask, token_type_ids, labels in dataloader:
            # 数据设备切换
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)
            predict_list.append(out.cpu())
            y_true_list.extend(labels.tolist())

    predict_all = torch.cat(predict_list, dim=0)  # 合并所有批次的预测结果
    y_true_all = torch.tensor(y_true_list)
    accuracy = accuracy_score(y_true_all.numpy(), predict_all.argmax(dim=1).numpy())  # 评估指标
    return accuracy

In [13]:
for epoch in range(1, CFG.epochs + 1):
    epoch_start_time = time.time()
    
    train(model_bert_base, dataloader_train, criterion_cross_entropy, optimizer_adamw, device)
    accu_val = evaluate(model_bert_base, dataloader_valid, device)
    
    print('-' * 63)
    print('| end of epoch {:5d} | time: {:5.2f}s | '
          'valid accuracy {:8.5f} |'.format(epoch,
                                            time.time() - epoch_start_time,
                                            accu_val))
    print('-' * 63)

| step   100 | loss  0.64576 | accuracy  0.62500 |
| step   200 | loss  0.49378 | accuracy  0.81250 |
| step   300 | loss  0.49802 | accuracy  0.87500 |
| step   400 | loss  0.66452 | accuracy  0.56250 |
| step   500 | loss  0.53502 | accuracy  0.75000 |
---------------------------------------------------------------
| end of epoch     1 | time: 46.39s | valid accuracy  0.83000 |
---------------------------------------------------------------
| step   100 | loss  0.52727 | accuracy  0.81250 |
| step   200 | loss  0.48297 | accuracy  0.87500 |
| step   300 | loss  0.49245 | accuracy  0.81250 |
| step   400 | loss  0.42374 | accuracy  0.87500 |
| step   500 | loss  0.47599 | accuracy  0.81250 |
---------------------------------------------------------------
| end of epoch     2 | time: 46.29s | valid accuracy  0.83333 |
---------------------------------------------------------------
| step   100 | loss  0.48862 | accuracy  0.87500 |
| step   200 | loss  0.44639 | accuracy  0.81250 |
| st