In [1]:
import torch
import torch.utils.data as Data
from datasets import load_from_disk
from transformers import BertTokenizer, BertModel, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
import numpy as np
import random
import math

In [2]:
dataset = load_from_disk('seamew_ChnSentiCorp/')
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9600
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1200
    })
})

In [3]:
class CFG:
    seed = 42
    epochs = 5
    model_name = "bert-base-chinese"
    batch_size = 64
    lr = 5e-4
    
    num_warmup_steps = 50
    num_training_steps = math.ceil(len(dataset['train']) / batch_size) * epochs  # 向上取整

In [4]:
def set_seed(seed):
    """PyTorch随机数种子设置大全"""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)  # CPU上设置随机种子
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)  # 当前GPU上设置随机种子
        # A bool that, if True, causes cuDNN to only use deterministic convolution algorithms.
        torch.backends.cudnn.deterministic = True
        # torch.cuda.manual_seed_all(seed) # 所有GPU上设置随机种子


set_seed(CFG.seed)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
class Dataset(Data.Dataset):
    """定义数据集"""

    def __init__(self, data, split):
        self.split = split
        self.dataset = data[split]

    # 必须实现__len__魔法方法
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        """定义索引方式"""
        text = self.dataset[i]['text']
        if self.split == 'test':
            return text,  # 测试数据集不含标签
        else:
            label = self.dataset[i]['label']
            return text, label


dataset_train = Dataset(dataset, 'train')
dataset_validation = Dataset(dataset, 'validation')
dataset_test = Dataset(dataset, 'test')

for text, label in dataset_train:
    # 调用__getitem__方法
    print(text)
    print(label)
    break

for text in dataset_test:
    # 调用__getitem__方法
    print(text)  # 元组
    break

选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般
1
('这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般',)


In [7]:
tokenizer = BertTokenizer.from_pretrained(CFG.model_name)
print(tokenizer.model_input_names)
print(tokenizer)

pretrained = BertModel.from_pretrained(CFG.model_name)
print(pretrained.num_parameters())

# 冻结网络层参数(不进行梯度更新)
for param in pretrained.parameters():
    param.requires_grad = False

['input_ids', 'token_type_ids', 'attention_mask']
BertTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
102267648


In [8]:
def get_collate_fn(tokenizer, max_len=512):
    """返回collate_fun函数(通过闭包函数引入形参)"""

    def collate_fn(data):
        sents = [i[0] for i in data]

        # 批量编码句子
        text_token = tokenizer(text=sents,
                               truncation=True,
                               padding='max_length',
                               max_length=max_len,
                               return_token_type_ids=True,
                               return_attention_mask=True,
                               return_tensors='pt')

        input_ids = text_token['input_ids']
        attention_mask = text_token['attention_mask']
        token_type_ids = text_token['token_type_ids']
        # 返回值必须为字典(键与模型forward方法形参对应)
        result = {'input_ids': input_ids,  # ★★★★★对应模型forward方法input_ids参数
                  'attention_mask': attention_mask,  # ★★★★★对应模型forward方法attention_mask参数
                  "token_type_ids": token_type_ids}  # ★★★★对应模型forward方法token_type_ids参数

        if len(data[0]) == 1:
            return result  # 测试数据集不含标签
        else:
            labels = [i[1] for i in data]
            labels = torch.LongTensor(labels)
            result['labels'] = labels  # ★★★★对应模型forward方法labels参数
            return result

    return collate_fn

In [9]:
dataLoader_test = Data.DataLoader(dataset=dataset_test, batch_size=2, collate_fn=get_collate_fn(tokenizer, max_len=512))
for i in dataLoader_test:
    print(i)
    break

{'input_ids': tensor([[ 101, 6821,  702,  ...,    0,    0,    0],
        [ 101, 2577, 4708,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])}


In [10]:
class Model(torch.nn.Module):
    """下游训练任务模型"""

    def __init__(self, pretrained_model):
        super().__init__()
        self.fc = torch.nn.Linear(768, 2)  # 二分类任务;768:模型hidden_size
        self.pretrained = pretrained_model
        self.criterion = torch.nn.CrossEntropyLoss()  # 损失函数

    def forward(self, input_ids, attention_mask, token_type_ids, labels=None):
        out = self.pretrained(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids)

        out = self.fc(out.pooler_output)
        out = out.softmax(dim=1)
        loss = None
        if labels is not None:  # 若包含标签
            loss = self.criterion(out, labels)

        if loss is not None:
            return (loss, out)
        else:
            return out

model = Model(pretrained)
model = model.to(device)

In [11]:
def compute_metrics(pred):
    """验证数据集评估函数"""
    labels = pred.label_ids
    preds = pred.predictions 
    preds_argmax = preds.argmax(-1)
    acc = accuracy_score(labels, preds_argmax)
    return {"accuracy": acc} 


def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps):
    """
    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        num_training_steps (`int`):
            The total number of training steps.
    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """

    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            # 学习率预热(线性增加)
            return float(current_step) / float(max(1, num_warmup_steps))
        # 学习率线性衰减(最小为0)
        # num_training_steps后学习率恒为0
        return max(
            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
        )

    return LambdaLR(optimizer, lr_lambda)


model_name = f"{CFG.model_name}-finetuned-emotion"
optimizer = optim.AdamW(model.parameters(), lr=CFG.lr)  # 优化器
scheduler_lr = get_linear_schedule_with_warmup(optimizer, CFG.num_warmup_steps, CFG.num_training_steps)  # 学习率预热(必须为LambdaLR对象)

In [None]:
training_args = TrainingArguments(
    output_dir=model_name,
    overwrite_output_dir=False,
    save_total_limit=None, 
    seed=CFG.seed,
    num_train_epochs=CFG.epochs,
    max_grad_norm=1.0,
    gradient_accumulation_steps=1, 
    per_device_train_batch_size=CFG.batch_size, 
    per_device_eval_batch_size=CFG.batch_size, 
    dataloader_drop_last=False,  
    eval_strategy="epoch", 
    eval_steps=None, 
    logging_strategy='epoch', 
    save_strategy='epoch',
    log_level='passive', 
    load_best_model_at_end=False, 
    metric_for_best_model=None,
    disable_tqdm=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_validation,
    data_collator=get_collate_fn(tokenizer, max_len=512),  # 对应pytorch torch.utils.data.DataLoade 参数collate_fn
    optimizers=(optimizer, scheduler_lr),  
    compute_metrics=compute_metrics)

trainer.train()  # 模型训练

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6058,0.534635,0.805
2,0.5195,0.511587,0.815833
3,0.5045,0.506941,0.814167
4,0.4959,0.494394,0.830833
5,0.4915,0.49323,0.831667


TrainOutput(global_step=750, training_loss=0.5234297993977864, metrics={'train_runtime': 182.4005, 'train_samples_per_second': 263.157, 'train_steps_per_second': 4.112, 'total_flos': 0.0, 'train_loss': 0.5234297993977864, 'epoch': 5.0})

In [13]:
optimizer  # 初始化学习率0.0005,最终学习率归0(get_linear_schedule_with_warmup学习率预热归0)

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.0005
    lr: 0.0
    maximize: False
    weight_decay: 0.01
)

In [14]:
preds_output = trainer.predict(dataset_validation)  # 预测和评估包含标签的验证数据集
preds_output

PredictionOutput(predictions=array([[0.28464532, 0.7153547 ],
       [0.02458108, 0.975419  ],
       [0.26128414, 0.7387159 ],
       ...,
       [0.51695514, 0.48304483],
       [0.16963767, 0.8303623 ],
       [0.9440778 , 0.05592221]], dtype=float32), label_ids=array([1, 1, 0, ..., 0, 1, 0]), metrics={'test_loss': 0.49323007464408875, 'test_accuracy': 0.8316666666666667, 'test_runtime': 3.8466, 'test_samples_per_second': 311.96, 'test_steps_per_second': 4.939})

In [15]:
print(preds_output.predictions)  # 预测结果
print(type(preds_output.predictions))
print(preds_output.predictions.shape)

[[0.28464532 0.7153547 ]
 [0.02458108 0.975419  ]
 [0.26128414 0.7387159 ]
 ...
 [0.51695514 0.48304483]
 [0.16963767 0.8303623 ]
 [0.9440778  0.05592221]]
<class 'numpy.ndarray'>
(1200, 2)


In [16]:
preds_output.metrics  # 评估结果

{'test_loss': 0.49323007464408875,
 'test_accuracy': 0.8316666666666667,
 'test_runtime': 3.8466,
 'test_samples_per_second': 311.96,
 'test_steps_per_second': 4.939}

In [17]:
for i in dataLoader_test:
    print(i)
    break

{'input_ids': tensor([[ 101, 6821,  702,  ...,    0,    0,    0],
        [ 101, 2577, 4708,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])}


In [18]:
trainer.predict(dataset_test)  # 预测不含标签的测试数据集

PredictionOutput(predictions=array([[0.72549987, 0.2745002 ],
       [0.9635792 , 0.03642078],
       [0.3198018 , 0.6801982 ],
       ...,
       [0.21934602, 0.780654  ],
       [0.12182001, 0.87817997],
       [0.9847793 , 0.01522069]], dtype=float32), label_ids=None, metrics={'test_runtime': 3.8365, 'test_samples_per_second': 312.787, 'test_steps_per_second': 4.952})

In [19]:
def predict(model, data_loader):
    """预测不含标签的测试数据集(自定义)"""
    model.eval()  # Sets the module in evaluation mode.
    predict_list = []
    with torch.no_grad():
        for i in data_loader:
            input_ids = i['input_ids'].to(device)
            attention_mask = i['attention_mask'].to(device)
            token_type_ids = i['token_type_ids'].to(device)
            output = model(input_ids, attention_mask, token_type_ids)
            predict_list.append(output)
    predict_all = torch.cat(predict_list, dim=0)
    return predict_all


result = predict(model, dataLoader_test)
print(result)

tensor([[0.7255, 0.2745],
        [0.9636, 0.0364],
        [0.3198, 0.6802],
        ...,
        [0.2193, 0.7807],
        [0.1218, 0.8782],
        [0.9848, 0.0152]], device='cuda:0')
