In [20]:
import torch
import torch.utils.data as Data
from datasets import load_from_disk
from transformers import BertTokenizer, BertModel, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
import numpy as np
import random
import math

In [21]:
dataset = load_from_disk('seamew_ChnSentiCorp/')
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9600
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1200
    })
})

In [22]:
class CFG:
    seed = 42
    epochs = 5
    model_name = "bert-base-chinese"
    batch_size = 64
    lr = 5e-4
    
    num_warmup_steps = 50
    num_training_steps = math.ceil(len(dataset['train']) / batch_size) * epochs  # 向上取整

In [23]:
def set_seed(seed):
    """PyTorch随机数种子设置大全"""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)  # CPU上设置随机种子
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)  # 当前GPU上设置随机种子
        # A bool that, if True, causes cuDNN to only use deterministic convolution algorithms.
        torch.backends.cudnn.deterministic = True
        # torch.cuda.manual_seed_all(seed) # 所有GPU上设置随机种子


set_seed(CFG.seed)

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [25]:
class Dataset(Data.Dataset):
    """定义数据集"""

    def __init__(self, data, split):
        self.split = split
        self.dataset = data[split]

    # 必须实现__len__魔法方法
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        """定义索引方式"""
        text = self.dataset[i]['text']
        if self.split == 'test':
            return text,  # 测试数据集不含标签
        else:
            label = self.dataset[i]['label']
            return text, label


dataset_train = Dataset(dataset, 'train')
dataset_validation = Dataset(dataset, 'validation')
dataset_test = Dataset(dataset, 'test')

for text, label in dataset_train:
    # 调用__getitem__方法
    print(text)
    print(label)
    break

for text in dataset_test:
    # 调用__getitem__方法
    print(text)  # 元组
    break

选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般
1
('这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般',)


In [26]:
tokenizer = BertTokenizer.from_pretrained(CFG.model_name)
print(tokenizer.model_input_names)
print(tokenizer)

pretrained = BertModel.from_pretrained(CFG.model_name)
print(pretrained.num_parameters())

# 冻结网络层参数(不进行梯度更新)
for param in pretrained.parameters():
    param.requires_grad = False

['input_ids', 'token_type_ids', 'attention_mask']
BertTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)


Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


102267648


In [27]:
def get_collate_fn(tokenizer, max_len=512):
    """返回collate_fun函数(通过闭包函数引入形参)"""

    def collate_fn(data):
        sents = [i[0] for i in data]

        # 批量编码句子
        text_token = tokenizer(text=sents,
                               truncation=True,
                               padding='max_length',
                               max_length=max_len,
                               return_token_type_ids=True,
                               return_attention_mask=True,
                               return_tensors='pt')

        input_ids = text_token['input_ids']
        attention_mask = text_token['attention_mask']
        token_type_ids = text_token['token_type_ids']
        # 返回值必须为字典(键与模型forward方法形参对应)
        result = {'input_ids': input_ids,  # ★★★★★对应模型forward方法input_ids参数
                  'attention_mask': attention_mask,  # ★★★★★对应模型forward方法attention_mask参数
                  "token_type_ids": token_type_ids}  # ★★★★对应模型forward方法token_type_ids参数

        if len(data[0]) == 1:
            return result  # 测试数据集不含标签
        else:
            labels = [i[1] for i in data]
            labels = torch.LongTensor(labels)
            result['labels'] = labels  # ★★★★对应模型forward方法labels参数
            return result

    return collate_fn

In [28]:
dataLoader_test = Data.DataLoader(dataset=dataset_test, batch_size=2, collate_fn=get_collate_fn(tokenizer, max_len=512))
for i in dataLoader_test:
    print(i)
    break

{'input_ids': tensor([[ 101, 6821,  702,  ...,    0,    0,    0],
        [ 101, 2577, 4708,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])}


In [29]:
class Model(torch.nn.Module):
    """下游训练任务模型"""

    def __init__(self, pretrained_model):
        super().__init__()
        self.fc = torch.nn.Linear(768, 2)  # 二分类任务;768:模型hidden_size
        self.pretrained = pretrained_model
        self.criterion = torch.nn.CrossEntropyLoss()  # 损失函数

    def forward(self, input_ids, attention_mask, token_type_ids, labels=None):
        out = self.pretrained(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids)

        out = self.fc(out.pooler_output)
        out = out.softmax(dim=1)
        loss = None
        if labels is not None:  # 若包含标签
            loss = self.criterion(out, labels)

        # 训练与评估阶段
        # ★★★★★
        # 返回值为一个元组
        # 元组的第一个元素必须为该批次数据的损失值
        # 元组的第二个元素为该批次数据的预测值(可选)
        # * 验证数据集评估函数指标的计算
        # * predict方法预测结果(predictions)与评估结果(metrics)(结合输入labels)的计算
        if loss is not None:
            return (loss, out)
        # 预测阶段
        # ★★★★★
        # 返回值为模型的预测结果
        else:
            return out

model = Model(pretrained)
model = model.to(device)

In [30]:
def compute_metrics(pred):
    """验证数据集评估函数"""
    labels = pred.label_ids  # 对应自定义模型forward函数输入:labels
    preds = pred.predictions  # 对应自定义模型forward函数返回值的第二个元素
    preds_argmax = preds.argmax(-1)
    acc = accuracy_score(labels, preds_argmax)
    return {"accuracy": acc}  # return a dictionary string to metric value


def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps):
    """
    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        num_training_steps (`int`):
            The total number of training steps.
    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """

    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            # 学习率预热(线性增加)
            return float(current_step) / float(max(1, num_warmup_steps))
        # 学习率线性衰减(最小为0)
        # num_training_steps后学习率恒为0
        return max(
            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
        )

    return LambdaLR(optimizer, lr_lambda)


model_name = f"{CFG.model_name}-finetuned-emotion"
optimizer = optim.AdamW(model.parameters(), lr=CFG.lr)  # 优化器
scheduler_lr = get_linear_schedule_with_warmup(optimizer, CFG.num_warmup_steps, CFG.num_training_steps)  # 学习率预热(必须为LambdaLR对象)

In [31]:
# 主要调节的超参数
training_args = TrainingArguments(
    # The output directory where the model predictions and checkpoints will be written.
    output_dir=model_name,
    # If True, overwrite the content of the output directory. Use this to continue training if output_dir points to a checkpoint directory.
    overwrite_output_dir=False,  # 默认:False
    # save_total_limit (int, optional) —If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in output_dir.
    save_total_limit=None,  # 默认:None

    seed=CFG.seed,

    # Total number of training epochs to perform
    num_train_epochs=CFG.epochs,  # 默认:3.0
    # If set to a positive number, the total number of training steps to perform. Overrides num_train_epochs. I
    # max_steps=100,  # 默认:-1

    #  Maximum gradient norm (for gradient clipping).
    max_grad_norm=1.0,  # 默认:1.0
    # Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
    gradient_accumulation_steps=1,  # 默认:1
    
    # 对应pytorch DataLoader 参数batch_size
    # The batch size per GPU/TPU core/CPU for training.
    per_device_train_batch_size=CFG.batch_size,  # 默认:8
    # The batch size per GPU/TPU core/CPU for evaluation.
    # 对应pytorch DataLoader 参数batch_size
    per_device_eval_batch_size=CFG.batch_size,  # 默认:8
    # Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size) or not.
    # 对应pytorch DataLoader 参数drop_last
    dataloader_drop_last=False,  # 默认:False

    # The evaluation strategy to adopt during training. Possible values are:
    # "no": No evaluation is done during training.
    # "steps": Evaluation is done (and logged) every eval_steps.
    # "epoch": Evaluation is done at the end of each epoch.
    evaluation_strategy="epoch",  # 默认:'no'
    # Number of update steps between two evaluations if evaluation_strategy="steps". Will default to the same value as logging_steps if not set.
    eval_steps=None,  # 默认None

    # The logging strategy to adopt during training. Possible values are:
    # "no": No logging is done during training.
    # "epoch": Logging is done at the end of each epoch.
    # "steps": Logging is done every logging_steps.
    logging_strategy='epoch',  # 默认:'steps'
    # Number of update steps between two logs if logging_strategy="steps".
    # logging_steps=500,  # 默认:500

    # The checkpoint save strategy to adopt during training. Possible values are:
    # "no": No save is done during training.
    # "epoch": Save is done at the end of each epoch.
    # "steps": Save is done every save_steps.
    save_strategy='epoch',  # 默认:'steps'
    #  Number of updates steps before two checkpoint saves if save_strategy="steps".
    save_steps=500,  # 默认:500

    # Logger log level to use on the main process. Possible choices are the log levels as strings: ‘debug’, ‘info’, ‘warning’, ‘error’ and ‘critical’, plus a ‘passive’ level which doesn’t set anything and lets the application set the level.
    log_level='passive',  # 默认:'passive'

    # Whether or not to load the best model found during training at the end of training.
    # When set to True, the parameters save_strategy needs to be the same as evaluation_strategy, and in the case it is “steps”, save_steps must be a round multiple of eval_steps.
    load_best_model_at_end=False,  # 默认load_best_model_at_end=False
    # Use in conjunction with load_best_model_at_end to specify the metric to use to compare two different models.
    # Must be the name of a metric returned by the evaluation with or without the prefix "eval_".
    # Will default to "loss" if unspecified and load_best_model_at_end=True (to use the evaluation loss).
    metric_for_best_model=None,

    disable_tqdm=False,  # 是否使用tqdm显示进度(.py运行时设置disable_tqdm=True)
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_validation,
    data_collator=get_collate_fn(tokenizer, max_len=512),  # 对应pytorch torch.utils.data.DataLoade 参数collate_fn
    optimizers=(optimizer, scheduler_lr),  # 自定义优化器与学习率预热
    compute_metrics=compute_metrics)

trainer.train()  # 模型训练

Epoch,Training Loss,Validation Loss,Accuracy
1,0.606,0.543939,0.784167
2,0.5244,0.512102,0.821667
3,0.5047,0.500557,0.825833
4,0.4966,0.497011,0.83
5,0.4932,0.495462,0.829167


TrainOutput(global_step=750, training_loss=0.5249578043619791, metrics={'train_runtime': 255.5864, 'train_samples_per_second': 187.803, 'train_steps_per_second': 2.934, 'total_flos': 0.0, 'train_loss': 0.5249578043619791, 'epoch': 5.0})

In [32]:
optimizer  # 初始化学习率0.0005,最终学习率归0(get_linear_schedule_with_warmup学习率预热归0)

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    initial_lr: 0.0005
    lr: 0.0
    maximize: False
    weight_decay: 0.01
)

In [33]:
# Run prediction and returns predictions and potential metrics.
# Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method will also return metrics, like in `evaluate()`.
preds_output = trainer.predict(dataset_validation)  # 预测和评估包含标签的验证数据集
preds_output

PredictionOutput(predictions=array([[0.30090708, 0.699093  ],
       [0.02519588, 0.97480416],
       [0.22255814, 0.7774418 ],
       ...,
       [0.51666945, 0.48333058],
       [0.18903433, 0.8109657 ],
       [0.93123865, 0.06876133]], dtype=float32), label_ids=array([1, 1, 0, ..., 0, 1, 0]), metrics={'test_loss': 0.495461642742157, 'test_accuracy': 0.8291666666666667, 'test_runtime': 5.1215, 'test_samples_per_second': 234.308, 'test_steps_per_second': 3.71})

In [34]:
print(preds_output.predictions)  # 预测结果
print(type(preds_output.predictions))
print(preds_output.predictions.shape)

[[0.30090708 0.699093  ]
 [0.02519588 0.97480416]
 [0.22255814 0.7774418 ]
 ...
 [0.51666945 0.48333058]
 [0.18903433 0.8109657 ]
 [0.93123865 0.06876133]]
<class 'numpy.ndarray'>
(1200, 2)


In [35]:
preds_output.metrics  # 评估结果

{'test_loss': 0.495461642742157,
 'test_accuracy': 0.8291666666666667,
 'test_runtime': 5.1215,
 'test_samples_per_second': 234.308,
 'test_steps_per_second': 3.71}

In [36]:
for i in dataLoader_test:
    print(i)
    break

{'input_ids': tensor([[ 101, 6821,  702,  ...,    0,    0,    0],
        [ 101, 2577, 4708,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])}


In [37]:
trainer.predict(dataset_test)  # 预测不含标签的测试数据集

PredictionOutput(predictions=array([[0.74712986, 0.25287017],
       [0.9722216 , 0.02777843],
       [0.40959132, 0.5904087 ],
       ...,
       [0.20984258, 0.7901574 ],
       [0.09190209, 0.9080979 ],
       [0.985579  , 0.01442092]], dtype=float32), label_ids=None, metrics={'test_runtime': 5.1017, 'test_samples_per_second': 235.214, 'test_steps_per_second': 3.724})

In [38]:
def predict(model, data_loader):
    """预测不含标签的测试数据集(自定义)"""
    model.eval()  # Sets the module in evaluation mode.
    predict_list = []
    with torch.no_grad():
        for i in data_loader:
            input_ids = i['input_ids'].to(device)
            attention_mask = i['attention_mask'].to(device)
            token_type_ids = i['token_type_ids'].to(device)
            output = model(input_ids, attention_mask, token_type_ids)
            predict_list.append(output)
    predict_all = torch.cat(predict_list, dim=0)
    return predict_all


result = predict(model, dataLoader_test)
print(result)

tensor([[0.7471, 0.2529],
        [0.9722, 0.0278],
        [0.4096, 0.5904],
        ...,
        [0.2098, 0.7902],
        [0.0919, 0.9081],
        [0.9856, 0.0144]], device='cuda:0')
