In [1]:
from pytorch_lightning import Trainer, seed_everything, LightningDataModule, LightningModule
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import torch.utils.data as Data
from datasets import load_from_disk
import torch
from transformers import BertTokenizer, BertModel
import torch.nn as nn
from sklearn.metrics import accuracy_score
from torch.optim.lr_scheduler import LambdaLR

In [2]:
# Make your code reproducible by calling this method at the beginning of your run.
# This covers PyTorch, NumPy and Python random number generators.
seed_everything(2022)

Global seed set to 2022


2022

In [3]:
class Dataset(Data.Dataset):
    """定义数据集"""

    def __init__(self, split):
        # self.dataset = load_dataset(path='seamew/ChnSentiCorp', split=split)
        self.dataset = load_from_disk('seamew_ChnSentiCorp')[split]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        """定义索引方式"""
        text = self.dataset[i]['text']
        label = self.dataset[i]['label']
        return text, label


dataset_train = Dataset('train')  # torch.utils.data.Dataset

for text, label in dataset_train:
    # 调用__getitem__方法
    print(text)
    print(label)
    break

选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般
1


In [4]:
model_ckpt = "bert-base-chinese"

token = BertTokenizer.from_pretrained(model_ckpt)
print(token.model_input_names)
pretrained = BertModel.from_pretrained(model_ckpt)
print(pretrained.num_parameters())

['input_ids', 'token_type_ids', 'attention_mask']


Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


102267648


In [5]:
def get_collate_fn(tokenizer, max_len=512):
    """返回collate_fun函数(通过闭包函数引入形参)"""

    def collate_fn(data):
        sents = [i[0] for i in data]
        labels = [i[1] for i in data]

        # 批量编码句子
        data = tokenizer(text=sents,
                         truncation=True,
                         padding='max_length',
                         max_length=max_len,
                         return_token_type_ids=True,
                         return_attention_mask=True,
                         return_tensors='pt')

        input_ids = data['input_ids']
        attention_mask = data['attention_mask']
        token_type_ids = data['token_type_ids']
        labels = torch.LongTensor(labels)
        return input_ids, attention_mask, token_type_ids, labels

    return collate_fn

In [6]:
class CustomDataModule(LightningDataModule):  # 必须继承自LightningDataModule
    def __init__(self, tokenizer, batch_size=16):
        super().__init__()
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.d_train = Dataset('train')
        self.d_valid = Dataset('validation')

    def train_dataloader(self):
        """返回训练数据集"""

        dataloader_train = torch.utils.data.DataLoader(dataset=self.d_train,
                                                       batch_size=self.batch_size,
                                                       collate_fn=get_collate_fn(self.tokenizer),
                                                       shuffle=True)
        return dataloader_train

    def val_dataloader(self):
        """返回验证数据集"""

        dataloader_valid = torch.utils.data.DataLoader(dataset=self.d_valid,
                                                       batch_size=self.batch_size,
                                                       collate_fn=get_collate_fn(self.tokenizer),
                                                       shuffle=False)
        return dataloader_valid

In [7]:
cdm = CustomDataModule(token)

for batch_idx, batch in enumerate(cdm.train_dataloader()):
    input_ids, attention_mask, token_type_ids, labels = batch
    print(input_ids)
    print(input_ids.shape)
    print(attention_mask)
    print(token_type_ids)
    print(labels)
    model_result = pretrained(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids)
    print(model_result.last_hidden_state.shape)
    break

tensor([[ 101, 2190,  754,  ...,    0,    0,    0],
        [ 101, 4692,  749,  ...,    0,    0,    0],
        [ 101,  122,  510,  ...,    0,    0,    0],
        ...,
        [ 101, 3300, 4157,  ...,    0,    0,    0],
        [ 101, 1355, 4638,  ...,    0,    0,    0],
        [ 101,  122,  119,  ...,    0,    0,    0]])
torch.Size([16, 512])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
tensor([1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0])
torch.Size([16, 512, 768])


In [8]:
def get_parameters(model,
                   encoder_layer_init_lr=2e-5,  # bert模型最后一个encoder结构的学习率
                   multiplier=0.95,  # 衰退因子
                   custom_lr=1e-4):  # 自定义的网络层学习率
    parameters = []
    lr = encoder_layer_init_lr

    # encoder层:
    # bert-larger共有24个encoder结构(分别为encoder.layer.0, encoder.layer.1, ......, encoder.layer.23)
    # bert-base共有12个encoder结构(分别为encoder.layer.0, encoder.layer.1, ......, encoder.layer.11)
    for layer in range(11, -1, -1):
        layer_params = {
            'params': [param for name, param in model.named_parameters() if f'encoder.layer.{layer}.' in name],
            # 关键字in表示是否包含
            'lr': lr
        }
        parameters.append(layer_params)
        lr *= multiplier  # 上个encoder结构的学习率 = 该encoder结构的学习率 * 衰退因子

    # 自定义网络层:下游任务自定义的网络层(具体任务对应修改)
    custom_params = {
        'params': [param for name, param in model.named_parameters() if 'linear' in name],
        'lr': custom_lr
    }
    parameters.append(custom_params)
    return parameters  # 这里bert模型的embedding层未加入优化器(即不参与梯度更新)


def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps):
    """
    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        num_training_steps (`int`):
            The total number of training steps.
    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """

    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            # 学习率预热(线性增加)
            return float(current_step) / float(max(1, num_warmup_steps))
        # 学习率线性衰减(最小为0)
        # num_training_steps后学习率恒为0
        return max(
            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
        )

    return LambdaLR(optimizer, lr_lambda)

In [9]:
class PLModel(LightningModule):  # 必须继承自LightningModule
    """下游训练任务模型"""

    def __init__(self, pretrained_model):
        super().__init__()
        self.model = nn.ModuleDict({
            'pretrained': pretrained_model,
            'fc': torch.nn.Linear(768, 2)  # 二分类任务
        })

    def forward(self, *args):
        input_ids, attention_mask, token_type_ids = args
        out = self.model['pretrained'](input_ids=input_ids,
                                       attention_mask=attention_mask,
                                       token_type_ids=token_type_ids)

        out = self.model['fc'](out.pooler_output)
        out = out.softmax(dim=1)  # 模型预测值
        return out  #  预测结果

    def training_step(self, batch, batch_idx):
        """
        单步(step)训练过程(Trainer实例fit方法调用)

        Parameters
        ---------
        batch : Tuple
            训练数据集每个批次数据
        batch_idx : int
            该批次数据索引

        Returns
        -------
        tr_loss : tensor
            该批次数据损失值(进度条中`loss=`的值)
        """

        input_ids, attention_mask, token_type_ids, labels = batch
        out = self(input_ids, attention_mask, token_type_ids)  # 执行forward方法
        tr_loss = nn.CrossEntropyLoss()(out, labels)
        if batch_idx % 100 == 0 and batch_idx > 0:
            predict = out.argmax(dim=1).cpu().numpy()
            accuracy = accuracy_score(labels.cpu().numpy(), predict)  # 评估指标
            print('| step {:5d} | loss {:8.5f} | accuracy {:8.5f} |'.format(batch_idx, tr_loss.item(), accuracy))
        return tr_loss

    def validation_step(self, batch, batch_idx):
        """
        单步(step)验证过程(Trainer实例fit方法,validate方法调用)

        Parameters
        ---------
        batch : Tuple
            验证数据集每个批次数据
        batch_idx : int
            该批次数据索引

        Returns
        -------
        result: dict
            自定义的一些返回值(供validation_epoch_end方法计算)
        """

        input_ids, attention_mask, token_type_ids, labels = batch
        out = self(input_ids, attention_mask, token_type_ids)  # 执行forward方法
        val_loss = nn.CrossEntropyLoss()(out, labels)
        self.log("val_loss", val_loss.item())  # Log the metric you want to monitor using log() method
        return {"loss": val_loss, "preds": out, "labels": labels}

    def validation_epoch_end(self, outputs):
        """
        轮次(epoch)验证评估(Trainer实例fit方法,validate方法调用)
        调用顺序:
                1. 计算该epoch前两步验证数据的结果(没有进行训练时)
                2. 该epoch训练过程结束后,计算该epoch所有验证数据的结果(每训练一轮进行一次验证)

        Parameters
        ---------
        outputs : list
            outputs计算过程如下:
            outputs = []
            for val_batch in CustomDataModule(token).val_dataloader:
                out = validation_step(val_batch)
                outputs.append(out)
            所有validation_step方法返回值组成的列表
        """

        preds = torch.cat([x["preds"] for x in outputs]).cpu()
        labels = torch.cat([x["labels"] for x in outputs]).cpu()
        val_epoch_loss = torch.stack([x["loss"] for x in outputs]).mean().cpu().item()
        val_epoch_acc = accuracy_score(labels.numpy(), preds.argmax(dim=1).numpy())  # 验证数据集准确率
        print('|valid loss {:8.5f} | valid accuracy {:8.5f}|'.format(val_epoch_loss, val_epoch_acc))
        # self.log("val_loss", val_epoch_loss.item())  # Log the metric you want to monitor using log() method
        self.log_dict({'val_loss': val_epoch_loss, 'val_acc': val_epoch_acc})  # 仅validate方法打印并返回该字典

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        """
        单次预测(Trainer实例predict方法调用)

        Parameters
        ---------
        batch : Tuple
            验证数据集每个批次数据
        batch_idx : int
            该批次数据索引
        dataloader_idx : int
            Index of the current dataloader
        
        Returns
        -------
        out: tensor
            单次预测结果
        """

        input_ids, attention_mask, token_type_ids, _ = batch
        out = self(input_ids, attention_mask, token_type_ids)  # 执行forward方法
        return out

    def configure_optimizers(self):
        # 优化器
        parameters = get_parameters(self.model, 2e-5, 0.95, 1e-4)
        optimizer = torch.optim.AdamW(parameters, lr=1e-4)
        # self.trainer.estimated_stepping_batches等价于len(dataloader_train)
        scheduler = {
            "scheduler": get_linear_schedule_with_warmup(optimizer, 0, self.trainer.estimated_stepping_batches * 2),
            # The scheduler instance
            # The unit of the scheduler's step size, could also be 'step'.
            # 'epoch' updates the scheduler on epoch end whereas 'step'
            # updates it after a optimizer update.
            "interval": "step",
            # How many epochs/steps should pass between calls to
            # `scheduler.step()`. 1 corresponds to updating the learning
            # rate after every epoch/step.
            "frequency": 1}
        return [optimizer], [
            scheduler]  # **Two lists** - The first list has multiple optimizers, and the second has multiple LR schedulers

In [10]:
model = PLModel(pretrained)
model

PLModel(
  (model): ModuleDict(
    (pretrained): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(21128, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                

In [11]:
trainer = Trainer(
    max_epochs=5,  # Stop training once this number of epochs is reached
    # max_steps=100,  #  Stop training after this number of steps
    # Supports passing different accelerator types (“cpu”, “gpu”, “tpu”, “ipu”, “hpu”, “mps, “auto”) as well as custom accelerator instances.
    accelerator="gpu",
    enable_checkpointing=False,  # If True, enable checkpointing.
    callbacks=[EarlyStopping(monitor="val_loss",  # quantity to be monitored
                             # one of 'min', 'max'. In 'min' mode, training will stop when the quantity monitored has stopped decreasing and in 'max' mode it will stop when the quantity monitored has stopped increasing.
                             mode="min",
                             # minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than or equal to min_delta, will count as no improvement.
                             min_delta=0.0,
                             # number of checks with no improvement after which training will be stopped.
                             patience=20,
                             # verbosity mode.
                             verbose=True)])  # 加入早停机制
trainer.fit(model=model, datamodule=cdm)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
  rank_zero_warn(

  | Name  | Type       | Params
-------------------------------------
0 | model | ModuleDict | 102 M 
-------------------------------------
102 M     Trainable params
0         Non-trainable params
102 M     Total params
409.077   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


|valid loss  0.69143 | valid accuracy  0.59375|


  rank_zero_warn(


Training: 0it [00:00, ?it/s]

| step   100 | loss  0.47934 | accuracy  0.81250 |
| step   200 | loss  0.49270 | accuracy  0.81250 |
| step   300 | loss  0.32406 | accuracy  1.00000 |
| step   400 | loss  0.56155 | accuracy  0.75000 |
| step   500 | loss  0.35649 | accuracy  0.93750 |


Validation: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.397


|valid loss  0.39747 | valid accuracy  0.91250|
| step   100 | loss  0.31428 | accuracy  1.00000 |
| step   200 | loss  0.31394 | accuracy  1.00000 |
| step   300 | loss  0.41560 | accuracy  0.87500 |
| step   400 | loss  0.41738 | accuracy  0.87500 |
| step   500 | loss  0.31879 | accuracy  1.00000 |


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.004 >= min_delta = 0.0. New best score: 0.393


|valid loss  0.39343 | valid accuracy  0.91500|
| step   100 | loss  0.31382 | accuracy  1.00000 |
| step   200 | loss  0.37612 | accuracy  0.93750 |
| step   300 | loss  0.35675 | accuracy  0.93750 |
| step   400 | loss  0.38119 | accuracy  0.93750 |
| step   500 | loss  0.37525 | accuracy  0.93750 |


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.012 >= min_delta = 0.0. New best score: 0.382


|valid loss  0.38158 | valid accuracy  0.93167|
| step   100 | loss  0.33717 | accuracy  1.00000 |
| step   200 | loss  0.38693 | accuracy  0.93750 |
| step   300 | loss  0.43730 | accuracy  0.87500 |
| step   400 | loss  0.32148 | accuracy  1.00000 |
| step   500 | loss  0.41800 | accuracy  0.87500 |


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.003 >= min_delta = 0.0. New best score: 0.378


|valid loss  0.37838 | valid accuracy  0.93167|
| step   100 | loss  0.31406 | accuracy  1.00000 |
| step   200 | loss  0.34210 | accuracy  0.93750 |
| step   300 | loss  0.31363 | accuracy  1.00000 |
| step   400 | loss  0.31369 | accuracy  1.00000 |
| step   500 | loss  0.37603 | accuracy  0.93750 |


Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


|valid loss  0.38702 | valid accuracy  0.92583|


In [12]:
validate_result = trainer.validate(model=model, datamodule=cdm)
validate_result

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Validation: 0it [00:00, ?it/s]

|valid loss  0.38702 | valid accuracy  0.92583|
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         val_acc            0.9258333333333333
        val_loss             0.387024462223053
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'val_loss': 0.387024462223053, 'val_acc': 0.9258333333333333}]

In [13]:
dataloader_test = torch.utils.data.DataLoader(dataset=Dataset('test'),
                                              batch_size=8,
                                              collate_fn=get_collate_fn(token),
                                              shuffle=False)
predict_list = trainer.predict(model=model, dataloaders=dataloader_test)
print(len(predict_list),  # 所有单次预测结果组成的列表
      len(dataloader_test))

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Predicting: 600it [00:00, ?it/s]

150 150


In [14]:
trainer.save_checkpoint(filepath='model_checkpoint.pkl')  # Runs routine to create a checkpoint.

In [15]:
# LightningModule from checkpoint
model_load = PLModel.load_from_checkpoint("model_checkpoint.pkl",
                                          pretrained_model=BertModel.from_pretrained('bert-base-chinese'))
model_load

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


PLModel(
  (model): ModuleDict(
    (pretrained): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(21128, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                

In [16]:
trainer.validate(model=model_load, datamodule=cdm)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Validation: 0it [00:00, ?it/s]

|valid loss  0.38702 | valid accuracy  0.92583|
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         val_acc            0.9258333333333333
        val_loss             0.387024462223053
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'val_loss': 0.387024462223053, 'val_acc': 0.9258333333333333}]

In [17]:
# nn.Module from checkpoint
checkpoint = torch.load("model_checkpoint.pkl")
print(checkpoint.keys())

dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 'loops', 'callbacks', 'optimizer_states', 'lr_schedulers'])
