In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, TrainingArguments
import numpy as np
import random
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
import math

In [2]:
emotions = load_dataset(path='dair-ai/emotion')
emotions

Using the latest cached version of the dataset since dair-ai/emotion couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'split' at C:\Users\duanm\.cache\huggingface\datasets\dair-ai___emotion\split\0.0.0\cab853a1dbdf4c42c2b3ef2173804746df8825fe (last modified on Wed Nov 13 16:08:02 2024).


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [3]:
class CFG:
    seed = 42
    epochs = 4
    model_name = "bert-base-uncased"
    batch_size = 4
    lr = 4e-5
    
    num_warmup_steps = 50
    num_training_steps = math.ceil(len(emotions['train']) / batch_size) * epochs  # 向上取整

In [4]:
def set_seed(seed):
    """PyTorch随机数种子设置大全"""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)  # CPU上设置随机种子
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)  # 当前GPU上设置随机种子
        # A bool that, if True, causes cuDNN to only use deterministic convolution algorithms.
        torch.backends.cudnn.deterministic = True
        # torch.cuda.manual_seed_all(seed) # 所有GPU上设置随机种子


seed = 2022
set_seed(seed)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)
print(tokenizer.model_input_names)

pretrained = AutoModel.from_pretrained(CFG.model_name)
print(pretrained.num_parameters())



['input_ids', 'token_type_ids', 'attention_mask']




109482240


In [7]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)


# 批次处理,整个数据集同时进行处理
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)
emotions_encoded  # 原有数据与map函数新增数据('input_ids', 'token_type_ids', 'attention_mask')的联合

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [8]:
emotions_encoded = emotions_encoded.remove_columns(['text'])  # 'text'列不参与训练(即不进入自定义模型forward函数)
emotions_encoded.set_format("torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
emotions_encoded['train'][:]  # 字典

{'label': tensor([0, 0, 3,  ..., 1, 3, 0]),
 'input_ids': tensor([[  101,  1045,  2134,  ...,     0,     0,     0],
         [  101,  1045,  2064,  ...,     0,     0,     0],
         [  101, 10047,  9775,  ...,     0,     0,     0],
         ...,
         [  101,  1045,  2514,  ...,     0,     0,     0],
         [  101,  1045,  2514,  ...,     0,     0,     0],
         [  101,  1045,  2113,  ...,     0,     0,     0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [9]:
class Customize_Model(torch.nn.Module):
    """下游训练任务模型"""

    def __init__(self, pretrained_model, num_labels):
        super().__init__()
        self.classifier = torch.nn.Linear(768, num_labels)  # 多分类任务
        self.pretrained = pretrained_model
        self.dropout = nn.Dropout()
        self.loss_fct = nn.CrossEntropyLoss()  # 损失函数

    def forward(self,
                input_ids,  # ★★★★★训练阶段对应emotions_encoded['train']中的input_ids
                attention_mask,  # ★★★★★训练阶段对应emotions_encoded['train']中的attention_mask
                token_type_ids,  # ★★★★★训练阶段对应emotions_encoded['train']中的token_type_ids
                labels=None):  # 标签;★★★★★训练阶段对应emotions_encoded['train']中的labels
        outputs = self.pretrained(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  token_type_ids=token_type_ids)
        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        loss = None
        if labels is not None:  # 若包含标签
            loss = self.loss_fct(logits.view(-1, num_labels), labels.view(-1))

        # 训练与评估阶段
        # ★★★★★
        # 返回值为一个元组
        # 元组的第一个元素必须为该批次数据的损失值
        # 元组的第二个元素为该批次数据的预测值(可选)
        # * 验证数据集评估函数指标的计算
        # * predict方法预测结果(predictions)与评估结果(metrics)(结合输入labels)的计算
        if loss is not None:
            return (loss, logits)
        # 预测阶段
        # ★★★★★
        # 返回值为模型的预测结果
        else:
            return logits


num_labels = 6

model = Customize_Model(pretrained, num_labels)
model = model.to(device)

In [10]:
def compute_metrics(pred):
    """验证数据集评估函数"""
    labels = pred.label_ids  # 对应自定义模型forward函数输入:labels
    preds = pred.predictions  # 对应自定义模型forward函数返回值的第二个元素
    preds_argmax = preds.argmax(-1)
    f1 = f1_score(labels, preds_argmax, average="weighted")
    acc = accuracy_score(labels, preds_argmax)
    return {"accuracy": acc, "f1": f1}  # return a dictionary string to metric values


def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps):
    """
    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        num_training_steps (`int`):
            The total number of training steps.
    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """

    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            # 学习率预热(线性增加)
            return float(current_step) / float(max(1, num_warmup_steps))
        # 学习率线性衰减(最小为0)
        # num_training_steps后学习率恒为0
        return max(
            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
        )

    return LambdaLR(optimizer, lr_lambda)


model_name = f"{CFG.model_name}-finetuned-emotion"
optimizer = optim.AdamW(model.parameters(), lr=CFG.lr)  # 优化器
scheduler_lr = get_linear_schedule_with_warmup(optimizer, CFG.num_warmup_steps, CFG.num_training_steps)  # 学习率预热(必须为LambdaLR对象)

In [None]:
# 主要调节的超参数
training_args = TrainingArguments(
    # The output directory where the model predictions and checkpoints will be written.
    output_dir=model_name,
    # If True, overwrite the content of the output directory. Use this to continue training if output_dir points to a checkpoint directory.
    overwrite_output_dir=False,  # 默认:False
    # save_total_limit (int, optional) —If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in output_dir.
    save_total_limit=None,  # 默认:None

    seed=42,

    # Total number of training epochs to perform
    num_train_epochs=CFG.epochs,  # 默认:3.0
    # If set to a positive number, the total number of training steps to perform. Overrides num_train_epochs.
    # max_steps=2000,  # 默认:-1

    # Maximum gradient norm (for gradient clipping).
    max_grad_norm=1.0,  # 默认:1.0
    # Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
    gradient_accumulation_steps=1,  # 默认:1

    # 对应pytorch DataLoader 参数batch_size
    # The batch size per GPU/TPU core/CPU for training.
    per_device_train_batch_size=CFG.batch_size,  # 默认:8
    # 对应pytorch DataLoader 参数batch_size
    # The batch size per GPU/TPU core/CPU for evaluation.
    per_device_eval_batch_size=CFG.batch_size,  # 默认:8
    # Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size) or not.
    # 对应pytorch DataLoader 参数drop_last
    dataloader_drop_last=False,  # 默认:False

    # The evaluation strategy to adopt during training. Possible values are:
    # "no": No evaluation is done during training.
    # "steps": Evaluation is done (and logged) every eval_steps.
    # "epoch": Evaluation is done at the end of each epoch.
    eval_strategy="epoch",  # 默认:'no'
    # Number of update steps between two evaluations if evaluation_strategy="steps". Will default to the same value as logging_steps if not set.
    eval_steps=None,  # 默认None

    # The logging strategy to adopt during training. Possible values are:
    # "no": No logging is done during training.
    # "epoch": Logging is done at the end of each epoch.
    # "steps": Logging is done every logging_steps.
    logging_strategy='epoch',  # 默认:'steps'
    # Number of update steps between two logs if logging_strategy="steps".
    # logging_steps=500,  # 默认:500

    # The checkpoint save strategy to adopt during training. Possible values are:
    # "no": No save is done during training.
    # "epoch": Save is done at the end of each epoch.
    # "steps": Save is done every save_steps.
    save_strategy='epoch',  # 默认:'steps'
    # Number of updates steps before two checkpoint saves if save_strategy="steps".
    save_steps=500,  # 默认:500

    # Logger log level to use on the main process. Possible choices are the log levels as strings: ‘debug’, ‘info’, ‘warning’, ‘error’ and ‘critical’, plus a ‘passive’ level which doesn’t set anything and lets the application set the level.
    log_level='passive',  # 默认:'passive'

    # Whether or not to load the best model found during training at the end of training.
    # When set to True, the parameters save_strategy needs to be the same as evaluation_strategy, and in the case it is “steps”, save_steps must be a round multiple of eval_steps.
    load_best_model_at_end=False,  # 默认load_best_model_at_end=False
    # Use in conjunction with load_best_model_at_end to specify the metric to use to compare two different models.
    # Must be the name of a metric returned by the evaluation with or without the prefix "eval_".
    # Will default to "loss" if unspecified and load_best_model_at_end=True (to use the evaluation loss).
    metric_for_best_model=None,

    # 原理: Activation checkpointing is a technique that trades compute for memory. Instead of keeping tensors needed for backward alive until they are used in gradient computation during backward, forward computation in checkpointed regions omits saving tensors for backward and recomputes them during the backward pass. Activation checkpointing can be applied to any part of a model.
    # 
    # transformer.Trainer内部调用源码:
    # ```python
    # # Activate gradient checkpointing if needed
    # if args.gradient_checkpointing:
    #     if args.gradient_checkpointing_kwargs is None:
    #         gradient_checkpointing_kwargs = {}
    #     else:
    #         gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs
    #     self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)
    # ````
    # If True, use gradient checkpointing to save memory at the expense of slower backward pass.
    gradient_checkpointing=True,  # 默认gradient_checkpointing=False
    # Key word arguments to be passed to the gradient_checkpointing_enable method.
    gradient_checkpointing_kwargs=None,  # 默认gradient_checkpointing_kwargs=None

    disable_tqdm=False,  # Whether or not to disable the tqdm progress bars and table of metrics produced by ~notebook.(.py运行时设置disable_tqdm=True)

    # The list of integrations to report the results and logs to. Supported platforms are "azure_ml", "clearml", "codecarbon", "comet_ml", "dagshub", "dvclive", "flyte", "mlflow", "neptune", "tensorboard", and "wandb". Use "all" to report to all integrations installed, "none" for no integrations.
    report_to="wandb",  # 默认:'all'
    # A descriptor for the run. Typically used for wandb, mlflow and comet logging. If not specified, will be the same as output_dir.
    run_name="Trainer_log"
)

# TrainingArguments优化器参数:
# optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch"`):
#     The optimizer to use, such as "adamw_torch", "adamw_torch_fused", "adamw_apex_fused", "adamw_anyprecision",
#     "adafactor". See `OptimizerNames` in [training_args.py](https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py)
#     for a full list of optimizers.
# learning_rate (`float`, *optional*, defaults to 5e-5):
#     The initial learning rate for [`AdamW`] optimizer.
# weight_decay (`float`, *optional*, defaults to 0):
#     The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in [`AdamW`]
#     optimizer.
# adam_beta1 (`float`, *optional*, defaults to 0.9):
#     The beta1 hyperparameter for the [`AdamW`] optimizer.
# adam_beta2 (`float`, *optional*, defaults to 0.999):
#     The beta2 hyperparameter for the [`AdamW`] optimizer.
# adam_epsilon (`float`, *optional*, defaults to 1e-8):
#     The epsilon hyperparameter for the [`AdamW`] optimizer.

# TrainingArguments学习率调整参数:
# lr_scheduler_type (`str` or [`SchedulerType`], *optional*, defaults to `"linear"`):
#     The scheduler type to use. See the documentation of [`SchedulerType`] for all possible values.
# lr_scheduler_kwargs ('dict', *optional*, defaults to {}):
#     The extra arguments for the lr_scheduler. See the documentation of each scheduler for possible values.
# warmup_ratio (`float`, *optional*, defaults to 0.0):
#     Ratio of total training steps used for a linear warmup from 0 to `learning_rate`.
# warmup_steps (`int`, *optional*, defaults to 0):
#     Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of `warmup_ratio`.

# optimizers (Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR], optional) — A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=emotions_encoded["train"],  # 类型:datasets.arrow_dataset.Dataset
    eval_dataset=emotions_encoded["validation"],  # 类型:datasets.arrow_dataset.Dataset
    optimizers=(optimizer, scheduler_lr),  # 自定义优化器与学习率预热
    compute_metrics=compute_metrics,
    # The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. Will default to [`default_data_collator`] if no `tokenizer` is provided, an instance of [`DataCollatorWithPadding`] otherwise
    data_collator=None,  # 默认:None
    # Processing class used to process the data. If provided, will be used to automatically process the inputs for the model, and it will be saved along the model to make it easier to rerun an interrupted training or reuse the fine-tuned model.
    processing_class=tokenizer  # 默认:None
    )
trainer.train()  # 模型训练

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: W&B API key is configured. Use `wandb login --relogin` to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01127777777777889, max=1.0)…

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4939,0.248475,0.933,0.933268
2,0.2193,0.26662,0.932,0.931252
3,0.1664,0.267592,0.9435,0.943788
4,0.0923,0.326478,0.942,0.942032


TrainOutput(global_step=16000, training_loss=0.24299878311157228, metrics={'train_runtime': 1481.8381, 'train_samples_per_second': 43.19, 'train_steps_per_second': 10.797, 'total_flos': 0.0, 'train_loss': 0.24299878311157228, 'epoch': 4.0})

In [12]:
optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 4e-05
    lr: 0.0
    maximize: False
    weight_decay: 0.01
)

In [13]:
# Run prediction and returns predictions and potential metrics.
# Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method will also return metrics, like in `evaluate()`.
preds_output = trainer.predict(emotions_encoded["validation"])  # 预测和评估包含标签的验证数据集
preds_output

PredictionOutput(predictions=array([[10.026421  , -1.2493951 , -2.0832083 , -1.463818  , -2.0060067 ,
        -1.7284527 ],
       [10.087032  , -1.3979279 , -1.9050596 , -1.516679  , -1.9069802 ,
        -1.6398629 ],
       [-2.3224888 ,  5.4730434 ,  4.8907237 , -2.767652  , -3.536753  ,
        -2.8313003 ],
       ...,
       [-2.0526452 ,  9.663821  , -0.52668995, -2.7965734 , -2.6153255 ,
        -2.043326  ],
       [-1.9870696 ,  8.883894  ,  1.147463  , -2.7566857 , -3.1054263 ,
        -2.5724087 ],
       [-2.1567388 ,  9.649207  , -0.5594587 , -2.8418207 , -2.5324154 ,
        -1.9988272 ]], dtype=float32), label_ids=array([0, 0, 2, ..., 1, 1, 1], dtype=int64), metrics={'test_loss': 0.3264780044555664, 'test_accuracy': 0.942, 'test_f1': 0.9420317738680476, 'test_runtime': 6.641, 'test_samples_per_second': 301.159, 'test_steps_per_second': 75.29})

In [14]:
print(preds_output.predictions)  # 预测结果
print(type(preds_output.predictions))
print(preds_output.predictions.shape)

[[10.026421   -1.2493951  -2.0832083  -1.463818   -2.0060067  -1.7284527 ]
 [10.087032   -1.3979279  -1.9050596  -1.516679   -1.9069802  -1.6398629 ]
 [-2.3224888   5.4730434   4.8907237  -2.767652   -3.536753   -2.8313003 ]
 ...
 [-2.0526452   9.663821   -0.52668995 -2.7965734  -2.6153255  -2.043326  ]
 [-1.9870696   8.883894    1.147463   -2.7566857  -3.1054263  -2.5724087 ]
 [-2.1567388   9.649207   -0.5594587  -2.8418207  -2.5324154  -1.9988272 ]]
<class 'numpy.ndarray'>
(2000, 6)


In [15]:
preds_output.metrics  # 评估结果

{'test_loss': 0.3264780044555664,
 'test_accuracy': 0.942,
 'test_f1': 0.9420317738680476,
 'test_runtime': 6.641,
 'test_samples_per_second': 301.159,
 'test_steps_per_second': 75.29}

In [16]:
test_dataset = emotions_encoded["test"].remove_columns(['label'])
test_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2000
})

In [17]:
trainer.predict(test_dataset)  # 预测不含标签的测试数据集

PredictionOutput(predictions=array([[10.029535  , -1.2211038 , -2.0303333 , -1.796847  , -1.8621632 ,
        -1.6016061 ],
       [10.07516   , -1.3358469 , -1.9799502 , -1.5594311 , -1.9193146 ,
        -1.6537375 ],
       [10.068274  , -1.3482922 , -1.9481524 , -1.6336232 , -1.8776803 ,
        -1.6334101 ],
       ...,
       [-2.0575485 ,  9.643328  , -0.64553326, -2.8232603 , -2.5103939 ,
        -1.9840453 ],
       [-2.1385372 ,  9.623734  , -0.67554325, -2.8852339 , -2.3299308 ,
        -1.9631996 ],
       [-2.4684706 , -2.1929386 , -3.0233455 , -1.8496861 ,  5.593462  ,
         3.6263864 ]], dtype=float32), label_ids=None, metrics={'test_runtime': 6.6473, 'test_samples_per_second': 300.872, 'test_steps_per_second': 75.218})