In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, TrainingArguments

In [2]:
emotions = load_dataset("dataset")
emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [3]:
model_ckpt = "bert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [5]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)


# 批次处理,整个数据集同时进行处理
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)
emotions_encoded  # 原有数据与map函数新增数据('input_ids', 'token_type_ids', 'attention_mask')的联合

Loading cached processed dataset at dataset/train/cache-9aa6937aa7a0509c.arrow
Loading cached processed dataset at dataset/validation/cache-72fd4e085f395d06.arrow
Loading cached processed dataset at dataset/test/cache-81fab418ff2a6a26.arrow


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [6]:
emotions_encoded = emotions_encoded.remove_columns(['text'])  # 'text'列不参与训练(即不进入自定义模型forward函数)
emotions_encoded.set_format("torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
emotions_encoded['train'][:]  # 字典形式

{'label': tensor([0, 0, 3,  ..., 1, 3, 0]),
 'input_ids': tensor([[  101,  1045,  2134,  ...,     0,     0,     0],
         [  101,  1045,  2064,  ...,     0,     0,     0],
         [  101, 10047,  9775,  ...,     0,     0,     0],
         ...,
         [  101,  1045,  2514,  ...,     0,     0,     0],
         [  101,  1045,  2514,  ...,     0,     0,     0],
         [  101,  1045,  2113,  ...,     0,     0,     0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [7]:
class Customize_Model(torch.nn.Module):
    """下游训练任务模型"""

    def __init__(self, pretrained_model, num_labels):
        super().__init__()
        self.classifier = torch.nn.Linear(768, num_labels)  # 多分类任务
        self.pretrained = pretrained_model
        self.dropout = nn.Dropout()
        self.loss_fct = nn.CrossEntropyLoss()  # 损失函数

    def forward(self,
                input_ids,  # ★★★★★训练阶段对应emotions_encoded['train']中的input_ids
                attention_mask,  # ★★★★★训练阶段对应emotions_encoded['train']中的attention_mask
                token_type_ids,  # ★★★★★训练阶段对应emotions_encoded['train']中的token_type_ids
                labels=None):  # 标签;★★★★★训练阶段对应emotions_encoded['train']中的labels
        outputs = self.pretrained(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  token_type_ids=token_type_ids)
        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        loss = None
        if labels is not None:  # 若包含标签
            loss = self.loss_fct(logits.view(-1, num_labels), labels.view(-1))

        # 训练与评估阶段
        # ★★★★★
        # 返回值为一个元组
        # 元组的第一个元素必须为该批次数据的损失值
        # 元组的第二个元素为该批次数据的预测值(可选)
        # * 验证数据集评估函数指标的计算
        # * predict方法预测结果(predictions)与评估结果(metrics)(结合输入labels)的计算
        if loss is not None:
            return (loss, logits)
        # 预测阶段
        # ★★★★★
        # 返回值为模型的预测结果
        else:
            return logits


num_labels = 6
model_from_pretrained = AutoModel.from_pretrained(model_ckpt)

model = Customize_Model(model_from_pretrained, num_labels)
model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
def compute_metrics(pred):
    """验证数据集评估函数"""
    labels = pred.label_ids  # 对应自定义模型forward函数输入:labels
    preds = pred.predictions  # 对应自定义模型forward函数返回值的第二个元素
    preds_argmax = preds.argmax(-1)
    f1 = f1_score(labels, preds_argmax, average="weighted")
    acc = accuracy_score(labels, preds_argmax)
    return {"accuracy": acc, "f1": f1}  # return a dictionary string to metric values

In [9]:
# 主要调节的超参数
batch_size = 64
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(
    # The output directory where the model predictions and checkpoints will be written.
    output_dir=model_name,
    # If True, overwrite the content of the output directory. Use this to continue training if output_dir points to a checkpoint directory.
    overwrite_output_dir=False,  # 默认:False
    seed=42,

    # Total number of training epochs to perform
    num_train_epochs=2,  # 默认:3.0
    # If set to a positive number, the total number of training steps to perform. Overrides num_train_epochs. I
    # max_steps=2000,  # 默认:-1

    #  Maximum gradient norm (for gradient clipping).
    max_grad_norm=1.0,  # 默认:1.0

    # The batch size per GPU/TPU core/CPU for training.
    per_device_train_batch_size=batch_size,  # 默认:8
    # The batch size per GPU/TPU core/CPU for evaluation.
    per_device_eval_batch_size=batch_size,  # 默认:8

    # Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
    gradient_accumulation_steps=1,  # 默认:1

    # HuggingFace AdamW优化器超参数
    # The initial learning rate for AdamW optimizer.
    learning_rate=2e-5,  # 默认: 5e-5
    # The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in AdamW optimizer.
    weight_decay=0.01,  # 默认:0
    # The beta1 hyperparameter for the AdamW optimizer.
    adam_beta1=0.9,  # 默认:0.9
    # The beta2 hyperparameter for the AdamW optimizer.
    adam_beta2=0.999,  # 默认:0.999
    # The epsilon hyperparameter for the AdamW optimizer.
    adam_epsilon=1e-8,  # 默认:1e-8

    # HuggingFace 不同学习率预热超参数
    #  The scheduler type to use. See the documentation of SchedulerType for all possible values.
    lr_scheduler_type='linear',  # 默认:'linear'
    # Ratio of total training steps used for a linear warmup from 0 to learning_rate.
    warmup_ratio=0.0,  # 默认:0.0
    # Number of steps used for a linear warmup from 0 to learning_rate. Overrides any effect of warmup_ratio.
    warmup_steps=0,  # 默认0

    # The evaluation strategy to adopt during training. Possible values are:
    # "no": No evaluation is done during training.
    # "steps": Evaluation is done (and logged) every eval_steps.
    # "epoch": Evaluation is done at the end of each epoch.
    evaluation_strategy="epoch",  # 默认:'no'
    # Number of update steps between two evaluations if evaluation_strategy="steps". Will default to the same value as logging_steps if not set.
    eval_steps=None,  # 默认None
    # The logging strategy to adopt during training. Possible values are:
    # "no": No logging is done during training.
    # "epoch": Logging is done at the end of each epoch.
    # "steps": Logging is done every logging_steps.
    logging_strategy='epoch',  # 默认:'steps'
    # Number of update steps between two logs if logging_strategy="steps".
    # logging_steps=500,  # 默认:500
    # #################################################################
    # The checkpoint save strategy to adopt during training. Possible values are:
    # "no": No save is done during training.
    # "epoch": Save is done at the end of each epoch.
    # "steps": Save is done every save_steps.
    save_strategy='epoch',  # 默认:'steps'
    #  Number of updates steps before two checkpoint saves if save_strategy="steps".
    save_steps=500,  # 默认:500
    # #################################################################
    # save_total_limit (int, optional) —If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in output_dir.
    save_total_limit=None,  # 默认:None
    # Logger log level to use on the main process. Possible choices are the log levels as strings: ‘debug’, ‘info’, ‘warning’, ‘error’ and ‘critical’, plus a ‘passive’ level which doesn’t set anything and lets the application set the level.
    log_level='passive',  # 默认:'passive'
    # #################################################################
    # Whether or not to load the best model found during training at the end of training.
    # When set to True, the parameters save_strategy needs to be the same as evaluation_strategy, and in the case it is “steps”, save_steps must be a round multiple of eval_steps.
    load_best_model_at_end=False,  # 默认load_best_model_at_end=False
    # Use in conjunction with load_best_model_at_end to specify the metric to use to compare two different models.
    # Must be the name of a metric returned by the evaluation with or without the prefix "eval_".
    # Will default to "loss" if unspecified and load_best_model_at_end=True (to use the evaluation loss).
    metric_for_best_model=None,
    disable_tqdm=False,  # 是否使用tqdm显示进度
)

# 建议通过optimizers参数自定义优化器与学习率预热
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=emotions_encoded["train"],  # 类型:datasets.arrow_dataset.Dataset
    eval_dataset=emotions_encoded["validation"],  # 类型:datasets.arrow_dataset.Dataset
    compute_metrics=compute_metrics,
    tokenizer=tokenizer)
trainer.train()  # 模型训练

***** Running training *****
  Num examples = 16000
  Num Epochs = 2
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 500


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.948,0.346318,0.8945,0.891665
2,0.2692,0.215337,0.9185,0.918275


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
Saving model checkpoint to bert-base-uncased-finetuned-emotion/checkpoint-250
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in bert-base-uncased-finetuned-emotion/checkpoint-250/tokenizer_config.json
Special tokens file saved in bert-base-uncased-finetuned-emotion/checkpoint-250/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
Saving model checkpoint to bert-base-uncased-finetuned-emotion/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in bert-base-uncased-finetuned-emotion/checkpoint-500/tokenizer_config.json
Special tokens file saved in bert-base-uncased-finetuned-emotion/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=500, training_loss=0.6085659484863282, metrics={'train_runtime': 116.2753, 'train_samples_per_second': 275.209, 'train_steps_per_second': 4.3, 'total_flos': 0.0, 'train_loss': 0.6085659484863282, 'epoch': 2.0})

In [10]:
# Run prediction and returns predictions and potential metrics.
# Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method will also return metrics, like in `evaluate()`.
preds_output = trainer.predict(emotions_encoded["validation"])  # 类型:datasets.arrow_dataset.Dataset;验证数据集(包含标签)
preds_output

***** Running Prediction *****
  Num examples = 2000
  Batch size = 64


PredictionOutput(predictions=array([[ 4.8580265 , -0.59786206, -1.1437703 , -0.9777483 , -0.80289173,
        -1.6467062 ],
       [ 4.945601  , -0.5794875 , -1.6377996 , -0.05106468, -0.8756255 ,
        -1.6242746 ],
       [-1.5682827 ,  2.4421551 ,  3.1642022 , -1.0934749 , -1.7419844 ,
        -0.8062323 ],
       ...,
       [-0.95017964,  5.242554  ,  0.7574893 , -1.4055537 , -1.5912442 ,
        -0.9436278 ],
       [-2.3446176 ,  2.9856758 ,  2.889302  , -1.2757487 , -1.4608138 ,
        -0.16156055],
       [-1.424035  ,  5.2089243 ,  0.7045717 , -1.7643484 , -0.9928238 ,
        -0.48339003]], dtype=float32), label_ids=array([0, 0, 2, ..., 1, 1, 1]), metrics={'test_loss': 0.21533739566802979, 'test_accuracy': 0.9185, 'test_f1': 0.9182753687347617, 'test_runtime': 2.0455, 'test_samples_per_second': 977.753, 'test_steps_per_second': 15.644})

In [11]:
print(preds_output.predictions)  # 预测结果
print(type(preds_output.predictions))
print(preds_output.predictions.shape)

[[ 4.8580265  -0.59786206 -1.1437703  -0.9777483  -0.80289173 -1.6467062 ]
 [ 4.945601   -0.5794875  -1.6377996  -0.05106468 -0.8756255  -1.6242746 ]
 [-1.5682827   2.4421551   3.1642022  -1.0934749  -1.7419844  -0.8062323 ]
 ...
 [-0.95017964  5.242554    0.7574893  -1.4055537  -1.5912442  -0.9436278 ]
 [-2.3446176   2.9856758   2.889302   -1.2757487  -1.4608138  -0.16156055]
 [-1.424035    5.2089243   0.7045717  -1.7643484  -0.9928238  -0.48339003]]
<class 'numpy.ndarray'>
(2000, 6)


In [12]:
preds_output.metrics

{'test_loss': 0.21533739566802979,
 'test_accuracy': 0.9185,
 'test_f1': 0.9182753687347617,
 'test_runtime': 2.0455,
 'test_samples_per_second': 977.753,
 'test_steps_per_second': 15.644}

In [13]:
test_dataset = emotions_encoded["test"].remove_columns(['label'])
test_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2000
})

In [19]:
trainer.predict(test_dataset)  # 预测不含标签的测试数据集

***** Running Prediction *****
  Num examples = 2000
  Batch size = 64


PredictionOutput(predictions=array([[ 4.62717   ,  0.20967638, -1.2161316 ,  0.26625907, -1.5343432 ,
        -1.7819738 ],
       [ 5.02729   , -0.6014791 , -1.3574317 , -0.636112  , -0.9252229 ,
        -1.6474454 ],
       [ 4.92349   , -0.79665595, -1.6364161 , -0.63495547, -0.5856782 ,
        -1.5237921 ],
       ...,
       [-1.0013064 ,  5.374679  ,  0.56565815, -1.4822636 , -1.3467855 ,
        -0.92645323],
       [-0.9053741 ,  5.1751623 ,  0.05427762, -1.5159779 , -0.80188656,
        -0.87655663],
       [-1.2898622 , -1.5045867 , -0.49642894, -1.3738636 ,  2.2142951 ,
         2.386966  ]], dtype=float32), label_ids=None, metrics={'test_runtime': 1.8663, 'test_samples_per_second': 1071.612, 'test_steps_per_second': 17.146})