In [1]:
import copy
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, TrainingArguments
import numpy as np
import random

In [2]:
def set_seed(seed):
    """PyTorch随机数种子设置大全"""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)  # CPU上设置随机种子
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)  # 当前GPU上设置随机种子
        # A bool that, if True, causes cuDNN to only use deterministic convolution algorithms.
        torch.backends.cudnn.deterministic = True
        # torch.cuda.manual_seed_all(seed) # 所有GPU上设置随机种子


seed = 2022
set_seed(seed)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
emotions = load_dataset(path='emotion')
emotions

Using the latest cached version of the module from C:\Users\dcdmm\.cache\huggingface\modules\datasets_modules\datasets\emotion\348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705 (last modified on Fri Apr  1 16:15:56 2022) since it couldn't be found locally at emotion., or remotely on the Hugging Face Hub.
Using custom data configuration default
Reusing dataset emotion (C:\Users\dcdmm\.cache\huggingface\datasets\emotion\default\0.0.0\348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [5]:
model_ckpt = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
print(tokenizer.model_input_names)

model_from_pretrained = AutoModel.from_pretrained(model_ckpt)
print(model_from_pretrained.num_parameters())

['input_ids', 'token_type_ids', 'attention_mask']


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


109482240


In [6]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)


# 批次处理,整个数据集同时进行处理
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)
emotions_encoded  # 原有数据与map函数新增数据('input_ids', 'token_type_ids', 'attention_mask')的联合

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [7]:
emotions_encoded = emotions_encoded.remove_columns(['text'])  # 'text'列不参与训练(即不进入自定义模型forward函数)
emotions_encoded.set_format("torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
emotions_encoded['train'][:]  # 字典形式

{'label': tensor([0, 0, 3,  ..., 1, 3, 0]),
 'input_ids': tensor([[  101,  1045,  2134,  ...,     0,     0,     0],
         [  101,  1045,  2064,  ...,     0,     0,     0],
         [  101, 10047,  9775,  ...,     0,     0,     0],
         ...,
         [  101,  1045,  2514,  ...,     0,     0,     0],
         [  101,  1045,  2514,  ...,     0,     0,     0],
         [  101,  1045,  2113,  ...,     0,     0,     0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [8]:
class Customize_Model(torch.nn.Module):
    """下游训练任务模型"""

    def __init__(self, pretrained_model, num_labels):
        super().__init__()
        self.classifier = torch.nn.Linear(768, num_labels)  # 多分类任务
        self.pretrained = pretrained_model
        self.dropout = nn.Dropout()
        self.loss_fct = nn.CrossEntropyLoss()  # 损失函数

    def forward(self,
                input_ids,  # ★★★★★训练阶段对应emotions_encoded['train']中的input_ids
                attention_mask,  # ★★★★★训练阶段对应emotions_encoded['train']中的attention_mask
                token_type_ids,  # ★★★★★训练阶段对应emotions_encoded['train']中的token_type_ids
                labels=None):  # 标签;★★★★★训练阶段对应emotions_encoded['train']中的labels
        outputs = self.pretrained(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  token_type_ids=token_type_ids)
        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        loss = None
        if labels is not None:  # 若包含标签
            loss = self.loss_fct(logits.view(-1, num_labels), labels.view(-1))

        # 训练与评估阶段
        # ★★★★★
        # 返回值为一个元组
        # 元组的第一个元素必须为该批次数据的损失值
        # 元组的第二个元素为该批次数据的预测值(可选)
        # * 验证数据集评估函数指标的计算
        # * predict方法预测结果(predictions)与评估结果(metrics)(结合输入labels)的计算
        if loss is not None:
            return (loss, logits)
        # 预测阶段
        # ★★★★★
        # 返回值为模型的预测结果
        else:
            return logits


num_labels = 6

model = Customize_Model(copy.deepcopy(model_from_pretrained),  # 必须进行深拷贝(pretrained(模型子网络结构)会参与梯度更新)
                        num_labels)
model = model.to(device)

In [9]:
def compute_metrics(pred):
    """验证数据集评估函数"""
    labels = pred.label_ids  # 对应自定义模型forward函数输入:labels
    preds = pred.predictions  # 对应自定义模型forward函数返回值的第二个元素
    preds_argmax = preds.argmax(-1)
    f1 = f1_score(labels, preds_argmax, average="weighted")
    acc = accuracy_score(labels, preds_argmax)
    return {"accuracy": acc, "f1": f1}  # return a dictionary string to metric values

In [10]:
# 主要调节的超参数
batch_size = 4
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(
    # The output directory where the model predictions and checkpoints will be written.
    output_dir=model_name,
    # If True, overwrite the content of the output directory. Use this to continue training if output_dir points to a checkpoint directory.
    overwrite_output_dir=False,  # 默认:False
    # save_total_limit (int, optional) —If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in output_dir.
    save_total_limit=None,  # 默认:None

    seed=42,

    # Total number of training epochs to perform
    num_train_epochs=2,  # 默认:3.0
    # If set to a positive number, the total number of training steps to perform. Overrides num_train_epochs. I
    # max_steps=2000,  # 默认:-1

    #  Maximum gradient norm (for gradient clipping).
    max_grad_norm=1.0,  # 默认:1.0

    # The batch size per GPU/TPU core/CPU for training.
    per_device_train_batch_size=batch_size,  # 默认:8
    # The batch size per GPU/TPU core/CPU for evaluation.
    per_device_eval_batch_size=batch_size,  # 默认:8

    # Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
    gradient_accumulation_steps=1,  # 默认:1

    # HuggingFace AdamW优化器超参数
    # The initial learning rate for AdamW optimizer.
    learning_rate=2e-5,  # 默认: 5e-5
    # The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in AdamW optimizer.
    weight_decay=0.01,  # 默认:0
    # The beta1 hyperparameter for the AdamW optimizer.
    adam_beta1=0.9,  # 默认:0.9
    # The beta2 hyperparameter for the AdamW optimizer.
    adam_beta2=0.999,  # 默认:0.999
    # The epsilon hyperparameter for the AdamW optimizer.
    adam_epsilon=1e-8,  # 默认:1e-8

    # HuggingFace 不同学习率预热超参数
    #  The scheduler type to use. See the documentation of SchedulerType for all possible values.
    lr_scheduler_type='linear',  # 默认:'linear'
    # Ratio of total training steps used for a linear warmup from 0 to learning_rate.
    warmup_ratio=0.0,  # 默认:0.0
    # Number of steps used for a linear warmup from 0 to learning_rate. Overrides any effect of warmup_ratio.
    # warmup_steps=0,  # 默认0

    # The evaluation strategy to adopt during training. Possible values are:
    # "no": No evaluation is done during training.
    # "steps": Evaluation is done (and logged) every eval_steps.
    # "epoch": Evaluation is done at the end of each epoch.
    evaluation_strategy="epoch",  # 默认:'no'
    # Number of update steps between two evaluations if evaluation_strategy="steps". Will default to the same value as logging_steps if not set.
    eval_steps=None,  # 默认None

    # The logging strategy to adopt during training. Possible values are:
    # "no": No logging is done during training.
    # "epoch": Logging is done at the end of each epoch.
    # "steps": Logging is done every logging_steps.
    logging_strategy='epoch',  # 默认:'steps'
    # Number of update steps between two logs if logging_strategy="steps".
    # logging_steps=500,  # 默认:500

    # The checkpoint save strategy to adopt during training. Possible values are:
    # "no": No save is done during training.
    # "epoch": Save is done at the end of each epoch.
    # "steps": Save is done every save_steps.
    save_strategy='epoch',  # 默认:'steps'
    #  Number of updates steps before two checkpoint saves if save_strategy="steps".
    save_steps=500,  # 默认:500

    # Logger log level to use on the main process. Possible choices are the log levels as strings: ‘debug’, ‘info’, ‘warning’, ‘error’ and ‘critical’, plus a ‘passive’ level which doesn’t set anything and lets the application set the level.
    log_level='passive',  # 默认:'passive'

    # Whether or not to load the best model found during training at the end of training.
    # When set to True, the parameters save_strategy needs to be the same as evaluation_strategy, and in the case it is “steps”, save_steps must be a round multiple of eval_steps.
    load_best_model_at_end=False,  # 默认load_best_model_at_end=False
    # Use in conjunction with load_best_model_at_end to specify the metric to use to compare two different models.
    # Must be the name of a metric returned by the evaluation with or without the prefix "eval_".
    # Will default to "loss" if unspecified and load_best_model_at_end=True (to use the evaluation loss).
    metric_for_best_model=None,

    disable_tqdm=False,  # 是否使用tqdm显示进度(.py运行时设置disable_tqdm=True)
)

# 建议通过optimizers参数自定义优化器与学习率预热
# optimizers (Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR], optional) — A tuple containing the optimizer and the scheduler to use.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=emotions_encoded["train"],  # 类型:datasets.arrow_dataset.Dataset
    eval_dataset=emotions_encoded["validation"],  # 类型:datasets.arrow_dataset.Dataset
    compute_metrics=compute_metrics,
    tokenizer=tokenizer)
trainer.train()  # 模型训练

***** Running training *****
  Num examples = 16000
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 8000


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4587,0.264718,0.935,0.934866
2,0.1804,0.217183,0.937,0.937069


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 4
Saving model checkpoint to bert-base-uncased-finetuned-emotion\checkpoint-4000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in bert-base-uncased-finetuned-emotion\checkpoint-4000\tokenizer_config.json
Special tokens file saved in bert-base-uncased-finetuned-emotion\checkpoint-4000\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 4
Saving model checkpoint to bert-base-uncased-finetuned-emotion\checkpoint-8000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in bert-base-uncased-finetuned-emotion\checkpoint-8000\tokenizer_config.json
Special tokens file saved in bert-base-uncased-finetuned-emotion\checkpoint-8000\special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=8000, training_loss=0.31951457214355466, metrics={'train_runtime': 2362.4992, 'train_samples_per_second': 13.545, 'train_steps_per_second': 3.386, 'total_flos': 0.0, 'train_loss': 0.31951457214355466, 'epoch': 2.0})

In [11]:
# Run prediction and returns predictions and potential metrics.
# Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method will also return metrics, like in `evaluate()`.
preds_output = trainer.predict(emotions_encoded["validation"])  # 类型:datasets.arrow_dataset.Dataset;验证数据集(包含标签)
preds_output

***** Running Prediction *****
  Num examples = 2000
  Batch size = 4


PredictionOutput(predictions=array([[ 8.74499   , -1.6454196 , -1.3407766 , -1.3644311 , -1.6235143 ,
        -1.8594097 ],
       [ 8.740201  , -1.5983361 , -1.753396  , -1.2184671 , -1.410719  ,
        -1.6200553 ],
       [-2.1097214 ,  2.4274385 ,  5.151494  , -1.6792418 , -2.2815318 ,
        -2.322027  ],
       ...,
       [-2.0081527 ,  8.669637  , -0.66810536, -2.04628   , -2.075761  ,
        -1.5750041 ],
       [-2.1995275 ,  3.2017984 ,  4.738045  , -1.6810935 , -2.2954392 ,
        -2.4768968 ],
       [-2.0613208 ,  8.638503  , -1.208316  , -2.001938  , -1.7295588 ,
        -1.0406128 ]], dtype=float32), label_ids=array([0, 0, 2, ..., 1, 1, 1], dtype=int64), metrics={'test_loss': 0.21718303859233856, 'test_accuracy': 0.937, 'test_f1': 0.9370687362877385, 'test_runtime': 33.3576, 'test_samples_per_second': 59.956, 'test_steps_per_second': 14.989})

In [12]:
print(preds_output.predictions)  # 预测结果
print(type(preds_output.predictions))
print(preds_output.predictions.shape)

[[ 8.74499    -1.6454196  -1.3407766  -1.3644311  -1.6235143  -1.8594097 ]
 [ 8.740201   -1.5983361  -1.753396   -1.2184671  -1.410719   -1.6200553 ]
 [-2.1097214   2.4274385   5.151494   -1.6792418  -2.2815318  -2.322027  ]
 ...
 [-2.0081527   8.669637   -0.66810536 -2.04628    -2.075761   -1.5750041 ]
 [-2.1995275   3.2017984   4.738045   -1.6810935  -2.2954392  -2.4768968 ]
 [-2.0613208   8.638503   -1.208316   -2.001938   -1.7295588  -1.0406128 ]]
<class 'numpy.ndarray'>
(2000, 6)


In [13]:
preds_output.metrics

{'test_loss': 0.21718303859233856,
 'test_accuracy': 0.937,
 'test_f1': 0.9370687362877385,
 'test_runtime': 33.3576,
 'test_samples_per_second': 59.956,
 'test_steps_per_second': 14.989}

In [14]:
test_dataset = emotions_encoded["test"].remove_columns(['label'])
test_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2000
})

In [15]:
trainer.predict(test_dataset)  # 预测不含标签的测试数据集

***** Running Prediction *****
  Num examples = 2000
  Batch size = 4


PredictionOutput(predictions=array([[ 8.547188 , -1.1954035, -1.6077578, -1.3239136, -1.8036917,
        -1.7479213],
       [ 8.737176 , -1.561123 , -1.5552757, -1.1923537, -1.7461095,
        -1.7708706],
       [ 8.780567 , -1.6004115, -1.5823025, -1.1673901, -1.728886 ,
        -1.6798965],
       ...,
       [-1.7076166,  8.718234 , -1.158481 , -1.8851666, -1.949625 ,
        -1.4237279],
       [-1.8153396,  8.687426 , -1.2887311, -1.8366973, -1.5447539,
        -1.5415577],
       [-1.3077655, -2.1242712, -2.5342546, -1.3820418,  5.358166 ,
         3.467203 ]], dtype=float32), label_ids=None, metrics={'test_runtime': 35.9695, 'test_samples_per_second': 55.603, 'test_steps_per_second': 13.901})