In [29]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, TrainingArguments

In [30]:
emotions = load_dataset("emotions_dataset")
emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [31]:
model_ckpt = "bert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [32]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
tokenizer

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased/resolve/ma

PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [33]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)


# 批次处理,整个数据集同时进行处理
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)
emotions_encoded  # 原有数据与map函数新增数据('input_ids', 'token_type_ids', 'attention_mask')的联合

Loading cached processed dataset at emotions_dataset/train/cache-e0dea2b01136f4ee.arrow
Loading cached processed dataset at emotions_dataset/validation/cache-d5b5af9c9d12db6c.arrow
Loading cached processed dataset at emotions_dataset/test/cache-b1978ac53e8209ee.arrow


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [34]:
emotions_encoded = emotions_encoded.remove_columns(['text'])  # 'text'列不参与训练(即不进入自定义模型forward函数)
emotions_encoded.set_format("torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
emotions_encoded['train'][:]

{'label': tensor([0, 0, 3,  ..., 1, 3, 0]),
 'input_ids': tensor([[  101,  1045,  2134,  ...,     0,     0,     0],
         [  101,  1045,  2064,  ...,     0,     0,     0],
         [  101, 10047,  9775,  ...,     0,     0,     0],
         ...,
         [  101,  1045,  2514,  ...,     0,     0,     0],
         [  101,  1045,  2514,  ...,     0,     0,     0],
         [  101,  1045,  2113,  ...,     0,     0,     0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [27]:
class Customize_Model(torch.nn.Module):
    """下游训练任务模型"""

    def __init__(self, pretrained_model, num_labels):
        super().__init__()
        self.classifier = torch.nn.Linear(768, num_labels)  # 多分类任务
        self.pretrained = pretrained_model
        self.dropout = nn.Dropout()

    def forward(self,
                input_ids,  # 训练阶段对应emotions_encoded['train']中的input_ids
                attention_mask,  # 训练阶段对应emotions_encoded['train']中的attention_mask
                token_type_ids,  # 训练阶段对应emotions_encoded['train']中的token_type_ids
                labels=None):  # 标签;训练阶段对应emotions_encoded['train']中的labels
        outputs = self.pretrained(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  token_type_ids=token_type_ids)
        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, num_labels), labels.view(-1))

        # 返回值为一个元组
        # 元组的第一个元素必须为计算的损失值
        # 元素的第二个元素用于评估函数计算模型的输出
        return (loss, logits)


num_labels = 6
model_from_pretrained = AutoModel.from_pretrained(model_ckpt)

model = Customize_Model(model_from_pretrained, num_labels)
model = model.to(device)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-uncased/re

In [23]:
def compute_metrics(pred):
    """验证数据集评估函数"""
    labels = pred.label_ids  # 对应自定义模型forward函数输入:labels
    preds = pred.predictions  # 对应自定义模型forward函数返回值的第二个元素
    preds_argmax = preds.argmax(-1)
    f1 = f1_score(labels, preds_argmax, average="weighted")
    acc = accuracy_score(labels, preds_argmax)
    return {"accuracy": acc, "f1": f1}  # return a dictionary string to metric values

In [28]:
# 主要调节的超参数
batch_size = 64
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(
    # The output directory where the model predictions and checkpoints will be written.
    output_dir=model_name,
    seed=42,

    # Total number of training epochs to perform
    num_train_epochs=2,  # 默认:3.0
    # If set to a positive number, the total number of training steps to perform. Overrides num_train_epochs. I
    # max_steps=2000,  # 默认:-1
    # The batch size per GPU/TPU core/CPU for training.
    per_device_train_batch_size=batch_size,  # 默认:8
    # The batch size per GPU/TPU core/CPU for evaluation.
    per_device_eval_batch_size=batch_size,  # 默认:8

    # The initial learning rate for AdamW optimizer.
    learning_rate=2e-5,  # 默认: 5e-5
    # The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in AdamW optimizer.
    weight_decay=0.01,  # 默认:0
    # The beta1 hyperparameter for the AdamW optimizer.
    adam_beta1=0.9,  # 默认:0.9
    # The beta2 hyperparameter for the AdamW optimizer.
    adam_beta2=0.999,  # 默认:0.999
    # The epsilon hyperparameter for the AdamW optimizer.
    adam_epsilon=1e-8,  # 默认:1e-8
    #  Maximum gradient norm (for gradient clipping).
    max_grad_norm=1.0,  # 默认:1.0
    #  The scheduler type to use. See the documentation of SchedulerType for all possible values.
    lr_scheduler_type='linear',  # 默认:'linear'
    # Ratio of total training steps used for a linear warmup from 0 to learning_rate.
    warmup_ratio=0.0,  # 默认:0.0
    # Number of steps used for a linear warmup from 0 to learning_rate. Overrides any effect of warmup_ratio.
    warmup_steps=0,  # 默认0

    # The evaluation strategy to adopt during training. Possible values are:
    # "no": No evaluation is done during training.
    # "steps": Evaluation is done (and logged) every eval_steps.
    # "epoch": Evaluation is done at the end of each epoch.
    evaluation_strategy="epoch",  # 默认:'no'
    # The logging strategy to adopt during training. Possible values are:
    # "no": No logging is done during training.
    # "epoch": Logging is done at the end of each epoch.
    # "steps": Logging is done every logging_steps.
    logging_strategy='epoch',  # 默认:'steps'
    # Number of update steps between two logs if logging_strategy="steps".
    # logging_steps=500,  # 默认:500
    # The checkpoint save strategy to adopt during training. Possible values are:
    # "no": No save is done during training.
    # "epoch": Save is done at the end of each epoch.
    # "steps": Save is done every save_steps.
    # Logger log level to use on the main process. Possible choices are the log levels as strings: ‘debug’, ‘info’, ‘warning’, ‘error’ and ‘critical’, plus a ‘passive’ level which doesn’t set anything and lets the application set the level.
    log_level='passive',  # 默认'passive'
    save_strategy='epoch',  # 默认:'steps'
    # Number of updates steps before two checkpoint saves if save_strategy="steps".
    # save_steps=500,  # 默认:500
    disable_tqdm=False,  # 使用tqdm显示进度
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=emotions_encoded["train"],
    eval_dataset=emotions_encoded["validation"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer)
trainer.train()  # 模型训练

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 16000
  Num Epochs = 2
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 500


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.0023,0.351801,0.897,0.895335
2,0.2638,0.22564,0.919,0.919262


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
Saving model checkpoint to bert-base-uncased-finetuned-emotion/checkpoint-250
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in bert-base-uncased-finetuned-emotion/checkpoint-250/tokenizer_config.json
Special tokens file saved in bert-base-uncased-finetuned-emotion/checkpoint-250/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
Saving model checkpoint to bert-base-uncased-finetuned-emotion/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in bert-base-uncased-finetuned-emotion/checkpoint-500/tokenizer_config.json
Special tokens file saved in bert-base-uncased-finetuned-emotion/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=500, training_loss=0.6330383148193359, metrics={'train_runtime': 116.0527, 'train_samples_per_second': 275.737, 'train_steps_per_second': 4.308, 'total_flos': 0.0, 'train_loss': 0.6330383148193359, 'epoch': 2.0})

In [38]:
# 默认预测
preds_output = trainer.predict(emotions_encoded["validation"])
preds_output

***** Running Prediction *****
  Num examples = 2000
  Batch size = 64


PredictionOutput(predictions=array([[ 4.763738  , -0.43802044, -1.0107248 , -0.6432574 , -1.1671257 ,
        -1.4157267 ],
       [ 4.622076  , -0.4318239 , -1.9421813 , -0.43927562, -0.61563396,
        -1.2663845 ],
       [-1.339176  ,  1.9858333 ,  3.1078916 , -0.6731394 , -1.5203284 ,
        -0.7745205 ],
       ...,
       [-1.1167469 ,  4.7990003 , -0.26819718, -1.4773217 , -1.4515011 ,
        -0.85388523],
       [-2.1689475 ,  2.6882067 ,  2.618569  , -0.9024664 , -1.3230274 ,
        -0.6409765 ],
       [-1.5250525 ,  4.1931148 , -0.5475913 , -1.9164736 , -1.077943  ,
         0.05993806]], dtype=float32), label_ids=array([0, 0, 2, ..., 1, 1, 1]), metrics={'test_loss': 0.2256401628255844, 'test_accuracy': 0.919, 'test_f1': 0.9192620494174079, 'test_runtime': 2.0237, 'test_samples_per_second': 988.273, 'test_steps_per_second': 15.812})

In [40]:
print(preds_output.predictions)  # 预测结果
print(type(preds_output.predictions))
print(preds_output.predictions.shape)

[[ 4.763738   -0.43802044 -1.0107248  -0.6432574  -1.1671257  -1.4157267 ]
 [ 4.622076   -0.4318239  -1.9421813  -0.43927562 -0.61563396 -1.2663845 ]
 [-1.339176    1.9858333   3.1078916  -0.6731394  -1.5203284  -0.7745205 ]
 ...
 [-1.1167469   4.7990003  -0.26819718 -1.4773217  -1.4515011  -0.85388523]
 [-2.1689475   2.6882067   2.618569   -0.9024664  -1.3230274  -0.6409765 ]
 [-1.5250525   4.1931148  -0.5475913  -1.9164736  -1.077943    0.05993806]]
<class 'numpy.ndarray'>
(2000, 6)


In [12]:
preds_output.metrics

{'test_loss': 0.21254733204841614,
 'test_accuracy': 0.9235,
 'test_f1': 0.9238430642610074,
 'test_runtime': 2.2745,
 'test_samples_per_second': 879.316,
 'test_steps_per_second': 14.069}