In [14]:
import torch
import torch.utils.data as Data
from datasets import load_dataset
from transformers import BertTokenizer
from transformers import BertModel
from sklearn.metrics import accuracy_score
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
from transformers import Trainer, TrainingArguments

In [15]:
model_ckpt = "bert-base-chinese"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [16]:
class Dataset(Data.Dataset):
    """定义数据集"""

    def __init__(self, split):
        self.dataset = load_dataset(path='seamew/ChnSentiCorp', split=split)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        """定义索引方式"""
        text = self.dataset[i]['text']
        label = self.dataset[i]['label']
        return text, label


dataset_train = Dataset('train')
dataset_validation = Dataset('validation')

for text, label in dataset_train:
    # 调用__getitem__方法
    print(text)
    print(label)
    break

选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般
1


In [17]:
token = BertTokenizer.from_pretrained(model_ckpt)


def collate_fn(data):
    sents = [i[0] for i in data]
    labels = [i[1] for i in data]

    # 批量编码句子
    data = token(text=sents,
                 truncation=True,
                 padding='max_length',
                 max_length=512,
                 return_token_type_ids=True,
                 return_attention_mask=True,
                 return_tensors='pt')

    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    labels = torch.LongTensor(labels)
    # 返回值必须为字典(键与模型forward方法形参对应)
    return {'input_ids': input_ids,  # ★★★★★对应模型forward方法input_ids参数
            'attention_mask': attention_mask,  # ★★★★★对应模型forward方法attention_mask参数
            "token_type_ids": token_type_ids,  # ★★★★对应模型forward方法token_type_ids参数
            'labels': labels}  # ★★★★对应模型forward方法labels参数

loading file https://huggingface.co/bert-base-chinese/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/36acdf4f3edf0a14ffb2b2c68ba47e93abd9448825202377ddb16dae8114fe07.accd894ff58c6ff7bd4f3072890776c14f4ea34fcc08e79cd88c2d157756dceb
loading file https://huggingface.co/bert-base-chinese/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-chinese/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-chinese/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/2dc6085404c55008ba7fc09ab7483ef3f0a4ca2496ccee0cdbf51c2b5d529dff.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
loading configuration file https://huggingface.co/bert-base-chinese/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/6cc404ca8136bc87bae0fb24f2259904943d776a6c5ddc26598bbdc319476f42.0f9bcd8314d841c06633e7b92b04509f1802c16796ee67

In [18]:
class Model(torch.nn.Module):
    """下游训练任务模型"""

    def __init__(self, pretrained_model):
        super().__init__()
        self.fc = torch.nn.Linear(768, 2)  # 二分类任务
        self.pretrained = pretrained_model
        self.criterion = torch.nn.CrossEntropyLoss()  # 损失函数

    def forward(self, input_ids, attention_mask, token_type_ids, labels):
        out = self.pretrained(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids)

        out = self.fc(out.pooler_output)
        out = out.softmax(dim=1)
        loss = self.criterion(out, labels)

        # ★★★★★
        # 返回值为一个元组
        # 元组的第一个元素必须为该批次训练数据的损失值
        # 元素的第二个元素用于评估函数计算验证数据集(若有)的输出
        return (loss, out)


pretrained = BertModel.from_pretrained(model_ckpt)

# 冻结网络层参数(不进行梯度更新)
for param in pretrained.parameters():
    param.requires_grad = False

model = Model(pretrained)
model = model.to(device)

loading configuration file https://huggingface.co/bert-base-chinese/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/6cc404ca8136bc87bae0fb24f2259904943d776a6c5ddc26598bbdc319476f42.0f9bcd8314d841c06633e7b92b04509f1802c16796ee67b0f1177065739e24ae
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size

In [21]:
def compute_metrics(pred):
    """验证数据集评估函数"""
    labels = pred.label_ids  # 对应自定义模型forward函数输入:labels
    preds = pred.predictions  # 对应自定义模型forward函数返回值的第二个元素
    preds_argmax = preds.argmax(-1)
    acc = accuracy_score(labels, preds_argmax)
    return {"accuracy": acc}  # return a dictionary string to metric value


def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps):
    """
    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        num_training_steps (`int`):
            The total number of training steps.
    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """

    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            # 学习率预热(线性增加)
            return float(current_step) / float(max(1, num_warmup_steps))
        # 学习率线性衰减(最小为0)
        # num_training_steps后学习率恒为0
        return max(
            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
        )

    return LambdaLR(optimizer, lr_lambda)


model_name = f"{model_ckpt}-finetuned-emotion"
batch_size = 64  # 批次大小
epochs = 3.0  # 训练轮数
steps_all = int(len(dataset_train) / batch_size) * epochs  # 总学习步数
optimizer = optim.AdamW(model.parameters(), lr=5e-4)  # 优化器
scheduler_lr = get_linear_schedule_with_warmup(optimizer, 50, 0.9 * steps_all)  # 学习率预热(必须为LambdaLR对象)

In [22]:
# 主要调节的超参数
training_args = TrainingArguments(
    # The output directory where the model predictions and checkpoints will be written.
    output_dir=model_name,
    seed=42,

    # Total number of training epochs to perform
    num_train_epochs=epochs,  # 默认:3.0
    # If set to a positive number, the total number of training steps to perform. Overrides num_train_epochs. I
    # max_steps=300,  # 默认:-1

    #  Maximum gradient norm (for gradient clipping).
    max_grad_norm=1.0,  # 默认:1.0

    # 对应pytorch DataLoader 参数batch_size
    # The batch size per GPU/TPU core/CPU for training.
    per_device_train_batch_size=batch_size,  # 默认:8
    # The batch size per GPU/TPU core/CPU for evaluation.
    # 对应pytorch DataLoader 参数batch_size
    per_device_eval_batch_size=batch_size,  # 默认:8
    # Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size) or not.
    # 对应pytorch DataLoader 参数drop_last
    dataloader_drop_last=False,  # 默认:False

    # The evaluation strategy to adopt during training. Possible values are:
    # "no": No evaluation is done during training.
    # "steps": Evaluation is done (and logged) every eval_steps.
    # "epoch": Evaluation is done at the end of each epoch.
    evaluation_strategy="epoch",  # 默认:'no'
    # The logging strategy to adopt during training. Possible values are:
    # "no": No logging is done during training.
    # "epoch": Logging is done at the end of each epoch.
    # "steps": Logging is done every logging_steps.
    logging_strategy='epoch',  # 默认:'steps'
    # Number of update steps between two logs if logging_strategy="steps".
    # logging_steps=500,  # 默认:500
    # The checkpoint save strategy to adopt during training. Possible values are:
    # "no": No save is done during training.
    # "epoch": Save is done at the end of each epoch.
    # "steps": Save is done every save_steps.
    # Logger log level to use on the main process. Possible choices are the log levels as strings: ‘debug’, ‘info’, ‘warning’, ‘error’ and ‘critical’, plus a ‘passive’ level which doesn’t set anything and lets the application set the level.
    log_level='passive',  # 默认'passive'
    save_strategy='epoch',  # 默认:'steps'
    # Number of updates steps before two checkpoint saves if save_strategy="steps".
    # save_steps=500,  # 默认:500
    disable_tqdm=False,  # 使用tqdm显示进度
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_validation,
    data_collator=collate_fn,  # 对应pytorch DataLoader 参数collate_fn
    optimizers=(optimizer, scheduler_lr),  # 自定义优化器与学习率预热
    compute_metrics=compute_metrics,
    tokenizer=token)

trainer.train()  # 模型训练

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 9600
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 450


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6069,0.541185,0.8
2,0.5273,0.524135,0.796667
3,0.515,0.517951,0.813333


***** Running Evaluation *****
  Num examples = 1200
  Batch size = 64
Saving model checkpoint to bert-base-chinese-finetuned-emotion/checkpoint-150
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in bert-base-chinese-finetuned-emotion/checkpoint-150/tokenizer_config.json
Special tokens file saved in bert-base-chinese-finetuned-emotion/checkpoint-150/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1200
  Batch size = 64
Saving model checkpoint to bert-base-chinese-finetuned-emotion/checkpoint-300
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in bert-base-chinese-finetuned-emotion/checkpoint-300/tokenizer_config.json
Special tokens file saved in bert-base-chinese-finetuned-emotion/checkpoint-300/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1200
  Batch size = 64
Saving model checkpoint to bert-base-chinese-finetuned-emotion/checkpoint-4

TrainOutput(global_step=450, training_loss=0.5497383456759982, metrics={'train_runtime': 226.8045, 'train_samples_per_second': 126.982, 'train_steps_per_second': 1.984, 'total_flos': 0.0, 'train_loss': 0.5497383456759982, 'epoch': 3.0})

In [23]:
optimizer  # 初始化学习率0.0005,最终学习率归0(get_linear_schedule_with_warmup学习率预热归0)

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    initial_lr: 0.0005
    lr: 0.0
    maximize: False
    weight_decay: 0.01
)