### 1. 确认模型任务的原有行为

In [61]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

model.config.id2label = {0: "差评！", 1: "好评！"}
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)

sen = "饭菜有些咸！"
pipe(sen)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[{'label': '好评！', 'score': 0.5472492575645447}]

### 2. 模型微调

#### 2.1 准备环境，导入相关包

In [41]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

#### 2.2 准备数据

- 加载数据

In [42]:
dataset = load_dataset("csv", data_files="./waimai_10k.csv", split="train")
dataset = dataset.filter(lambda x: x["review"] is not None)
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 11987
})

- 数据预处理

In [43]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

def process_function(examples):
    tokenized_examples = tokenizer(examples["review"], max_length=128, truncation=True)
    tokenized_examples["labels"] = examples["label"]
    return tokenized_examples

tokenized_dataset = dataset.map(process_function, batched=True, remove_columns=dataset.column_names)
tokenized_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 11987
})

- 分割数据集

In [44]:
tokenized_datasets = tokenized_dataset.train_test_split(test_size=0.1)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 10788
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1199
    })
})

#### 2.3 加载预训练模型

In [45]:
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")

# if torch.cuda.is_available():
#     model = model.cuda()
#     print('Use cuda GPU')
# elif torch.backends.mps.is_available():
#     model = model.to('mps')
#     print('Use mps')
model.config

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertConfig {
  "_name_or_path": "hfl/rbt3",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 3,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

#### 2.4 训练与评估

- 定义评估函数

In [46]:
import evaluate

acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

- 定义训练参数

In [47]:
train_args = TrainingArguments(output_dir="./checkpoints",      # 输出文件夹
                               per_device_train_batch_size=64,  # 训练时的batch_size
                               per_device_eval_batch_size=128,  # 验证时的batch_size
                               logging_steps=10,                # log 打印的频率
                               eval_strategy="epoch",     # 评估策略
                               save_strategy="epoch",           # 保存策略
                               save_total_limit=3,              # 最大保存数
                               learning_rate=2e-5,              # 学习率
                               weight_decay=0.01,               # weight_decay
                               metric_for_best_model="f1",      # 设定评估指标
                               load_best_model_at_end=True)     # 训练完成后加载最优模型
train_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
evaluation_strategy=None,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=

In [48]:
from transformers import DataCollatorWithPadding
trainer = Trainer(model=model, 
                  args=train_args, 
                  train_dataset=tokenized_datasets["train"], 
                  eval_dataset=tokenized_datasets["test"], 
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  compute_metrics=eval_metric)

- 执行训练

In [49]:
trainer.train()

  2%|▏         | 10/507 [00:03<02:18,  3.59it/s]

{'loss': 0.5992, 'grad_norm': 1.7134801149368286, 'learning_rate': 1.9605522682445763e-05, 'epoch': 0.06}


  4%|▍         | 20/507 [00:06<02:53,  2.81it/s]

{'loss': 0.5047, 'grad_norm': 2.125014543533325, 'learning_rate': 1.921104536489152e-05, 'epoch': 0.12}


  6%|▌         | 30/507 [00:09<02:42,  2.93it/s]

{'loss': 0.4028, 'grad_norm': 2.1130874156951904, 'learning_rate': 1.881656804733728e-05, 'epoch': 0.18}


  8%|▊         | 40/507 [00:13<02:34,  3.02it/s]

{'loss': 0.3768, 'grad_norm': 2.4480748176574707, 'learning_rate': 1.842209072978304e-05, 'epoch': 0.24}


 10%|▉         | 50/507 [00:16<02:52,  2.66it/s]

{'loss': 0.3205, 'grad_norm': 2.2368876934051514, 'learning_rate': 1.80276134122288e-05, 'epoch': 0.3}


 12%|█▏        | 60/507 [00:20<02:23,  3.11it/s]

{'loss': 0.3327, 'grad_norm': 4.769203186035156, 'learning_rate': 1.7633136094674557e-05, 'epoch': 0.36}


 14%|█▍        | 70/507 [00:23<02:34,  2.83it/s]

{'loss': 0.3337, 'grad_norm': 2.2559540271759033, 'learning_rate': 1.7238658777120315e-05, 'epoch': 0.41}


 16%|█▌        | 80/507 [00:27<02:33,  2.79it/s]

{'loss': 0.3223, 'grad_norm': 4.772592067718506, 'learning_rate': 1.6844181459566076e-05, 'epoch': 0.47}


 18%|█▊        | 90/507 [00:30<02:07,  3.27it/s]

{'loss': 0.3165, 'grad_norm': 2.260765790939331, 'learning_rate': 1.6449704142011837e-05, 'epoch': 0.53}


 20%|█▉        | 100/507 [00:33<02:30,  2.71it/s]

{'loss': 0.3109, 'grad_norm': 4.239144325256348, 'learning_rate': 1.6055226824457594e-05, 'epoch': 0.59}


 22%|██▏       | 110/507 [00:37<02:27,  2.70it/s]

{'loss': 0.2334, 'grad_norm': 2.9362125396728516, 'learning_rate': 1.5660749506903355e-05, 'epoch': 0.65}


 24%|██▎       | 120/507 [00:41<02:18,  2.79it/s]

{'loss': 0.3027, 'grad_norm': 3.3372623920440674, 'learning_rate': 1.5266272189349113e-05, 'epoch': 0.71}


 26%|██▌       | 130/507 [00:44<02:11,  2.86it/s]

{'loss': 0.303, 'grad_norm': 3.895632743835449, 'learning_rate': 1.4871794871794874e-05, 'epoch': 0.77}


 28%|██▊       | 140/507 [00:48<02:19,  2.64it/s]

{'loss': 0.3294, 'grad_norm': 6.658329963684082, 'learning_rate': 1.4477317554240633e-05, 'epoch': 0.83}


 30%|██▉       | 150/507 [00:51<01:58,  3.02it/s]

{'loss': 0.2848, 'grad_norm': 2.6907379627227783, 'learning_rate': 1.4082840236686392e-05, 'epoch': 0.89}


 32%|███▏      | 160/507 [00:54<02:02,  2.83it/s]

{'loss': 0.2948, 'grad_norm': 2.9135537147521973, 'learning_rate': 1.3688362919132151e-05, 'epoch': 0.95}


 33%|███▎      | 169/507 [00:58<02:37,  2.15it/s]
 33%|███▎      | 169/507 [01:01<02:37,  2.15it/s]

{'eval_loss': 0.24542610347270966, 'eval_accuracy': 0.9132610508757297, 'eval_f1': 0.8663239074550129, 'eval_runtime': 2.7408, 'eval_samples_per_second': 437.471, 'eval_steps_per_second': 3.649, 'epoch': 1.0}


 34%|███▎      | 170/507 [01:01<07:44,  1.38s/it]

{'loss': 0.253, 'grad_norm': 2.638707399368286, 'learning_rate': 1.3293885601577909e-05, 'epoch': 1.01}


 36%|███▌      | 180/507 [01:05<02:04,  2.64it/s]

{'loss': 0.252, 'grad_norm': 4.371758937835693, 'learning_rate': 1.2899408284023668e-05, 'epoch': 1.07}


 37%|███▋      | 190/507 [01:09<01:56,  2.72it/s]

{'loss': 0.235, 'grad_norm': 3.4787752628326416, 'learning_rate': 1.250493096646943e-05, 'epoch': 1.12}


 39%|███▉      | 200/507 [01:12<01:37,  3.16it/s]

{'loss': 0.2768, 'grad_norm': 2.4601473808288574, 'learning_rate': 1.2110453648915189e-05, 'epoch': 1.18}


 41%|████▏     | 210/507 [01:15<01:40,  2.95it/s]

{'loss': 0.2345, 'grad_norm': 2.4465718269348145, 'learning_rate': 1.1715976331360948e-05, 'epoch': 1.24}


 43%|████▎     | 220/507 [01:19<01:43,  2.77it/s]

{'loss': 0.2527, 'grad_norm': 3.817941427230835, 'learning_rate': 1.1321499013806707e-05, 'epoch': 1.3}


 45%|████▌     | 230/507 [01:22<01:33,  2.97it/s]

{'loss': 0.2863, 'grad_norm': 2.899588108062744, 'learning_rate': 1.0927021696252466e-05, 'epoch': 1.36}


 47%|████▋     | 240/507 [01:26<01:15,  3.56it/s]

{'loss': 0.2678, 'grad_norm': 3.582040309906006, 'learning_rate': 1.0532544378698226e-05, 'epoch': 1.42}


 49%|████▉     | 250/507 [01:29<01:33,  2.76it/s]

{'loss': 0.2577, 'grad_norm': 3.276808738708496, 'learning_rate': 1.0138067061143987e-05, 'epoch': 1.48}


 51%|█████▏    | 260/507 [01:32<01:21,  3.03it/s]

{'loss': 0.2795, 'grad_norm': 4.357559680938721, 'learning_rate': 9.743589743589744e-06, 'epoch': 1.54}


 53%|█████▎    | 270/507 [01:35<01:13,  3.24it/s]

{'loss': 0.226, 'grad_norm': 3.8930981159210205, 'learning_rate': 9.349112426035503e-06, 'epoch': 1.6}


 55%|█████▌    | 280/507 [01:39<01:17,  2.94it/s]

{'loss': 0.2286, 'grad_norm': 2.2554616928100586, 'learning_rate': 8.954635108481263e-06, 'epoch': 1.66}


 57%|█████▋    | 290/507 [01:43<01:19,  2.74it/s]

{'loss': 0.2396, 'grad_norm': 2.8817203044891357, 'learning_rate': 8.560157790927024e-06, 'epoch': 1.72}


 59%|█████▉    | 300/507 [01:46<01:20,  2.56it/s]

{'loss': 0.2425, 'grad_norm': 4.655620574951172, 'learning_rate': 8.165680473372781e-06, 'epoch': 1.78}


 61%|██████    | 310/507 [01:50<00:55,  3.55it/s]

{'loss': 0.2894, 'grad_norm': 2.336229085922241, 'learning_rate': 7.77120315581854e-06, 'epoch': 1.83}


 63%|██████▎   | 320/507 [01:53<01:04,  2.90it/s]

{'loss': 0.2429, 'grad_norm': 3.9970695972442627, 'learning_rate': 7.3767258382643005e-06, 'epoch': 1.89}


 65%|██████▌   | 330/507 [01:56<01:03,  2.77it/s]

{'loss': 0.2501, 'grad_norm': 3.426359176635742, 'learning_rate': 6.98224852071006e-06, 'epoch': 1.95}


 67%|██████▋   | 338/507 [01:59<01:00,  2.79it/s]
 67%|██████▋   | 338/507 [02:02<01:00,  2.79it/s]

{'eval_loss': 0.23134386539459229, 'eval_accuracy': 0.9182652210175146, 'eval_f1': 0.8759493670886076, 'eval_runtime': 2.3069, 'eval_samples_per_second': 519.736, 'eval_steps_per_second': 4.335, 'epoch': 2.0}


 67%|██████▋   | 340/507 [02:03<02:37,  1.06it/s]

{'loss': 0.1962, 'grad_norm': 3.9682326316833496, 'learning_rate': 6.587771203155819e-06, 'epoch': 2.01}


 69%|██████▉   | 350/507 [02:06<00:59,  2.62it/s]

{'loss': 0.217, 'grad_norm': 3.336657762527466, 'learning_rate': 6.193293885601579e-06, 'epoch': 2.07}


 71%|███████   | 360/507 [02:10<00:47,  3.12it/s]

{'loss': 0.3004, 'grad_norm': 4.232394218444824, 'learning_rate': 5.7988165680473375e-06, 'epoch': 2.13}


 73%|███████▎  | 370/507 [02:13<00:50,  2.71it/s]

{'loss': 0.2093, 'grad_norm': 3.9263665676116943, 'learning_rate': 5.404339250493097e-06, 'epoch': 2.19}


 75%|███████▍  | 380/507 [02:17<00:44,  2.86it/s]

{'loss': 0.209, 'grad_norm': 3.313805103302002, 'learning_rate': 5.009861932938857e-06, 'epoch': 2.25}


 77%|███████▋  | 390/507 [02:20<00:36,  3.19it/s]

{'loss': 0.2256, 'grad_norm': 2.428868055343628, 'learning_rate': 4.615384615384616e-06, 'epoch': 2.31}


 79%|███████▉  | 400/507 [02:23<00:31,  3.41it/s]

{'loss': 0.2293, 'grad_norm': 4.139084339141846, 'learning_rate': 4.220907297830375e-06, 'epoch': 2.37}


 81%|████████  | 410/507 [02:27<00:37,  2.61it/s]

{'loss': 0.2423, 'grad_norm': 3.2295563220977783, 'learning_rate': 3.826429980276135e-06, 'epoch': 2.43}


 83%|████████▎ | 420/507 [02:31<00:32,  2.68it/s]

{'loss': 0.2142, 'grad_norm': 2.2318713665008545, 'learning_rate': 3.4319526627218935e-06, 'epoch': 2.49}


 85%|████████▍ | 430/507 [02:34<00:25,  3.04it/s]

{'loss': 0.1668, 'grad_norm': 3.9607889652252197, 'learning_rate': 3.037475345167653e-06, 'epoch': 2.54}


 87%|████████▋ | 440/507 [02:38<00:21,  3.07it/s]

{'loss': 0.2314, 'grad_norm': 2.5219662189483643, 'learning_rate': 2.6429980276134125e-06, 'epoch': 2.6}


 89%|████████▉ | 450/507 [02:41<00:19,  2.92it/s]

{'loss': 0.224, 'grad_norm': 3.141319751739502, 'learning_rate': 2.2485207100591717e-06, 'epoch': 2.66}


 91%|█████████ | 460/507 [02:45<00:18,  2.61it/s]

{'loss': 0.2475, 'grad_norm': 2.478579044342041, 'learning_rate': 1.8540433925049312e-06, 'epoch': 2.72}


 93%|█████████▎| 470/507 [02:48<00:11,  3.28it/s]

{'loss': 0.2305, 'grad_norm': 4.76114559173584, 'learning_rate': 1.4595660749506904e-06, 'epoch': 2.78}


 95%|█████████▍| 480/507 [02:52<00:09,  2.76it/s]

{'loss': 0.2119, 'grad_norm': 1.7352054119110107, 'learning_rate': 1.06508875739645e-06, 'epoch': 2.84}


 97%|█████████▋| 490/507 [02:55<00:06,  2.75it/s]

{'loss': 0.2462, 'grad_norm': 2.7153799533843994, 'learning_rate': 6.706114398422091e-07, 'epoch': 2.9}


 99%|█████████▊| 500/507 [02:59<00:02,  2.91it/s]

{'loss': 0.2512, 'grad_norm': 3.6124022006988525, 'learning_rate': 2.7613412228796843e-07, 'epoch': 2.96}


100%|██████████| 507/507 [03:01<00:00,  3.40it/s]
100%|██████████| 507/507 [03:03<00:00,  3.40it/s]

{'eval_loss': 0.23135297000408173, 'eval_accuracy': 0.9174311926605505, 'eval_f1': 0.8748419721871049, 'eval_runtime': 2.3411, 'eval_samples_per_second': 512.143, 'eval_steps_per_second': 4.271, 'epoch': 3.0}


100%|██████████| 507/507 [03:04<00:00,  2.75it/s]

{'train_runtime': 184.3464, 'train_samples_per_second': 175.561, 'train_steps_per_second': 2.75, 'train_loss': 0.27392145938421847, 'epoch': 3.0}





TrainOutput(global_step=507, training_loss=0.27392145938421847, metrics={'train_runtime': 184.3464, 'train_samples_per_second': 175.561, 'train_steps_per_second': 2.75, 'total_flos': 471647796782640.0, 'train_loss': 0.27392145938421847, 'epoch': 3.0})

- 评估

In [51]:
trainer.evaluate()

100%|██████████| 10/10 [00:01<00:00,  5.10it/s]


{'eval_loss': 0.23134386539459229,
 'eval_accuracy': 0.9182652210175146,
 'eval_f1': 0.8759493670886076,
 'eval_runtime': 2.2119,
 'eval_samples_per_second': 542.056,
 'eval_steps_per_second': 4.521,
 'epoch': 3.0}

#### 2.5 模型预测

In [52]:
trainer.predict(tokenized_datasets["test"])

100%|██████████| 10/10 [00:01<00:00,  5.14it/s]


PredictionOutput(predictions=array([[ 1.2873403, -1.068345 ],
       [-2.1651404,  2.165618 ],
       [ 1.9800029, -2.0699615],
       ...,
       [-0.5200064,  0.4800528],
       [ 1.6012837, -1.5406739],
       [ 1.6972494, -2.3993273]], dtype=float32), label_ids=array([0, 1, 0, ..., 1, 0, 0]), metrics={'test_loss': 0.23134386539459229, 'test_accuracy': 0.9182652210175146, 'test_f1': 0.8759493670886076, 'test_runtime': 2.1935, 'test_samples_per_second': 546.621, 'test_steps_per_second': 4.559})

In [53]:
from transformers import pipeline

id2_label = {0: "差评！", 1: "好评！"}
model.config.id2label = id2_label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

sen = "饭菜有些咸！"
pipe(sen)

[{'label': '差评！', 'score': 0.9463842511177063}]

#### 2.6 微调模型保存

- 保存到本地

In [56]:
local_model_path = './my-awesome-model'
model.save_pretrained(local_model_path)
tokenizer.save_pretrained(local_model_path)

('./my-awesome-model/tokenizer_config.json',
 './my-awesome-model/special_tokens_map.json',
 './my-awesome-model/vocab.txt',
 './my-awesome-model/added_tokens.json',
 './my-awesome-model/tokenizer.json')

- 从本地加载和预测

In [57]:
model = AutoModelForSequenceClassification.from_pretrained(local_model_path)
model.config.id2label = id2_label

tokenizer = AutoTokenizer.from_pretrained(local_model_path)
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)

sen = "饭菜有些咸！"
pipe(sen)

[{'label': '差评！', 'score': 0.9463842511177063}]