### 1. 确认模型任务的原有行为

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

model.config.id2label = {0: "差评！", 1: "好评！"}
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)

sen = "饭菜有些咸！"
pipe(sen)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[{'label': '差评！', 'score': 0.5466088652610779}]

### 2. 模型微调

#### 2.1 准备环境，导入相关包

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

#### 2.2 准备数据

- 加载数据

In [3]:
dataset = load_dataset("csv", data_files="./waimai_10k.csv", split="train")
dataset = dataset.filter(lambda x: x["review"] is not None)
dataset

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Dataset({
    features: ['label', 'review'],
    num_rows: 11987
})

- 数据预处理

In [4]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

def process_function(examples):
    tokenized_examples = tokenizer(examples["review"], max_length=128, truncation=True)
    tokenized_examples["labels"] = examples["label"]
    return tokenized_examples

tokenized_dataset = dataset.map(process_function, batched=True, remove_columns=dataset.column_names)
tokenized_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 11987
})

- 分割数据集

In [5]:
tokenized_datasets = tokenized_dataset.train_test_split(test_size=0.1)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 10788
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1199
    })
})

#### 2.3 加载预训练模型

In [6]:
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")

# if torch.cuda.is_available():
#     model = model.cuda()
#     print('Use cuda GPU')
# elif torch.backends.mps.is_available():
#     model = model.to('mps')
#     print('Use mps')
model.config

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertConfig {
  "_name_or_path": "hfl/rbt3",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 3,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

#### 2.4 训练与评估

- 定义评估函数

In [7]:
import evaluate

acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

- 定义训练参数

In [8]:
train_args = TrainingArguments(output_dir="./checkpoints",      # 输出文件夹
                               per_device_train_batch_size=64,  # 训练时的batch_size
                               per_device_eval_batch_size=128,  # 验证时的batch_size
                               logging_steps=10,                # log 打印的频率
                               eval_strategy="epoch",     # 评估策略
                               save_strategy="epoch",           # 保存策略
                               save_total_limit=3,              # 最大保存数
                               learning_rate=2e-5,              # 学习率
                               weight_decay=0.01,               # weight_decay
                               metric_for_best_model="f1",      # 设定评估指标
                               load_best_model_at_end=True)     # 训练完成后加载最优模型
train_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
evaluation_strategy=None,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=

In [9]:
from transformers import DataCollatorWithPadding
trainer = Trainer(model=model, 
                  args=train_args, 
                  train_dataset=tokenized_datasets["train"], 
                  eval_dataset=tokenized_datasets["test"], 
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  compute_metrics=eval_metric)

- 执行训练

In [10]:
trainer.train()

  2%|▏         | 10/507 [00:04<03:00,  2.75it/s]

{'loss': 0.615, 'grad_norm': 1.8407313823699951, 'learning_rate': 1.9605522682445763e-05, 'epoch': 0.06}


  4%|▍         | 20/507 [00:07<02:42,  2.99it/s]

{'loss': 0.4912, 'grad_norm': 2.28098464012146, 'learning_rate': 1.921104536489152e-05, 'epoch': 0.12}


  6%|▌         | 30/507 [00:11<02:49,  2.82it/s]

{'loss': 0.4087, 'grad_norm': 3.172823667526245, 'learning_rate': 1.881656804733728e-05, 'epoch': 0.18}


  8%|▊         | 40/507 [00:14<02:58,  2.62it/s]

{'loss': 0.3397, 'grad_norm': 3.5258448123931885, 'learning_rate': 1.842209072978304e-05, 'epoch': 0.24}


 10%|▉         | 50/507 [00:18<02:57,  2.58it/s]

{'loss': 0.3588, 'grad_norm': 3.596682548522949, 'learning_rate': 1.80276134122288e-05, 'epoch': 0.3}


 12%|█▏        | 60/507 [00:22<02:43,  2.73it/s]

{'loss': 0.3524, 'grad_norm': 2.0398333072662354, 'learning_rate': 1.7633136094674557e-05, 'epoch': 0.36}


 14%|█▍        | 70/507 [00:25<02:32,  2.86it/s]

{'loss': 0.3783, 'grad_norm': 5.001039981842041, 'learning_rate': 1.7238658777120315e-05, 'epoch': 0.41}


 16%|█▌        | 80/507 [00:29<02:39,  2.68it/s]

{'loss': 0.3375, 'grad_norm': 2.783374547958374, 'learning_rate': 1.6844181459566076e-05, 'epoch': 0.47}


 18%|█▊        | 90/507 [00:33<02:34,  2.70it/s]

{'loss': 0.2593, 'grad_norm': 4.875743389129639, 'learning_rate': 1.6449704142011837e-05, 'epoch': 0.53}


 20%|█▉        | 100/507 [00:36<02:08,  3.17it/s]

{'loss': 0.2711, 'grad_norm': 2.658362865447998, 'learning_rate': 1.6055226824457594e-05, 'epoch': 0.59}


 22%|██▏       | 110/507 [00:40<02:36,  2.54it/s]

{'loss': 0.2785, 'grad_norm': 3.4528117179870605, 'learning_rate': 1.5660749506903355e-05, 'epoch': 0.65}


 24%|██▎       | 120/507 [00:43<02:19,  2.77it/s]

{'loss': 0.2852, 'grad_norm': 4.155889987945557, 'learning_rate': 1.5266272189349113e-05, 'epoch': 0.71}


 26%|██▌       | 130/507 [00:47<02:11,  2.86it/s]

{'loss': 0.2571, 'grad_norm': 2.0127270221710205, 'learning_rate': 1.4871794871794874e-05, 'epoch': 0.77}


 28%|██▊       | 140/507 [00:50<02:11,  2.79it/s]

{'loss': 0.2688, 'grad_norm': 2.631019353866577, 'learning_rate': 1.4477317554240633e-05, 'epoch': 0.83}


 30%|██▉       | 150/507 [00:54<02:12,  2.69it/s]

{'loss': 0.3077, 'grad_norm': 2.5243868827819824, 'learning_rate': 1.4082840236686392e-05, 'epoch': 0.89}


 32%|███▏      | 160/507 [00:57<02:03,  2.80it/s]

{'loss': 0.3371, 'grad_norm': 5.9462971687316895, 'learning_rate': 1.3688362919132151e-05, 'epoch': 0.95}


                                                 
 33%|███▎      | 169/507 [01:04<02:07,  2.65it/s]

{'eval_loss': 0.2778220474720001, 'eval_accuracy': 0.8957464553794829, 'eval_f1': 0.8378728923476005, 'eval_runtime': 2.6953, 'eval_samples_per_second': 444.846, 'eval_steps_per_second': 3.71, 'epoch': 1.0}


 34%|███▎      | 170/507 [01:04<07:10,  1.28s/it]

{'loss': 0.3016, 'grad_norm': 1.7471638917922974, 'learning_rate': 1.3293885601577909e-05, 'epoch': 1.01}


 36%|███▌      | 180/507 [01:07<01:47,  3.05it/s]

{'loss': 0.2408, 'grad_norm': 4.324486255645752, 'learning_rate': 1.2899408284023668e-05, 'epoch': 1.07}


 37%|███▋      | 190/507 [01:11<01:39,  3.19it/s]

{'loss': 0.2435, 'grad_norm': 3.3531062602996826, 'learning_rate': 1.250493096646943e-05, 'epoch': 1.12}


 39%|███▉      | 200/507 [01:14<01:34,  3.24it/s]

{'loss': 0.265, 'grad_norm': 2.4972119331359863, 'learning_rate': 1.2110453648915189e-05, 'epoch': 1.18}


 41%|████▏     | 210/507 [01:17<01:20,  3.70it/s]

{'loss': 0.2389, 'grad_norm': 2.7506918907165527, 'learning_rate': 1.1715976331360948e-05, 'epoch': 1.24}


 43%|████▎     | 220/507 [01:21<01:40,  2.86it/s]

{'loss': 0.2754, 'grad_norm': 5.1883649826049805, 'learning_rate': 1.1321499013806707e-05, 'epoch': 1.3}


 45%|████▌     | 230/507 [01:24<01:34,  2.94it/s]

{'loss': 0.2438, 'grad_norm': 4.316596031188965, 'learning_rate': 1.0927021696252466e-05, 'epoch': 1.36}


 47%|████▋     | 240/507 [01:28<01:27,  3.04it/s]

{'loss': 0.2443, 'grad_norm': 2.6976797580718994, 'learning_rate': 1.0532544378698226e-05, 'epoch': 1.42}


 49%|████▉     | 250/507 [01:31<01:28,  2.89it/s]

{'loss': 0.2397, 'grad_norm': 2.765881299972534, 'learning_rate': 1.0138067061143987e-05, 'epoch': 1.48}


 51%|█████▏    | 260/507 [01:34<01:28,  2.78it/s]

{'loss': 0.314, 'grad_norm': 4.0533294677734375, 'learning_rate': 9.743589743589744e-06, 'epoch': 1.54}


 53%|█████▎    | 270/507 [01:38<01:17,  3.04it/s]

{'loss': 0.2171, 'grad_norm': 2.269906759262085, 'learning_rate': 9.349112426035503e-06, 'epoch': 1.6}


 55%|█████▌    | 280/507 [01:41<01:19,  2.85it/s]

{'loss': 0.2586, 'grad_norm': 1.838826298713684, 'learning_rate': 8.954635108481263e-06, 'epoch': 1.66}


 57%|█████▋    | 290/507 [01:45<01:11,  3.02it/s]

{'loss': 0.2326, 'grad_norm': 3.9657132625579834, 'learning_rate': 8.560157790927024e-06, 'epoch': 1.72}


 59%|█████▉    | 300/507 [01:48<01:04,  3.21it/s]

{'loss': 0.2089, 'grad_norm': 2.4238359928131104, 'learning_rate': 8.165680473372781e-06, 'epoch': 1.78}


 61%|██████    | 310/507 [01:51<01:03,  3.13it/s]

{'loss': 0.2397, 'grad_norm': 4.144314289093018, 'learning_rate': 7.77120315581854e-06, 'epoch': 1.83}


 63%|██████▎   | 320/507 [01:55<01:08,  2.75it/s]

{'loss': 0.2346, 'grad_norm': 3.450509548187256, 'learning_rate': 7.3767258382643005e-06, 'epoch': 1.89}


 65%|██████▌   | 330/507 [01:58<01:00,  2.94it/s]

{'loss': 0.2434, 'grad_norm': 2.632070541381836, 'learning_rate': 6.98224852071006e-06, 'epoch': 1.95}


                                                 
 67%|██████▋   | 338/507 [02:03<00:46,  3.67it/s]

{'eval_loss': 0.2719721496105194, 'eval_accuracy': 0.9032527105921602, 'eval_f1': 0.8465608465608465, 'eval_runtime': 2.2779, 'eval_samples_per_second': 526.355, 'eval_steps_per_second': 4.39, 'epoch': 2.0}


 67%|██████▋   | 340/507 [02:04<02:22,  1.17it/s]

{'loss': 0.2618, 'grad_norm': 2.2482895851135254, 'learning_rate': 6.587771203155819e-06, 'epoch': 2.01}


 69%|██████▉   | 350/507 [02:07<00:54,  2.88it/s]

{'loss': 0.2469, 'grad_norm': 4.963229179382324, 'learning_rate': 6.193293885601579e-06, 'epoch': 2.07}


 71%|███████   | 360/507 [02:11<00:56,  2.60it/s]

{'loss': 0.248, 'grad_norm': 3.213634490966797, 'learning_rate': 5.7988165680473375e-06, 'epoch': 2.13}


 73%|███████▎  | 370/507 [02:15<00:49,  2.77it/s]

{'loss': 0.1985, 'grad_norm': 2.4743404388427734, 'learning_rate': 5.404339250493097e-06, 'epoch': 2.19}


 75%|███████▍  | 380/507 [02:19<00:46,  2.72it/s]

{'loss': 0.2146, 'grad_norm': 3.061187505722046, 'learning_rate': 5.009861932938857e-06, 'epoch': 2.25}


 77%|███████▋  | 390/507 [02:22<00:36,  3.24it/s]

{'loss': 0.1982, 'grad_norm': 1.8067915439605713, 'learning_rate': 4.615384615384616e-06, 'epoch': 2.31}


 79%|███████▉  | 400/507 [02:25<00:34,  3.08it/s]

{'loss': 0.2229, 'grad_norm': 2.916599750518799, 'learning_rate': 4.220907297830375e-06, 'epoch': 2.37}


 81%|████████  | 410/507 [02:29<00:36,  2.63it/s]

{'loss': 0.2561, 'grad_norm': 3.7369627952575684, 'learning_rate': 3.826429980276135e-06, 'epoch': 2.43}


 83%|████████▎ | 420/507 [02:33<00:30,  2.86it/s]

{'loss': 0.21, 'grad_norm': 3.8664727210998535, 'learning_rate': 3.4319526627218935e-06, 'epoch': 2.49}


 85%|████████▍ | 430/507 [02:35<00:21,  3.61it/s]

{'loss': 0.2325, 'grad_norm': 2.325525999069214, 'learning_rate': 3.037475345167653e-06, 'epoch': 2.54}


 87%|████████▋ | 440/507 [02:39<00:24,  2.75it/s]

{'loss': 0.2526, 'grad_norm': 4.785170555114746, 'learning_rate': 2.6429980276134125e-06, 'epoch': 2.6}


 89%|████████▉ | 450/507 [02:42<00:20,  2.83it/s]

{'loss': 0.2285, 'grad_norm': 2.6223301887512207, 'learning_rate': 2.2485207100591717e-06, 'epoch': 2.66}


 91%|█████████ | 460/507 [02:46<00:16,  2.77it/s]

{'loss': 0.2051, 'grad_norm': 2.9570510387420654, 'learning_rate': 1.8540433925049312e-06, 'epoch': 2.72}


 93%|█████████▎| 470/507 [02:49<00:13,  2.83it/s]

{'loss': 0.2188, 'grad_norm': 1.9123773574829102, 'learning_rate': 1.4595660749506904e-06, 'epoch': 2.78}


 95%|█████████▍| 480/507 [02:53<00:10,  2.56it/s]

{'loss': 0.1855, 'grad_norm': 2.722668409347534, 'learning_rate': 1.06508875739645e-06, 'epoch': 2.84}


 97%|█████████▋| 490/507 [02:56<00:05,  3.10it/s]

{'loss': 0.2277, 'grad_norm': 3.0212206840515137, 'learning_rate': 6.706114398422091e-07, 'epoch': 2.9}


 99%|█████████▊| 500/507 [02:59<00:02,  3.05it/s]

{'loss': 0.2215, 'grad_norm': 2.5214579105377197, 'learning_rate': 2.7613412228796843e-07, 'epoch': 2.96}


                                                 
100%|██████████| 507/507 [03:04<00:00,  3.28it/s]

{'eval_loss': 0.2666938602924347, 'eval_accuracy': 0.9049207673060884, 'eval_f1': 0.8488063660477454, 'eval_runtime': 2.3621, 'eval_samples_per_second': 507.59, 'eval_steps_per_second': 4.233, 'epoch': 3.0}


100%|██████████| 507/507 [03:04<00:00,  2.75it/s]

{'train_runtime': 184.6857, 'train_samples_per_second': 175.238, 'train_steps_per_second': 2.745, 'train_loss': 0.27138275907354714, 'epoch': 3.0}





TrainOutput(global_step=507, training_loss=0.27138275907354714, metrics={'train_runtime': 184.6857, 'train_samples_per_second': 175.238, 'train_steps_per_second': 2.745, 'total_flos': 466439293911168.0, 'train_loss': 0.27138275907354714, 'epoch': 3.0})

- 评估

In [11]:
trainer.evaluate()

100%|██████████| 10/10 [00:02<00:00,  4.96it/s]


{'eval_loss': 0.2666938602924347,
 'eval_accuracy': 0.9049207673060884,
 'eval_f1': 0.8488063660477454,
 'eval_runtime': 2.2742,
 'eval_samples_per_second': 527.211,
 'eval_steps_per_second': 4.397,
 'epoch': 3.0}

#### 2.5 模型预测

In [12]:
trainer.predict(tokenized_datasets["test"])

100%|██████████| 10/10 [00:02<00:00,  4.97it/s]


PredictionOutput(predictions=array([[ 2.0266874 , -2.6203175 ],
       [ 1.6777284 , -1.997865  ],
       [ 0.19724365, -0.6200299 ],
       ...,
       [ 1.4871122 , -2.272513  ],
       [ 1.4682922 , -1.8287487 ],
       [-2.2058794 ,  1.8980082 ]], dtype=float32), label_ids=array([0, 0, 0, ..., 0, 0, 1]), metrics={'test_loss': 0.2666938602924347, 'test_accuracy': 0.9049207673060884, 'test_f1': 0.8488063660477454, 'test_runtime': 2.2742, 'test_samples_per_second': 527.208, 'test_steps_per_second': 4.397})

In [13]:
from transformers import pipeline

id2_label = {0: "差评！", 1: "好评！"}
model.config.id2label = id2_label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

sen = "饭菜有些咸！"
pipe(sen)

[{'label': '差评！', 'score': 0.9448801279067993}]

#### 2.6 微调模型保存

- 保存到本地

In [14]:
local_model_path = './my-awesome-model'
model.save_pretrained(local_model_path)
tokenizer.save_pretrained(local_model_path)

('./my-awesome-model/tokenizer_config.json',
 './my-awesome-model/special_tokens_map.json',
 './my-awesome-model/vocab.txt',
 './my-awesome-model/added_tokens.json',
 './my-awesome-model/tokenizer.json')

- 从本地加载和预测

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(local_model_path)
model.config.id2label = id2_label

tokenizer = AutoTokenizer.from_pretrained(local_model_path)
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)

sen = "饭菜有些咸！"
pipe(sen)

[{'label': '差评！', 'score': 0.9448801279067993}]