In [2]:
from utils import *
from datasets import load_dataset
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from transformers import DataCollatorForTokenClassification
import numpy as np
import evaluate

In [3]:
# 指定文件路径
train_file_path = 'data/train.json'  # 训练集文件路径
test_file_path = 'data/test.json'    # 验证集（或测试集）文件路径

# 加载数据集
dataset = load_dataset('json', data_files={'train': train_file_path, 'test': test_file_path})

label_map, label_nums = generate_label_map_and_count(train_file_path)
print("Label Map:", label_map)
print("Number of Labels:", label_nums)

# 加载模型
tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
model = BertForTokenClassification.from_pretrained('bert-base-chinese', num_labels=label_nums)


Label Map: OrderedDict([('O', 0), ('B-LOC', 1), ('B-ORG', 2), ('B-PER', 3), ('I-LOC', 4), ('I-ORG', 5), ('I-PER', 6)])
Number of Labels: 7


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 数据预处理

In [4]:
# 应用预处理到加载的数据集
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128, return_offsets_mapping=True)
    labels = []
    
    for i, doc_labels in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # 获取每个令牌对应的词汇ID
        label_ids = [-100 if id is None else label_map[doc_labels[id]] for id in word_ids]
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    tokenized_inputs.pop("offset_mapping") # 不需要返回偏移量映射给模型
    return tokenized_inputs


tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)


# 评估函数定义

In [5]:

metric = evaluate.load('seqeval')
label_map = {'O': 0, 'B-LOC': 1, 'B-ORG': 2, 'B-PER': 3, 'I-LOC': 4, 'I-ORG': 5, 'I-PER': 6}
# 反转label_map，用于ID到标签的转换
id_to_label = {id: label for label, id in label_map.items()}

def compute_metrics(eval_preds):
    seqeval_metric = evaluate.load('seqeval')
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=2)
    
    true_labels = []
    pred_labels = []
    for i, label_seq in enumerate(labels):
        true_seq = []
        pred_seq = []
        for j, label_id in enumerate(label_seq):
            if label_id != -100:  # 忽略特殊的-100标签
                true_seq.append(id_to_label.get(label_id, "O"))
                
                pred_id = predictions[i, j]
                pred_seq.append(id_to_label.get(pred_id, "O"))
                    
        true_labels.append(true_seq)
        pred_labels.append(pred_seq)
    
    results = seqeval_metric.compute(predictions=pred_labels, references=true_labels, scheme="IOB2", mode="strict")
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [6]:

print(tokenized_dataset["train"].column_names)
tokenized_dataset["train"][0]["text"]
tokenized_dataset["train"][0]["labels"]
print(tokenized_dataset["test"].column_names)

['labels', 'text', 'input_ids', 'token_type_ids', 'attention_mask']
['labels', 'text', 'input_ids', 'token_type_ids', 'attention_mask']


In [7]:
# 定义训练参数
training_args = TrainingArguments(
    output_dir="./tmp",         # 输出目录
    num_train_epochs=3,             # 训练轮数
    per_device_train_batch_size=8,  # 训练批次大小
    per_device_eval_batch_size=16,   # 评估批次大小
    warmup_steps=500,               # 预热步数
    weight_decay=0.01,              # 权重衰减
    logging_dir="./logs",           # 日志目录
    logging_strategy='epoch',# 日志记录步数
    evaluation_strategy='epoch',    # 评估策略
    save_strategy='epoch',          # 保存策略
    load_best_model_at_end=True,    # 训练结束时加载最佳模型
)

In [8]:
# 初始化Trainer
trainer = Trainer(
    model=model,                         # 使用预处理后初始化的模型
    args=training_args,                  # 训练参数
    train_dataset=tokenized_dataset["train"], # 训练数据集
    eval_dataset=tokenized_dataset["test"],     # 验证/测试数据集
    compute_metrics=compute_metrics,
)

# 开始训练
trainer.train()
# 训练结束后手动保存模型
trainer.save_model("./models/my-bert-chinese-ner")



dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0808,0.040567,0.919086,0.883507,0.900946,0.98893
2,0.0271,0.038639,0.920483,0.919157,0.91982,0.99081
3,0.0123,0.036514,0.93489,0.922938,0.928876,0.991904


In [9]:
# 评估模型
trainer.evaluate()

{'eval_loss': 0.036513760685920715,
 'eval_precision': 0.9348896589458325,
 'eval_recall': 0.9229384227583723,
 'eval_f1': 0.9288756002536922,
 'eval_accuracy': 0.9919044956518636,
 'eval_runtime': 12.0875,
 'eval_samples_per_second': 361.116,
 'eval_steps_per_second': 22.585,
 'epoch': 3.0}