In [30]:
#导入相关包
from transformers import AutoTokenizer, AutoModelForTokenClassification,TrainingArguments, Trainer,DataCollatorForTokenClassification
import evaluate
from datasets import load_dataset

In [31]:
#加载数据集（cache_dir参数指定了数据集缓存目录）
ner_datasets = load_dataset('./peoples-daily-ner',cache_dir='./')
ner_datasets

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 166920
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 18552
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 37096
    })
})

In [32]:
ner_datasets["train"][0]

{'id': '0',
 'tokens': ['海',
  '钓',
  '比',
  '赛',
  '地',
  '点',
  '在',
  '厦',
  '门',
  '与',
  '金',
  '门',
  '之',
  '间',
  '的',
  '海',
  '域',
  '。'],
 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0]}

In [33]:
ner_datasets["train"].features

{'id': Value('string'),
 'tokens': List(Value('string')),
 'ner_tags': List(ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']))}

In [34]:
label_list = ner_datasets["train"].features["ner_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [35]:
#数据集预处理
tokenizer = AutoTokenizer.from_pretrained("D:\Hugging Face Hub\chinese-macbert-base")

In [36]:
tokenizer(ner_datasets["train"][0]["tokens"],is_split_into_words=True)#这并不是我们想要的一个词一个token，我们要的是一个句子(token可能被拆分了)

{'input_ids': [101, 3862, 7157, 3683, 6612, 1765, 4157, 1762, 1336, 7305, 680, 7032, 7305, 722, 7313, 4638, 3862, 1818, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [37]:
def process_function(examples):
    tokenized_examples = tokenizer(examples["tokens"], truncation=True, max_length=512,is_split_into_words=True)
    labels = []
    for i,label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_examples.word_ids(batch_index=i)#i批次的word_ids
        label_ids = []
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_id])
        labels.append(label_ids)
    tokenized_examples["labels"]=labels
    return tokenized_examples

In [38]:
tokenized_datasets = ner_datasets.map(process_function,batched=True)
tokenized_datasets

Map:   0%|          | 0/166920 [00:00<?, ? examples/s]

Map:   0%|          | 0/18552 [00:00<?, ? examples/s]

Map:   0%|          | 0/37096 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 166920
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 18552
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 37096
    })
})

In [39]:
print(tokenized_datasets["train"][0])

{'id': '0', 'tokens': ['海', '钓', '比', '赛', '地', '点', '在', '厦', '门', '与', '金', '门', '之', '间', '的', '海', '域', '。'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0], 'input_ids': [101, 3862, 7157, 3683, 6612, 1765, 4157, 1762, 1336, 7305, 680, 7032, 7305, 722, 7313, 4638, 3862, 1818, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0, 0, -100]}


In [40]:
#创建model(默认是二分类任务)
model = AutoModelForTokenClassification.from_pretrained("D:\Hugging Face Hub\chinese-macbert-base",num_labels=len(label_list))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at D:\Hugging Face Hub\chinese-macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
#创建评估函数
seqeval = evaluate.load("seqeval")
seqeval

EvaluationModule(name: "seqeval", module_type: "metric", features: {'predictions': List(Value('string')), 'references': List(Value('string'))}, usage: """
Produces labelling scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions: List of List of predicted labels (Estimated targets as returned by a tagger)
    references: List of List of reference labels (Ground truth (correct) target values)
    suffix: True if the IOB prefix is after type, False otherwise. default: False
    scheme: Specify target tagging scheme. Should be one of ["IOB1", "IOB2", "IOE1", "IOE2", "IOBES", "BILOU"].
        default: None
    mode: Whether to count correct entity labels with incorrect I/B tags as true positives or not.
        If you want to only count exact matches, pass mode="strict". default: None.
    sample_weight: Array-like of shape (n_samples,), weights for individual samples. default: None
    zero_division: Which value to substitute as a

In [42]:
import numpy as np
def eval_metric(pred):
    predictions,labels = pred
    predictions = np.argmax(predictions, axis=-1)
    true_predictions = [
        [label_list[p] for p, l in zip(prediction, label) if l!=-100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for p, l in zip(prediction, label) if l!=-100]
        for prediction, label in zip(predictions, labels)
    ]
    result = seqeval.compute(predictions=true_predictions, references=true_labels,mode ="strict",scheme="IOB2")
    return {
        "f1":result["overall_f1"]
    }

In [43]:
#创建training argment
args = TrainingArguments(
    output_dir="./model_for_ner",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    #eval_strategy="epoch",
    #save_strategy="epoch",
    metric_for_best_model="f1",
    #load_best_model_at_end=True,
    logging_steps=50
)

In [44]:
#创建trainer
trainer = Trainer(
    model = model,
    args = args,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["validation"],
    compute_metrics = eval_metric,
    data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer)
)

In [23]:
#训练
trainer.train()

Step,Training Loss
50,0.2121
100,0.0801
150,0.0646
200,0.0745
250,0.0598
300,0.053
350,0.0522
400,0.0584
450,0.048
500,0.05


KeyboardInterrupt: 

In [None]:
#评估
trainer.evaluate(eval_dataset=tokenized_datasets["test"])

In [24]:
#模型预测
from transformers import pipeline
model.config.id2label = {idx:label for idx, label in enumerate(label_list)}

In [29]:
ner_pipe = pipeline("token-classification", model=model, tokenizer=tokenizer,device=0,aggregation_strategy="simple")

Device set to use cuda:0


In [None]:
res = ner_pipe("小明在北京上班")
ner_result = []
x = "小明在北京上班"
for i in res :
    if i["entity_group"] not in ner_result:
        ner_result[i["entity_group"]] = []
    ner_result[i["entity_group"]].append(x[i["start"]:i["end"]])
ner_result