# 命名实体标注(Named Entity Recognition, NER)

In [1]:
# 载入相关套件
from transformers import pipeline

In [5]:
# 载入模型
nlp = pipeline("ner")

In [10]:
# 测试资料
sequence = "Hugging Face Inc. is a company based in New York City. " \
           "Its headquarters are in DUMBO, therefore very" \
           "close to the Manhattan Bridge."

# 推测答案
import pandas as pd
df = pd.DataFrame(nlp(sequence))
df

Unnamed: 0,word,score,entity,index,start,end
0,Hu,0.999511,I-ORG,1,0,2
1,##gging,0.989597,I-ORG,2,2,7
2,Face,0.99797,I-ORG,3,8,12
3,Inc,0.999376,I-ORG,4,13,16
4,New,0.999341,I-LOC,11,40,43
5,York,0.999193,I-LOC,12,44,48
6,City,0.999341,I-LOC,13,49,53
7,D,0.986336,I-LOC,19,79,80
8,##UM,0.939624,I-LOC,20,80,82
9,##BO,0.912139,I-LOC,21,82,84


## 结合Tokenizer

In [11]:
# 载入相关套件
from transformers import TFAutoModelForTokenClassification, AutoTokenizer
import tensorflow as tf

# 结合分词器(Tokenizer)
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
model = TFAutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some layers from the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing TFBertForTokenClassification: ['dropout_147']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForTokenClassification were initialized from the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.


Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

In [15]:
# NER 类别 
label_list = [
    "O",       # 非实体
    "B-MISC",  # 杂项实体的开头，接在另一杂项实体的后面
    "I-MISC",  # 杂项实体
    "B-PER",   # 人名的开头，接在另一人名的后面
    "I-PER",   # 人名
    "B-ORG",   # 组织的开头，接在另一组织的后面
    "I-ORG",   # 组织
    "B-LOC",   # 地名的开头，接在另一地名的后面
    "I-LOC"    # 地名
]

# 测试资料
sequence = "Hugging Face Inc. is a company based in New York City. " \
           "Its headquarters are in DUMBO, therefore very" \
           "close to the Manhattan Bridge."

# 推测答案
tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
inputs = tokenizer.encode(sequence, return_tensors="tf")
outputs = model(inputs)[0]
predictions = tf.argmax(outputs, axis=2)
print([(token, label_list[prediction]) for token, prediction in 
       zip(tokens, predictions[0].numpy())])

[('[CLS]', 'O'), ('Hu', 'I-ORG'), ('##gging', 'I-ORG'), ('Face', 'I-ORG'), ('Inc', 'I-ORG'), ('.', 'O'), ('is', 'O'), ('a', 'O'), ('company', 'O'), ('based', 'O'), ('in', 'O'), ('New', 'I-LOC'), ('York', 'I-LOC'), ('City', 'I-LOC'), ('.', 'O'), ('Its', 'O'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), ('D', 'I-LOC'), ('##UM', 'I-LOC'), ('##BO', 'I-LOC'), (',', 'O'), ('therefore', 'O'), ('very', 'O'), ('##c', 'O'), ('##lose', 'O'), ('to', 'O'), ('the', 'O'), ('Manhattan', 'I-LOC'), ('Bridge', 'I-LOC'), ('.', 'O'), ('[SEP]', 'O')]
