In [3]:
import os
os.environ['http_proxy'] = 'http://127.0.0.1:7890'
os.environ['https_proxy'] = 'http://127.0.0.1:7890'

## basics

- NER 也属于 token-classification；

## 应用

- models: 
    - `dslim/bert-large-NER`
- dataset
    - `conll2003`
        - PER（人员），LOC（位置），ORG（组织）和MISC

In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Model Repository on huggingface.co
model_id = "dslim/bert-large-NER"

# Load Model and Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForTokenClassification.from_pretrained(model_id)

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
token_clf = pipeline("token-classification", model=model, tokenizer=tokenizer,device=0)

In [6]:
example = "My name is Wolfgang and I live in Berlin"
ner_results = token_clf(example)
ner_results

[{'entity': 'B-PER',
  'score': 0.9971501,
  'index': 4,
  'word': 'Wolfgang',
  'start': 11,
  'end': 19},
 {'entity': 'B-LOC',
  'score': 0.9986046,
  'index': 9,
  'word': 'Berlin',
  'start': 34,
  'end': 40}]

In [7]:
from evaluate import evaluator
from datasets import load_dataset

# load eval dataset
eval_dataset = load_dataset("conll2003", split="validation")

# define evaluator
task_evaluator = evaluator("token-classification")

# run baseline
results = task_evaluator.compute(
    model_or_pipeline=token_clf,
    data=eval_dataset,
    metric="seqeval",
)
results

{'LOC': {'precision': 0.9680563075257174,
  'recall': 0.9733260751224823,
  'f1': 0.9706840390879478,
  'number': 1837},
 'MISC': {'precision': 0.8955696202531646,
  'recall': 0.920824295010846,
  'f1': 0.9080213903743315,
  'number': 922},
 'ORG': {'precision': 0.9360294117647059,
  'recall': 0.9492915734526473,
  'f1': 0.9426138467234357,
  'number': 1341},
 'PER': {'precision': 0.9825613079019073,
  'recall': 0.9788273615635179,
  'f1': 0.980690780527604,
  'number': 1842},
 'overall_precision': 0.9537562604340568,
 'overall_recall': 0.9614607876135981,
 'overall_f1': 0.957593027153872,
 'overall_accuracy': 0.9928157003231961,
 'total_time_in_seconds': 36.903920194999955,
 'samples_per_second': 88.06652471680601,
 'latency_in_seconds': 0.011355052367692292}