In [194]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

In [2]:
checkpoint = "bert-base-cased"

# Tokenizing

In [3]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
example = "My name is Sylvain and I work at Hugging Face in Brooklyn."
encoding = tokenizer(example)

In [4]:
print(type(encoding))

<class 'transformers.tokenization_utils_base.BatchEncoding'>


In [7]:
tokenizer.is_fast

True

In [6]:
encoding.is_fast

True

In [8]:
encoding.tokens()

['[CLS]',
 'My',
 'name',
 'is',
 'S',
 '##yl',
 '##va',
 '##in',
 'and',
 'I',
 'work',
 'at',
 'Hu',
 '##gging',
 'Face',
 'in',
 'Brooklyn',
 '.',
 '[SEP]']

In [9]:
encoding.word_ids()

[None, 0, 1, 2, 3, 3, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, None]

In [11]:
pd.DataFrame({"token": encoding.tokens(), "id": encoding.word_ids()})

Unnamed: 0,token,id
0,[CLS],
1,My,0.0
2,name,1.0
3,is,2.0
4,S,3.0
5,##yl,3.0
6,##va,3.0
7,##in,3.0
8,and,4.0
9,I,5.0


In [12]:
tokenizer1 = AutoTokenizer.from_pretrained("bert-base-cased")
tokenizer2 = AutoTokenizer.from_pretrained("roberta-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [14]:
encoding1 = tokenizer1("81s")
encoding2 = tokenizer2("81s")

In [15]:
print(encoding1.tokens())

['[CLS]', '81', '##s', '[SEP]']


In [16]:
print(encoding2.tokens())

['<s>', '81', 's', '</s>']


In [27]:
start, end = encoding.word_to_chars(3)
example[start: end]

'Sylvain'

In [29]:
encoding = tokenizer(["Sentence 1.", "Sentence 2.", "Sentence 3."])

In [32]:
encoding

{'input_ids': [[101, 14895, 5208, 2093, 122, 119, 102], [101, 14895, 5208, 2093, 123, 119, 102], [101, 14895, 5208, 2093, 124, 119, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]}

# Token classification

In [46]:
checkpoint = "dbmdz/bert-large-cased-finetuned-conll03-english"

In [47]:
token_classifier = pipeline("token-classification", model=checkpoint)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [58]:
example = "My name is Sylvain and I work at Hugging Face in Brooklyn."

## Token-level classification

In [59]:
outputs = token_classifier(example)

In [60]:
outputs

[{'entity': 'I-PER',
  'score': 0.99938285,
  'index': 4,
  'word': 'S',
  'start': 11,
  'end': 12},
 {'entity': 'I-PER',
  'score': 0.99815494,
  'index': 5,
  'word': '##yl',
  'start': 12,
  'end': 14},
 {'entity': 'I-PER',
  'score': 0.99590707,
  'index': 6,
  'word': '##va',
  'start': 14,
  'end': 16},
 {'entity': 'I-PER',
  'score': 0.99923277,
  'index': 7,
  'word': '##in',
  'start': 16,
  'end': 18},
 {'entity': 'I-ORG',
  'score': 0.9738931,
  'index': 12,
  'word': 'Hu',
  'start': 33,
  'end': 35},
 {'entity': 'I-ORG',
  'score': 0.976115,
  'index': 13,
  'word': '##gging',
  'start': 35,
  'end': 40},
 {'entity': 'I-ORG',
  'score': 0.9887976,
  'index': 14,
  'word': 'Face',
  'start': 41,
  'end': 45},
 {'entity': 'I-LOC',
  'score': 0.9932106,
  'index': 16,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

## Grouping of classifications

In [178]:
token_grouped_classifier = pipeline("token-classification", aggregation_strategy="simple", model=checkpoint)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [179]:
outputs = token_grouped_classifier(example)

In [180]:
outputs

[{'entity_group': 'PER',
  'score': 0.9981694,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.9796019,
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.9932106,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

# Without pipeline

In [182]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForTokenClassification.from_pretrained(checkpoint)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [183]:
inputs = tokenizer(example, return_tensors="pt", return_offsets_mapping=True)
outputs = model(
    input_ids=inputs.input_ids,
    token_type_ids=inputs.token_type_ids,
    attention_mask=inputs.attention_mask
)

In [184]:
print(inputs["input_ids"].shape)
print(outputs["logits"].shape)

torch.Size([1, 19])
torch.Size([1, 19, 9])


In [185]:
probs = F.softmax(outputs["logits"], dim=-1).squeeze(0).tolist()
preds = outputs["logits"].argmax(dim=-1).squeeze(0).tolist()

In [186]:
preds

[0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 6, 6, 6, 0, 8, 0, 0]

## Label definitions

In [187]:
model.config.id2label

{0: 'O',
 1: 'B-MISC',
 2: 'I-MISC',
 3: 'B-PER',
 4: 'I-PER',
 5: 'B-ORG',
 6: 'I-ORG',
 7: 'B-LOC',
 8: 'I-LOC'}

## Aggregate token predictions

In [188]:
results = []
tokens = inputs.tokens()

for idx, pred in enumerate(preds):
    label = model.config.id2label[pred]
    if label != "O":
        results.append(
            {"entity": label, "score": probs[idx][pred], "word": tokens[idx]}
        )

In [189]:
results

[{'entity': 'I-PER', 'score': 0.9993828535079956, 'word': 'S'},
 {'entity': 'I-PER', 'score': 0.9981548190116882, 'word': '##yl'},
 {'entity': 'I-PER', 'score': 0.995907187461853, 'word': '##va'},
 {'entity': 'I-PER', 'score': 0.9992327690124512, 'word': '##in'},
 {'entity': 'I-ORG', 'score': 0.9738931059837341, 'word': 'Hu'},
 {'entity': 'I-ORG', 'score': 0.9761149883270264, 'word': '##gging'},
 {'entity': 'I-ORG', 'score': 0.9887974858283997, 'word': 'Face'},
 {'entity': 'I-LOC', 'score': 0.99321049451828, 'word': 'Brooklyn'}]

In [190]:
# Text span for each token, without the extra characters ("##")
offsets = inputs.offset_mapping.squeeze(0).tolist()

In [191]:
for i, offset in enumerate(offsets):
    print(example[offset[0]: offset[1]])


My
name
is
S
yl
va
in
and
I
work
at
Hu
gging
Face
in
Brooklyn
.



## Fully recreate pipeline results

In [192]:
results = []
tokens = inputs.tokens()
offsets = inputs.offset_mapping.squeeze(0).tolist()

for idx, pred in enumerate(preds):
    label = model.config.id2label[pred]
    if label != "O":
        start, end = offsets[idx]
        results.append({
            "entity": label,
            "score": probs[idx][pred],
            "word": tokens[idx],
            "start": start,
            "end": end,
        })

In [193]:
results

[{'entity': 'I-PER',
  'score': 0.9993828535079956,
  'word': 'S',
  'start': 11,
  'end': 12},
 {'entity': 'I-PER',
  'score': 0.9981548190116882,
  'word': '##yl',
  'start': 12,
  'end': 14},
 {'entity': 'I-PER',
  'score': 0.995907187461853,
  'word': '##va',
  'start': 14,
  'end': 16},
 {'entity': 'I-PER',
  'score': 0.9992327690124512,
  'word': '##in',
  'start': 16,
  'end': 18},
 {'entity': 'I-ORG',
  'score': 0.9738931059837341,
  'word': 'Hu',
  'start': 33,
  'end': 35},
 {'entity': 'I-ORG',
  'score': 0.9761149883270264,
  'word': '##gging',
  'start': 35,
  'end': 40},
 {'entity': 'I-ORG',
  'score': 0.9887974858283997,
  'word': 'Face',
  'start': 41,
  'end': 45},
 {'entity': 'I-LOC',
  'score': 0.99321049451828,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

## Fully recreate pipeline results with grouping

In [198]:
results = []

idx = 0
while idx < len(preds):
    pred = preds[idx]
    label = model.config.id2label[pred]
    if label != "O":
        # Remove B- or I-
        label = label[2:]
        start, _ = offsets[idx]

        # Get all tokens with I-label
        all_scores = []
        while(idx < len(preds) and model.config.id2label[preds[idx]] == f"I-{label}"):
            all_scores.append(probs[idx][pred])
            _, end = offsets[idx]
            idx += 1

        # Set score as mean of all scores for tokens in grouped entity
        score = np.mean(all_scores).item()
        word = example[start: end]
        results.append({
            "entity_group": label,
            "score": score,
            "word": word,
            "start": start,
            "end": end,
        })
    idx += 1

In [199]:
results

[{'entity_group': 'PER',
  'score': 0.998169407248497,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.9796018600463867,
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.99321049451828,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]