In [71]:
from datasets import load_dataset
from transformers import AutoTokenizer

In [2]:
raw_datasets = load_dataset("conll2003")

In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

# Investigate data

In [5]:
raw_datasets["train"][0]["tokens"]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [6]:
raw_datasets["train"][0]["ner_tags"]

[3, 0, 7, 0, 0, 0, 7, 0, 0]

## Label meanings

In [28]:
raw_datasets["train"].features["ner_tags"].feature.names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

## Match labels to text

In [55]:
def pp_words_and_labels(words, labels, label_names, max_line=80):
    line1 = ""
    line2 = ""
    
    for word, label in zip(words, labels):
        full_label = label_names[label]
        max_length = max(len(word), len(full_label))
        line1 += word + " " * (max_length - len(word) + 1)
        line2 += full_label + " " * (max_length - len(full_label) + 1)
        if len(line1) > max_line:
            print(line1)
            print(line2)
            line1 = ""
            line2 = ""
    print(line1)
    print(line2)

### NER labels

In [56]:
pp_words_and_labels(
    words=raw_datasets["train"][0]["tokens"],
    labels=raw_datasets["train"][0]["ner_tags"],
    label_names=raw_datasets["train"].features["ner_tags"].feature.names
)

EU    rejects German call to boycott British lamb . 
B-ORG O       B-MISC O    O  O       B-MISC  O    O 


In [57]:
pp_words_and_labels(
    words=raw_datasets["train"][4]["tokens"],
    labels=raw_datasets["train"][4]["ner_tags"],
    label_names=raw_datasets["train"].features["ner_tags"].feature.names,
    max_line=110
)

Germany 's representative to the European Union 's veterinary committee Werner Zwingmann said on Wednesday consumers 
B-LOC   O  O              O  O   B-ORG    I-ORG O  O          O         B-PER  I-PER     O    O  O         O         
should buy sheepmeat from countries other than Britain until the scientific advice was clearer . 
O      O   O         O    O         O     O    B-LOC   O     O   O          O      O   O       O 


### POS labels

In [65]:
pp_words_and_labels(
    raw_datasets["train"][0]["tokens"],
    raw_datasets["train"][0]["pos_tags"],
    raw_datasets["train"].features["pos_tags"].feature.names,
)

EU  rejects German call to boycott British lamb . 
NNP VBZ     JJ     NN   TO VB      JJ      NN   . 


In [67]:
pp_words_and_labels(
    raw_datasets["train"][4]["tokens"],
    raw_datasets["train"][4]["pos_tags"],
    raw_datasets["train"].features["pos_tags"].feature.names,
    max_line=110
)

Germany 's  representative to the European Union 's  veterinary committee Werner Zwingmann said on Wednesday consumers 
NNP     POS NN             TO DT  NNP      NNP   POS JJ         NN        NNP    NNP       VBD  IN NNP       NNS       
should buy sheepmeat from countries other than Britain until the scientific advice was clearer . 
MD     VB  NN        IN   NNS       JJ    IN   NNP     IN    DT  JJ         NN     VBD JJR     . 


### Chunking labels

In [68]:
pp_words_and_labels(
    raw_datasets["train"][0]["tokens"],
    raw_datasets["train"][0]["chunk_tags"],
    raw_datasets["train"].features["chunk_tags"].feature.names,
)

EU   rejects German call to   boycott British lamb . 
B-NP B-VP    B-NP   I-NP B-VP I-VP    B-NP    I-NP O 


In [70]:
pp_words_and_labels(
    raw_datasets["train"][4]["tokens"],
    raw_datasets["train"][4]["chunk_tags"],
    raw_datasets["train"].features["chunk_tags"].feature.names,
    max_line=115
)

Germany 's   representative to   the  European Union 's   veterinary committee Werner Zwingmann said on   Wednesday 
B-NP    B-NP I-NP           B-PP B-NP I-NP     I-NP  B-NP I-NP       I-NP      I-NP   I-NP      B-VP B-PP B-NP      
consumers should buy  sheepmeat from countries other  than Britain until  the  scientific advice was  clearer . 
I-NP      B-VP   I-VP B-NP      B-PP B-NP      B-ADJP B-PP B-NP    B-SBAR B-NP I-NP       I-NP   B-VP B-ADJP  O 


# Define tokenizer

In [72]:
model_checkpoint = "bert-base-cased"

In [73]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 9d0a3653-f952-4a05-b940-35c98af5052c)')' thrown while requesting HEAD https://huggingface.co/bert-base-cased/resolve/main/tokenizer_config.json


In [74]:
tokenizer.is_fast

True

## Text is pre-tokenized, need to tell our tokenizer to handle this

In [75]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)

In [76]:
inputs.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [83]:
labels = raw_datasets["train"][0]["ner_tags"]

### Label and token mismatch

Length of labels & tokens no longer match, since words were split up and special tokens were added

In [89]:
print(len(labels), len(inputs.tokens()))

9 12


### Solution
Match tokens to corresponding word and then expanding the label list to match the tokens

In [88]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [90]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If label is B-XXX, change it to I-XXX since we're continuing the same word
            if label % 2 == 1:
                label += 1
            new_labels.append(label)
    return new_labels

In [91]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()

In [94]:
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]
