# Token Classification

In [1]:
# Built-in library
import re
import json
from typing import Any, Dict, List, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import pandas as pd
from rich import print
import torch

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

### Load Data

- The data can be found [here](https://huggingface.co/datasets/conll2003).

In [2]:
from datasets import load_dataset
from datasets.dataset_dict import Dataset, DatasetDict


PATH: str = "conll2003"
raw_datasets: DatasetDict = load_dataset(path=PATH)
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [3]:
raw_datasets.get("train")[0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [4]:
raw_datasets.get("train").features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [5]:
label_names: list[str] = (
    raw_datasets.get("train").features.get("ner_tags").feature.names
)
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

### Labels

```text
- O means the word doesn’t correspond to any entity.
- B-PER/I-PER means the word corresponds to the beginning of/is inside a person entity.
- B-ORG/I-ORG means the word corresponds to the beginning of/is inside an organization entity.
- B-LOC/I-LOC means the word corresponds to the beginning of/is inside a location entity.
- B-MISC/I-MISC means the word corresponds to the beginning of/is inside a miscellaneous entity.
```

<br>

```python
{
    "O" :      0,
    "B-PER" :  1,
    "I-PER" :  2,
    "B-ORG" :  3,
    "I-ORG" :  4,
    "B-LOC" :  5,
    "I-LOC" :  6,
    "B-MISC" : 7,
    "I-MISC" : 8,
}
```

In [6]:
words: list[str] = raw_datasets["train"][0]["tokens"]
labels: list[str] = raw_datasets["train"][0]["ner_tags"]

print(f"words: {words}")
print(f"labels: {labels}")

In [7]:
words: list[str] = raw_datasets["train"][0]["tokens"]
labels: list[str] = raw_datasets["train"][0]["ner_tags"]
line1: str = ""
line2: str = ""

for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

In [8]:
def display_tokens_nerTags(*, idx: int) -> None:
    """This prints the tokens and their corresponding NER tags."""
    words: list[str] = raw_datasets["train"][idx]["tokens"]
    labels: list[str] = raw_datasets["train"][idx]["ner_tags"]
    line1: str = ""
    line2: str = ""

    for word, label in zip(words, labels):
        full_label = label_names[label]
        max_length = max(len(word), len(full_label))
        line1 += word + " " * (max_length - len(word) + 1)
        line2 += full_label + " " * (max_length - len(full_label) + 1)

    print(line1)
    print(line2)

In [9]:
display_tokens_nerTags(idx=1)

display_tokens_nerTags(idx=5)

### Create A Tokenizer Object

In [10]:
from transformers import AutoTokenizer


model_checkpoint: str = "bert-base-cased"
tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Check that the tokenizer object is backed by 🤗 Tokenizers:
assert tokenizer.is_fast is True

In [11]:
# Tokenize a pre-tokenized input using is_split_into_words=True:
texts: list[str] = raw_datasets["train"][0]["tokens"]
inputs: dict[str, Any] = tokenizer(texts, is_split_into_words=True)

print(f"texts: {texts}")
print(f"tokens: {inputs.tokens()}")

In [12]:
# Obtain the IDs (the converted tokens in integer format)
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

#### Note:

```text
- The size of the tokenized ID is different from the size of the labels.
```

In [13]:
# The size of the tokenized ID is different from the size of the labels
output: list[Optional[int]] = inputs.word_ids()
original: list[int] = raw_datasets["train"][0]["ner_tags"]

print((output, original))
print(f"Size: {len(output)} != {len(original)}")

In [14]:
def align_labels_with_tokens(labels: list[int], word_ids: list[Optional[int]]):
    new_labels: list[int] = []
    current_word: Optional[int] = None

    for word_id in word_ids:
        # if the current_word is not None
        if word_id != current_word:
            # Start of a new word!
            # Update the current_word
            current_word = word_id
            label: int = -100 if word_id is None else labels[word_id]
            new_labels.append(label)

        elif word_id is None:
            # Special token
            new_labels.append(-100)

        else:
            # Same word as previous token
            label: int = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [15]:
labels = raw_datasets["train"][0]["ner_tags"]
# Convert the pre-tokenized words to IDs
word_ids = inputs.word_ids()

print(f'tokens: {raw_datasets["train"][0]["tokens"]}')
print(f"labels: {labels}")
print(f"word_ids: {word_ids}")

<br>

```text
- The function added the -100 for the two special tokens at the beginning and the end, and a new 0 for our word that was split into two tokens.
```

In [16]:
new_labels: list[int] = align_labels_with_tokens(labels, word_ids)

print(f"labels: {labels}")
print(f"new_labels: {new_labels}")

<br><br>

```text
Ex 2:
- Some researchers prefer to attribute only one label per word, and assign -100 to the other subtokens in a given word. This is to avoid long words that split into lots of subtokens contributing heavily to the loss. Change the previous function to align labels with input IDs by following this rule.
```

In [17]:
def align_labels_with_tokens_ex_2(labels: list[int], word_ids: list[Optional[int]]):
    """Implementation of Ex 2."""

    new_labels: list[int] = []
    current_word: Optional[int] = None

    for word_id in word_ids:
        # if the current_word is not None
        if word_id != current_word:
            # Start of a new word!
            # Update the current_word
            current_word = word_id
            label: int = -100 if word_id is None else labels[word_id]
            new_labels.append(label)

        elif word_id is None:
            # Special token
            new_labels.append(-100)

        else:
            # Same word as previous token
            label: int = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(-100)

    return new_labels

In [18]:
new_labels: list[int] = align_labels_with_tokens_ex_2(labels, word_ids)

print(f"labels: {labels}")
print(f"new_labels: {new_labels}")

```text
- To preprocess the entire dataset, we tokenize all inputs and align the labels with the corresponding tokens using align_labels_with_tokens(). To improve processing speed, we create a function that handles a list of examples and use Dataset.map() with batched=True. 
- Additionally, for inputs in the form of lists of texts (or lists of lists of words), we modify the word_ids() function to include the index of the desired example.
```

In [19]:
def tokenize_and_align_labels(examples: dict[str, Any]):
    tokenized_inputs: dict[str, Any] = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels: list[str] = examples["ner_tags"]
    new_labels: list[str] = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [20]:
# Apply the tokenization on the entire dataset
tokenized_datasets: Dataset = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

In [21]:
print(tokenized_datasets.get("train")[0])

### Fine-tuning the model with the Trainer API

```text
- The actual code using the Trainer will be the same as before; the only changes are the way the data is collated into a batch and the metric computation function.


Data collation
--------------
- We can’t just use a DataCollatorWithPadding like in Chapter 3 because that only pads the inputs (input IDs, attention mask, and token type IDs). 
- Here our labels should be padded the exact same way as the inputs so that they stay the same size, using -100 as a value so that the corresponding predictions are ignored in the loss computation.
- This is all done by a DataCollatorForTokenClassification. Like the DataCollatorWithPadding, it takes the tokenizer used to preprocess the inputs
```

In [22]:
from transformers import DataCollatorForTokenClassification


data_collator: DataCollatorForTokenClassification = DataCollatorForTokenClassification(
    tokenizer=tokenizer
)

2023-10-26 22:23:56.414954: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [23]:
# Test it on a few samples
batch: torch.Tensor = data_collator([tokenized_datasets["train"][i] for i in range(2)])

print(batch.get("labels"))

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [24]:
# Original data (before applying data collator)
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

<br>

### Metrics

```text
- To have the Trainer compute a metric every epoch, we will need to define a compute_metrics() function that takes the arrays of predictions and labels, and returns a dictionary with the metric names and values.

- The conventional approach for assessing token classification predictions is through the application of the seqeval metric. Before employing this metric, it's essential to install the seqeval library.
```

<br>

```python
!pip install seqeval
```

In [25]:
import evaluate


metric = evaluate.load("seqeval")

In [26]:
# Sample
labels: list[str] = raw_datasets["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
print(labels)

In [27]:
predictions: list[str] = labels.copy()
# Simulate prediction
predictions[2] = "O"
print(metric.compute(predictions=[predictions], references=[labels]))

In [28]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

### Defining the model


```text
- For token classification, the AutoModelForTokenClassification class is used. 
- It's important to provide information about the number of labels, either through the num_labels argument or by setting id2label and label2id dictionaries for proper inference and mapping of IDs to labels.
```

In [29]:
id2label: dict[str:Any] = {i: label for i, label in enumerate(label_names)}
label2id: dict[str:Any] = {v: k for k, v in id2label.items()}

print(id2label)
print(label2id)

In [30]:
from transformers import AutoModelForTokenClassification


# Now we can just pass them to the AutoModelForTokenClassification.from_pretrained() method,
# and they will be set in the model’s configuration and then properly saved and uploaded to the Hub:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
# Check that the model has the right number of labels:
model.config.num_labels

9