<a href="https://colab.research.google.com/github/chineidu/NLP-Tutorial/blob/main/notebook/06_Transformers/07_token_classif.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install rich
!pip install transformers[torch]
!pip install torch datasets evaluate

Collecting transformers[torch]
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[torch])
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers[torch])
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m50.9 MB/s

# Token Classification

In [2]:
# Built-in library
import re
import json
from typing import Any, Dict, List, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import pandas as pd
from rich import print
import torch

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
# %load_ext lab_black

# auto reload imports
# %load_ext autoreload
# %autoreload 2



### Load Data

- The data can be found [here](https://huggingface.co/datasets/conll2003).

In [3]:
from datasets import load_dataset
from datasets.dataset_dict import Dataset, DatasetDict


PATH: str = "conll2003"
raw_datasets: DatasetDict = load_dataset(path=PATH)
raw_datasets

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [4]:
raw_datasets.get("train")[0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [5]:
raw_datasets.get("train").features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [6]:
label_names: list[str] = (
    raw_datasets.get("train").features.get("ner_tags").feature.names
)
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

### Labels

```text
- O means the word doesn’t correspond to any entity.
- B-PER/I-PER means the word corresponds to the beginning of/is inside a person entity.
- B-ORG/I-ORG means the word corresponds to the beginning of/is inside an organization entity.
- B-LOC/I-LOC means the word corresponds to the beginning of/is inside a location entity.
- B-MISC/I-MISC means the word corresponds to the beginning of/is inside a miscellaneous entity.
```

<br>

```python
{
    "O" :      0,
    "B-PER" :  1,
    "I-PER" :  2,
    "B-ORG" :  3,
    "I-ORG" :  4,
    "B-LOC" :  5,
    "I-LOC" :  6,
    "B-MISC" : 7,
    "I-MISC" : 8,
}
```

In [7]:
words: list[str] = raw_datasets["train"][0]["tokens"]
labels: list[str] = raw_datasets["train"][0]["ner_tags"]

print(f"words: {words}")
print(f"labels: {labels}")

In [8]:
words: list[str] = raw_datasets["train"][0]["tokens"]
labels: list[str] = raw_datasets["train"][0]["ner_tags"]
line1: str = ""
line2: str = ""

for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

In [9]:
def display_tokens_nerTags(*, idx: int) -> None:
    """This prints the tokens and their corresponding NER tags."""
    words: list[str] = raw_datasets["train"][idx]["tokens"]
    labels: list[str] = raw_datasets["train"][idx]["ner_tags"]
    line1: str = ""
    line2: str = ""

    for word, label in zip(words, labels):
        full_label = label_names[label]
        max_length = max(len(word), len(full_label))
        line1 += word + " " * (max_length - len(word) + 1)
        line2 += full_label + " " * (max_length - len(full_label) + 1)

    print(line1)
    print(line2)

In [10]:
display_tokens_nerTags(idx=1)

display_tokens_nerTags(idx=5)

### Create A Tokenizer Object

In [11]:
from transformers import AutoTokenizer


model_checkpoint: str = "bert-base-cased"
tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Check that the tokenizer object is backed by 🤗 Tokenizers:
assert tokenizer.is_fast is True

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [12]:
# Tokenize a pre-tokenized input using is_split_into_words=True:
texts: list[str] = raw_datasets["train"][0]["tokens"]
inputs: dict[str, Any] = tokenizer(texts, is_split_into_words=True)

print(f"texts: {texts}")
print(f"tokens: {inputs.tokens()}")

In [13]:
# Obtain the IDs (the converted tokens in integer format)
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

#### Note:

```text
- The size of the tokenized ID is different from the size of the labels.
```

In [14]:
# The size of the tokenized ID is different from the size of the labels
output: list[Optional[int]] = inputs.word_ids()
original: list[int] = raw_datasets["train"][0]["ner_tags"]

print((output, original))
print(f"Size: {len(output)} != {len(original)}")

In [15]:
def align_labels_with_tokens(labels: list[int], word_ids: list[Optional[int]]):
    """This ensures that the new labels aligns with the tokens.
    e.g.
    original tokens: [0, 1, 2, 3, 4]
    original labels: [3, 0, 0, 5, 0]

    aligned tokens: [None, 0, 1, 2, 3, 3, 4, None]
    aligned labels: [-100, 3, 0, 0, 5, 5, 0, -100]
    """
    new_labels: list[int] = []
    current_word: Optional[int] = None

    for word_id in word_ids:
        # If the current_word is not None
        if word_id != current_word:
            # If it's NOT a special token!
            current_word = word_id # Update the current_word
            label: int = -100 if word_id is None else labels[word_id]
            new_labels.append(label)

        elif word_id is None:
            # Special token
            new_labels.append(-100)

        else:
            # Same word as previous token i.e. current_word == word_id
            label: int = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            # i.e odd labels are B-XXX while even labels are I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [16]:
labels = raw_datasets["train"][0]["ner_tags"]
# Convert the pre-tokenized words to IDs
word_ids = inputs.word_ids()

print(f'tokens: {raw_datasets["train"][0]["tokens"]}')
print(f"labels: {labels}")
print(f"word_ids: {word_ids}")

<br>

```text
- The function added the -100 for the two special tokens at the beginning and the end, and a new 0 for our word that was split into two tokens.
```

In [17]:
new_labels: list[int] = align_labels_with_tokens(labels, word_ids)

print(f"labels: {labels}")
print(f"new_labels: {new_labels}")

<br><br>

```text
Ex 2:
- Some researchers prefer to attribute only one label per word, and assign -100 to the other subtokens in a given word. This is to avoid long words that split into lots of subtokens contributing heavily to the loss. Change the previous function to align labels with input IDs by following this rule.
```

In [18]:
def align_labels_with_tokens_ex_2(labels: list[int], word_ids: list[Optional[int]]):
    """Implementation of Ex 2.

    e.g.
    original tokens: [0, 1, 2, 3, 4]
    original labels: [3, 0, 0, 5, 0]

    aligned tokens: [None, 0, 1, 2, 3, 3, 4, None]
    aligned labels: [-100, 3, 0, 0, 5, -100, 0, -100]
    """

    new_labels: list[int] = []
    current_word: Optional[int] = None

    for word_id in word_ids:
        # If the current_word is not None
        if word_id != current_word:
            # If it's NOT a special token!
            current_word = word_id # Update the current_word
            label: int = -100 if word_id is None else labels[word_id]
            new_labels.append(label)

        elif word_id is None:
            # Special token
            new_labels.append(-100)

        else:
            # Same word as previous token i.e. current_word == word_id
            label: int = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            # i.e odd labels are B-XXX while even labels are I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(-100)

    return new_labels

In [19]:
new_labels: list[int] = align_labels_with_tokens_ex_2(labels, word_ids)

print(f"labels: {labels}")
print(f"new_labels: {new_labels}")

```text
- To preprocess the entire dataset, we tokenize all inputs and align the labels with the corresponding tokens using align_labels_with_tokens(). To improve processing speed, we create a function that handles a list of examples and use Dataset.map() with batched=True.
- Additionally, for inputs in the form of lists of texts (or lists of lists of words), we modify the word_ids() function to include the index of the desired example.
```

In [20]:
def tokenize_and_align_labels(examples: dict[str, Any]):
    """This is used to tokenize and align the labels of the dataset."""
    tokenized_inputs: dict[str, Any] = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels: list[str] = examples["ner_tags"]
    new_labels: list[list[int]] = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    # Create a new label!
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [21]:
# Apply the tokenization on the entire dataset
tokenized_datasets: Dataset = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [22]:
print(tokenized_datasets.get("train")[0])

### Fine-tuning the model with the Trainer API

```text
- The actual code using the Trainer will be the same as before; the only changes are the way the data is collated into a batch and the metric computation function.


Data collation
--------------
- We can’t just use a DataCollatorWithPadding like in Chapter 3 because that only pads the inputs (input IDs, attention mask, and token type IDs).
- Here our labels should be padded the exact same way as the inputs so that they stay the same size, using -100 as a value so that the corresponding predictions are ignored in the loss computation.
- This is all done by a DataCollatorForTokenClassification. Like the DataCollatorWithPadding, it takes the tokenizer used to preprocess the inputs
```

In [23]:
from transformers import DataCollatorForTokenClassification


data_collator: DataCollatorForTokenClassification = DataCollatorForTokenClassification(
    tokenizer=tokenizer
)

In [24]:
# Test it on a few samples
# It dynamically pads the input IDs, attention mask, token type IDs and the aligned labels.
batch: torch.Tensor = data_collator([tokenized_datasets["train"][i] for i in range(2)])

print(batch.get("labels"))

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [25]:
# Original data (before applying data collator)
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

<br>

### Metrics

```text
- To have the Trainer compute a metric every epoch, we will need to define a compute_metrics() function that takes the arrays of predictions and labels, and returns a dictionary with the metric names and values.

- The conventional approach for assessing token classification predictions is through the application of the seqeval metric. Before employing this metric, it's essential to install the seqeval library.
```

<br>

```python
!pip install seqeval
```

In [26]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=e87bc328f19964dbe301aab2f6774935d560dfb52d81234b315d3b58a881cbfa
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [27]:
import evaluate


metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [28]:
# Sample
labels: list[str] = raw_datasets["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
print(labels)

In [29]:
predictions: list[str] = labels.copy()

# Simulate prediction
predictions[2] = "O"
print(metric.compute(predictions=[predictions], references=[labels]))

In [30]:
def compute_metrics(eval_preds) -> dict[str, Any]:
    """This is used to calculate the evaluation metrics."""
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

### Defining the model


```text
- For token classification, the AutoModelForTokenClassification class is used.
- It's important to provide information about the number of labels, either through the num_labels argument or by setting id2label and label2id dictionaries for proper inference and mapping of IDs to labels.
```

In [31]:
print(f"label_names: {label_names}")

In [32]:
id2label: dict[str, Any] = {i: label for i, label in enumerate(label_names, start=0)}
label2id: dict[str, Any] = {v: k for k, v in id2label.items()}

print(id2label)
print(label2id)

In [33]:
from transformers import AutoModelForTokenClassification


# Now we can just pass them to the AutoModelForTokenClassification.from_pretrained() method,
# and they will be set in the model’s configuration and then properly saved and uploaded to the Hub:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
# Check that the model has the right number of labels:
model.config.num_labels

9

### Fine-tuning the model

```text

- We just need to do two last things before we define our Trainer: log in to Hugging Face and define our training arguments. If you’re working in a notebook, there’s a convenience function to help you with this
```
<br>

```python
from huggingface_hub import notebook_login

notebook_login()
```

```text
====================================================================================================
```

```sh
# On a terminal
huggingface-cli login
```

<br>

```text
Once this is done, define the TrainingArguments:
```

In [35]:
from huggingface_hub import notebook_login

# Login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [36]:
from transformers import TrainingArguments


OUTPUT_DIR: str = "bert-finetuned-ner"
STRATEGY: str = "epoch"
LEARNING_RATE: float = 2e-5
NUM_EPOCHS: int = 3
WEIGHT_DECAY: float = 0.01


args: TrainingArguments = TrainingArguments(
    OUTPUT_DIR,
    evaluation_strategy=STRATEGY,
    save_strategy=STRATEGY,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    push_to_hub=True,
)

In [38]:
from transformers import Trainer


# Train the model!
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets.get("train"),
    eval_dataset=tokenized_datasets.get("validation"),
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0791,0.066409,0.910101,0.937058,0.923383,0.981559
2,0.0398,0.060371,0.927419,0.948334,0.93776,0.985371
3,0.025,0.059124,0.931904,0.951195,0.941451,0.986313


TrainOutput(global_step=5268, training_loss=0.06524142217889553, metrics={'train_runtime': 616.1229, 'train_samples_per_second': 68.368, 'train_steps_per_second': 8.55, 'total_flos': 921792849708600.0, 'train_loss': 0.06524142217889553, 'epoch': 3.0})

```text
- During training, the model is saved and uploaded to the Hub in the background at regular intervals.
- This allows for easy resumption of training on another machine if needed.
- After training, the push_to_hub() method ensures the most recent model version is uploaded.
```

In [39]:
trainer.push_to_hub(commit_message="Training complete")

'https://huggingface.co/chineidu/bert-finetuned-ner/tree/main/'

### Using the fine-tuned model

```text
- We’ve already shown you how you can use the model we fine-tuned on the Model Hub with the inference widget.
- To use it locally in a pipeline, you just have to specify the proper model identifier.
```

In [40]:
from transformers import pipeline


model_checkpoint: str = "chineidu/bert-finetuned-ner"
text: str = "My name is Chineidu and I work at Indicina in Lagos, Nigeria."
token_classifier: pipeline = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

print(token_classifier(text))

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [44]:
texts: list[str] = [
                    "Mauricio Pochettino is the head coach of Chelsea FC.",
                    "Olumo Rock is a landmark in Abeokuta, Nigeria.",
                    "Lionel Messi won the latest edition of the FIFA World Cup in 2022.",
                    "Solvify is a technology company that specializes in Internet-related services and products.",
                    "The Harry Potter series, authored by J.K. Rowling, remains a bestseller worldwide.",
                    "On September 20, 2023, Apple unveiled its latest iPhone model at the tech conference in San Francisco.",

             ]
print(token_classifier(texts))