<a href="https://colab.research.google.com/github/chineidu/NLP-Tutorial/blob/main/notebook/06_Transformers/NLP-With-Transformers/02a-NER-football.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers accelerate datasets \
  seqeval mlxtend watermark rich

In [2]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,mlxtend,transformers --conda

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.34.0

numpy       : 1.26.4
pandas      : 2.1.4
polars      : 0.20.2
mlxtend     : 0.23.1
transformers: 4.42.4

conda environment: n/a



In [3]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")


# # Black code formatter (Optional)
# %load_ext lab_black

# # auto reload imports
# %load_ext autoreload
# %autoreload 2

### Load Data

In [4]:
# fp: str = "../../../data/ner_data.jsonl"

# with open(fp, "r") as f:
#     json_data = [json.loads(line) for line in f]

# print(len(json_data))

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
from datasets.features.features import ClassLabel, Sequence

# data: DatasetDict = load_dataset("ontonotes/conll2012_ontonotesv5", "english_v4")
# data.save_to_disk("../../../data/conll2012_ontonotesv5")

fp: str = "../../../data/conll2012_ontonotesv5"
fp: str = "/content/drive/MyDrive/My doc/Deep Learning/Data/conll2012_ontonotesv5"
ds_dict: DatasetDict = load_from_disk(dataset_path=fp)
ds_dict

DatasetDict({
    train: Dataset({
        features: ['document_id', 'sentences'],
        num_rows: 1940
    })
    validation: Dataset({
        features: ['document_id', 'sentences'],
        num_rows: 222
    })
    test: Dataset({
        features: ['document_id', 'sentences'],
        num_rows: 222
    })
})

In [7]:
ds_dict["train"].features["sentences"][0]

{'part_id': Value(dtype='int32', id=None),
 'words': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['XX', '``', '$', "''", ',', '-LRB-', '-RRB-', '.', ':', 'ADD', 'AFX', 'CC', 'CD', 'DT', 'EX', 'FW', 'HYPH', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NFP', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'parse_tree': Value(dtype='string', id=None),
 'predicate_lemmas': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'predicate_framenet_ids': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'word_senses': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None),
 'speaker': Value(dtype='string', id=None),
 'named_entities': Sequence(feature=ClassLabel(names=['O', 'B-PERSON', 'I-PERSON', 'B-NORP', 'I-NORP', 'B-FA

In [8]:
tags: ClassLabel = ds_dict["train"].features["sentences"][0]["named_entities"].feature
print(tags)

ClassLabel(names=['O', 'B-PERSON', 'I-PERSON', 'B-NORP', 'I-NORP', 'B-FAC', 'I-FAC', 'B-ORG', 'I-ORG', 'B-GPE', 'I-GPE', 'B-LOC', 'I-LOC', 'B-PRODUCT', 'I-PRODUCT', 'B-DATE', 'I-DATE', 'B-TIME', 'I-TIME', 'B-PERCENT', 'I-PERCENT', 'B-MONEY', 'I-MONEY', 'B-QUANTITY', 'I-QUANTITY', 'B-ORDINAL', 'I-ORDINAL', 'B-CARDINAL', 'I-CARDINAL', 'B-EVENT', 'I-EVENT', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'B-LAW', 'I-LAW', 'B-LANGUAGE', 'I-LANGUAGE'], id=None)


In [9]:
index2tag: dict[int, str] = {tags.str2int(tag): tag for tag in tags.names}
tag2index: dict[str, int] = {tag: tags.str2int(tag) for tag in tags.names}

print(index2tag)

{0: 'O', 1: 'B-PERSON', 2: 'I-PERSON', 3: 'B-NORP', 4: 'I-NORP', 5: 'B-FAC', 6: 'I-FAC', 7: 'B-ORG', 8: 'I-ORG', 9: 'B-GPE', 10: 'I-GPE', 11: 'B-LOC', 12: 'I-LOC', 13: 'B-PRODUCT', 14: 'I-PRODUCT', 15: 'B-DATE', 16: 'I-DATE', 17: 'B-TIME', 18: 'I-TIME', 19: 'B-PERCENT', 20: 'I-PERCENT', 21: 'B-MONEY', 22: 'I-MONEY', 23: 'B-QUANTITY', 24: 'I-QUANTITY', 25: 'B-ORDINAL', 26: 'I-ORDINAL', 27: 'B-CARDINAL', 28: 'I-CARDINAL', 29: 'B-EVENT', 30: 'I-EVENT', 31: 'B-WORK_OF_ART', 32: 'I-WORK_OF_ART', 33: 'B-LAW', 34: 'I-LAW', 35: 'B-LANGUAGE', 36: 'I-LANGUAGE'}


### Comment

- Drop irrelevant columns

In [10]:
def extract_tokens_and_labels(example: dict[str, Any]) -> dict[str, Any]:
    tokens = example["sentences"][0]["words"]
    labels = example["sentences"][0]["named_entities"]

    return {"tokens": tokens, "labels": labels}

In [11]:
# ClassLabel object
class_label = ClassLabel(num_classes=len(tags.names), names=tags.names)

# Create a copy
ds_dict: DatasetDict = ds_dict.copy()

for split in ds_dict.keys():
    ds_dict[split] = ds_dict[split].map(extract_tokens_and_labels)
    ds_dict[split] = ds_dict[split].remove_columns(["sentences"])
    # Update the 'labels' column to use ClassLabel
    ds_dict[split] = ds_dict[split].cast_column(
        column="labels", feature=Sequence(class_label)
    )
    ds_dict[split] = ds_dict[split].map(
        lambda x: {"ner_labels": [index2tag[t] for t in x["labels"]]}
    )

ds_dict

{'train': Dataset({
     features: ['document_id', 'tokens', 'labels', 'ner_labels'],
     num_rows: 1940
 }),
 'validation': Dataset({
     features: ['document_id', 'tokens', 'labels', 'ner_labels'],
     num_rows: 222
 }),
 'test': Dataset({
     features: ['document_id', 'tokens', 'labels', 'ner_labels'],
     num_rows: 222
 })}

In [12]:
ds_dict["train"][10]

{'document_id': 'bc/cnn/00/cnn_0007',
 'tokens': ['Journalists', 'sources', 'and', 'jail', '/.'],
 'labels': [0, 0, 0, 0, 0],
 'ner_labels': ['O', 'O', 'O', 'O', 'O']}

In [13]:
df: pl.DataFrame = ds_dict["train"].to_polars()
df = df.with_columns(token_length=pl.col("tokens").map_elements(lambda x: len(x)))
df.head()

document_id,tokens,labels,ner_labels,token_length
str,list[str],list[i64],list[str],i64
"""bc/cctv/00/cctv_0001""","[""What"", ""kind"", … ""?""]","[0, 0, … 0]","[""O"", ""O"", … ""O""]",5
"""bc/cctv/00/cctv_0002""","[""Abramov"", ""had"", … "".""]","[1, 0, … 0]","[""B-PERSON"", ""O"", … ""O""]",14
"""bc/cctv/00/cctv_0003""","[""Hello"", "","", … "".""]","[0, 0, … 0]","[""O"", ""O"", … ""O""]",5
"""bc/cctv/00/cctv_0004""","[""There"", ""will"", … "".""]","[0, 0, … 0]","[""O"", ""O"", … ""O""]",13
"""bc/cnn/00/cnn_0001""","[""Sunday"", ""the"", … ""/.""]","[15, 0, … 0]","[""B-DATE"", ""O"", … ""O""]",7


In [14]:
# Check the distribution of all the tags
from collections import Counter, defaultdict


splits_freq: defaultdict = defaultdict(Counter)

for split, ds in ds_dict.items():
    for row in ds["ner_labels"]:
        for tag in row:
            # Focus on the `beginning` tags
            if tag.startswith("B"):
                tag_type: str = tag.split("-")[1]
                splits_freq[split][tag_type] += 1


# The tags are roughly equally distributed
pl.DataFrame(splits_freq).to_pandas()

Unnamed: 0,train,validation,test
0,"{'PERSON': 774, 'GPE': 1069, 'TIME': 88, 'CARDINAL': 243, 'DATE': 842, 'LOC': 109, 'ORG': 1005, 'NORP': 432, 'PRODUCT': 77, 'WORK_OF_ART': 56, 'ORDINAL': 44, 'FAC': 47, 'MONEY': 119, 'EVENT': 26, 'PERCENT': 47, 'QUANTITY': 11, 'LAW': 13, 'LANGUAGE': 6}","{'DATE': 105, 'GPE': 142, 'WORK_OF_ART': 4, 'PERSON': 100, 'ORDINAL': 12, 'ORG': 125, 'PRODUCT': 9, 'CARDINAL': 38, 'FAC': 4, 'TIME': 10, 'NORP': 46, 'EVENT': 6, 'LOC': 4, 'QUANTITY': 1, 'MONEY': 16, 'PERCENT': 7}","{'GPE': 128, 'DATE': 103, 'PERSON': 81, 'LOC': 12, 'TIME': 13, 'NORP': 47, 'MONEY': 8, 'ORG': 118, 'EVENT': 5, 'CARDINAL': 33, 'WORK_OF_ART': 9, 'PRODUCT': 9, 'ORDINAL': 9, 'FAC': 2, 'QUANTITY': 2, 'PERCENT': 6, 'LANGUAGE': 1}"


In [15]:
from transformers import AutoTokenizer


chkpoint: str = "dslim/distilbert-NER"  # "dslim/bert-base-NER"

tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(chkpoint)

In [16]:
text: str = " ".join(ds_dict["train"]["tokens"][-3])
inp_labels: list[int] = ds_dict["train"]["labels"][-3]
tokens: list[str] = tokenizer(text).tokens()  # tokenizer.tokenize(text)
inputs = tokenizer(text, return_tensors="pt")

tokens

['[CLS]',
 'Note',
 ':',
 'There',
 "'",
 's',
 'a',
 'piece',
 'by',
 'George',
 'Pack',
 '##er',
 'in',
 'The',
 'New',
 'Yorker',
 '-',
 'L',
 '##RB',
 '-',
 "'",
 "'",
 'Bet',
 '##ray',
 '##ed',
 ':',
 'the',
 'Iraqi',
 '##s',
 'who',
 'trusted',
 'America',
 'the',
 'most',
 "'",
 "'",
 '-',
 'R',
 '##RB',
 '-',
 'about',
 'all',
 'this',
 ',',
 'but',
 'it',
 "'",
 's',
 'way',
 'too',
 'long',
 'to',
 'blog',
 'here',
 '.',
 '[SEP]']

In [17]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask'])

In [18]:
from transformers import AutoModel


bert_model = AutoModel.from_pretrained(chkpoint)
bert_model(**inputs)["last_hidden_state"].shape

torch.Size([1, 56, 768])

### Tokenization Pipeline

<br>

[![image.png](https://i.postimg.cc/zvJ0SrZz/image.png)](https://postimg.cc/dkx2wgVp)

<br>

- `Normalization`: it involves cleaning raw text by removing whitespace, accents, and standardizing Unicode characters. It also includes lowercasing to reduce vocabulary size. After normalization, our example string becomes "jack sparrow loves new york!".

- `Pretokenization`: it splits text into words for easier tokenization. For English and similar languages, this is simple. For languages like Chinese, it's more complex and might require language-specific libraries.

- `Tokenizer model`: it splits words into subwords to reduce vocabulary size and out-of-vocabulary tokens. This is done using algorithms like BPE, Unigram, and WordPiece. For example, "jack sparrow" might become "[jack, spa, rrow]".

- `Postprocessing`: it's the final step in tokenization, where additional tokens (like [CLS] and [SEP]) are added to the beginning and end of the token sequence to prepare it for input into a model like BERT.


### Named Entity Recognition

[![image.png](https://i.postimg.cc/26YwrD5Q/image.png)](https://postimg.cc/rdhWN7xs)

<br>

- In `token classification`, assign the label (e.g., B-PER) to the first subword ("Chr") and ignore subsequent subwords ("##ista"). This convention follows the BERT paper and maintains the IOB2 format. Postprocessing can propagate the label to all subwords.

<br>

### Classification Head

In [19]:
import torch
from torch import nn, Tensor
from transformers import DistilBertConfig, DistilBertModel
from transformers.models.distilbert.modeling_distilbert import DistilBertPreTrainedModel
from transformers.modeling_outputs import TokenClassifierOutput


class DistilBertForTokenClassification(DistilBertPreTrainedModel):
    class_config = DistilBertConfig

    def __init__(self, config) -> None:
        super().__init__(config)

        # Load model body
        self.model = DistilBertModel(config)

        # Setup classifiaction head
        self.num_labels = config.num_labels
        self.dropout = nn.Dropout(config.seq_classif_dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Tensor | None = None,
        attention_mask: Tensor | None = None,
        token_type_ids: Tensor | None = None,
        labels: Tensor | None = None,
        **kwargs
    ) -> TokenClassifierOutput:
        # Get the encoder representations using the body
        outputs: dict = self.model(input_ids, attention_mask, token_type_ids, **kwargs)

        # Apply classifier to encoder representation
        sequence_output: Tensor = self.dropout(outputs["last_hidden_state"])
        logits: Tensor = self.classifier(sequence_output)

        # Calculate the loss
        loss: Tensor | None = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [20]:
tags

ClassLabel(names=['O', 'B-PERSON', 'I-PERSON', 'B-NORP', 'I-NORP', 'B-FAC', 'I-FAC', 'B-ORG', 'I-ORG', 'B-GPE', 'I-GPE', 'B-LOC', 'I-LOC', 'B-PRODUCT', 'I-PRODUCT', 'B-DATE', 'I-DATE', 'B-TIME', 'I-TIME', 'B-PERCENT', 'I-PERCENT', 'B-MONEY', 'I-MONEY', 'B-QUANTITY', 'I-QUANTITY', 'B-ORDINAL', 'I-ORDINAL', 'B-CARDINAL', 'I-CARDINAL', 'B-EVENT', 'I-EVENT', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'B-LAW', 'I-LAW', 'B-LANGUAGE', 'I-LANGUAGE'], id=None)

In [21]:
config = DistilBertConfig.from_pretrained(
    chkpoint, num_labels=len(tags.names), id2label=index2tag, label2id=tag2index
)
config

DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "B-PERSON",
    "2": "I-PERSON",
    "3": "B-NORP",
    "4": "I-NORP",
    "5": "B-FAC",
    "6": "I-FAC",
    "7": "B-ORG",
    "8": "I-ORG",
    "9": "B-GPE",
    "10": "I-GPE",
    "11": "B-LOC",
    "12": "I-LOC",
    "13": "B-PRODUCT",
    "14": "I-PRODUCT",
    "15": "B-DATE",
    "16": "I-DATE",
    "17": "B-TIME",
    "18": "I-TIME",
    "19": "B-PERCENT",
    "20": "I-PERCENT",
    "21": "B-MONEY",
    "22": "I-MONEY",
    "23": "B-QUANTITY",
    "24": "I-QUANTITY",
    "25": "B-ORDINAL",
    "26": "I-ORDINAL",
    "27": "B-CARDINAL",
    "28": "I-CARDINAL",
    "29": "B-EVENT",
    "30": "I-EVENT",
    "31": "B-WORK_OF_ART",
    "32": "I-WORK_OF_ART",
    "33": "B-LAW",
    "34": "I-LAW",
    "35": "B-LANGUA

In [22]:
device: str = "cuda" if torch.cuda.is_available() else "cpu"
model = DistilBertForTokenClassification(config=config).to(device)


# Check that the tokenizer and model were properly initialized
inputs = tokenizer(text, return_tensors="pt")
data: dict[str, int] = {
    col: int(val) for col, val in zip(tokens, inputs.input_ids.flatten())
}
print(data)

pl.DataFrame(data)

{'[CLS]': 101, 'Note': 5322, ':': 131, 'There': 1247, "'": 112, 's': 188, 'a': 170, 'piece': 2727, 'by': 1118, 'George': 1667, 'Pack': 14667, '##er': 1200, 'in': 1107, 'The': 1109, 'New': 1203, 'Yorker': 20998, '-': 118, 'L': 149, '##RB': 22672, 'Bet': 26615, '##ray': 6447, '##ed': 1174, 'the': 1103, 'Iraqi': 8612, '##s': 1116, 'who': 1150, 'trusted': 9373, 'America': 1738, 'most': 1211, 'R': 155, 'about': 1164, 'all': 1155, 'this': 1142, ',': 117, 'but': 1133, 'it': 1122, 'way': 1236, 'too': 1315, 'long': 1263, 'to': 1106, 'blog': 10679, 'here': 1303, '.': 119, '[SEP]': 102}


[CLS],Note,:,There,',s,a,piece,by,George,Pack,##er,in,The,New,Yorker,-,L,##RB,Bet,##ray,##ed,the,Iraqi,##s,who,trusted,America,most,R,about,all,this,",",but,it,way,too,long,to,blog,here,.,[SEP]
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
101,5322,131,1247,112,188,170,2727,1118,1667,14667,1200,1107,1109,1203,20998,118,149,22672,26615,6447,1174,1103,8612,1116,1150,9373,1738,1211,155,1164,1155,1142,117,1133,1122,1236,1315,1263,1106,10679,1303,119,102


In [23]:
inputs_device: dict[str, Tensor] = {
    k: torch.tensor(v).to(device) for k, v in inputs.items()
}
inputs_device

{'input_ids': tensor([[  101,  5322,   131,  1247,   112,   188,   170,  2727,  1118,  1667,
          14667,  1200,  1107,  1109,  1203, 20998,   118,   149, 22672,   118,
            112,   112, 26615,  6447,  1174,   131,  1103,  8612,  1116,  1150,
           9373,  1738,  1103,  1211,   112,   112,   118,   155, 22672,   118,
           1164,  1155,  1142,   117,  1133,  1122,   112,   188,  1236,  1315,
           1263,  1106, 10679,  1303,   119,   102]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [24]:
inputs_device

{'input_ids': tensor([[  101,  5322,   131,  1247,   112,   188,   170,  2727,  1118,  1667,
          14667,  1200,  1107,  1109,  1203, 20998,   118,   149, 22672,   118,
            112,   112, 26615,  6447,  1174,   131,  1103,  8612,  1116,  1150,
           9373,  1738,  1103,  1211,   112,   112,   118,   155, 22672,   118,
           1164,  1155,  1142,   117,  1133,  1122,   112,   188,  1236,  1315,
           1263,  1106, 10679,  1303,   119,   102]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [25]:
# Pass the input to the model and extract the predictions
inputs_device: dict[str, Tensor] = {
    k: torch.tensor(v).to(device) for k, v in inputs.items()
}
outputs = model(**inputs_device).logits
predictions: Tensor = torch.argmax(outputs, dim=-1)
print(f"Number of tokens in sequence: {len(tokens)}")

data: dict[str, str] = {
    col: tags.names[p] for col, p in zip(tokens, predictions.flatten())
}

pl.DataFrame(data)

Number of tokens in sequence: 56


[CLS],Note,:,There,',s,a,piece,by,George,Pack,##er,in,The,New,Yorker,-,L,##RB,Bet,##ray,##ed,the,Iraqi,##s,who,trusted,America,most,R,about,all,this,",",but,it,way,too,long,to,blog,here,.,[SEP]
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""B-LANGUAGE""","""I-ORDINAL""","""B-ORDINAL""","""B-WORK_OF_ART""","""B-GPE""","""B-LAW""","""B-MONEY""","""B-DATE""","""B-EVENT""","""I-PERSON""","""I-TIME""","""I-FAC""","""I-NORP""","""I-ORDINAL""","""B-LAW""","""B-MONEY""","""I-QUANTITY""","""B-GPE""","""I-ORDINAL""","""I-FAC""","""B-GPE""","""B-LANGUAGE""","""I-ORG""","""B-MONEY""","""B-ORDINAL""","""I-LOC""","""B-NORP""","""B-MONEY""","""B-GPE""","""B-LAW""","""I-FAC""","""I-QUANTITY""","""I-LANGUAGE""","""B-EVENT""","""B-LOC""","""B-NORP""","""I-LOC""","""I-QUANTITY""","""B-QUANTITY""","""I-PERSON""","""I-NORP""","""I-NORP""","""O""","""I-TIME"""


In [26]:
# text.split()
# pattern: str = r"[,.:;?!\s]+"
# tokens: list[str] = re.compile(pattern).split(text)
# tokens

In [27]:
sample = ds_dict["train"][10]

words, inp_labels = sample["tokens"], sample["labels"]

len(words), len(inp_labels)

(5, 5)

In [28]:
# Tokenize each word and specify that the input sequence has already been split into words.
tokenized_input = tokenizer(words, is_split_into_words=True)
tokens: list[str] = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]', 'Journalists', 'sources', 'and', 'jail', '/', '.', '[SEP]']

In [29]:
word_ids = tokenized_input.word_ids()
word_ids

[None, 0, 1, 2, 3, 4, 4, None]

In [30]:
# Align the labels
prev_word_idx: int | None = None
label_ids: list[int] = []

for word_idx in word_ids:
    if word_idx is None or word_idx == prev_word_idx:
        label_ids.append(-100)
    else:
        label_ids.append(inp_labels[word_idx])
    prev_word_idx = word_idx


# Update the labels
labels = [index2tag[l] if l != -100 else "IGN" for l in label_ids]
index = ["Tokens", "Word IDs", "label IDs", "Labels"]

pd.DataFrame([tokens, word_ids, label_ids, inp_labels], index=index)

Unnamed: 0,0,1,2,3,4,5,6,7
Tokens,[CLS],Journalists,sources,and,jail,/,.,[SEP]
Word IDs,,0,1,2,3,4,4,
label IDs,-100,0,0,0,0,0,-100,-100
Labels,0,0,0,0,0,,,


#### Comment

- The ID `-100` is used to mask subword representations in PyTorch's cross-entropy loss to ignore them during training, preventing their influence on the model's learning.

In [31]:
# Putting it together
import os


os.environ["TOKENIZERS_PARALLELISM"] = "true"


def tokenize_and_align_labels(examples: dict[str, list]) -> dict[str, list]:
    labels: list[list[int]] = []

    # Tokenize each word and specify that the input sequence has
    # already been split into words.
    tokenized_inputs: dict[str, list] = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        max_length=config.max_position_embeddings,
        padding="max_length",
    )

    for idx, label in enumerate(examples["labels"]):
        word_ids: list[int | None] = tokenized_inputs.word_ids(batch_index=idx)
        prev_word_idx: int | None = None
        label_ids: list[int] = []

        for word_idx in word_ids:
            if word_idx is None or word_idx == prev_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            prev_word_idx = word_idx
        labels.append(label_ids)

    # Add the labels to the inputs
    tokenized_inputs["labels"] = labels

    return tokenized_inputs


def encode_dataset(corpus: DatasetDict) -> DatasetDict:
    encoded_corpus = DatasetDict()
    for split, dataset in corpus.items():
        encoded_corpus[split] = dataset.map(
            tokenize_and_align_labels,
            batched=True,
            remove_columns=["tokens", "ner_labels"],
            desc=f"Running tokenizer on {split} split",
        )
    return encoded_corpus

In [32]:
ds_dict_encoded: DatasetDict = encode_dataset(ds_dict)
ds_dict_encoded

Running tokenizer on train split:   0%|          | 0/1940 [00:00<?, ? examples/s]

Running tokenizer on validation split:   0%|          | 0/222 [00:00<?, ? examples/s]

Running tokenizer on test split:   0%|          | 0/222 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['document_id', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1940
    })
    validation: Dataset({
        features: ['document_id', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 222
    })
    test: Dataset({
        features: ['document_id', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 222
    })
})

<br>

### Performance Metrics

- Evaluating NER models involves precision, recall, and F1-score.
- All words of an entity must be predicted correctly.
- The `seqeval` library can compute these metrics using the `classification_report()` function.

In [33]:
from seqeval.metrics import classification_report


def align_predictions(
    predictions: np.ndarray, label_ids: np.ndarray
) -> tuple[list[list[str]], list[list[str]]]:
    """
    Align predictions with label IDs and convert them to tag strings. It's
    required to convert the model outputs to the format `seqeval` expects.

    Parameters
    ----------
    predictions : np.ndarray
        The prediction tensor of shape (batch_size, seq_len, num_classes).
    label_ids : np.ndarray
        The label IDs tensor of shape (batch_size, seq_len).

    Returns
    -------
    tuple[list[list[str]], list[list[str]]]
        A tuple containing two lists:
        - preds_list: List of lists containing predicted tags for each sequence.
        - labels_list: List of lists containing true tags for each sequence.
    """
    preds: np.ndarray = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list: list[list[str]] = []
    preds_list: list[list[str]] = []

    for batch_idx in range(batch_size):
        example_labels: list[str] = []
        example_preds: list[str] = []
        for seq_idx in range(seq_len):
            # Skip the special tokens (label IDs = -100)
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])

        labels_list.append(example_labels)
        preds_list.append(example_preds)

    return preds_list, labels_list

In [34]:
from huggingface_hub import login, notebook_login
# from dotenv import load_dotenv, find_dotenv


# # Load the .env file
# _ = load_dotenv(find_dotenv())


# Access an environment variable
# HF_NOTEBOOK_TOKEN = os.getenv("HF_NOTEBOOK_TOKEN")
# login(token=HF_NOTEBOOK_TOKEN, add_to_git_credential=True)

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [35]:
from transformers import TrainingArguments


# Define the training arguments
num_epochs: int = 3
batch_size: int = 24  # (4 because data is small)
logging_steps: int = len(ds_dict_encoded["train"]) // batch_size
model_name: str = f"{chkpoint}-finetuned"

# The model's predictions are evaluated on the validation set after each epoch.
# Weight decay is adjusted, and checkpointing is disabled to speed up training.
training_args = TrainingArguments(
    output_dir=model_name,
    log_level="error",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    save_steps=1e6,
    weight_decay=0.01,
    disable_tqdm=False,
    logging_steps=logging_steps,
    push_to_hub=True,
)

In [36]:
from seqeval.metrics import accuracy_score, f1_score


def compute_metrics(eval_pred: tuple[Any, Any]) -> dict[str, float]:
    """
    Compute F1 score for the predictions.

    Parameters
    ----------
    eval_pred :tuple[Any, Any]
        A tuple containing model predictions and true labels.

    Returns
    -------
    dict[str, float]
        A dictionary containing the F1 score.
    """
    y_pred, y_true = align_predictions(*eval_pred)
    return {"f1": f1_score(y_true, y_pred)}

#### Comment

- The labels in sequence classification are padded with `-100` to avoid affecting the loss.
- A `model_init()` method is used to load an untrained model for training to avoid creating a new model for each Trainer.

In [37]:
from transformers import DataCollatorForTokenClassification


data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [38]:
def model_init() -> DistilBertForTokenClassification:
    """
    Initialize and return a DistilBertForTokenClassification model.

    Returns
    -------
    DistilBertForTokenClassification
        A pre-trained BERT model for token classification, loaded with the specified
        checkpoint and configuration, and moved to the specified device.
    """
    return DistilBertForTokenClassification.from_pretrained(
        chkpoint, config=config, ignore_mismatched_sizes=True
    ).to(device)

### Model Training

In [39]:
from transformers import Trainer


# A model_init() method is used to load an untrained model for
# training to avoid creating a new model for each Trainer.
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=ds_dict_encoded["train"],
    eval_dataset=ds_dict_encoded["validation"],
    tokenizer=tokenizer,
)

trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,1.0541,0.724767,0.393085
2,0.6283,0.60199,0.462052
3,0.5061,0.565164,0.478764


TrainOutput(global_step=243, training_loss=0.7273975380163624, metrics={'train_runtime': 271.2317, 'train_samples_per_second': 21.458, 'train_steps_per_second': 0.896, 'total_flos': 760882256547840.0, 'train_loss': 0.7273975380163624, 'epoch': 3.0})

In [40]:
trainer.push_to_hub(commit_message="Training completed!")

events.out.tfevents.1725148274.d0b0eca3538a.13189.0:   0%|          | 0.00/8.27k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/chineidu/distilbert-NER-finetuned/commit/3e6869698c27808acac8ace2e4187d8eb91766f3', commit_message='Training completed!', commit_description='', oid='3e6869698c27808acac8ace2e4187d8eb91766f3', pr_url=None, pr_revision=None, pr_num=None)

In [41]:
def tag_text(
    text: str,
    tags: ClassLabel,
    model: DistilBertModel,
    tokenizer: AutoTokenizer,
) -> pl.DataFrame:
    # Get tokens with special characters
    tokens: list[str] = tokenizer(text).tokens()

    # Encode
    inputs: Tensor = tokenizer(
        text, padding=True, truncation=True, max_length=512, return_tensors="pt"
    )
    inputs = {k: torch.tensor(v).to(device) for k, v in inputs.items()}

    # Get predictions
    outputs: Tensor = model(**inputs).logits

    # Get predictions
    predictions: Tensor = torch.argmax(outputs, dim=-1)

    data: dict[str, str] = {
        col: tags.names[p]
        for col, p in zip(tokens, predictions.flatten().cpu().numpy())
    }

    return pl.DataFrame(data)

In [42]:
text: str = (
    "Neidu Emmanuel recently announced plans to host a Tesla AI Day at the Gigafactory "
    "in Austin, Texas. The event is scheduled for October 18th, 2024. During the "
    "event, Neidu is expected to unveil new advancements in Tesla's autonomous driving "
    "technology and discuss the company's ambitious goals for artificial intelligence. "
    "The event is expected to attract thousands of attendees, including investors, "
    "tech enthusiasts, and industry experts."
)

tag_text(text, tags, trainer.model, tokenizer)

[CLS],N,##ei,##du,Emmanuel,recently,announced,plans,to,host,a,Te,##sla,AI,Day,at,the,G,##iga,##fa,##ctor,##y,in,Austin,",",Texas,.,The,event,is,scheduled,for,October,18th,202,##4,During,expected,un,##ve,##il,new,advancement,##s,',s,autonomous,driving,technology,and,discuss,company,ambitious,goals,artificial,intelligence,attract,thousands,of,attendees,including,investors,tech,enthusiasts,industry,experts,[SEP]
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""B-GPE""","""I-ORG""","""O""","""O""","""O""","""O""","""O""","""O""","""O""","""O""","""O""","""O""","""O""","""O""","""O""","""O""","""O""","""I-PERSON""","""O""","""O""","""O""","""I-ORG""","""O""","""O""","""O""","""B-GPE""","""O""","""O""","""O""","""O""","""O""","""O""","""I-DATE""","""O""","""O""","""O""","""O""","""O""","""O""","""O""","""B-GPE""","""O""","""O""","""O""","""O""","""O""","""O""","""O""","""O""","""O""","""O""","""O""","""O""","""O""","""O""","""I-ORG""","""O""","""B-DATE""","""O""","""I-ORG""","""O""","""O""","""O""","""I-ORG""","""O""","""O""","""I-DATE"""


In [45]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline


tokenizer = AutoTokenizer.from_pretrained("dslim/distilbert-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/distilbert-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

ner_results = nlp(text)
print(ner_results)


[{'entity': 'B-PER', 'score': 0.95814115, 'index': 1, 'word': 'N', 'start': 0, 'end': 1}, {'entity': 'B-PER', 'score': 0.85098195, 'index': 2, 'word': '##ei', 'start': 1, 'end': 3}, {'entity': 'B-PER', 'score': 0.92637885, 'index': 3, 'word': '##du', 'start': 3, 'end': 5}, {'entity': 'I-PER', 'score': 0.8446359, 'index': 4, 'word': 'Emmanuel', 'start': 6, 'end': 14}, {'entity': 'B-MISC', 'score': 0.97945064, 'index': 11, 'word': 'Te', 'start': 50, 'end': 52}, {'entity': 'B-MISC', 'score': 0.9907212, 'index': 12, 'word': '##sla', 'start': 52, 'end': 55}, {'entity': 'I-MISC', 'score': 0.980295, 'index': 13, 'word': 'AI', 'start': 56, 'end': 58}, {'entity': 'I-MISC', 'score': 0.99176544, 'index': 14, 'word': 'Day', 'start': 59, 'end': 62}, {'entity': 'B-LOC', 'score': 0.96773815, 'index': 17, 'word': 'G', 'start': 70, 'end': 71}, {'entity': 'B-LOC', 'score': 0.95314467, 'index': 18, 'word': '##iga', 'start': 71, 'end': 74}, {'entity': 'B-LOC', 'score': 0.8271949, 'index': 19, 'word': '##f