<a href="https://colab.research.google.com/github/chineidu/NLP-Tutorial/blob/main/notebook/06_Transformers/NLP-With-Transformers/02-NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multiingual Name Entity Recognition



In [1]:
!pip install -q transformers accelerate datasets \
  seqeval mlxtend watermark rich

In [2]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,mlxtend,transformers --conda

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.34.0

numpy       : 1.26.4
pandas      : 2.1.4
polars      : 0.20.2
mlxtend     : 0.23.1
transformers: 4.42.4

conda environment: n/a



In [3]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")


# auto reload imports# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(500)

warnings.filterwarnings("ignore")


# # Black code formatter (Optional)
# %load_ext lab_black

# # auto reload imports
# %load_ext autoreload
# %autoreload 2

In [4]:
from datasets import Dataset, get_dataset_config_names, load_dataset


xtreme_subsets = get_dataset_config_names("xtreme")
print(f"XTREME has {len(xtreme_subsets)} configurations")

XTREME has 183 configurations


In [5]:
# Select the config that starts with `PAN`
pan_subsets: list[str] = [s for s in xtreme_subsets if s.startswith("PAN")]
print(pan_subsets[:3])

['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg']


In [6]:
from datasets import DatasetDict


# Load English subset
en_dataset: DatasetDict = load_dataset("xtreme", name="PAN-X.en")
en_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

In [7]:
# Shuffle
en_dataset['train'].shuffle(seed=123)

# Select a specific number of records
print(en_dataset['train'].select(range(10)))

# Check the number pf records
print(f"Number of records: {len(en_dataset['train']):,}")

# OR
print(f"Number of records: {en_dataset['train'].num_rows:,}")

Dataset({
    features: ['tokens', 'ner_tags', 'langs'],
    num_rows: 10
})
Number of records: 20,000
Number of records: 20,000


#### Coment

- To create a realistic Swiss corpus, we'll sample PAN-X corpora in German (62.9%), French (22.9%), Italian (8.4%), and English (5.9%) according to their spoken proportions in Switzerland.
- This will create a language imbalance simulating real-world datasets.
- We'll use a Python `defaultdict` to store language codes and corresponding PAN-X corpora.

In [8]:
from collections import defaultdict


seed: int = 123
langs: list[str] = ["de", "fr", "it", "en"]
fracs: list[float] = [0.629, 0.229, 0.084, 0.059]
panx_ch: defaultdict = defaultdict(DatasetDict)

for lang, frac in zip(langs, fracs):
    # Load dataset
    ds: DatasetDict = load_dataset("xtreme", name=f"PAN-X.{lang}")
    # Shuffle and downsample each split according to spoken proportion
    for split in ds:
      num_rows: int = ds[split].num_rows
      panx_ch[lang][split] = ds[split].shuffle(seed=seed).select(range(int(frac * num_rows)))


In [9]:
panx_ch

defaultdict(datasets.dataset_dict.DatasetDict,
            {'de': DatasetDict({
                 train: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 12580
                 })
                 validation: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 6290
                 })
                 test: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 6290
                 })
             }),
             'fr': DatasetDict({
                 train: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 4580
                 })
                 validation: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 2290
                 })
                 test: Dataset({
                     features: ['tokens', 'ner_tags', 'la

In [10]:
for key, val in panx_ch.items():
  print(val)
  break

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 12580
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 6290
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 6290
    })
})


In [11]:
# Number of training examples per language
res: dict[str, int] = {key:val["train"].num_rows for key, val in panx_ch.items()}
pl.DataFrame(res)


de,fr,it,en
i64,i64,i64,i64
12580,4580,1680,1180


In [12]:
# Inspect one of the languages
sample = panx_ch["en"]["train"][0]
sample

{'tokens': ['Collin', 'Peterson', '(', 'D-MN', ')'],
 'ner_tags': [1, 2, 0, 0, 0],
 'langs': ['en', 'en', 'en', 'en', 'en']}

In [13]:
print(panx_ch["en"]["train"].features)

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None), 'langs': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}


In [14]:
ner_tags = panx_ch["en"]["train"].features["ner_tags"]
tags = ner_tags.feature
print(f"{ner_tags = }")
print(f"{tags = }")

ner_tags = Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
tags = ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


In [15]:
for idx, _ in enumerate(tags.names):
  console.print(f"{idx} ==> {tags.int2str(idx)}")

In [16]:
pprint(panx_ch["en"]["train"][10])

{'langs': ['en',
           'en',
           'en',
           'en',
           'en',
           'en',
           'en',
           'en',
           'en',
           'en',
           'en',
           'en',
           'en'],
 'ner_tags': [0, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2],
 'tokens': ['Tries',
            ':',
            'Todd',
            'Carney',
            ',',
            'Ricky',
            'Leutele',
            ',',
            'Michael',
            'Gordon',
            ',',
            'Jeff',
            'Robson']}


In [17]:
def create_tag_names(batch: DatasetDict) -> dict[str, Any]:
  return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}


panx_de: DatasetDict = panx_ch["de"].map(create_tag_names)

panx_de["train"][5]

{'tokens': ['Die',
  'meisten',
  'seiner',
  'Gemälde',
  'sind',
  'im',
  'Statens',
  'Museum',
  'for',
  'Kunst',
  'zu',
  'sehen',
  '.'],
 'ner_tags': [0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 0, 0, 0],
 'langs': ['de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de'],
 'ner_tags_str': ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'I-ORG',
  'I-ORG',
  'I-ORG',
  'O',
  'O',
  'O']}

In [18]:
pl.DataFrame(panx_de["train"].select_columns(['tokens', 'ner_tags_str'])[5]).transpose()

column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12
str,str,str,str,str,str,str,str,str,str,str,str,str
"""Die""","""meisten""","""seiner""","""Gemälde""","""sind""","""im""","""Statens""","""Museum""","""for""","""Kunst""","""zu""","""sehen""","""."""
"""O""","""O""","""O""","""O""","""O""","""O""","""B-ORG""","""I-ORG""","""I-ORG""","""I-ORG""","""O""","""O""","""O"""


In [19]:
# Check the distribution of all the tags
from collections import Counter


splits_freq = defaultdict(Counter)

for split, ds in panx_de.items():
  for row in ds["ner_tags_str"]:
    for tag in row:
      # Focus on the `beginning` tags
      if tag.startswith("B"):
        tag_type: str = tag.split("-")[1]
        splits_freq[split][tag_type] += 1

In [20]:
# The tags are roughly equally distributed
console.print(f"{splits_freq = }")

pl.DataFrame(splits_freq).to_pandas()

Unnamed: 0,train,validation,test
0,"{'PER': 5862, 'LOC': 6039, 'ORG': 5413}","{'PER': 2837, 'LOC': 3095, 'ORG': 2721}","{'ORG': 2620, 'PER': 2962, 'LOC': 3145}"


<br>

## Multilingual Transformers

- Multilingual transformers are trained on multilingual corpora and they can generalize well across languages for various tasks.

- This approach can outperform monolingual models for cross-lingual transfer, avoiding the need to train separate models for each language.

- `CoNLL-2002` and `CoNLL-2003` are commonly used benchmarks for NER in these languages.

- Multilingual transformer models are usually evaluated in three different ways:
  - `en`: Fine-tune on the English training data and then evaluate on each language's test set.
  - `each`: Fine-tune and evaluate on monolingual test data to measure per-language performance.
  - `all`: Fine-tune on all the training data to evaluate on all on each languag's test set.

  <br>

- [XLM-RoBERTa](https://huggingface.co/docs/transformers/en/
model_doc/xlm-roberta) (`XLM-R`) model will be used.

- XLM-RoBERTa (XLM-R) is a multilingual transformer model trained on a massive dataset of text and code in 100 different languages.
  
- It can be used for a variety of tasks, including:

  - Text classification: Sentiment analysis, topic classification, etc.
  - Question answering: Extracting answers to questions from a given text.
  - Translation: Translating text from one language to another.
  - Named entity recognition: Identifying named entities in text, such as people, organizations, and locations.

- XLM-R achieves state-of-the-art results on many cross-lingual benchmarks, and it is a powerful tool for natural language processing in multiple languages.

#### Tokenization: SentencePiece



In [21]:
from transformers import AutoTokenizer


bert_model_name: str = "bert-base-cased"
xlmr_model_name: str = "xlm-roberta-base"

bert_tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

In [22]:
text: str = "Jack Sparrow loves New York!"

bert_tokens: list[str] = bert_tokenizer(text).tokens()
xlmr_tokens: list[str] = xlmr_tokenizer(text).tokens()

print(f"{bert_tokens = }")
print(f"{xlmr_tokens = }")

bert_tokens = ['[CLS]', 'Jack', 'Spa', '##rrow', 'loves', 'New', 'York', '!', '[SEP]']
xlmr_tokens = ['<s>', '▁Jack', '▁Spar', 'row', '▁love', 's', '▁New', '▁York', '!', '</s>']


### Tokenization Pipeline

<br>

[![image.png](https://i.postimg.cc/zvJ0SrZz/image.png)](https://postimg.cc/dkx2wgVp)

<br>

- `Normalization`: it involves cleaning raw text by removing whitespace, accents, and standardizing Unicode characters. It also includes lowercasing to reduce vocabulary size. After normalization, our example string becomes "jack sparrow loves new york!".

- `Pretokenization`: it splits text into words for easier tokenization. For English and similar languages, this is simple. For languages like Chinese, it's more complex and might require language-specific libraries.

- `Tokenizer model`: it splits words into subwords to reduce vocabulary size and out-of-vocabulary tokens. This is done using algorithms like BPE, Unigram, and WordPiece. For example, "jack sparrow" might become "[jack, spa, rrow]".

- `Postprocessing`: it's the final step in tokenization, where additional tokens (like [CLS] and [SEP]) are added to the beginning and end of the token sequence to prepare it for input into a model like BERT.


<hr><br>

#### SentencePiece Tokenizer

- The SentencePiece tokenizer is a flexible subword segmentation method that handles multilingual text effectively by preserving whitespace and using Unicode characters.

- It avoids ambiguities in detokenization and can be used for various languages without relying on language-specific pretokenizers.

### Transformers For Classification Vs NER

**1.) Classification**

[![image.png](https://i.postimg.cc/Mph53R2r/image.png)](https://postimg.cc/tsNFYsgh)

<br>

**2.) Named Entity Recognition**

[![image.png](https://i.postimg.cc/26YwrD5Q/image.png)](https://postimg.cc/rdhWN7xs)

<br>

- In `token classification`, assign the label (e.g., B-PER) to the first subword ("Chr") and ignore subsequent subwords ("##ista"). This convention follows the BERT paper and maintains the IOB2 format. Postprocessing can propagate the label to all subwords.


### Anatomy Transformers Model Class

- The Transformers API organizes models by task, but this can be limiting.
- If a desired task model doesn't exist, it can be difficult to implement a custom model.
- However, Transformers provides tools to load pretrained weights and use task-specific helper functions, making it easier to create custom models.

### Bodies And Heads

- Transformers are versatile due to their `body-head` structure.
- The `head` is task-specific, while the `body` is task-agnostic.
- The body includes embeddings and transformer layers. The Transformers code reflects this structure with classes like BertModel and GPT2Model that return hidden states.
- Task-specific models such as `BertForMaskedLM` or `BertForSequenceClassification` use the base model and add the necessary head on top of the hidden states.


<hr><br>

### Creating Custom Model For Token Classification

- Building a custom token classification head for XLM-R is similar to RoBERTa and an existing class, XLMRobertaForTokenClassification, can be used directly for this task.
- However, going through the exercise of building a custom model for educational purposes.

- We need a data structure for our XLM-R NER tagger.
- It will have a configuration object and a forward function.

In [23]:
from torch import Tensor
import torch.nn as nn
from transformers import XLMRobertaConfig, XLMRobertaModel
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel


class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    class_config = XLMRobertaConfig

    def __init__(self, config) -> None:
        super().__init__(config)
        self.num_labels = config.num_labels

        # Load model body
        self.roberta = RobertaModel(config, add_pooling_layer=False)

        # Setup classification head
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Load and initialize weights
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        labels=None,
        **kwargs
    ):
        # Get the encoder representations using model body
        outputs: Tensor = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            **kwargs
        )

        # Apply classifier to encoder reps
        sequence_output: Tensor = self.dropout(outputs[0])
        logits: Tensor = self.classifier(sequence_output)

        # Calculate losses
        loss: Optional[Tensor] = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # Return model output object
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

#### Comment

- The config_class ensures standard XLM-R settings.
- The super() method calls the initialization function of the RobertaPreTrainedModel class.
- The model body is extended with a classification head.
- The forward() method defines the model's behavior.
- The hidden state is fed through the dropout and classification layers.
- The loss is calculated if labels are provided.
- The outputs are wrapped in a TokenClassifierOutput object.
- This custom transformer model inherits from a PreTrainedModel, allowing access to useful Transformer utilities.

<br>

### Loading A Custom Model


In [24]:
tags.names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [25]:
from transformers import AutoConfig


index2tag: dict[int, str] = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index: dict[str, int] = {tag: idx for idx, tag in enumerate(tags.names)}

xlmr_config: XLMRobertaConfig = AutoConfig.from_pretrained(
    xlmr_model_name, num_labels=len(tags.names), id2label=index2tag,
    label2id=tag2index
)
xlmr_config

XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 5,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.42.4",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

In [26]:
import torch


device: str = "cuda" if torch.cuda.is_available() else "cpu"
xlmr_model = XLMRobertaForTokenClassification.from_pretrained(
    xlmr_model_name, config=xlmr_config
).to(device)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
text

'Jack Sparrow loves New York!'

In [28]:
# Check that the tokenizer and model were properly initialized
input_ids = xlmr_tokenizer.encode(text, return_tensors="pt")
data: dict[str, int] = {
    col: int(val) for col, val in zip(xlmr_tokens, input_ids.flatten())
}
print(data)

pl.DataFrame(data)

{'<s>': 0, '▁Jack': 21763, '▁Spar': 37456, 'row': 15555, '▁love': 5161, 's': 7, '▁New': 2356, '▁York': 5753, '!': 38, '</s>': 2}


<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
0,21763,37456,15555,5161,7,2356,5753,38,2


In [29]:
# Pass the input to the model and extract the predictions
outputs = xlmr_model(input_ids.to(device)).logits
predictions: Tensor = torch.argmax(outputs, dim=-1)
print(f"Number of tokens in sequence: {len(xlmr_tokens)}")

# (batch_size, num_tokens, num_tags)
print(f"Shape of outputs: {outputs.shape}")

# (batch_size, num_tokens)
print(f"Shape of predictions: {predictions.shape}")

print(f"{predictions = }")

Number of tokens in sequence: 10
Shape of outputs: torch.Size([1, 10, 7])
Shape of predictions: torch.Size([1, 10])
predictions = tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


In [30]:
data: dict[str, str] = {
    col: tags.names[p] for col, p in zip(xlmr_tokens, predictions.flatten())
}
print(data)

pl.DataFrame(data)

{'<s>': 'O', '▁Jack': 'O', '▁Spar': 'O', 'row': 'O', '▁love': 'O', 's': 'O', '▁New': 'O', '▁York': 'O', '!': 'O', '</s>': 'O'}


<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
str,str,str,str,str,str,str,str,str,str
"""O""","""O""","""O""","""O""","""O""","""O""","""O""","""O""","""O""","""O"""


In [31]:
def tag_text(
    text: str,
    tags: list[str],
    model: RobertaModel,
    tokenizer: AutoTokenizer,
) -> pl.DataFrame:
    # Get tokens with special characters
    tokens: list[str] = tokenizer(text).tokens()

    # Encode
    input_ids: Tensor = tokenizer(text, return_tensors="pt").input_ids.to(device)

    # Get predictions
    outputs: Tensor = model(input_ids).logits

    # Get predictions
    predictions: Tensor = torch.argmax(outputs, dim=-1)

    data: dict[str, str] = {
        col: tags.names[p]
        for col, p in zip(tokens, predictions.flatten().cpu().numpy())
    }

    return pl.DataFrame(data)

<br>

### Tokenizing Texts For NER

- To prepare the dataset for fine-tuning with XLM-R, we need to tokenize it. - Datasets provides a `map()` function for this.

- We need to define a function with the signature:
```py
function(examples: Dict[str, List]) -> Dict[str, List] to process the dataset.
```

- The XLM-R tokenizer's output returns input IDs for the model.
- To use the model, we need to add an attention mask and label IDs that indicate which tokens correspond to each NER tag.
- These can be obtained from the dataset's examples.

In [32]:
sample

{'tokens': ['Collin', 'Peterson', '(', 'D-MN', ')'],
 'ner_tags': [1, 2, 0, 0, 0],
 'langs': ['en', 'en', 'en', 'en', 'en']}

In [33]:
words, labels = sample["tokens"], sample["ner_tags"]

# Tokenize each word and specify that the input sequence has already been split into words.
tokenized_input = xlmr_tokenizer(
    words, is_split_into_words=True
)
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
pl.DataFrame(tokens).transpose()

column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10
str,str,str,str,str,str,str,str,str,str,str
"""<s>""","""▁Colli""","""n""","""▁Peter""","""son""","""▁(""","""▁D""","""-""","""MN""","""▁)""","""</s>"""


#### Comment

- The word `Collin` was split into `Colli` and `n` by the `SentencePience` tokenizer.
- Since we want to follow the convection that only `Colli` should be associated with the `B-PER` label,  we need a way to mask the subword representations after the subword.
- This can be done using `.word_ids()` method.
- `word_ids` has mapped each subword to the corresponding index in the words sequence, so the first subword, “__Colli", is assigned the index 0, while
“▁Peter” and “son” are assigned the index 1 (since “Peterson” is the second
word in words).
- We can also see that special tokens like `<s>` and `<\s>` are mapped to
None.
- Let's set `-100` as the label for these special tokens and the subwords we wish to mask during training.

In [34]:
word_ids = tokenized_input.word_ids()
data: dict[str, int] = {
    col: idx_ for col, idx_ in zip(tokens, word_ids)
}

pl.DataFrame(data)

<s>,▁Colli,n,▁Peter,son,▁(,▁D,-,MN,▁),</s>
null,i64,i64,i64,i64,i64,i64,i64,i64,i64,null
,0,0,1,1,2,3,3,3,4,


In [35]:
word_ids, labels

([None, 0, 0, 1, 1, 2, 3, 3, 3, 4, None], [1, 2, 0, 0, 0])

In [36]:
prev_word_idx: int | None = None
label_ids: list[int] = []

for word_idx in word_ids:
  if word_idx is None or word_idx == prev_word_idx:
    label_ids.append(-100)
  else:
    label_ids.append(labels[word_idx])
  prev_word_idx = word_idx


# Update the labels
labels = [index2tag[l] if l != -100 else "UKN" for l in label_ids]
index = ["Tokens", "Word IDs", "label IDs", "Labels"]

pd.DataFrame([tokens, word_ids, label_ids, labels], index=index)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
Tokens,<s>,▁Colli,n,▁Peter,son,▁(,▁D,-,MN,▁),</s>
Word IDs,,0,0,1,1,2,3,3,3,4,
label IDs,-100,1,-100,2,-100,0,0,-100,-100,0,-100
Labels,UKN,B-PER,UKN,I-PER,UKN,O,O,UKN,UKN,O,UKN


#### Comment

- The ID `-100` is used to mask subword representations in PyTorch's cross-entropy loss to ignore them during training, preventing their influence on the model's learning.

In [37]:
# Putting it together

def tokenize_and_align_labels(examples: dict[str, list]) -> dict[str, list]:
    """
    Tokenize and align labels for named entity recognition.

    Parameters
    ----------
    examples : dict[str, list]
        A dictionary containing 'tokens' and 'ner_tags' as keys.
        'tokens' : list of str
            List of words in the input sequence.
        'ner_tags' : list of int
            List of corresponding NER tags for each word.

    Returns
    -------
    dict[str, list]
        A dictionary containing tokenized inputs and aligned labels.
        'input_ids' : list of int
            Tokenized input sequence.
        'attention_mask' : list of int
            Attention mask for the tokenized sequence.
        'labels' : list of int
            Aligned labels for the tokenized sequence.

    Notes
    -----
    This function assumes the existence of a tokenizer named 'xlmr_tokenizer'.
    """
    labels: list[list[int]] = []

    # Tokenize each word and specify that the input sequence has
    # already been split into words.
    tokenized_inputs: dict[str, list] = xlmr_tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )

    for idx, label in enumerate(examples["ner_tags"]):
        word_ids: list[int | None] = tokenized_inputs.word_ids(batch_index=idx)
        prev_word_idx: int | None = None
        label_ids: list[int] = []

        for word_idx in word_ids:
            if word_idx is None or word_idx == prev_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            prev_word_idx = word_idx
        labels.append(label_ids)

    # Add the labels to the inputs
    tokenized_inputs["labels"] = labels

    return tokenized_inputs


def encode_panx_dataset(corpus: Dataset) -> Dataset:
    """
    Encode the PAN-X dataset by applying tokenization and label alignment.

    Parameters
    ----------
    corpus : Dataset
        The input corpus to be encoded.

    Returns
    -------
    Dataset
        The encoded dataset with tokenized inputs and aligned labels.

    Notes
    -----
    This function uses the `tokenize_and_align_labels` function to process
    the dataset. It removes the original 'tokens', 'ner_tags', and 'langs'
    columns from the dataset.
    """
    return corpus.map(
        tokenize_and_align_labels,
        batched=True,
        remove_columns=["tokens", "ner_tags", "langs"],
        desc="Running tokenizer on dataset",
    )

In [38]:
# Apply it to the entire dataset
panx_de_encoded = encode_panx_dataset(panx_ch["de"])

Running tokenizer on dataset:   0%|          | 0/6290 [00:00<?, ? examples/s]

<br>

### Performance Metrics

- Evaluating NER models involves precision, recall, and F1-score.
- All words of an entity must be predicted correctly.
- The `seqeval` library can compute these metrics using the `classification_report()` function.

In [38]:
from seqeval.metrics import classification_report


# Example usage
y_true: list[list[str]] = [
    ["O", "O", "O", "B-MISC", "I-MISC", "I-MISC", "O"],
    ["B-PER", "I-PER", "O"],
]
y_pred: list[list[str]] = [
    ["O", "O", "B-MISC", "I-MISC", "I-MISC", "I-MISC", "O"],
    ["B-PER", "I-PER", "O"],
]
print(classification_report(y_true, y_pred))