# Token Classification

##### Named Entity Recognition (NER): 

Finding the entitites like person locations or organisation in a sentence. This can be formulated as attributing a label to each token by having one class per entity and one class for "no entity"

##### PART OF SPEECH:
Mark each word in a sentence as corresponding to a particular part of speech (such as noun, verb, adjective, etc.)

##### Chunking 
Find the tokens that belong to the same entity. This task (which can be combined with POS or NER), can be formulated as attributing one label(usually B) to any tokens that are at the beginning of a chunk, another label(usually l-) to tokens that are inside a chunk and a third label to tokens that don't belong to any chunk (usually O)

In [1]:
! pip install transformers datasets tokenizer seqeval -q 

In [2]:
import datasets 
import numpy as np
from transformers import BertTokenizerFast, DataCollatorForTokenClassification, AutoModelForTokenClassification

conll2003 = datasets.load_dataset("conll2003",trust_remote_code=True)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
conll2003.column_names

{'train': ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
 'validation': ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
 'test': ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags']}

In [4]:
conll2003

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [5]:
conll2003.shape

{'train': (14041, 5), 'validation': (3250, 5), 'test': (3453, 5)}

In [6]:
conll2003["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [7]:
conll2003["train"].features["ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [8]:
conll2003["train"].description

'The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses 

### Bert Tokenizer 

In [9]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [10]:
example_text = conll2003["train"][0]

tokenized_input = tokenizer(example_text['tokens'],is_split_into_words=True)

tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

word_ids = tokenized_input.word_ids()

word_ids

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]

In [11]:
len(example_text['ner_tags']), len(tokenized_input["input_ids"])

# This is classic example of problem of sub token
# input_ids returned by tokenizer are longer then ner_tags (labels of our dataset) ---> 

(9, 11)

In [12]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 'eu',
 'rejects',
 'german',
 'call',
 'to',
 'boycott',
 'british',
 'lamb',
 '.',
 '[SEP]']

##### so the tokens which is tokenized_input of input_ids have 11 tokens (2 extra added in the start and end these are special tokens but our ner_tags
##### labels of our data) contains only 9 tokens

### The below function tokenize_and_align_labels does 2 jobs 

1. set -100 as the label for these special tokens and the subwords we wish to mask durring training
2. mask the subword representations after the first subword 

#### then we align the labels with the token ids using the strategy we picked

In [13]:
def tokenize_and_align_labels(example, label_all_token=True):
    tokenized_input = tokenizer(example['tokens'],truncation=True, is_split_into_words=True, padding=True)
    all_labels = []

    for i, labels in enumerate(example['ner_tags']):
        word_ids = tokenized_input.word_ids(batch_index=i)
        # word_ids() ==> Return a list mapping the tokens
        # to their actual word in the initial sentence. It returns a list indicating the word corresponding to each token
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                # set -100 as the label for these special tokens
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # add corresponding token to the word as it is
                label_ids.append(labels[word_idx])
            else:
                # to take care of sub-words which have the same word idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(labels[word_idx] if label_all_token else -100)
            previous_word_idx = word_idx
        all_labels.append(label_ids)
        tokenized_input['labels'] = labels
        return tokenized_input

In [14]:
def tokenize_and_align_labels(examples, label_all_token=True):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True, padding=True)
    all_labels = []

    for i, labels in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(labels[word_idx])
            else:
                label_ids.append(labels[word_idx] if label_all_token else -100)
            previous_word_idx = word_idx

        all_labels.append(label_ids)

    tokenized_inputs['labels'] = all_labels
    return tokenized_inputs

In [15]:
q = tokenize_and_align_labels(conll2003['train'][4:5])
print(q)

{'input_ids': [[101, 2762, 1005, 1055, 4387, 2000, 1996, 2647, 2586, 1005, 1055, 15651, 2837, 14121, 1062, 9328, 5804, 2056, 2006, 9317, 10390, 2323, 4965, 8351, 4168, 4017, 2013, 3032, 2060, 2084, 3725, 2127, 1996, 4045, 6040, 2001, 24509, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 5, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, -100]]}


##### now the labels are properly alligned for special variable

In [16]:
for tokens, label in zip(tokenizer.convert_ids_to_tokens(q['input_ids'][0]),q['labels'][0]):
    print(f"{tokens:_<40} {label}")

[CLS]___________________________________ -100
germany_________________________________ 5
'_______________________________________ 0
s_______________________________________ 0
representative__________________________ 0
to______________________________________ 0
the_____________________________________ 0
european________________________________ 3
union___________________________________ 4
'_______________________________________ 0
s_______________________________________ 0
veterinary______________________________ 0
committee_______________________________ 0
werner__________________________________ 1
z_______________________________________ 2
##wing__________________________________ 2
##mann__________________________________ 2
said____________________________________ 0
on______________________________________ 0
wednesday_______________________________ 0
consumers_______________________________ 0
should__________________________________ 0
buy_____________________________________ 0
sheep___

In [17]:
tokenized_dataset = conll2003.map(tokenize_and_align_labels, batched=True)

In [18]:
! pip install torch torchvision torchaudio


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [19]:
import torch
print(torch.__version__)

2.4.1


In [20]:
model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased',num_labels=9)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
! pip install --upgrade accelerate


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [22]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    'test-ner',
    eval_strategy= "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
)

In [23]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [26]:
# ! pip install evaluate

import evaluate

metric = evaluate.load('seqeval')

example = conll2003['train'][0]

In [27]:
label_list = conll2003['train'].features['ner_tags'].feature.names

label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

## Now lets calculate the metric on a single example ie the variable example

In [28]:
labels =  [label_list[i] for i in example['ner_tags']]
labels

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [29]:
metric.compute(predictions=[labels],references=[labels])

{'MISC': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(2)},
 'ORG': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(1)},
 'overall_precision': np.float64(1.0),
 'overall_recall': np.float64(1.0),
 'overall_f1': np.float64(1.0),
 'overall_accuracy': 1.0}

#### Compute Metrics 

In [None]:
def compute_metrics(eval_preds):
    pred_logits, labels = eval_preds

    pred_logits = np.argmax(pred_logits, axis = 2)
    prediction  = 

* O means the word doesn't correspond to any entity 
* B-PER/I-PER means the word corresponds to the beginning of/is inside a person entity
* B-ORG/I-ORG means the word corresponds to the beginning of/is inside a organization entity
* B-LOC/I-LOC means the word corresponds to the beginning of/is inside a location entity
* B-MISC/I-MISC means the word corresponds to the beginning of/is inside a miscellaneous entity