# Token Classification (HuggingFace)

- NER, POS Tagging, Chunking (which tokens belong to the same entity)

## 1. Load the data

CoNLL-2003 dataset

In [1]:
import os
os.environ['http_proxy']  = 'http://192.41.170.23:3128'
os.environ['https_proxy'] = 'http://192.41.170.23:3128'

In [2]:
from datasets import load_dataset

raw_datasets = load_dataset("conll2003")

Found cached dataset conll2003 (/home/chaklams/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [4]:
raw_datasets["train"][0]["tokens"]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [5]:
raw_datasets["train"][0]["ner_tags"]

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [6]:
raw_datasets["train"][0]["pos_tags"]

[22, 42, 16, 21, 35, 37, 16, 21, 7]

In [7]:
raw_datasets["train"].features["ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [8]:
raw_datasets["train"].features["pos_tags"]

Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None)

In [10]:
ner_features = raw_datasets["train"].features["ner_tags"]
label_names  = ner_features.feature.names

In [11]:
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

## 2. Preprocessing

Tokenization (numericalization), aligning labels

In [12]:
from transformers import AutoTokenizer

checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [13]:
tokenizer("Chaky loves deep learning")

{'input_ids': [101, 24705, 3781, 7871, 1996, 3776, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [14]:
tokenizer.decode([101, 24705, 3781, 7871, 1996, 3776, 102])

'[CLS] Chaky loves deep learning [SEP]'

In [15]:
tokenizer.is_fast #basically a internal Huggingface
#optimization that makes its tokenizer very fast

True

In [17]:
tokens = raw_datasets["train"][0]["tokens"]
tokens

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [20]:
#we have to aware that our inputs are already
#tokenized.....

inputs = tokenizer(tokens, is_split_into_words=True)
inputs

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [22]:
inputs.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [23]:
raw_datasets["train"][0]["ner_tags"]

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [24]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [26]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    
    for word_id in word_ids:
        
        if word_id != current_word:
            #Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        
        elif word_id is None:
            new_labels.append(-100) #-100 is a default index to ignore for huggingface
            
        else:
            #same word as previous token
            label = labels[word_id]
            if label % 2 == 1:
                label += 1
            new_labels.append(label)
            
    return new_labels

In [27]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]


In [30]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [32]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels, batched=True, remove_columns=raw_datasets["train"].column_names,)

  0%|          | 0/15 [00:00<?, ?ba/s]

Loading cached processed dataset at /home/chaklams/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-b255492edbda1b2a.arrow
Loading cached processed dataset at /home/chaklams/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-68b2047c122dc838.arrow


In [33]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [34]:
tokenized_datasets["train"][0]['input_ids']

[101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102]

In [35]:
tokenizer.decode(tokenized_datasets["train"][0]['input_ids'])

'[CLS] EU rejects German call to boycott British lamb. [SEP]'

In [36]:
tokenized_datasets["train"][0]['token_type_ids']

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [37]:
tokenized_datasets["train"][0]['attention_mask']

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [38]:
tokenized_datasets["train"][0]['labels']

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]

## 3. Dataloader

In [42]:
from transformers import DataCollatorForTokenClassification
#huggingface is very kind to make a data collator for each pipeline

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [44]:
test = [tokenized_datasets["train"][i] for i in range(2)]

In [45]:
data_collator(test)

{'input_ids': tensor([[  101,  7270, 22961,  1528,  1840,  1106, 21423,  1418,  2495, 12913,
            119,   102],
         [  101,  1943, 14428,   102,     0,     0,     0,     0,     0,     0,
              0,     0]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]),
 'labels': tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
         [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])}

In [47]:
from torch.utils.data import DataLoader

train_loader = DataLoader(tokenized_datasets["train"],      shuffle=True,
                          collate_fn=data_collator, batch_size=8)
val_loader   = DataLoader(tokenized_datasets["validation"],
                          collate_fn=data_collator, batch_size=8)

In [49]:
# for batch in train_loader:
#     print(batch)
#     break

## 4. Model

The second part of the Pipeline

In [None]:
from transformers import AutoModelForTokenClassification
#basically, it imports a pretrained model, and add linear layers and only train that layers....

model = AutoModelForTokenClassification(
    model_checkpoint, id2label=id2label,label2id=label2id)