# Name Entity Recognition (HuggingFace)

Reference:  https://huggingface.co/learn

## 1. Load the data

CoNLL-2003 dataset

In [4]:
from datasets import load_dataset
raw_datasets = load_dataset('conll2003')

In [5]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [6]:
raw_datasets["train"][0]["tokens"]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [7]:
raw_datasets["train"][0]["ner_tags"]

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [8]:
raw_datasets["train"].features["ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [10]:
ner_features = raw_datasets["train"].features["ner_tags"]
label_names  = ner_features.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

## 2. Preprocessing

Tokenization (numericalization), aligning labels

In [11]:
from transformers import AutoTokenizer

checkpoint = "bert-base-cased"
tokenizer  = AutoTokenizer.from_pretrained(checkpoint)

In [13]:
tokenizer("Deep learning is fun!")

{'input_ids': [101, 7786, 3776, 1110, 4106, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [14]:
tokenizer.decode([101, 7786, 3776, 1110, 4106, 106, 102])

'[CLS] Deep learning is fun! [SEP]'

In [15]:
tokenizer.is_fast  #huggingface built-in tokenizer, that could basically perform very fast using some kind of parallel computation

True

In [16]:
tokens = raw_datasets["train"][0]["tokens"]

In [17]:
tokens

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [18]:
inputs = tokenizer(tokens, is_split_into_words=True)
inputs

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [20]:
inputs.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [21]:
raw_datasets["train"][0]["ner_tags"]

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [22]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [23]:
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [24]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    
    for word_id in word_ids: #[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]
        
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
            
        elif word_id is None:
            new_labels.append(-100) #-100 is a default index that HF will ignore
            
        else: 
            label = labels[word_id] #if the current token is the same word as the previous token
            if label % 2 == 1:  #if the label is B-XXX, we change it to I-XXX
                label += 1
            new_labels.append(label)
    
    return new_labels

In [25]:
labels = raw_datasets["train"][0]["ner_tags"]
labels

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [26]:
word_ids = inputs.word_ids()
word_ids

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [28]:
print(align_labels_with_tokens(labels, word_ids))

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]


In [29]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
        
    tokenized_inputs['labels'] = new_labels
    return  tokenized_inputs

In [30]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels, batched=True, remove_columns=raw_datasets["train"].column_names
)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [31]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [35]:
tokenized_datasets["train"][1]['input_ids']

[101, 1943, 14428, 102]

In [36]:
tokenizer.decode(tokenized_datasets["train"][1]['input_ids'])

'[CLS] Peter Blackburn [SEP]'

In [37]:
tokenized_datasets["train"][1]['labels']

[-100, 1, 2, -100]

## 3. Dataloader

In [38]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [41]:
test = [tokenized_datasets["train"][i] for i in range(2)]

In [42]:
test

[{'input_ids': [101,
   7270,
   22961,
   1528,
   1840,
   1106,
   21423,
   1418,
   2495,
   12913,
   119,
   102],
  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]},
 {'input_ids': [101, 1943, 14428, 102],
  'token_type_ids': [0, 0, 0, 0],
  'attention_mask': [1, 1, 1, 1],
  'labels': [-100, 1, 2, -100]}]

In [43]:
data_collator(test)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[  101,  7270, 22961,  1528,  1840,  1106, 21423,  1418,  2495, 12913,
           119,   102],
        [  101,  1943, 14428,   102,     0,     0,     0,     0,     0,     0,
             0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])}

In [44]:
from torch.utils.data import DataLoader

train_loader = DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=data_collator, batch_size=8)
val_loader   = DataLoader(tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8)

## 4. Model

The second part of the Pipeline

In [45]:
id2label ={str(i): label for i, label in enumerate(label_names)}
label2id = {v:k for k, v in id2label.items()}

In [46]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    checkpoint, id2label=id2label, label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 5. Metrics

We need to define `compute_metrics()` that takes list of predictions and labels, and returns a dictionary with the metric names and values.

Note: `pip install seqeval`

In [47]:
import evaluate

metric = evaluate.load("seqeval")

In [48]:
labels = raw_datasets["train"][0]["ner_tags"]

In [50]:
labels = [label_names[i] for i in labels]
labels

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [51]:
labels2 = labels.copy()
labels2[2] = 'B-ORG'
labels2

['B-ORG', 'O', 'B-ORG', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [52]:
metric.compute(predictions=[labels2], references=[labels])

{'MISC': {'precision': 1.0,
  'recall': 0.5,
  'f1': 0.6666666666666666,
  'number': 2},
 'ORG': {'precision': 0.5,
  'recall': 1.0,
  'f1': 0.6666666666666666,
  'number': 1},
 'overall_precision': 0.6666666666666666,
 'overall_recall': 0.6666666666666666,
 'overall_f1': 0.6666666666666666,
 'overall_accuracy': 0.8888888888888888}

## 6. Optimizer

In [45]:
from torch.optim import AdamW

#Adam with learning decay
optimizer = AdamW(model.parameters(), lr=2e-5)

## 7. Accelerator

So usually, you just train right..

But huggingface creates a wrapper called `Accelerator` which
utilize your resources in a parallel fashion....

In [46]:
from accelerate import Accelerator

In [47]:
accelerator = Accelerator()

model, optimizer, train_loader, val_loader = \
    accelerator.prepare(model, optimizer, train_loader, val_loader)

## 8. Learning rate scheduler

In [48]:
from transformers import get_scheduler

num_train_epochs = 1
num_update_steps_per_epoch = len(train_loader)
num_train_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_train_steps)

## 9. Repository

Repository is like a free-cloud space, hosted by HuggingFace.

It is very useful because for every certain steps, it will upload
your model to the Huggingface....if suddenly something crashes, 
you can resume....because your weights are push to Huggingface repo.

In [49]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [50]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "bert-finetuned-ner-accelerate"
repo_name  = get_full_repo_name(model_name)
repo_name

'Chaklam/bert-finetuned-ner-accelerate'

In [51]:
#sudo apt install git-lfs
#brew install git-lfs
#go to git-lfs and download it

import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"

output_dir = "bert-finetuned-ner-accelerate"
repo       = Repository(output_dir, clone_from=repo_name)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/Users/chaklam/Github/Natural-Language-Processing/Code/04 - Huggingface/code-along/bert-finetuned-ner-accelerate is already a clone of https://huggingface.co/Chaklam/bert-finetuned-ner-accelerate. Make sure you pull the latest changes with `repo.git_pull()`.


## 10. Training

In [52]:
#convert predictions and labels into strings, like 
#what our metric object expects

def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels      = labels.detach().cpu().clone().numpy()
    
    true_labels = [[label_names[l] for l in label if l !=-100] 
                   for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l!= - 100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions
    

In [53]:
from tqdm.auto import tqdm #progress bar
import torch

progress_bar = tqdm(range(num_train_steps))

for epoch in range(num_train_epochs):
    model.train()
    for batch in train_loader:
        outputs = model(**batch) #** because our input is keyword (input_ids = ...)
        loss    = outputs.loss
        accelerator.backward(loss)  #instead of optimizer.backward
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    
    #evaluation
    model.eval() #all batchnorm, dropout will be turned off....
    for batch in val_loader:
        with torch.no_grad():
            outputs = model(**batch) 
        
        predictions = outputs.logits.argmax(dim = -1)
        labels      = batch["labels"]
        
        #necessary to pad predictions and labels to same length...if not...crash...
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels      = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
        
        predictions_gathered = accelerator.gather(predictions)
        labels_gathered      = accelerator.gather(labels)
        
        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)
        
    results = metric.compute()

    print(

        f"epoch {epoch}: ",
        {
                key: results[f"overall_{key}"]
                for key in ["precision", "recall", "f1", "accuracy"]
        }

    )
        
    #save and upload your model
    accelerator.wait_for_everyone() #many processes
    unwrapped_model = accelerator.unwrap_model(model) #start from scratch
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)
    

  0%|          | 0/1756 [00:00<?, ?it/s]

## 11. Inference!!!

In [1]:
from transformers import pipeline

checkpoint = "Chaklam/bert-finetuned-ner-accelerate"

clf        = pipeline("token-classification", model=checkpoint, aggregation_strategy="simple")

config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/431M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [2]:
clf("Ayush and Chaklam are going to play soccer today at AIT, Bangkok, Thailand and eat some snacks")

[{'entity_group': 'PER',
  'score': 0.9725842,
  'word': 'Ayush',
  'start': 0,
  'end': 5},
 {'entity_group': 'PER',
  'score': 0.98298603,
  'word': 'Chaklam',
  'start': 10,
  'end': 17},
 {'entity_group': 'LOC',
  'score': 0.7392455,
  'word': 'AIT',
  'start': 52,
  'end': 55},
 {'entity_group': 'LOC',
  'score': 0.99720275,
  'word': 'Bangkok',
  'start': 57,
  'end': 64},
 {'entity_group': 'LOC',
  'score': 0.9977047,
  'word': 'Thailand',
  'start': 66,
  'end': 74}]