# Token Classificaiton (HuggingFace)
- NER, POS, Tagging, Chucking (which tokens belong to the same entity)

## 1. Load the data

CoNLL-2003 dataset

In [69]:
#uncomment this if you are not using our department puffer
# import os
# os.environ['http_proxy']  = 'http://192.41.170.23:3128'
# os.environ['https_proxy'] = 'http://192.41.170.23:3128'

In [70]:
from datasets import load_dataset

raw_datasets = load_dataset('conll2003')

Found cached dataset conll2003 (C:/Users/Guntsv/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

In [71]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [72]:
raw_datasets['train'][0]['tokens']

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [73]:
raw_datasets['train'][0]['ner_tags']

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [74]:
raw_datasets['train'][0]['pos_tags']

[22, 42, 16, 21, 35, 37, 16, 21, 7]

In [75]:
raw_datasets['train'].features['ner_tags']

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [76]:
ner_features = raw_datasets['train'].features['ner_tags']
label_names = ner_features.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

## 2. Preprocessing

Tokenization (numericalization), aligning labels

In [77]:
from transformers import AutoTokenizer
checkpoint = 'bert-base-cased'
tokenizer  = AutoTokenizer.from_pretrained(checkpoint)

In [78]:
tokenizer("Chaky love deep learning")

{'input_ids': [101, 24705, 3781, 1567, 1996, 3776, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [79]:
tokenizer.decode([101, 24705, 3781, 1567, 1996, 3776, 102])

'[CLS] Chaky love deep learning [SEP]'

In [80]:
tokenizer.is_fast #basically a internal Huggingface
#optimization that makes its tokenizer very fast

True

In [81]:
tokens = raw_datasets['train'][0]['tokens']
tokens

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [82]:
#we have to aware that our input are already
#tokenized......

inputs = tokenizer(tokens, is_split_into_words=True)
inputs

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [83]:
inputs.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [84]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [85]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None

    for word_id in word_ids:
        if word_id != current_word:
            #start of a new word
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
            
        elif word_id is None:
            new_labels.append(-100) #-100 is a default index to ignore for hugging face
            
        else:
            #same word as previous token
            label = labels[word_id]
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [86]:
labels = raw_datasets['train'][0]['ner_tags']
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels,word_ids))

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]


In [87]:
def tokenize_and_align_labels(examples):
    tokenizd_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

    all_labels = examples['ner_tags']
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenizd_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels,word_ids))

    tokenizd_inputs['labels'] = new_labels
    return tokenizd_inputs

In [88]:
tokenized_datasets = raw_datasets.map(
    function = tokenize_and_align_labels, batched=True, remove_columns=raw_datasets['train'].column_names,)

Loading cached processed dataset at C:\Users\Guntsv\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98\cache-3c07aff338c7bbaa.arrow


  0%|          | 0/4 [00:00<?, ?ba/s]

Loading cached processed dataset at C:\Users\Guntsv\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98\cache-be0a42cc0b608f54.arrow


In [89]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [90]:
tokenized_datasets['train'][0]['input_ids']

[101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102]

In [91]:
tokenizer.decode(tokenized_datasets['train'][0]['input_ids'])

'[CLS] EU rejects German call to boycott British lamb. [SEP]'

In [92]:
tokenized_datasets['train'][0]['token_type_ids']

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [93]:
tokenized_datasets['train'][0]['attention_mask']

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [94]:
tokenized_datasets['train'][0]['labels']

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]

In [95]:
from transformers import DataCollatorForTokenClassification
#huggingface is very kind to make a data collator for each pipeline

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [96]:
test = [tokenized_datasets['train'][i] for i in range(2)]

In [97]:
data_collator(test)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[  101,  7270, 22961,  1528,  1840,  1106, 21423,  1418,  2495, 12913,
            119,   102],
         [  101,  1943, 14428,   102,     0,     0,     0,     0,     0,     0,
              0,     0]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]),
 'labels': tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
         [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])}

## 3. Dataloader

In [98]:
from torch.utils.data import DataLoader
train_loader = DataLoader(tokenized_datasets['train'],shuffle=True,
                          collate_fn=data_collator, batch_size=8)
val_loader = DataLoader(tokenized_datasets['validation'],
                        collate_fn=data_collator, batch_size=8)
test_loader = DataLoader(tokenized_datasets['test'],
                         collate_fn=data_collator, batch_size=8)

In [99]:
# for batch in train_loader:
#     print(batch)
#     break

## 4. Model
The second part of the Pipeline

In [100]:
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v:k for k,v in id2label.items()}

In [101]:
from transformers import AutoModelForTokenClassification
#basically, it imports a pretrained model, and add linear layers and only train that layers.....

model = AutoModelForTokenClassification.from_pretrained(
    checkpoint, id2label = id2label, label2id= label2id
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [102]:
model.config.num_labels

9

## 5. Metrics

We need to define 'compute_metrics()' that takes list of predictions and labels, and returns a dictionary with

In [103]:
# !pip install seqeval

In [104]:
import evaluate

metric = evaluate.load('seqeval')

In [105]:
labels = raw_datasets['train'][0]['ner_tags']
labels

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [106]:
labels = [label_names[i] for i in labels]
labels

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [107]:
#metric.compute
#let's create a kae label by perturbing the label by chaning some value, at index 2 
pred = labels.copy()
pred[2] = 'B-ORG'
pred

['B-ORG', 'O', 'B-ORG', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [108]:
metric.compute(predictions=[pred], references=[labels])

{'MISC': {'precision': 1.0,
  'recall': 0.5,
  'f1': 0.6666666666666666,
  'number': 2},
 'ORG': {'precision': 0.5,
  'recall': 1.0,
  'f1': 0.6666666666666666,
  'number': 1},
 'overall_precision': 0.6666666666666666,
 'overall_recall': 0.6666666666666666,
 'overall_f1': 0.6666666666666666,
 'overall_accuracy': 0.8888888888888888}

In [109]:
# import numpy as np

# def compute_metric(eval_preds):
#     logits, labels = eval_preds
#     pred           = np.argmax(logits,axis = -1)

#     #Remove ignore index (special tokens) and convert to labels
#     true_labels = [
#         [label_names[l] for l in label if l != -100] 
#         for label in labels
#     ]
#     true_pred   = [
#         [label_names[p] for (p,l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(pred, labels)
#     ]
#     all_metrics = metric.compute(predictions=true_pred, references=true_labels)

#     return {
#         'precision' : all_metrics['overall_precision'],
#         'recall'    : all_metrics['overall_recall'],
#         'f1'        : all_metrics['overall_f1'],
#         'accuracy'  : all_metrics['overall_accuracys']
#     }

## 6. Optimizer

In [110]:
from torch.optim import AdamW

#Adam with learning decay
optimizer = AdamW(model.parameters(),lr=2e-5)

## 7. Accelerator 

So usually, you just train right...

But huggingface creates a wrapper called 'Acceleartor'
which utilize your resource in a parameter

In [111]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimzer, train_loader, eval_loader = \
    accelerator.prepare(model,optimizer,train_loader,val_loader)

## 8. Learning rate scheduler

In [112]:
from transformers import get_scheduler

num_train_epochs = 1
num_update_step_per_epoch = len(train_loader)
num_train_steps = num_train_epochs* num_update_step_per_epoch

lr_scheduler = get_scheduler(
    'linear',
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_train_steps)

## 9. Repository 

Repository is like a free-cloud space, hosted by HuggingFace.

ITt is very useful because for every certain steps, it will upload 
your model to the Huggingface..... if suddenly something crashes, 
you can resume .... becuase your weights are push to Huggingface repo.

In [113]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [114]:
!git config --global credential.helper

In [115]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "bert-fintuned-ner-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

'guntsv/bert-fintuned-ner-accelerate'

In [116]:
#sudo apt install git-lfs
#sudo brew install git-lfs
#go to git-lfs and download it
import os
os.environ["TOKENIZERS_PARALLELISM"] = 'true'

output_dir = "bert-fintuned-ner-accelerate"
repo = Repository(output_dir, clone_from=repo_name)

c:\Users\Guntsv\Documents\GitHub\DSAI-AIT-2022\Course\Natural Language Understanding\Labolatory - Code Along\04 - Huggingface\code-along\bert-fintuned-ner-accelerate is already a clone of https://huggingface.co/guntsv/bert-fintuned-ner-accelerate. Make sure you pull the latest changes with `repo.git_pull()`.


# 10. Training

In [117]:
#convert prediciton and labels into strings, like
#what our metric object expects

def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels      = labels.detach().cpu().clone().numpy()

    true_labels = [[label_names[l] for l in label if l != -100] 
                   for label in labels
                   ]
    true_predictions = [[label_names[p] for (p,l) in zip(prediction,label) if l != -100] 
                   for prediction,label in zip(predictions,labels)
                   ]
    
    return true_labels, true_predictions

In [118]:
from tqdm.auto import tqdm #process bar
import torch

progress_bar = tqdm(range(num_train_steps))

for epoch in range(num_train_epochs):
    model.train()
    for batch in train_loader:
        outputs = model(**batch) #**because our input is keyword (input_ids = ....   )
        loss = outputs.loss
        accelerator.backward(loss) #instead of optimizer.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    
    #evaluation
    model.eval() #all batchnorm, dropout will be turned off
    for batch in val_loader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim = -1)
        labels      = batch["label"]

        #necessary to pad predictions and labels to same length... if not.... crash
        predictions = accelerator.pad_across_processes(predictions, dim = -1 , pad_index= -100)
        labels      = accelerator.pad_across_processes(labels, dim = -1 , pad_index= -100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered      = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions = true_predictions, reference = true_labels)

    results = metric.compute()
    

    print(

        f"epoch {epoch}",
        {
            key: results [f'overall_{key}']
            for key in ['precision','recall','f1','accuracy']
        }
    )
    #save and upload your model
    accelerator.wait_for_everyone() #many processes
    unwrapped_model = accelerator.unwrap_model(model) #start from scratch
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)

    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(commit_message=f'Training in progress epoch {epoch}', blocking=False)

  0%|          | 0/1756 [00:00<?, ?it/s]

KeyboardInterrupt: 

## 11. Inference!!!

In [None]:
from transformers import pipeline

checkpoint = 'guntsv/bert-fintuned-ner-accelerate'

clf = pipeline('token-classification', model=checkpoint, aggregation_strategy = 'simple')

In [None]:
clf("Ayush and Chaky are going to play soccer today at AIT, Bangkok")