### Lecture 15
Fine-tuning BERT for NER. 

Code adapted from: https://huggingface.co/docs/transformers/en/tasks/token_classification

In [1]:
#!pip install seqeval

In [2]:
import numpy as np 
import torch 
from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForTokenClassification, 
                          TrainingArguments, Trainer, DataCollatorForTokenClassification, pipeline)
import evaluate

### 1. Dataset

In [3]:
# Dataset with labeled NER BIO tags
wnut = load_dataset("wnut_17", trust_remote_code=True)

In [4]:
# List of possible token labels 
label_list = wnut["train"].features[f"ner_tags"].feature.names
label_list

['O',
 'B-corporation',
 'I-corporation',
 'B-creative-work',
 'I-creative-work',
 'B-group',
 'I-group',
 'B-location',
 'I-location',
 'B-person',
 'I-person',
 'B-product',
 'I-product']

In [5]:
# Transformer into y=0, y=1, y=2 etc. 
label2id = {}
id2label = {}
for i, label in enumerate(label_list): 
    label2id[label] = i 
    id2label[i] = label

In [6]:
label2id

{'O': 0,
 'B-corporation': 1,
 'I-corporation': 2,
 'B-creative-work': 3,
 'I-creative-work': 4,
 'B-group': 5,
 'I-group': 6,
 'B-location': 7,
 'I-location': 8,
 'B-person': 9,
 'I-person': 10,
 'B-product': 11,
 'I-product': 12}

In [7]:
# Example with the named entity "Rick and Morty"
wnut["validation"][2]

{'id': '2',
 'tokens': ['All',
  'I',
  "'",
  've',
  'been',
  'doing',
  'is',
  'BINGE',
  'watching',
  'Rick',
  'and',
  'Morty',
  '😂'],
 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 0]}

### 2. Tokenization and pre-processing

We'll load a smaller "distilled" BERT model. 

In [8]:
model_name = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)

In [9]:
# Here's how the tokenizer works 
example = wnut["validation"][2]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 'all',
 'i',
 "'",
 've',
 'been',
 'doing',
 'is',
 'bing',
 '##e',
 'watching',
 'rick',
 'and',
 'mort',
 '##y',
 '[UNK]',
 '[SEP]']

In [10]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], 
                                 truncation=True, 
                                 is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [11]:
tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/1009 [00:00<?, ? examples/s]

In [12]:
# Class to help us create batches 
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

### 3. Model & training

We'll load a smaller "distilled" BERT model. 

In [13]:
model_name

'distilbert/distilbert-base-uncased'

In [14]:
model = AutoModelForTokenClassification.from_pretrained(model_name, 
                                                        num_labels=len(label2id), 
                                                        id2label=id2label, 
                                                        label2id=label2id)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Where do I specify the optimizer?! 
training_args = TrainingArguments(
    output_dir="saved_models",      # where to save the trainined model 2 
    learning_rate=2e-5,             # initial learning rate
    per_device_train_batch_size=50, # batch sizes within SGD
    per_device_eval_batch_size=50,
    num_train_epochs=2,             # epoch = 1 time thru the training data 
    eval_strategy="epoch",          # evaluates on the dev set each epoch
    save_strategy="epoch",          # saves a checkpoint of the model each epoch
)

What metrics do we want? 

In [22]:
seqeval = evaluate.load("seqeval") # Some pre-computed token-level precision, recall, F1

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_wnut["train"],
    eval_dataset=tokenized_wnut["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [24]:
# Takes about 20 minutes on Katie's Apple M1 CPUs... 
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.24108,0.604853,0.417464,0.493984,0.945769
2,No log,0.233566,0.652406,0.437799,0.52398,0.947613


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=136, training_loss=0.13805177632500143, metrics={'train_runtime': 1159.4324, 'train_samples_per_second': 5.855, 'train_steps_per_second': 0.117, 'total_flos': 102032801108760.0, 'train_loss': 0.13805177632500143, 'epoch': 2.0})

In [25]:
# Save the final model and tokenizer 
model.save_pretrained("saved_models")
tokenizer.save_pretrained("saved_models")

('saved_models/tokenizer_config.json',
 'saved_models/special_tokens_map.json',
 'saved_models/vocab.txt',
 'saved_models/added_tokens.json',
 'saved_models/tokenizer.json')

### 4. Test-time inference 

In [26]:
classifier = pipeline("ner", model="saved_models", tokenizer="saved_models")

In [27]:
classifier("Katie Keith is a professor at Williams College!")

[{'entity': 'B-person',
  'score': 0.7615155,
  'index': 1,
  'word': 'katie',
  'start': 0,
  'end': 5},
 {'entity': 'I-person',
  'score': 0.5479589,
  'index': 2,
  'word': 'keith',
  'start': 6,
  'end': 11},
 {'entity': 'B-location',
  'score': 0.27801132,
  'index': 7,
  'word': 'williams',
  'start': 30,
  'end': 38},
 {'entity': 'I-location',
  'score': 0.24160397,
  'index': 8,
  'word': 'college',
  'start': 39,
  'end': 46}]