In [1]:
import numpy as np
from tqdm.auto import tqdm
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer,
    DataCollatorForTokenClassification,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    get_scheduler,
    pipeline
)
from accelerate import Accelerator
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW

In [2]:
raw_datasets = load_dataset("conll2003")

In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

# Investigate data

In [4]:
raw_datasets["train"][0]["tokens"]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [5]:
raw_datasets["train"][0]["ner_tags"]

[3, 0, 7, 0, 0, 0, 7, 0, 0]

## Label meanings

In [6]:
raw_datasets["train"].features["ner_tags"].feature.names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

## Match labels to text

In [7]:
def pp_words_and_labels(words, labels, label_names, max_line=80):
    line1 = ""
    line2 = ""
    
    for word, label in zip(words, labels):
        full_label = label_names[label]
        max_length = max(len(word), len(full_label))
        line1 += word + " " * (max_length - len(word) + 1)
        line2 += full_label + " " * (max_length - len(full_label) + 1)
        if len(line1) > max_line:
            print(line1)
            print(line2)
            line1 = ""
            line2 = ""
    print(line1)
    print(line2)

### NER labels

In [8]:
pp_words_and_labels(
    words=raw_datasets["train"][0]["tokens"],
    labels=raw_datasets["train"][0]["ner_tags"],
    label_names=raw_datasets["train"].features["ner_tags"].feature.names
)

EU    rejects German call to boycott British lamb . 
B-ORG O       B-MISC O    O  O       B-MISC  O    O 


In [9]:
pp_words_and_labels(
    words=raw_datasets["train"][4]["tokens"],
    labels=raw_datasets["train"][4]["ner_tags"],
    label_names=raw_datasets["train"].features["ner_tags"].feature.names,
    max_line=110
)

Germany 's representative to the European Union 's veterinary committee Werner Zwingmann said on Wednesday consumers 
B-LOC   O  O              O  O   B-ORG    I-ORG O  O          O         B-PER  I-PER     O    O  O         O         
should buy sheepmeat from countries other than Britain until the scientific advice was clearer . 
O      O   O         O    O         O     O    B-LOC   O     O   O          O      O   O       O 


### POS labels

In [10]:
pp_words_and_labels(
    raw_datasets["train"][0]["tokens"],
    raw_datasets["train"][0]["pos_tags"],
    raw_datasets["train"].features["pos_tags"].feature.names,
)

EU  rejects German call to boycott British lamb . 
NNP VBZ     JJ     NN   TO VB      JJ      NN   . 


In [11]:
pp_words_and_labels(
    raw_datasets["train"][4]["tokens"],
    raw_datasets["train"][4]["pos_tags"],
    raw_datasets["train"].features["pos_tags"].feature.names,
    max_line=110
)

Germany 's  representative to the European Union 's  veterinary committee Werner Zwingmann said on Wednesday consumers 
NNP     POS NN             TO DT  NNP      NNP   POS JJ         NN        NNP    NNP       VBD  IN NNP       NNS       
should buy sheepmeat from countries other than Britain until the scientific advice was clearer . 
MD     VB  NN        IN   NNS       JJ    IN   NNP     IN    DT  JJ         NN     VBD JJR     . 


### Chunking labels

In [12]:
pp_words_and_labels(
    raw_datasets["train"][0]["tokens"],
    raw_datasets["train"][0]["chunk_tags"],
    raw_datasets["train"].features["chunk_tags"].feature.names,
)

EU   rejects German call to   boycott British lamb . 
B-NP B-VP    B-NP   I-NP B-VP I-VP    B-NP    I-NP O 


In [13]:
pp_words_and_labels(
    raw_datasets["train"][4]["tokens"],
    raw_datasets["train"][4]["chunk_tags"],
    raw_datasets["train"].features["chunk_tags"].feature.names,
    max_line=115
)

Germany 's   representative to   the  European Union 's   veterinary committee Werner Zwingmann said on   Wednesday 
B-NP    B-NP I-NP           B-PP B-NP I-NP     I-NP  B-NP I-NP       I-NP      I-NP   I-NP      B-VP B-PP B-NP      
consumers should buy  sheepmeat from countries other  than Britain until  the  scientific advice was  clearer . 
I-NP      B-VP   I-VP B-NP      B-PP B-NP      B-ADJP B-PP B-NP    B-SBAR B-NP I-NP       I-NP   B-VP B-ADJP  O 


# Define tokenizer

In [14]:
model_checkpoint = "bert-base-cased"

In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [16]:
tokenizer.is_fast

True

## Text is pre-tokenized, need to tell our tokenizer to handle this

In [17]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)

In [18]:
inputs.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [19]:
labels = raw_datasets["train"][0]["ner_tags"]

### Label and token mismatch

Length of labels & tokens no longer match, since words were split up and special tokens were added

In [20]:
print(len(labels), len(inputs.tokens()))

9 12


### Solution
Match tokens to corresponding word and then expanding the label list to match the tokens

In [21]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [22]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If label is B-XXX, change it to I-XXX since we're continuing the same word
            if label % 2 == 1:
                label += 1
            new_labels.append(label)
    return new_labels

In [23]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()

In [24]:
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]


# Tokenize
Tokenize while aligning labels

In [25]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [26]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names
)

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [27]:
tokenized_datasets["train"][0]

{'input_ids': [101,
  7270,
  22961,
  1528,
  1840,
  1106,
  21423,
  1418,
  2495,
  12913,
  119,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]}

# Model fine-tuning

## Data collation
Need to pad both inputs and labels

In [28]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [29]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [30]:
batch

{'input_ids': tensor([[  101,  7270, 22961,  1528,  1840,  1106, 21423,  1418,  2495, 12913,
           119,   102],
        [  101,  1943, 14428,   102,     0,     0,     0,     0,     0,     0,
             0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])}

In [31]:
[tokenizer.decode(ids) for ids in batch.input_ids.tolist()]

['[CLS] EU rejects German call to boycott British lamb. [SEP]',
 '[CLS] Peter Blackburn [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]']

In [32]:
batch.labels

tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])

In [33]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]
[-100, 1, 2, -100]


## Metrics

In [34]:
metric = evaluate.load("seqeval")

In [35]:
labels = raw_datasets["train"][0]["ner_tags"]
label_names = raw_datasets["train"].features["ner_tags"].feature.names
labels = [label_names[i] for i in labels]

In [36]:
labels

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [37]:
# Make some fake predictions to look at metrics
predictions = labels.copy()
predictions[2] = "O"
predictions[-2] = "I-MISC"

In [38]:
print(predictions)
print(label_names)

['B-ORG', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'I-MISC', 'O']
['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [39]:
metric.compute(predictions=[predictions], references=[labels])

{'MISC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 0.5,
 'overall_recall': 0.3333333333333333,
 'overall_f1': 0.4,
 'overall_accuracy': 0.7777777777777778}

In [40]:
def prepped_labels_and_preds(labels, predictions, label_names):
    # Remove masked indices and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    true_labels, true_predictions = prepped_labels_and_preds(labels, predictions, label_names)
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }


## Model

In [41]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [42]:
id2label

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}

In [43]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
model.config.num_labels

9

In [45]:
args = TrainingArguments(
    output_dir="../temp/07/bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    overwrite_output_dir=True,
)

## Train model

In [46]:
# Training on 25% of the data to save time
n_train_use = int(0.25 * tokenized_datasets["train"].num_rows)
n_eval_use = int(0.25 * tokenized_datasets["validation"].num_rows)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"].shuffle(seed=42).select(range(n_train_use)),
    eval_dataset=tokenized_datasets["validation"].shuffle(seed=42).select(range(n_eval_use)),
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [47]:
trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.100703,0.835897,0.888889,0.861579,0.971915
2,0.257200,0.083818,0.886259,0.918882,0.902276,0.979492
3,0.064400,0.088002,0.896689,0.922972,0.909641,0.980575


TrainOutput(global_step=1317, training_loss=0.1325849462116679, metrics={'train_runtime': 144.7877, 'train_samples_per_second': 72.727, 'train_steps_per_second': 9.096, 'total_flos': 233000610434676.0, 'train_loss': 0.1325849462116679, 'epoch': 3.0})

## Accelerated training

In [48]:
batch_size = 8

### Data loaders

In [49]:
train_dataloader = DataLoader(
    tokenized_datasets["train"].shuffle(seed=42).select(range(n_train_use)),
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"].shuffle(seed=42).select(range(n_eval_use)),
    collate_fn=data_collator,
    batch_size=batch_size
)

In [50]:
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

### Model

In [51]:
# Start fresh with our model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Optimizer

In [52]:
optimizer = AdamW(model.parameters(), lr=2e-5)

### Prepare with accelerator

In [53]:
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

### Scheduler

In [54]:
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

### Training

In [55]:
output_dir = "../temp/07/bert-finetuned-ner-accelerate"

#### Define a post-processing function to simplify evaluation

In [56]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    true_labels, true_predictions = prepped_labels_and_preds(labels, predictions, label_names)
    return true_labels, true_predictions

#### Training loop

In [57]:
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    
    # Training block
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation block
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Update padding to match (different processes might have different batch max lengths)
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"Epoch {epoch}: ", {key: results[f"overall_{key}"] for key in ["precision", "recall", "f1", "accuracy"]}
    )

    # Save results
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)

  0%|          | 0/1317 [00:00<?, ?it/s]

Epoch 0:  {'precision': 0.8657123381049762, 'recall': 0.8089171974522293, 'f1': 0.8363516628251564, 'accuracy': 0.9657204714938658}
Epoch 1:  {'precision': 0.901840490797546, 'recall': 0.8574206092028516, 'f1': 0.8790697674418604, 'accuracy': 0.9765455857589608}
Epoch 2:  {'precision': 0.9120654396728016, 'recall': 0.8802631578947369, 'f1': 0.8958821560093739, 'accuracy': 0.9779889343276401}


# Estimation using the fine-tuned model

In [58]:
local_model_checkpoint = "../temp/07/bert-finetuned-ner-accelerate/"

In [59]:
token_classifier = pipeline(
    "token-classification", model=local_model_checkpoint, aggregation_strategy="simple"
)

In [60]:
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

[{'entity_group': 'PER',
  'score': 0.9926579,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.85100394,
  'word': 'Hu',
  'start': 33,
  'end': 35},
 {'entity_group': 'MISC',
  'score': 0.4837528,
  'word': '##gging',
  'start': 35,
  'end': 40},
 {'entity_group': 'ORG',
  'score': 0.5974179,
  'word': 'Face',
  'start': 41,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.9922236,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]