In [2]:
%%capture

!pip install -r requirements.txt

# Named Entity Recognition

Named Entity Recognition (NER), also known as Token Classification, is a popular NLP modeling task in which you fine-tune a model to recognize words, phrases, and concepts from a given body of text.

[![Alt text](https://github.com/Ben-Epstein/domino-dca-notebooks/blob/main/reference-project-ner/images/ner.png?raw=true)](https://github.com/dominodatalab/reference-project-ner)


This is a very powerful technique that can be applied to most any domain:
* Extracting executive names from a financial report
* Saving ingredients from a recipe
* Identifying streets, names, and addresses to remove PII from data

In this notebook, we will fine-tune an extremely popular class of models, Bert (in this case, [distil-bert](https://huggingface.co/distilbert-base-cased)), on the task of NER. We will perform NER over the [wikiann](https://huggingface.co/datasets/wikiann/viewer/en) dataset, containing wikipedia articles and entities (known as "spans") pertaining to people (PER), locations (LOC), and organizations (ORG).




In [65]:
import os

import numpy as np
import pandas as pd
from datasets import load_dataset, load_metric
from huggingface_hub import notebook_login
from matplotlib import pyplot as plt
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
import mlflow

GREEN = '\033[92m'
BLUE = '\033[94m'
CYAN = '\033[96m'
ENDC = '\033[0m'

SPAN_MAP = {
    "ORG": GREEN,
    "LOC": BLUE,
    "PER": CYAN
}

LOG_DATA_MLFLOW = True
LOG_MODEL_MLFLOW = True

## Load our data

First, we will load our dataset. We can look at a particular sample, and view the spans that have been labeled.

In [11]:
# Our model's tokenizer
HF_MODEL = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [73]:
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset

# Load our DatasetDict
ds = load_dataset("wikiann", "en")
print("Splits:", list(ds.keys()))
print("Columns", list(ds["train"][0].keys()))
# Extract our labels
tags_name = "tags" if "tags" in ds["train"].features else "ner_tags"
assert tags_name in ds["train"].features, (
    "Your dataset must have `tags` or `ner_tags` to perform token classification"
)

labels = ds["train"].features[tags_name].feature.names
print("Labels", labels)

tokens = ds["train"][0]["tokens"]
inp = tokenizer.convert_tokens_to_string(tokens)
print("Input:", inp)
print("Spans:", ds["train"][0]["spans"])
for span in ds["train"][0]["spans"]:
    ent, spantext = span.split(": ")
    color = SPAN_MAP[ent]
    inp = inp.replace(spantext, f"{color}{spantext}{ENDC}")
print("Text with spans:", inp)

Splits: ['validation', 'test', 'train']
Columns ['tokens', 'ner_tags', 'langs', 'spans']
Labels ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
Input: R.H. Saunders ( St. Lawrence River ) ( 968 MW )
Spans: ['ORG: R.H. Saunders', 'ORG: St. Lawrence River']
Text with spans: [92mR.H. Saunders[0m ( [92mSt. Lawrence River[0m ) ( 968 MW )


## Preprocessing

Before training, we need to tokenize our inputs and align their labels.

Specifically, when tokenizing, special tokens such as `[CLS]` and `[SEP]` which create mismatches between actual token inputs and their labels.
    
We realign the tokens and labels by:
  1. Mapping all tokens to their corresponding word with the `word_ids` method.
  2. Assigning the label -100 to the special tokens `[CLS]` and `[SEP]`
      so they're ignored by the PyTorch loss function.
  3. Only labeling the first token of a given word. Assign -100 to
      other subtokens from the same word.
        
For more information, see: https://huggingface.co/docs/transformers/tasks/token_classification

In [33]:
from datasets.formatting.formatting import LazyBatch
from transformers import BatchEncoding
from datasets import Dataset, DatasetDict
from typing import Dict, List

def tokenize_and_align_labels(examples: Dict[str, List]) -> Dict[str, List]:
    """Tokenize inputs and align token values to their labels"""
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[tags_name]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            # Only label the first token of a given word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


ds_encoded = ds.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

## Data Collation

When processing our data during training, we need to pad each sample to the longest one in the batch. The `DataCollatorForTokenClassification` handles this for us.

In [34]:
from transformers import DataCollatorForTokenClassification

# (Assuming PyTorch) we create a collator to pad the sentences to the max
# input length during batch creation
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

## Evaluation

During training, we want to monitor the progress of the model. After each batch, we measure that progress using `seqeval`, the standard metric in NER.

Our `compute_metrics` function will measure the precision, recall, f1, and accuracy of our batch predictions.

In [36]:
from typing import Dict
from transformers import EvalPrediction
import evaluate
import numpy as np


seqeval = evaluate.load("seqeval")


def compute_metrics(p: EvalPrediction) -> Dict[str, float]:
    """We use seqeval during training to compute precision, recall, f1, and accuracy

    Seqeval is the standard for metric computation in token classification.
    We preprocess the predictions and labels to remove the -100 ([CLS] and [SEP] tokens)
    """
    predictions, prediction_labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, prediction_labels)
    ]
    true_labels = [
        [labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, prediction_labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

## Load our model and train!

We load our model using the `AutoModelForTokenClassification` entrypoint, which ensures our model is properly prepared for the task of token classification (NER). We provide it with our labels and their mappings for easier evaluation

In [37]:
# In order to create our model, we create idx2label and label2idx maps
id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}

# Now load our model to fine-tune

model = AutoModelForTokenClassification.from_pretrained(
    HF_MODEL, num_labels=len(labels), id2label=id2label, label2id=label2id
)

Downloading model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Automatic integration with Domino's MLflow environment

_Your metrics and models will be logged_

In [39]:
import os
import mlflow.transformers

mlflow.transformers.autolog(
    log_input_examples=LOG_DATA_MLFLOW,
    log_model_signatures=LOG_MODEL_MLFLOW,
    log_models=LOG_MODEL_MLFLOW,
    log_datasets=LOG_DATA_MLFLOW
)


mlflow_run_name = "ner-wikipedia-run-1"
exp = mlflow.set_experiment("mlflow-ner")
os.environ["MLFLOW_FLATTEN_PARAMS"] = "1"
os.environ["HF_MLFLOW_LOG_ARTIFACTS"] = "1"

2023/10/21 21:58:39 INFO mlflow.tracking.fluent: Experiment with name 'mlflow-ner' does not exist. Creating a new experiment.


In [41]:
training_args = TrainingArguments(
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    num_train_epochs=3,
    output_dir="./output",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=False,
    seed=101,
    load_best_model_at_end=True

)

has_val = "validation" in ds
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=ds_encoded["train"],
    eval_dataset=ds_encoded["validation"] if has_val else ds_encoded["test"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
)

trainer.train()


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3098,0.266697,0.772363,0.804468,0.788089,0.918136
2,0.2176,0.254495,0.779828,0.815496,0.797263,0.922743
3,0.1579,0.256304,0.794636,0.82306,0.808598,0.926517


Downloading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

2023/10/21 22:01:55 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


Downloading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

2023/10/21 22:04:06 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


Downloading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

2023/10/21 22:06:29 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


TrainOutput(global_step=3750, training_loss=0.25097258911132814, metrics={'train_runtime': 441.6407, 'train_samples_per_second': 135.857, 'train_steps_per_second': 8.491, 'total_flos': 481475700577344.0, 'train_loss': 0.25097258911132814, 'epoch': 3.0})

## Logging your Data and Models to MLFlow

_If you've set either these values to True, we will log the model checkpoints and input dataset to MLFlow directly within Domino._

In [42]:
run_id = mlflow.last_active_run().info.run_id
with mlflow.start_run(run_id=run_id):
    if LOG_MODEL_MLFLOW:
        model_config = {
            "model": trainer.model,
            "tokenizer": trainer.tokenizer
        }
        mlflow.transformers.log_model(model_config, artifact_path="model")
    if LOG_DATA_MLFLOW:
        for split in ds.keys():
            data = mlflow.data.from_huggingface(ds[split])
            mlflow.log_input(data, context=split)


  mlflow.transformers.log_model(model_config, artifact_path="model")
  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


Downloading (…)solve/main/README.md:   0%|          | 0.00/8.81k [00:00<?, ?B/s]



## Using our fine-tuned model

_Lastly, we can save the model directly to disk, and load it to make predictions_

In [64]:
from transformers import pipeline

trainer.save_model(f"/mnt/artifacts/{mlflow_run_name}")

token_classifier = pipeline(model=f"/mnt/artifacts/{mlflow_run_name}", aggregation_strategy="simple", task="token-classification")
sentence = "The Microsoft spokesperson Ronald Ramer lives in New York"
tokens = token_classifier(sentence)


for token in tokens:
    entity = token["entity_group"]
    color = SPAN_MAP[entity]
    span = token["word"]
    sentence = sentence.replace(span, f"{color}{span}{ENDC}")

print("Prediction:", sentence)

Prediction: The [92mMicrosoft[0m spokesperson [96mRonald Ramer[0m lives in [94mNew York[0m
