<a href="https://colab.research.google.com/github/cswamy/pytorch/blob/main/NER_finetuned_bert_base_cased_conll.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a target="_blank" href="https://colab.research.google.com/github/cswamy/pytorch/blob/main/notebooks/NER_finetuned_bert_base_cased_conll.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

### **Notes**

Notebook to finetune a bert-base-cased model for named entity recognition using the conll2003 dataset.

Inspired by hugging face tutorial: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

**Resources**

1.   Hugging face checkpoint: https://huggingface.co/bert-base-cased
2.   Original bert-base-cased paper: https://arxiv.org/abs/1810.04805
3.   Conll 2003 dataset: https://huggingface.co/datasets/conll2003


### **Setup**

In [1]:
import torch
from torch import nn

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

### **Download dataset**

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/519.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/519.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-

In [3]:
from datasets import load_dataset

raw_datasets = load_dataset(path="conll2003")
raw_datasets

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [4]:
class_names = raw_datasets["train"].features["ner_tags"].feature.names
class_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

### **Tokenize**

In [5]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m77.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, safetensors, transformers
Successfully installed safetensors-0.3.3 tokenizers-0.13.3 transformers-4.33.1


#### Setup tokenizer

In [6]:
from transformers import AutoTokenizer

checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

#### Test tokenizer on one input

In [7]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
print(f"Tokenizer output: {inputs}")
print(f"Output tokens: {inputs.tokens()}")
print(f"Output word ids: {inputs.word_ids()}")
print(f"Lengths of raw tokens, ner_tags, tokenized tokens and word_ids: {len(raw_datasets['train'][0]['tokens']), len(raw_datasets['train'][0]['ner_tags']), len(inputs.tokens()), len(inputs.word_ids())}")

Tokenizer output: {'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Output tokens: ['[CLS]', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.', '[SEP]']
Output word ids: [None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]
Lengths of raw tokens, ner_tags, tokenized tokens and word_ids: (9, 9, 12, 12)


#### Define tokenize functions

In [8]:
def tokenize_and_align_labels(examples):
  """
  Function to tokenize raw dataset and align tokenized inputs with labels (ner_tags).
  Aligning is required since tokenization inserts new tokens (e.g. [CLS]) and creates
  sub-word tokens.
  """
  labels = examples["ner_tags"]
  tokenized_inputs = tokenizer(examples["tokens"],
                               truncation=True,
                               is_split_into_words=True)

  # New labels list for all examples in batch
  new_labels_list = []
  for i, label in enumerate(labels):

    tokens = tokenized_inputs.tokens(i)
    word_ids = tokenized_inputs.word_ids(i)

    # New labels list for current example
    new_labels = []
    current_word = None
    for word_id in word_ids:
      if word_id != current_word:
        # Start a new word
        current_word = word_id
        # Set to -100 for SEP token
        new_label = -100 if word_id is None else label[word_id]
        new_labels.append(new_label)
      elif word_id is None:
        # Set to -100 for CLS
        new_labels.append(-100)
      else:
        # Sub-word: same label as previous word
        new_label = label[word_id]
        # If label is B-XXX, change to I-XXX
        if new_label % 2 == 1:
          new_label += 1
        new_labels.append(new_label)

    new_labels_list.append(new_labels)

  tokenized_inputs["labels"] = new_labels_list
  return tokenized_inputs

#### Tokenize and align labels

In [9]:
tokenized_datasets = raw_datasets.map(tokenize_and_align_labels,
                                      batched=True,
                                      remove_columns=raw_datasets["train"].column_names)
tokenized_datasets

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

### **Prepare dataloaders**

In [10]:
from torch.utils.data import DataLoader
from transformers import DataCollatorForTokenClassification

BATCH_SIZE = 8
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

train_dataloader = DataLoader(dataset=tokenized_datasets["train"],
                              batch_size=BATCH_SIZE,
                              shuffle=True,
                              collate_fn=data_collator)

val_dataloader = DataLoader(dataset=tokenized_datasets["validation"],
                            batch_size=BATCH_SIZE,
                            shuffle=False,
                            collate_fn=data_collator)

len(train_dataloader), len(val_dataloader), class_names

(1756,
 407,
 ['O',
  'B-PER',
  'I-PER',
  'B-ORG',
  'I-ORG',
  'B-LOC',
  'I-LOC',
  'B-MISC',
  'I-MISC'])

### **Train model**

#### Training setup

In [11]:
from torch.optim import lr_scheduler
from transformers import AutoModelForTokenClassification

# Setup id2label and label2id dicts
id2label = {i: label for i, label in enumerate(class_names)}
label2id = {label: i for i, label in enumerate(class_names)}

# Instantiate model
model = AutoModelForTokenClassification.from_pretrained(checkpoint,
                                                        id2label=id2label,
                                                        label2id=label2id).to(device)

# Setup optimiser
optimiser = torch.optim.AdamW(params=model.parameters(),
                              lr=2e-5)

# Setup scheduler
lr_scheduler = torch.optim.lr_scheduler.LinearLR(optimizer=optimiser)

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Define accuracy function from torchmetrics

In [12]:
!pip install torchmetrics

from torchmetrics import Accuracy
acc_fn = Accuracy(task="multiclass", num_classes=len(class_names)).to(device)

Collecting torchmetrics
  Downloading torchmetrics-1.1.1-py3-none-any.whl (763 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m763.4/763.4 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.9.0-py3-none-any.whl (23 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.9.0 torchmetrics-1.1.1


#### Define preprocessor for accuracy

In [13]:
def process_for_acc(preds_batch:torch.tensor,
                    labels_batch:torch.tensor):
  """
  Function to remove unwanted labels and predictions for torchmetrics Accuracy.
  Args:
    preds_batch: batch of predictions
    labels_batch: batch of truth labels
  Returns:
    Tuple of two lists with cleaned up preds_batch and labels_batch.
  """
  # Convert tensors to lists of lists
  batched_preds_list = preds_batch.tolist()
  batched_labels_list = labels_batch.tolist()

  processed_preds_batch = []
  processed_labels_batch = []
  # Process each preds and labels
  for i, label_list in enumerate(batched_labels_list):
    # Collect indices from where label is -100
    idx_to_remove = []
    for j, label in enumerate(label_list):
      if label == -100:
        idx_to_remove.append(j)
    # Get corresponding preds
    pred_list = batched_preds_list[i]
    # Remove indices from both preds and labels
    filtered_label_list = [label_list[l] for l in range(len(label_list)) if l not in idx_to_remove]
    filtered_pred_list = [pred_list[p] for p in range(len(pred_list)) if p not in idx_to_remove]
    # Append tensorised versions back to create batched lists
    processed_labels_batch.append(filtered_label_list)
    processed_preds_batch.append(filtered_pred_list)

  # Return processed lists
  return (processed_preds_batch, processed_labels_batch)

#### Train loop

In [14]:
from tqdm.auto import tqdm

EPOCHS = 5

# Variables to accumulate loss and acc across batches
train_loss, train_acc = 0, 0

model.train()
for epoch in tqdm(range(EPOCHS)):
  for batch in train_dataloader:

    # Send data to device
    batch = {k: v.to(device) for k, v in batch.items()}

    # Forward pass
    outputs = model(**batch)

    # Get loss and accumulate
    loss = outputs.loss
    train_loss += loss

    # Get logits and preds
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1)

    # Use preprocessor function for accuracy
    # Process list of lists and calculate acc for each sample in batch
    acc_batch = 0
    preds_processed, labels_processed = process_for_acc(preds, batch["labels"])
    for i, label in enumerate(labels_processed):
      label = torch.tensor(label)
      pred = torch.tensor(preds_processed[i])
      acc = acc_fn(pred, label)
      acc_batch += acc.item()
    # Average acc across samples in batch
    acc_batch /= len(labels_processed)
    # Accumulate accuracy over batches
    train_acc += acc_batch

    # Zero grad optimiser
    optimiser.zero_grad()

    # Backpropagate loss
    loss.backward()

    # Step optimiser and scheduler
    optimiser.step()
    lr_scheduler.step()

  # Average loss and acc across batches
  train_loss /= len(train_dataloader)
  train_acc /= len(train_dataloader)

  # Print progress
  print(f"Epoch: {epoch+1} | Training loss: {train_loss:.4f} | Training acc: {train_acc:.2%}")

  0%|          | 0/5 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch: 1 | Training loss: 0.1420 | Training acc: 95.70%
Epoch: 2 | Training loss: 0.0469 | Training acc: 98.77%
Epoch: 3 | Training loss: 0.0266 | Training acc: 99.31%
Epoch: 4 | Training loss: 0.0176 | Training acc: 99.61%
Epoch: 5 | Training loss: 0.0139 | Training acc: 99.66%


#### Eval loop

In [15]:
# Variables to track eval loss and accuracy
val_loss, val_acc = 0, 0

model.eval()
with torch.inference_mode():
  # Loop through validation batches
  for batch in val_dataloader:

    # Send data to device
    batch = {k: v.to(device) for k, v in batch.items()}

    # Forward pass
    outputs = model(**batch)

    # Get and accumulate loss
    val_loss += outputs.loss

    # Get preds
    preds = torch.argmax(outputs.logits, dim=-1)

    # Use preprocessor and accumulate accuracy
    acc_batch = 0
    preds_processed, labels_processed = process_for_acc(preds, batch["labels"])
    for i, label in enumerate(labels_processed):
      label = torch.tensor(label)
      pred = torch.tensor(preds_processed[i])
      acc = acc_fn(pred, label)
      acc_batch += acc.item()
    # Average acc across samples in batch
    acc_batch /= len(labels_processed)
    # Accumulate accuracy over batches
    val_acc += acc_batch

  # Average loss and acc across batches
  val_loss /= len(val_dataloader)
  val_acc /= len(val_dataloader)

# Print outputs
print(f"Validation loss: {val_loss:.4f} | Validation accuracy: {val_acc:.2%}")

Validation loss: 0.0693 | Validation accuracy: 98.67%


### **Save model**

In [17]:
!git clone https://github.com/cswamy/pytorch

Cloning into 'pytorch'...
remote: Enumerating objects: 19, done.[K
remote: Counting objects: 100% (19/19), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 19 (delta 3), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (19/19), 5.65 KiB | 5.65 MiB/s, done.
Resolving deltas: 100% (3/3), done.


In [19]:
from pytorch.scripts import utils
utils.save_model(model=model,
                 target_dir="models",
                 model_name="bertbasecased_finetuned_conll.pth")

[INFO] Saving model to: models/bertbasecased_finetuned_conll.pth


### **Make predictions**

#### Test set

In [20]:
# Create test dataloader
test_dataloader = DataLoader(dataset=tokenized_datasets["test"],
                             batch_size=BATCH_SIZE,
                             shuffle=False,
                             collate_fn=data_collator)

In [21]:
test_loss, test_acc = 0, 0

model.eval()
with torch.inference_mode():
  for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}

    # Forward pass
    outputs = model(**batch)

    # Get and accumulate loss
    test_loss += outputs.loss

    # Get preds
    preds = torch.argmax(outputs.logits, dim=-1)

    # Use preprocessor and accumulate accuracy
    acc_batch = 0
    preds_processed, labels_processed = process_for_acc(preds, batch["labels"])
    for i, label in enumerate(labels_processed):
      label = torch.tensor(label)
      pred = torch.tensor(preds_processed[i])
      acc = acc_fn(pred, label)
      acc_batch += acc.item()
    # Average acc across samples in batch
    acc_batch /= len(labels_processed)
    # Accumulate accuracy over batches
    test_acc += acc_batch

  # Average loss and acc across batches
  test_loss /= len(test_dataloader)
  test_acc /= len(test_dataloader)

# Print outputs
print(f"Test loss: {test_loss:.4f} | Test accuracy: {test_acc:.2%}")

Test loss: 0.1810 | Test accuracy: 96.70%


#### New sentences

In [22]:
import re
def pred_ner(new_text:str,
             model:torch.nn.Module,
             tokenizer,
             device:torch.device):
  """
  Function for named entity recognition on new text.
  Args:
    new_text(str): A new sentence to classify entities on.
    model(torch.nn.Module): Trained pytorch model for NER.
    tokenizer: tokenizer for the model.
    device(torch.device): Device setting
  Returns:
    List of dicts with words and entities in text.
  """
  new_text_tokens = new_text.split(' ')
  tokenized_sample = tokenizer(new_text_tokens, is_split_into_words=True)
  input_to_model = {k: torch.tensor(v).unsqueeze(dim=0).to(device) for k, v in tokenized_sample.items()}
  outputs = model(**input_to_model)
  preds = torch.argmax(outputs.logits, dim=-1)
  preds_list = preds.squeeze(dim=0).tolist()

  # Remove CLS and SEP tokens from all lists
  tokenized_tokens = tokenized_sample.tokens()[1:-1]
  word_ids = tokenized_sample.word_ids()[1:-1]
  preds_list = preds_list[1:-1]

  # Remove pred = 0 from tokens and word ids (0's are non-entities)
  ix_remove = []
  for i, pred in enumerate(preds_list):
    if pred == 0:
      ix_remove.append(i)
  filtered_tokens = [tokenized_tokens[t] for t in range(len(tokenized_tokens)) if t not in ix_remove]
  filtered_wordids = [word_ids[w] for w in range(len(word_ids)) if w not in ix_remove]
  filtered_preds = [preds_list[p] for p in range(len(preds_list)) if p not in ix_remove]

  # Create list with words from original text and predictions
  current_word = None
  results_list = []
  for i, word in enumerate(filtered_wordids):
    if word != current_word:
      if filtered_preds[i] % 2 == 1:
        results_dict = {}
        results_dict["word"] = re.sub(r'[^\w\s]', '', new_text_tokens[word])
        results_dict["pred"] = filtered_preds[i]
        results_list.append(results_dict)
        current_word = word
      else:
        tmp_dict = results_list[-1]
        tmp_dict["word"] = new_text_tokens[word-1] + ' ' + new_text_tokens[word]

  # Finally convert predictions to entity categories
  # Person, Organization, Location and Miscellaneous
  for pred in results_list:
    if pred["pred"] <= 2:
      pred["pred"] = "Person"
    elif pred["pred"] <= 4:
      pred["pred"] = "Organisation"
    elif pred["pred"] <= 6:
      pred["pred"] = "Location"
    else:
      pred["pred"] = "Miscellaneous"

  return results_list

In [23]:
new_text = "Barack Obama was the 44th President of the United States."
results = pred_ner(new_text, model, tokenizer, device)
results

[{'word': 'Barack Obama', 'pred': 'Person'},
 {'word': 'United States.', 'pred': 'Location'}]

### **Depoly to hugging face**

In [24]:
from pathlib import Path

# Create folders
demo_path = Path("demos/bert_ner")
demo_path.mkdir(parents=True, exist_ok=True)

In [25]:
# Move model to demo folder
!mv models/bertbasecased_finetuned_conll.pth demos/bert_ner

In [26]:
# Create class_names.txt file
class_names_path = demo_path / "class_names.txt"
with open(class_names_path, "w") as f:
  f.write("\n".join(class_names))

In [27]:
%%writefile demos/bert_ner/model.py
from transformers import AutoModelForTokenClassification, AutoTokenizer

def create_bertcased_ner(class_names):
  """
  Initializes tokenizer and model for a bert-cased checkpoint.
  Args:
    class_names: List of classnames
  Returns:
    Instance of model and tokenizer
  """
  checkpoint = "bert-base-cased"
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)

  # Setup id2label and label2id dicts
  id2label = {i: label for i, label in enumerate(class_names)}
  label2id = {label: i for i, label in enumerate(class_names)}

  # Instantiate model
  model = AutoModelForTokenClassification.from_pretrained(checkpoint,
                                                          id2label=id2label,
                                                          label2id=label2id)

  return model, tokenizer

Writing demos/bert_ner/model.py


In [29]:
%%writefile demos/bert_ner/app.py
import gradio as gr
import os
import re
import torch

from model import create_bertcased_ner
from typing import Tuple, Dict

# Read class names from class_names.txt
with open("class_names.txt", "r") as f:
  class_names = [entity_name.strip() for entity_name in f.readlines()]

# Setup model and tokenizer
model, tokenizer = create_bertcased_ner(class_names)

# Load state dict from model
model.load_state_dict(
    torch.load(
        f="bertbasecased_finetuned_conll.pth",
        map_location=torch.device("cpu")
    ))

# Predict function
def predict(new_text:str):
  """
  Function for named entity recognition on new text.
  Args:
    new_text(str): A new sentence to classify entities on.
  Returns:
    List of dicts with words and entities in text.
  """

  new_text_tokens = new_text.split(' ')
  tokenized_sample = tokenizer(new_text_tokens, is_split_into_words=True)
  input_to_model = {k: torch.tensor(v).unsqueeze(dim=0) for k, v in tokenized_sample.items()}
  outputs = model(**input_to_model)
  preds = torch.argmax(outputs.logits, dim=-1)
  preds_list = preds.squeeze(dim=0).tolist()

  # Remove CLS and SEP tokens from all lists
  tokenized_tokens = tokenized_sample.tokens()[1:-1]
  word_ids = tokenized_sample.word_ids()[1:-1]
  preds_list = preds_list[1:-1]

  # Remove pred = 0 from tokens and word ids (0's are non-entities)
  ix_remove = []
  for i, pred in enumerate(preds_list):
    if pred == 0:
      ix_remove.append(i)
  filtered_tokens = [tokenized_tokens[t] for t in range(len(tokenized_tokens)) if t not in ix_remove]
  filtered_wordids = [word_ids[w] for w in range(len(word_ids)) if w not in ix_remove]
  filtered_preds = [preds_list[p] for p in range(len(preds_list)) if p not in ix_remove]

  # Create list with words from original text and predictions
  current_word = None
  results_list = []
  for i, word in enumerate(filtered_wordids):
    if word != current_word:
      if filtered_preds[i] % 2 == 1:
        results_dict = {}
        results_dict["word"] = re.sub(r'[^\w\s]', '', new_text_tokens[word])
        results_dict["pred"] = filtered_preds[i]
        results_list.append(results_dict)
        current_word = word
      else:
        tmp_dict = results_list[-1]
        tmp_dict["word"] = new_text_tokens[word-1] + ' ' + new_text_tokens[word]

  # Finally convert predictions to entity categories
  # Person, Organization, Location and Miscellaneous
  for pred in results_list:
    if pred["pred"] <= 2:
      pred["pred"] = "Person"
    elif pred["pred"] <= 4:
      pred["pred"] = "Organisation"
    elif pred["pred"] <= 6:
      pred["pred"] = "Location"
    else:
      pred["pred"] = "Miscellaneous"

  # Convert list of dicts to list of tuples for gradio HighlightedText component
  results_tuples_list = []
  results_tuples_list = [(d['word'], d['pred']) for d in results_list]
  return results_tuples_list

# Create examples list
examples_list = ["Barack Obama was the 44th President of the United States",
                "The United Nations is headquartered in New York"]

# Create Gradio app
title = "Named Entity Recognition 🔎"
description = "Bert-base-cased model finetuned for named entity recognition using the conll2003 dataset."

demo = gr.Interface(fn=predict,
                    inputs=gr.inputs.Textbox(label="Input",
                                             placeholder="Enter sentence here..."),
                    outputs=gr.HighlightedText(),
                    examples=examples_list,
                    title=title,
                    description=description)

# Launch gradio
demo.launch()

Writing demos/bert_ner/app.py


In [30]:
%%writefile demos/bert_ner/requirements.txt
torch==1.12.0
gradio==3.1.4
transformers==transformers==4.33.1

Writing demos/bert_ner/requirements.txt


In [31]:
!cd demos/bert_ner && zip -r ../bert_ner.zip *

  adding: app.py (deflated 60%)
  adding: bertbasecased_finetuned_conll.pth (deflated 7%)
  adding: class_names.txt (deflated 27%)
  adding: model.py (deflated 59%)
  adding: requirements.txt (deflated 17%)


In [55]:
try:
  from google.colab import files
  files.download("demos/bert_ner.zip")
except:
  print(f"Download failed!")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

For further instructions on uploading to hugging face, refer here: https://www.learnpytorch.io/09_pytorch_model_deployment/#117-deploying-our-foodvision-big-app-to-huggingface-spaces