# Notebook Overview

This notebook is used to fine-tune a BERT-based named entity recognition (NER) model using user-authored scenarios describing how users interact with a screen in a mobile app. The notebook covers loading, formatting and splitting the data for training, configuration of the algorithm to train the model, and training the model.

In [1]:
! pip install transformers datasets tokenizers evaluate
! pip install transformers[sentencepiece]
! pip install torch
! pip install tensorflow
! pip install spacy
! pip install seqeval
! pip install ipywidgets
! pip install "ray[tune]" scipy scikit-learn

Collecting protobuf<=3.20.2; extra == "sentencepiece"
  Using cached protobuf-3.20.2-cp38-cp38-macosx_10_9_x86_64.whl (982 kB)
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 4.21.12
    Uninstalling protobuf-4.21.12:
      Successfully uninstalled protobuf-4.21.12
[31mERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.

We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.

tensorflow 2.11.0 requires protobuf<3.20,>=3.9.2, but you'll have protobuf 3.20.2 which is incompatible.
cached-path 1.1.2 requires huggingface-hub<0.6.0,>=0.0.12, but you'll have huggingface-hub 0.12.0 which is incompatible.
allennlp 2.9.3 requires torch<1.12.0,>=1.6.0, but you'll have torch 1.13.1 which is incompatible.
allennlp 2.9.3 requires transformers<4.19,>

Collecting protobuf<3.20,>=3.9.2
  Using cached protobuf-3.19.6-cp38-cp38-macosx_10_9_x86_64.whl (980 kB)
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.2
    Uninstalling protobuf-3.20.2:
      Successfully uninstalled protobuf-3.20.2
[31mERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.

We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.

cached-path 1.1.2 requires huggingface-hub<0.6.0,>=0.0.12, but you'll have huggingface-hub 0.12.0 which is incompatible.
allennlp 2.9.3 requires torch<1.12.0,>=1.6.0, but you'll have torch 1.13.1 which is incompatible.
allennlp 2.9.3 requires transformers<4.19,>=4.1, but you'll have transformers 4.26.0 which is incompatible.
allennlp 2.9.3 requires typer>=0.4.1, but you'll have typer 0.3.







In [2]:
! pip install numpy==1.21



In [1]:
import json, datasets

def create_dataset(sentences):
    res = {'tokens': [], 'ner_tags': []}
    for sent in sentences:
        res['tokens'].append([word for word, pos, tag in sent['tokens']])
        res['ner_tags'].append([tag for word, pos, tag in sent['tokens']])
    
    dataset = datasets.Dataset.from_dict(res, features=datasets.Features({
        "tokens": datasets.Sequence(datasets.Value("string")),
        "ner_tags": datasets.Sequence(
            datasets.features.ClassLabel(
                names=['O', 'B-SIM', 'I-SIM', 'B-COM', 'I-COM', 'B-QUE', 'I-QUE']
            )
        ),
    }))
    return dataset

dataset = json.load(open('../datasets/scenarios-training-new.json'))
raw_datasets = datasets.dataset_dict.DatasetDict({
    'train': create_dataset(dataset['train']),
    'validation': create_dataset(dataset['validation']),
    'test': create_dataset(dataset['test'])
})

In [2]:
# tokenize and align dataset
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)
    return new_labels

def tokenize_and_align_labels(examples, **kwargs):
    '''
    Input: a row of Dataset
    Output: use dataset.map() method to map this function for each row.
            This will tokenize each row and align the original labels.
    '''
    tokenizer = kwargs['tokenizer']
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [3]:
from sklearn.preprocessing import MultiLabelBinarizer
from seqeval.metrics import classification_report

# compute evaluation for precision, recall, f1 and accuracy
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    MultiLabelBinarizer().fit_transform(true_labels)
    MultiLabelBinarizer().fit_transform(true_predictions)
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    
    # return all_metrics
    print(classification_report(true_labels,true_predictions ))
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [4]:
# load the dataset from the given dataset path
#DATASET_PATH = '../datasets/scenarios-labeled.json'
#raw_datasets = load_dataset(DATASET_PATH, [0.8, 0.1, 0.1])

ner_feature = raw_datasets["train"].features["ner_tags"]
label_names = ner_feature.feature.names

In [5]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification
import evaluate
from transformers import Trainer, TrainingArguments

# tokenize the data for fine-tuning the bert-base-NER model
model_checkpoint = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
    fn_kwargs={'tokenizer': tokenizer}
)

# setup collator and evalation metric
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# define the model initialization
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

def model_init():
    return AutoModelForTokenClassification.from_pretrained(
            model_checkpoint,
            id2label=id2label,
            label2id=label2id,
            ignore_mismatched_sizes=True
        )
    
# train the model
args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    push_to_hub=False,
)
trainer = Trainer(
    # model=model,
    model_init = model_init,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

# set dependencies for the train() function to complete
import numpy as np
import os
os.environ["WANDB_DISABLED"] = "true"
metric = evaluate.load("seqeval")

trainer.train()

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file config.json from cache at /Users/breaux/.cache/huggingface/hub/models--dslim--bert-base-NER/snapshots/f7c2808a659015eeb8828f3f809a2f1be67a2446/config.json
Model config BertConfig {
  "_name_or_path": "dslim/bert-base-NER",
  "_num_labels": 9,
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-SIM",
    "2": "I-SIM",
    "3": "B-COM",
    "4": "I-COM",
    "5": "B-QUE",
    "6": "I-QUE"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-COM": 3,
    "B-QUE": 5,
    "B-SIM": 1,
    "I-COM": 4,
    "I-QUE": 6,
    "I-SIM": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "position_e

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.31599,0.593684,0.648276,0.61978,0.893866
2,0.316900,0.270683,0.531092,0.726437,0.613592,0.899883
3,0.316900,0.303888,0.608534,0.754023,0.673511,0.905566
4,0.117300,0.406298,0.613936,0.749425,0.674948,0.90707
5,0.117300,0.38697,0.611855,0.735632,0.668058,0.908909
6,0.040400,0.487344,0.622824,0.74023,0.676471,0.907237
7,0.040400,0.479942,0.640232,0.76092,0.695378,0.913923
8,0.015700,0.537268,0.635815,0.726437,0.678112,0.911416
9,0.015700,0.572689,0.634051,0.744828,0.684989,0.909744
10,0.006700,0.569418,0.629845,0.747126,0.683491,0.909911


***** Running Evaluation *****
  Num examples = 267
  Batch size = 8


              precision    recall  f1-score   support

         COM       0.00      0.00      0.00        33
         QUE       0.30      0.41      0.35        41
         SIM       0.64      0.73      0.68       361

   micro avg       0.59      0.65      0.62       435
   macro avg       0.31      0.38      0.34       435
weighted avg       0.56      0.65      0.60       435



Saving model checkpoint to bert-finetuned-ner/checkpoint-267
Configuration saved in bert-finetuned-ner/checkpoint-267/config.json
Model weights saved in bert-finetuned-ner/checkpoint-267/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-267/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-267/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 267
  Batch size = 8


              precision    recall  f1-score   support

         COM       0.01      0.03      0.02        33
         QUE       0.29      0.44      0.35        41
         SIM       0.65      0.82      0.73       361

   micro avg       0.53      0.73      0.61       435
   macro avg       0.32      0.43      0.37       435
weighted avg       0.57      0.73      0.64       435



Saving model checkpoint to bert-finetuned-ner/checkpoint-534
Configuration saved in bert-finetuned-ner/checkpoint-534/config.json
Model weights saved in bert-finetuned-ner/checkpoint-534/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-534/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-534/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 267
  Batch size = 8


              precision    recall  f1-score   support

         COM       0.03      0.06      0.04        33
         QUE       0.45      0.51      0.48        41
         SIM       0.70      0.84      0.77       361

   micro avg       0.61      0.75      0.67       435
   macro avg       0.40      0.47      0.43       435
weighted avg       0.63      0.75      0.69       435



Saving model checkpoint to bert-finetuned-ner/checkpoint-801
Configuration saved in bert-finetuned-ner/checkpoint-801/config.json
Model weights saved in bert-finetuned-ner/checkpoint-801/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-801/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-801/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 267
  Batch size = 8


              precision    recall  f1-score   support

         COM       0.02      0.03      0.02        33
         QUE       0.40      0.44      0.42        41
         SIM       0.70      0.85      0.77       361

   micro avg       0.61      0.75      0.67       435
   macro avg       0.37      0.44      0.40       435
weighted avg       0.62      0.75      0.68       435



Saving model checkpoint to bert-finetuned-ner/checkpoint-1068
Configuration saved in bert-finetuned-ner/checkpoint-1068/config.json
Model weights saved in bert-finetuned-ner/checkpoint-1068/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-1068/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-1068/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 267
  Batch size = 8


              precision    recall  f1-score   support

         COM       0.02      0.03      0.02        33
         QUE       0.49      0.59      0.53        41
         SIM       0.71      0.82      0.76       361

   micro avg       0.61      0.74      0.67       435
   macro avg       0.41      0.48      0.44       435
weighted avg       0.64      0.74      0.68       435



Saving model checkpoint to bert-finetuned-ner/checkpoint-1335
Configuration saved in bert-finetuned-ner/checkpoint-1335/config.json
Model weights saved in bert-finetuned-ner/checkpoint-1335/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-1335/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-1335/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 267
  Batch size = 8


              precision    recall  f1-score   support

         COM       0.06      0.09      0.07        33
         QUE       0.43      0.49      0.46        41
         SIM       0.72      0.83      0.77       361

   micro avg       0.62      0.74      0.68       435
   macro avg       0.40      0.47      0.43       435
weighted avg       0.64      0.74      0.69       435



Saving model checkpoint to bert-finetuned-ner/checkpoint-1602
Configuration saved in bert-finetuned-ner/checkpoint-1602/config.json
Model weights saved in bert-finetuned-ner/checkpoint-1602/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-1602/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-1602/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 267
  Batch size = 8


              precision    recall  f1-score   support

         COM       0.09      0.15      0.11        33
         QUE       0.53      0.56      0.55        41
         SIM       0.72      0.84      0.78       361

   micro avg       0.64      0.76      0.70       435
   macro avg       0.45      0.52      0.48       435
weighted avg       0.66      0.76      0.70       435



Saving model checkpoint to bert-finetuned-ner/checkpoint-1869
Configuration saved in bert-finetuned-ner/checkpoint-1869/config.json
Model weights saved in bert-finetuned-ner/checkpoint-1869/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-1869/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-1869/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 267
  Batch size = 8


              precision    recall  f1-score   support

         COM       0.04      0.06      0.05        33
         QUE       0.50      0.54      0.52        41
         SIM       0.73      0.81      0.77       361

   micro avg       0.64      0.73      0.68       435
   macro avg       0.42      0.47      0.44       435
weighted avg       0.65      0.73      0.69       435



Saving model checkpoint to bert-finetuned-ner/checkpoint-2136
Configuration saved in bert-finetuned-ner/checkpoint-2136/config.json
Model weights saved in bert-finetuned-ner/checkpoint-2136/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-2136/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-2136/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 267
  Batch size = 8


              precision    recall  f1-score   support

         COM       0.06      0.09      0.07        33
         QUE       0.45      0.51      0.48        41
         SIM       0.72      0.83      0.77       361

   micro avg       0.63      0.74      0.68       435
   macro avg       0.41      0.48      0.44       435
weighted avg       0.65      0.74      0.69       435



Saving model checkpoint to bert-finetuned-ner/checkpoint-2403
Configuration saved in bert-finetuned-ner/checkpoint-2403/config.json
Model weights saved in bert-finetuned-ner/checkpoint-2403/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-2403/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-2403/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 267
  Batch size = 8


              precision    recall  f1-score   support

         COM       0.08      0.12      0.10        33
         QUE       0.40      0.51      0.45        41
         SIM       0.72      0.83      0.77       361

   micro avg       0.63      0.75      0.68       435
   macro avg       0.40      0.49      0.44       435
weighted avg       0.64      0.75      0.69       435



Saving model checkpoint to bert-finetuned-ner/checkpoint-2670
Configuration saved in bert-finetuned-ner/checkpoint-2670/config.json
Model weights saved in bert-finetuned-ner/checkpoint-2670/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-2670/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-2670/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=2670, training_loss=0.09332507275017013, metrics={'train_runtime': 6792.3328, 'train_samples_per_second': 3.137, 'train_steps_per_second': 0.393, 'total_flos': 482182188006354.0, 'train_loss': 0.09332507275017013, 'epoch': 10.0})

In [49]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# instantiate the tokenizer and model, setup pipeline
model_path = './bert-finetuned-ner/checkpoint-300'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path, ignore_mismatched_sizes=True)
nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy='first')

loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./bert-finetuned-ner/checkpoint-300/config.json
Model config BertConfig {
  "_name_or_path": "./bert-finetuned-ner/checkpoint-300",
  "_num_labels": 9,
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-SIM",
    "2": "I-SIM",
    "3": "B-COM",
    "4": "I-COM",
    "5": "B-QUE",
    "6": "I-QUE"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-COM": 3,
    "B-QUE": 5,
    "B-SIM": 1,
    "I-COM": 4,
    "I-QUE": 6,
    "I-SIM": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers"

In [52]:
from spacy.training import offsets_to_biluo_tags, biluo_to_iob
import spacy, json

def create_labels(text, entities):
    doc = nlp_parser(text)
    labels = []
    last_label = 'O'
    for token in doc:
        label = 'O'
        for entity in entities:
            if entity['start'] <= token.idx and token.idx + len(token.text) <= entity['end']:
                label = entity['entity_group']
                if last_label == label:
                    label = 'I-' + label
                else:
                    label = 'B-' + label
                last_label = label
            else:
                last_label = 'O'
        labels.append(label)
    return labels

# setup the spaCy English tokenizer
nlp_parser = spacy.load("en_core_web_sm")

# load the scenario data to obtain untokenized text
dataset = json.load(open('../datasets/scenarios-training-new.json', 'r'))
 
y_true = []
y_pred = []
count = 0
for sentence in datasets['test']:
    y_true.extend([t for w, p, t in sentence['tokens']])
        
    # predict the named entities from the test scenario
    entities = nlp(sentence['text'])
    print(entities)
    labels = create_labels(sentence['text'], entities)
    y_pred.extend(labels)
print(count)
    
print('Created y_true length = %i' % len(y_true))
print('Created y_pred length = %i' % len(y_pred))

[]
[{'entity_group': 'COM', 'score': 0.49818853, 'word': 'celebrities', 'start': 35, 'end': 46}, {'entity_group': 'COM', 'score': 0.6673409, 'word': 'follow', 'start': 54, 'end': 60}, {'entity_group': 'SIM', 'score': 0.9049134, 'word': 'profile', 'start': 82, 'end': 89}]
[{'entity_group': 'SIM', 'score': 0.8745841, 'word': 'goods', 'start': 54, 'end': 59}, {'entity_group': 'SIM', 'score': 0.76690686, 'word': 'trading cards', 'start': 84, 'end': 97}]


TypeError: 'NoneType' object is not iterable

In [47]:
from sklearn.metrics import classification_report
    
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

       B-COM       0.04      0.39      0.08        28
       B-QUE       0.13      0.92      0.23        24
       B-SIM       0.63      0.75      0.68       338
       I-COM       0.00      0.00      0.00       155
       I-QUE       0.00      0.00      0.00       132
       I-SIM       0.00      0.00      0.00       105
           O       0.97      0.96      0.97      4762

    accuracy                           0.88      5544
   macro avg       0.25      0.43      0.28      5544
weighted avg       0.87      0.88      0.87      5544



  _warn_prf(average, modifier, msg_start, len(result))


In [48]:
from seqeval.metrics import classification_report

print(classification_report([y_true], [y_pred]))

              precision    recall  f1-score   support

         COM       0.00      0.00      0.00        28
         QUE       0.00      0.00      0.00        25
         SIM       0.43      0.51      0.47       338

   micro avg       0.21      0.44      0.29       391
   macro avg       0.14      0.17      0.16       391
weighted avg       0.37      0.44      0.40       391

