# Model Testing

Importing libraries.

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, DataCollatorForTokenClassification, Trainer
from sklearn.preprocessing import LabelEncoder
import warnings, evaluate, pickle, json
from tqdm import tqdm
import numpy as np
warnings.filterwarnings("ignore")
from datasets import disable_caching
disable_caching()
import torch

  from .autonotebook import tqdm as notebook_tqdm
2024-05-24 22:06:51.389575: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-24 22:06:51.440437: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Change the DATA_DIR with your own folder where data is stored.

In [None]:
DATA_DIR = "data"

The testing pipeline is the same for both models:

1. Load model and tokenizer from locally saved fine-tuned model.
2. Load only the test dataset. Define the same function from the training pipeline to tokenize and align the labels accordingly.
3. Map the function to the test dataset and define the data collator.
4. Load the label encoder and create mappings between labels and IDs. This is to compute metrics.
5. Load the seqeval library for evaluation and define the function to compute evaluation metrics, and the function to preprocess logits for metrics.
6. Define a Trainer instance in order to easy evaluation. Into the training arguments, it is important to set do_train in False, so the weights will not be updated.
7. Evaluate the test dataset and print results.

## BERT on testing

In [2]:
model_name = "bert_ner_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

In [3]:
test_dataset = load_dataset("json", data_files={"test": f"{DATA_DIR}/test_data.json"})["test"]

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["modified_words"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [4]:
tokenized_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

Map: 100%|████████████████████| 117120/117120 [00:09<00:00, 11926.88 examples/s]


In [5]:
with open("data/labelencoder.pkl","rb") as f:
    le = pickle.load(f)
id2label = {i: le.classes_[i] for i in range(len(le.classes_))}
label2id = {id2label[j]: j for j in range(len(id2label))}

In [6]:
seqeval = evaluate.load("seqeval")
def compute_metrics(p):
    predictions = p.predictions
    labels = p.label_ids
    
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

def preprocess_logits_for_metrics(logits, labels):
    pred_ids = np.argmax(logits.cpu(), axis=2)
    return pred_ids

In [7]:
training_args = TrainingArguments(
    output_dir = "./",
    per_device_eval_batch_size=32,
    do_train=False,
    do_eval=True,
    evaluation_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

eval_result = trainer.evaluate(eval_dataset=tokenized_dataset)
print(eval_result)

{'eval_loss': 0.18253612518310547, 'eval_precision': 0.8892020121861458, 'eval_recall': 0.8640431102871964, 'eval_f1': 0.8764420473037796, 'eval_accuracy': 0.9542758773490764, 'eval_runtime': 557.8663, 'eval_samples_per_second': 209.943, 'eval_steps_per_second': 6.561}


##

## RoBERTa on testing

In [2]:
model_name = "roberta_ner_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

In [3]:
test_dataset = load_dataset("json", data_files={"test": f"{DATA_DIR}/test_data.json"})["test"]

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["modified_words"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [4]:
tokenized_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

Map: 100%|████████████████████| 117120/117120 [00:09<00:00, 12681.89 examples/s]


In [5]:
with open("data/labelencoder.pkl","rb") as f:
    le = pickle.load(f)
id2label = {i: le.classes_[i] for i in range(len(le.classes_))}
label2id = {id2label[j]: j for j in range(len(id2label))}

In [6]:
seqeval = evaluate.load("seqeval")
def compute_metrics(p):
    predictions = p.predictions
    labels = p.label_ids
    
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

def preprocess_logits_for_metrics(logits, labels):
    pred_ids = np.argmax(logits.cpu(), axis=2)
    return pred_ids

In [7]:
training_args = TrainingArguments(
    output_dir = "./",
    per_device_eval_batch_size=32,
    do_train=False,
    do_eval=True,
    evaluation_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

eval_result = trainer.evaluate(eval_dataset=tokenized_dataset)
print(eval_result)

{'eval_loss': 0.17509829998016357, 'eval_precision': 0.8947445331927776, 'eval_recall': 0.870805817412943, 'eval_f1': 0.8826128853650539, 'eval_accuracy': 0.9565493488181205, 'eval_runtime': 849.9891, 'eval_samples_per_second': 137.79, 'eval_steps_per_second': 4.306}
