# Prediction and Evaluation of BERT-based System

This notebook test the model performance of fine-tuned BERT model, and generates a file integrated the dev/test dataset and the corresponding predictions.

For higher speed of processing, it is suggested to run this notebook in Google Colab.

This step utilises code from the following GitHub repository:

[bert4srl](https://github.com/angel-daza/bert4srl)


In [92]:
! apt install python3.10-venv

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
python3.10-venv is already the newest version (3.10.12-1~22.04.3).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [93]:
! python -m venv myvenve
! source myvenve/bin/activate

In [94]:
%%writefile requirements.txt
keras==2.8.0
seqeval==1.2.2
tabulate==0.8.9
tensorflow==2.8.4
torch==1.11.0
transformers==4.17.0

Overwriting requirements.txt


In [95]:
! pip install -r requirements.txt



In [96]:
from collections import defaultdict
import pandas as pd
import torch
import numpy as np
import argparse
from transformers import BertTokenizer
from transformers import BertForTokenClassification
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

In [97]:
! pip install seqeval



In [98]:
# Mounting Google Colab to Google Drive to access the dataset
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/bert4srl-master')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [99]:
# Our code behind the scenes!
import utils_srl

In [113]:
# =====================================================================================
#                    SET PARAMETERS
# =====================================================================================

# Define parameters
test_path = '/content/drive/MyDrive/Colab Notebooks/bert4srl-master/data/annotation_test.jsonl' # Change this path to the jsonl file to be evaluated
model_dir = '/content/drive/MyDrive/Colab Notebooks/bert4srl-master/saved_models_tuned/TRIAL_BERT_SRL' # Change this path to the directory where the model is saved
epoch = 7 # Epoch with the best performance
lang = "EN"
gold_labels = False
eval_preds = 'True'
batch_size = 16
seq_max_len = 256

EVALUATE_PREDICATES = utils_srl.get_bool_value(eval_preds)
device, USE_CUDA = utils_srl.get_torch_device()
file_has_gold = gold_labels
SEQ_MAX_LEN = seq_max_len
BATCH_SIZE = batch_size

No GPU available, using the CPU instead.


In [114]:
# Load Saved Model
model, tokenizer = utils_srl.load_model(BertForTokenClassification, BertTokenizer, f"{model_dir}/EPOCH_{epoch}")
label2index = utils_srl.load_label_dict(f"{model_dir}/label2index.json")
# index2label = {v: k.strip("B-") for k, v in label2index.items()}
index2label = {v: k for k, v in label2index.items()}

# Load File for Predictions
_, prediction_inputs, prediction_masks, gold_labels, seq_lens, gold_predicates = utils_srl.load_srl_dataset(
    test_path, tokenizer, include_labels=True, max_len=SEQ_MAX_LEN, label2index=label2index
)

In [115]:
# Create the DataLoader
prediction_data = TensorDataset(prediction_inputs, prediction_masks, gold_labels, seq_lens, gold_predicates)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE)

print(f'Predicting labels for {len(prediction_inputs):,} test sentences...')

Predicting labels for 478 test sentences...


In [116]:
# Put model in evaluation mode
model.eval()

# Tracking variables
predictions, true_labels = [], []
total_sents = 0
confusion_dict = defaultdict(list)
arg_excess, arg_missed, arg_match = defaultdict(int), defaultdict(int), defaultdict(int)

In [117]:
# Predict
pred_label_list = []
pred_label_list_processed = []

for batch in prediction_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)

    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels, b_lengths, b_preds = batch

    # Telling the model not to compute or store gradients, saving memory and
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=b_preds, attention_mask=b_input_mask)

    logits = outputs[0]  # [B, S, V]
    class_probabilities = torch.softmax(logits, dim=-1)

    # Move class_probabilities and labels to CPU
    class_probabilities = class_probabilities.detach().cpu().numpy()
    argmax_indices = np.argmax(class_probabilities, axis=-1)

    label_ids = b_labels.to('cpu').numpy()
    seq_lengths = b_lengths.to('cpu').numpy()

    for ix in range(len(label_ids)):
        total_sents += 1
        text = tokenizer.convert_ids_to_tokens(b_input_ids[ix], skip_special_tokens=True)
        # Store predictions and true labels
        pred_labels = [index2label[p] for p in argmax_indices[ix][:seq_lengths[ix]] if p != 0]
        gold_labels = [index2label[g] for g in label_ids[ix] if g != 0]
        predictions += pred_labels[:len(gold_labels)]
        true_labels += gold_labels
        # We have to evaluate ONLY the labels that belong to a Start WordPiece (not contain "##")
        eval_metrics = utils_srl.evaluate_tagset(gold_labels, pred_labels, ignore_verb_label=EVALUATE_PREDICATES)
        arg_excess, arg_missed, arg_match = utils_srl.add_to_eval_dicts(eval_metrics, arg_excess, arg_missed, arg_match)

        for j, gold in enumerate(gold_labels):
            if j < len(pred_labels): confusion_dict[gold].append(pred_labels[j])

        pred_label_list.append(pred_labels)

        pred_list_processed = [element for element in pred_labels[1:-1] if element != 'X']
        gold_list_processed = [element for element in gold_labels[1:-1] if element != 'X']

        for item in pred_list_processed:
          pred_label_list_processed.append(item)


        print(f"\n----- {total_sents} -----\n{pred_list_processed}\n{gold_list_processed}")


----- 1 -----
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

----- 2 -----
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-event', 'I-event', 'B-time', 'I-time', 'I-time', 'I-time', 'I-time', 'O']
['O', 'O', 'O', 'O', 'O', '[UNK]', 'O', 'B-event', 'I-event', 'B-time', 'I-time', 'I-time', 'I-time', 'I-time', 'O']

----- 3 -----
['O', 'O', 'O', 'O', 'O', 'O', 'B-time', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'B-time', 'O']

----- 4 -----
['O', 'B-event', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['[UNK]', 'B-event', 'I-event', 'I-event', 'I-event', 'I-event', 'I-event', 'I-event', 'I-event', 'I-event', 'I-event', 'O']

----- 5 -----
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

----- 6 -----
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-event', 'I-event', 'I-event', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O

In [118]:
# Overall Metrics
metrics_file = f"{model_dir}/F1_Results_{lang}_{epoch}.txt"
utils_srl.get_overall_metrics(arg_excess, arg_missed, arg_match, save_to_file=metrics_file, print_metrics=True)


--- OVERALL ---
Correct: 895	Excess: 400	Missed: 499
Precision: 69.11		Recall: 64.20
F1: 66.57

                  corr.    excess    missed    prec.    rec.     F1
--------------  -------  --------  --------  -------  ------  -----
I-event             371       216       173    63.20   68.20  65.61
I-participants        0        11         0     0.00    0.00   0.00
I-place              22        17        20    56.41   52.38  54.32
I-time              260        40        28    86.67   90.28  88.44
[UNK]                 0         0       170     0.00    0.00   0.00
event               130        65        63    66.67   67.36  67.01
participants          0        18         0     0.00    0.00   0.00
place                 4        14        18    22.22   18.18  20.00
time                108        19        27    85.04   80.00  82.44


(69.1119691119691, 64.20373027259684, 66.56749721085906)

In [120]:
# Predictions of dev data
# Read dev files
file_path = '/content/drive/MyDrive/Colab Notebooks/bert4srl-master/data/annotation_dev_processed.tsv'
output_path = '/content/drive/MyDrive/Colab Notebooks/bert4srl-master/data/bert_dev_predictions.tsv'
data = pd.read_csv(file_path, delimiter='\t', encoding='utf-8')

In [121]:
# Predictions of test data (uncomment to run)

# # Read test files
# file_path = '/content/drive/MyDrive/Colab Notebooks/bert4srl-master/data/annotation_test_processed.tsv'
# output_path = '/content/drive/MyDrive/Colab Notebooks/bert4srl-master/data/bert_test_predictions.tsv'
# data = pd.read_csv(file_path, delimiter='\t', encoding='utf-8')

In [124]:
# Add a new column in data
data['prediction'] = pred_label_list_processed

# save the dataframe to the output file
data.to_csv(output_path, sep='\t', index=False, encoding='utf-8')
print("New column added and file saved successfully.")

New column added and file saved successfully.
