In [1]:
# Define the gpu on the gpu machine
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=7

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=7


In [2]:
import os
import pandas as pd
import json
from sklearn.metrics import f1_score

In [None]:
# Based on the dataset that is being evaluated, extract a list of true labels and a list of unique labels

def extract_true_label(dataset, dataset_dict):
    """The function takes the dataset name and dataset dictionary and returns the list of true labels from the test split."""
    # Load the json file
    with open(dataset_dict[dataset], "r") as file:
        json_dict = json.load(file)

    # Open the test dictionary as DataFrame
    test_df = pd.DataFrame(json_dict["test"])

    # Extract a list of unique labels
    labels = list(test_df.labels.unique())

    # Extract label list
    y_true = test_df.labels.to_list()

    return [y_true, labels]

#Calculate the scores
def testing(true, pred, labels):
    """
    This function takes the list of true labels and list of predictions and evaluates the model based on comparing them.
    It calculates micro and macro F1 scores.
    
    Args:
    - y_true: list of true labels
    - y_pred: list of predicted labels
    - show_matrix: defaults to False - whether the confusion matrix is printed

    The function returns a dictionary with accuracy, micro and macro F1.
    """
    y_true = true
    y_pred = pred
    LABELS = labels

    # Calculate the scores
    macro = f1_score(y_true, y_pred, labels=LABELS, average="macro")
    micro = f1_score(y_true, y_pred, labels=LABELS,  average="micro")
    print(f"Macro f1: {macro:0.3}, Micro f1: {micro:0.3}")
    
    return {"Micro F1":micro, "Macro F1": macro}

In [4]:
# Get all the datasets in the data/datasets directory
dataset_path = "data/datasets/"

datasets = os.listdir("data/datasets/")

# Add the path to the datasets
dataset_paths = [dataset_path + x for x in datasets]

dataset_dict = {x[0]:x[1] for x in list(zip(datasets, dataset_paths))}

print("Evaluated datasets: {}".format(datasets))

Evaluated datasets: ['hr500k.json', 'reldi-normtagner-hr.json', 'reldi-normtagner-sr.json', 'set.sr.plus.json']


In [9]:
model_list = ['csebert', 'xlm-r-base', 'xlm-r-large', 'bertic', 'xlm-r-bertic', 'xlm-r-slobertic']

In [47]:
# Open the submission to be evaluated
path = "/home/tajak/NER-recognition/benchich/ner/systems/hugging-face-models/submissions"

with open("{}/submission-{}-{}.json".format(path,model, dataset), "r") as sub_file:
    results = json.load(sub_file)

# Extract predictions
args = results["args"]

epochs = results["args"]["num_train_epochs"]
lr = results["args"]["learning_rate"]

In [80]:
dataset_name = results["predictions"][0]["test"]
dataset = dataset_name.split(" ")[0]
dataset

'set.sr.plus.json'

In [49]:
lr

4e-05

In [50]:
result_list = []

for model in model_list:
    for dataset in datasets:
        test_file = extract_true_label(dataset, dataset_dict)
        y_true = test_file[0]
        labels = test_file[1]

        # Open the submission to be evaluated
        path = "/home/tajak/NER-recognition/benchich/ner/systems/hugging-face-models/submissions"

        with open("{}/submission-{}-{}.json".format(path,model, dataset), "r") as sub_file:
            results = json.load(sub_file)

        # Extract information on arguments
        epochs = results["args"]["num_train_epochs"]
        lr = results["args"]["learning_rate"]

        # Extract predictions
        y_pred = results["predictions"][0]["predictions"]

        print("Evaluation: {} on {}".format(model, dataset))

        current_scores = testing(y_true, y_pred, labels)

        current_res_dict = {"Model": model, "Test Dataset": dataset, "Macro F1": current_scores["Macro F1"], "Micro F1": current_scores["Micro F1"], "Epochs": epochs, "Learning Rate": lr}

        result_list.append(current_res_dict)

        print("\n----------------------\n")

Evaluation: csebert on hr500k.json
Macro f1: 0.627, Micro f1: 0.959

----------------------

Evaluation: csebert on reldi-normtagner-hr.json
Macro f1: 0.517, Micro f1: 0.956

----------------------

Evaluation: csebert on reldi-normtagner-sr.json
Macro f1: 0.512, Micro f1: 0.973

----------------------

Evaluation: csebert on set.sr.plus.json
Macro f1: 0.612, Micro f1: 0.953

----------------------

Evaluation: xlm-r-base on hr500k.json
Macro f1: 0.568, Micro f1: 0.955

----------------------

Evaluation: xlm-r-base on reldi-normtagner-hr.json
Macro f1: 0.404, Micro f1: 0.956

----------------------

Evaluation: xlm-r-base on reldi-normtagner-sr.json
Macro f1: 0.491, Micro f1: 0.972

----------------------

Evaluation: xlm-r-base on set.sr.plus.json
Macro f1: 0.604, Micro f1: 0.953

----------------------

Evaluation: xlm-r-large on hr500k.json
Macro f1: 0.0957, Micro f1: 0.917

----------------------

Evaluation: xlm-r-large on reldi-normtagner-hr.json
Macro f1: 0.0957, Micro f1: 0.91

In [51]:
# Create a dataframe

result_df = pd.DataFrame(result_list)

# Save the df

result_df.to_csv("hugging-face-models-results.csv")

In [56]:
# For each dataset, create a table with results

def results_table(result_df, datasets):
    dataset_df = result_df[result_df["Test Dataset"] == dataset]

    # Sort values based on highes Macro F1
    dataset_df = dataset_df.sort_values(by="Macro F1", ascending=False)

    # Round scores to 3 decimal places
    dataset_df["Macro F1"] = dataset_df["Macro F1"].round(3)

    dataset_df["Micro F1"] = dataset_df["Micro F1"].round(3)

    print(dataset_df.to_markdown(index=False))
    print("\n")

    return dataset_df

In [64]:
current_df = results_table(result_df, datasets[0])

type(current_df.to_markdown(index=False))

| Model           | Test Dataset     |   Macro F1 |   Micro F1 |   Epochs |   Learning Rate |
|:----------------|:-----------------|-----------:|-----------:|---------:|----------------:|
| bertic          | set.sr.plus.json |      0.618 |      0.954 |       10 |           4e-05 |
| csebert         | set.sr.plus.json |      0.612 |      0.953 |        9 |           4e-05 |
| xlm-r-base      | set.sr.plus.json |      0.604 |      0.953 |        6 |           4e-05 |
| xlm-r-large     | set.sr.plus.json |      0.597 |      0.952 |       13 |           4e-05 |
| xlm-r-bertic    | set.sr.plus.json |      0.59  |      0.953 |       13 |           4e-05 |
| xlm-r-slobertic | set.sr.plus.json |      0.094 |      0.881 |       13 |           4e-05 |




str

In [65]:
for dataset in datasets:
    current_df = results_table(result_df, datasets)
    print(current_df.to_markdown(index=False))

    # Save the table in markdown
    with open("systems/hugging-face-models/results-{}.md".format(dataset), "w") as result_file:
        result_file.write(current_df.to_markdown(index=False))

| Model           | Test Dataset   |   Macro F1 |   Micro F1 |   Epochs |   Learning Rate |
|:----------------|:---------------|-----------:|-----------:|---------:|----------------:|
| csebert         | hr500k.json    |      0.627 |      0.959 |        4 |           4e-05 |
| bertic          | hr500k.json    |      0.596 |      0.957 |        9 |           4e-05 |
| xlm-r-base      | hr500k.json    |      0.568 |      0.955 |        5 |           4e-05 |
| xlm-r-slobertic | hr500k.json    |      0.158 |      0.922 |        7 |           4e-05 |
| xlm-r-large     | hr500k.json    |      0.096 |      0.917 |        7 |           4e-05 |
| xlm-r-bertic    | hr500k.json    |      0.096 |      0.917 |        7 |           4e-05 |


| Model           | Test Dataset   |   Macro F1 |   Micro F1 |   Epochs |   Learning Rate |
|:----------------|:---------------|-----------:|-----------:|---------:|----------------:|
| csebert         | hr500k.json    |      0.627 |      0.959 |        4 |     