# Evaluation

In [1]:
utils_path = "../utils/"
corpus_path = "../datasets/cantemist_v6/"
subtask = "norm"
sub_task_path = "cantemist-" + subtask + "/"
test_gs_path = corpus_path + "test-set/" + sub_task_path

In [2]:
import tensorflow as tf

# Auxiliary components
import sys
sys.path.insert(0, utils_path)
from nlp_utils import *

RES_DIR = "../results/Cantemist/final_exec/"

# GS data
df_test_gs = format_ner_gs(test_gs_path, subtask=subtask)

2022-09-14 08:40:02.951142: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


## Evaluation

In [3]:
def check_ner_norm_performance(model_name, arr_execs):
    """
    Sanity-check procedure that prints the NER-NORM performance of each single model execution.
    """
    for i_exec in arr_execs:
        print("Exec " + str(i_exec) + ":")
        df_test_preds_ner = pd.read_csv(RES_DIR + "df_test_preds_ner_c_multi_task_" + \
                str(model_name) + "_" + str(i_exec) + ".csv", header=0, sep='\t')
        print("NER performance:", calculate_ner_metrics(
            gs=df_test_gs, 
            pred=format_ner_pred_df(
                gs_path=test_gs_path, 
                df_preds=df_test_preds_ner, 
                subtask='ner'
            ),
            subtask='ner'
        ))
        df_test_preds_norm = pd.read_csv(RES_DIR + "df_test_preds_norm_c_multi_task_" + \
                str(model_name) + "_" + str(i_exec) + ".csv", header=0, sep='\t')
        print("NORM performance:", calculate_ner_metrics(
            gs=df_test_gs, 
            pred=format_ner_pred_df(
                gs_path=test_gs_path, 
                df_preds=df_test_preds_norm, 
                subtask=subtask
            ),
            subtask=subtask
        ), end="\n\n")

In [4]:
def model_performance(dict_names_execs, subtask='norm', 
                      df_gs=df_test_gs, path_gs=test_gs_path,
                      round_n=3):
    """
    Generate a pd.DataFrame with the statistics of the performance of each model.
    
    dict_names_execs: each key is a string with the model name, and 
                      each value is a list with the random execs of the corresponding model.
    """
    res_dict = {}
    for model_name in dict_names_execs:
        p_res, r_res, f1_res = [], [], []
        for i_exec in dict_names_execs[model_name]:
            df_test_preds = pd.read_csv(RES_DIR + "df_test_preds_" + subtask + "_c_multi_task_" + \
                    str(model_name) + "_" + str(i_exec) + ".csv", header=0, sep='\t')
            p, r, f1 = calculate_ner_metrics(
                gs=df_gs, 
                pred=format_ner_pred_df(
                    gs_path=path_gs, 
                    df_preds=df_test_preds, 
                    subtask=subtask
                ),
                subtask=subtask
            )
            p_res.append(p)
            r_res.append(r)
            f1_res.append(f1)
        p_res_stat = pd.Series(p_res).describe()
        r_res_stat = pd.Series(r_res).describe()
        f1_res_stat = pd.Series(f1_res).describe()
        res_dict[model_name] = {"P_avg": round(p_res_stat['mean'], round_n), "P_std": round(p_res_stat['std'], round_n), 
                                "P_max": round(p_res_stat['max'], round_n),
                                "R_avg": round(r_res_stat['mean'], round_n), "R_std": round(r_res_stat['std'], round_n), 
                                "R_max": round(r_res_stat['max'], round_n),
                                "F1_avg": round(f1_res_stat['mean'], round_n), "F1_std": round(f1_res_stat['std'], round_n), 
                                "F1_max": round(f1_res_stat['max'], round_n)}
    return pd.DataFrame(res_dict, index=["P_avg", "P_std", "P_max", 
                                         "R_avg", "R_std", "R_max", 
                                         "F1_avg", "F1_std", "F1_max"]).transpose()    

In [5]:
def format_df_paper(df_res):
    arr_metrics = ["P", "R", "F1"]
    arr_cols = []
    for metric in arr_metrics:
        df_res[metric + '_avg_std'] = df_res.apply(
            lambda x: "." + str(x[metric + '_avg']).split('.')[-1] + " ± " + \
                "." + str(x[metric + '_std']).split('.')[-1], 
            axis=1
        )
        df_res[metric + '_max'] = df_res[metric + '_max'].apply(
            lambda x: "." + str(x).split('.')[-1]
        )
        arr_cols += [metric + '_avg_std', metric + '_max']
    return df_res[arr_cols]

In [None]:
# Sanity check

In [6]:
m_name = "mbert"
execs = [1, 2, 3, 4, 5]

check_ner_norm_performance(model_name=m_name, arr_execs=execs)

Exec 1:
NER performance: (0.855, 0.8615, 0.8582)
NORM performance: (0.8044, 0.8106, 0.8075)

Exec 2:
NER performance: (0.8559, 0.8566, 0.8562)
NORM performance: (0.8058, 0.8065, 0.8062)

Exec 3:
NER performance: (0.8631, 0.8643, 0.8637)
NORM performance: (0.8114, 0.8126, 0.812)

Exec 4:
NER performance: (0.8597, 0.8516, 0.8556)
NORM performance: (0.8083, 0.8007, 0.8045)

Exec 5:
NER performance: (0.8435, 0.8632, 0.8532)
NORM performance: (0.7972, 0.8159, 0.8064)



In [7]:
m_name = "mbert_galen"
execs = [1, 2, 3, 4, 5]

check_ner_norm_performance(model_name=m_name, arr_execs=execs)

Exec 1:
NER performance: (0.8653, 0.8643, 0.8648)
NORM performance: (0.8101, 0.8092, 0.8097)

Exec 2:
NER performance: (0.8822, 0.8593, 0.8706)
NORM performance: (0.8302, 0.8087, 0.8193)

Exec 3:
NER performance: (0.8529, 0.8748, 0.8637)
NORM performance: (0.8011, 0.8216, 0.8113)

Exec 4:
NER performance: (0.8694, 0.8665, 0.8679)
NORM performance: (0.8202, 0.8175, 0.8189)

Exec 5:
NER performance: (0.864, 0.8637, 0.8639)
NORM performance: (0.8136, 0.8134, 0.8135)



## Paper

### NORM

In [8]:
model_performance(
    {
        'beto': [1, 2, 3, 4, 5], 
        'beto_galen': [1, 2, 3, 4, 5],
        'mbert': [1, 2, 3, 4, 5], 
        'mbert_galen': [1, 2, 3, 4, 5],
        'xlmr': [1, 2, 3, 4, 5], 
        'xlmr_galen': [1, 2, 3, 4, 5]
    }, 
    subtask='norm'
)

Unnamed: 0,P_avg,P_std,P_max,R_avg,R_std,R_max,F1_avg,F1_std,F1_max
beto,0.802,0.011,0.82,0.797,0.005,0.805,0.799,0.004,0.804
beto_galen,0.805,0.005,0.81,0.806,0.007,0.816,0.805,0.004,0.813
mbert,0.805,0.005,0.811,0.809,0.006,0.816,0.807,0.003,0.812
mbert_galen,0.815,0.011,0.83,0.814,0.006,0.822,0.815,0.004,0.819
xlmr,0.802,0.007,0.814,0.806,0.006,0.816,0.804,0.005,0.81
xlmr_galen,0.812,0.008,0.826,0.812,0.003,0.817,0.812,0.004,0.818


In [9]:
format_df_paper(
    model_performance(
        {
            'beto': [1, 2, 3, 4, 5], 
            'beto_galen': [1, 2, 3, 4, 5],
            'mbert': [1, 2, 3, 4, 5], 
            'mbert_galen': [1, 2, 3, 4, 5],
            'xlmr': [1, 2, 3, 4, 5], 
            'xlmr_galen': [1, 2, 3, 4, 5]
        }, 
        subtask='norm'
    )
)

Unnamed: 0,P_avg_std,P_max,R_avg_std,R_max,F1_avg_std,F1_max
beto,.802 ± .011,0.82,.797 ± .005,0.805,.799 ± .004,0.804
beto_galen,.805 ± .005,0.81,.806 ± .007,0.816,.805 ± .004,0.813
mbert,.805 ± .005,0.811,.809 ± .006,0.816,.807 ± .003,0.812
mbert_galen,.815 ± .011,0.83,.814 ± .006,0.822,.815 ± .004,0.819
xlmr,.802 ± .007,0.814,.806 ± .006,0.816,.804 ± .005,0.81
xlmr_galen,.812 ± .008,0.826,.812 ± .003,0.817,.812 ± .004,0.818


### NER

In [10]:
model_performance(
    {
        'beto': [1, 2, 3, 4, 5], 
        'beto_galen': [1, 2, 3, 4, 5],
        'mbert': [1, 2, 3, 4, 5], 
        'mbert_galen': [1, 2, 3, 4, 5],
        'xlmr': [1, 2, 3, 4, 5], 
        'xlmr_galen': [1, 2, 3, 4, 5]
    }, 
    subtask='ner'
)

Unnamed: 0,P_avg,P_std,P_max,R_avg,R_std,R_max,F1_avg,F1_std,F1_max
beto,0.858,0.012,0.878,0.852,0.007,0.862,0.855,0.004,0.861
beto_galen,0.864,0.006,0.871,0.865,0.005,0.871,0.865,0.003,0.868
mbert,0.855,0.007,0.863,0.859,0.005,0.864,0.857,0.004,0.864
mbert_galen,0.867,0.011,0.882,0.866,0.006,0.875,0.866,0.003,0.871
xlmr,0.853,0.005,0.859,0.857,0.007,0.868,0.855,0.003,0.859
xlmr_galen,0.865,0.008,0.878,0.866,0.004,0.87,0.865,0.003,0.869


In [11]:
format_df_paper(
    model_performance(
        {
            'beto': [1, 2, 3, 4, 5], 
            'beto_galen': [1, 2, 3, 4, 5],
            'mbert': [1, 2, 3, 4, 5], 
            'mbert_galen': [1, 2, 3, 4, 5],
            'xlmr': [1, 2, 3, 4, 5], 
            'xlmr_galen': [1, 2, 3, 4, 5]
        }, 
        subtask='ner'
    )
)

Unnamed: 0,P_avg_std,P_max,R_avg_std,R_max,F1_avg_std,F1_max
beto,.858 ± .012,0.878,.852 ± .007,0.862,.855 ± .004,0.861
beto_galen,.864 ± .006,0.871,.865 ± .005,0.871,.865 ± .003,0.868
mbert,.855 ± .007,0.863,.859 ± .005,0.864,.857 ± .004,0.864
mbert_galen,.867 ± .011,0.882,.866 ± .006,0.875,.866 ± .003,0.871
xlmr,.853 ± .005,0.859,.857 ± .007,0.868,.855 ± .003,0.859
xlmr_galen,.865 ± .008,0.878,.866 ± .004,0.87,.865 ± .003,0.869


Save the (F1) performance of all executions of all models

In [7]:
def model_f1_values(dict_names_execs, subtask='norm', 
                    df_gs=df_test_gs, path_gs=test_gs_path):
    """
    Generate a vector containing the F1 performance of all executions of all models, in the given order.
    
    dict_names_execs: each key is a string with the model name, and 
                      each value is a list with the random execs of the corresponding model.
    """
    arr_values = []
    for model_name in dict_names_execs:
        for i_exec in dict_names_execs[model_name]:
            df_test_preds = pd.read_csv(RES_DIR + "df_test_preds_" + subtask + "_c_multi_task_" + \
                    str(model_name) + "_" + str(i_exec) + ".csv", header=0, sep='\t')
            _, _, f1 = calculate_ner_metrics(
                gs=df_gs, 
                pred=format_ner_pred_df(
                    gs_path=path_gs, 
                    df_preds=df_test_preds, 
                    subtask=subtask
                ),
                subtask=subtask
            )
            arr_values.append(f1)
    return arr_values

In [None]:
# NER

In [8]:
arr_val = model_f1_values(
    {
        'beto': [1, 2, 3, 4, 5], 
        'beto_galen': [1, 2, 3, 4, 5],
        'mbert': [1, 2, 3, 4, 5], 
        'mbert_galen': [1, 2, 3, 4, 5],
        'xlmr': [1, 2, 3, 4, 5], 
        'xlmr_galen': [1, 2, 3, 4, 5]
    }, 
    subtask='ner'
)

In [17]:
pd.DataFrame(arr_val).to_csv(RES_DIR + "ner_f1_exec_c_multi_task.csv", index=False, header=False, sep = '\t')

In [None]:
# NORM

In [18]:
arr_val = model_f1_values(
    {
        'beto': [1, 2, 3, 4, 5], 
        'beto_galen': [1, 2, 3, 4, 5],
        'mbert': [1, 2, 3, 4, 5], 
        'mbert_galen': [1, 2, 3, 4, 5],
        'xlmr': [1, 2, 3, 4, 5], 
        'xlmr_galen': [1, 2, 3, 4, 5]
    }, 
    subtask='norm'
)

In [19]:
pd.DataFrame(arr_val).to_csv(RES_DIR + "norm_f1_exec_c_multi_task.csv", index=False, header=False, sep = '\t')