# Evaluation

In [1]:
utils_path = "../utils/"
corpus_path = "../datasets/final_dataset_v4_to_publish/"
codes_d_path = "../datasets/final_dataset_v4_to_publish/codiesp_codes/codiesp-P_codes.tsv"
test_gs_path = corpus_path + "test/testX.tsv"

In [2]:
import tensorflow as tf

# Auxiliary components
import sys
sys.path.insert(0, utils_path)
from nlp_utils import *

RES_DIR = "../results/CodiEsp/final_exec/"

TYPE_ANN = "PROCEDIMIENTO"
TYPE_TASK = TYPE_ANN[0].lower()

# GS data
df_test_gs = format_codiesp_x_gs(test_gs_path)

valid_codes = set(pd.read_csv(codes_d_path, sep='\t', header=None, 
                                  usecols=[0])[0].tolist())
valid_codes = set([x.lower() for x in valid_codes])

2022-09-15 08:05:20.608571: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


## Evaluation

In [3]:
def check_ner_norm_performance(model_name, arr_execs):
    """
    Sanity-check procedure that prints the NER-NORM performance of each single model execution.
    """
    for i_exec in arr_execs:
        print("Exec " + str(i_exec) + ":")
        df_test_preds_ner = pd.read_csv(RES_DIR + "df_test_preds_ner_" + TYPE_TASK + "_multi_task_" + \
                str(model_name) + "_" + str(i_exec) + ".csv", header=0, sep='\t')
        # Adapt to CodiEsp format
        df_test_preds_ner['label_pred'] = TYPE_ANN
        df_test_preds_ner['pos_pred'] = [str(row['start']) + ' ' + str(row['end']) for index, row in df_test_preds_ner.iterrows()]
        df_test_preds_ner['code'] = 'n23' if TYPE_ANN == 'DIAGNOSTICO' else 'bn20'
        df_test_preds_ner = df_test_preds_ner[['clinical_case', 'pos_pred', 'label_pred', 'code']]
        print("NER performance:", calculate_codiesp_ner_metrics(
            df_gs=df_test_gs[df_test_gs['label_gs'] == TYPE_ANN], 
            df_pred=format_codiesp_x_pred_df(
                df_run=df_test_preds_ner,
                valid_codes=valid_codes
            )
        ))
        df_test_preds_norm = pd.read_csv(RES_DIR + "df_test_preds_norm_" + TYPE_TASK + "_multi_task_" + \
                str(model_name) + "_" + str(i_exec) + ".csv", header=0, sep='\t')
        print("NORM performance:", calculate_codiesp_x_metrics(
            df_gs=df_test_gs[df_test_gs['label_gs'] == TYPE_ANN], 
            df_pred=format_codiesp_x_pred_df(
                df_run=df_test_preds_norm,
                valid_codes=valid_codes
            )
        ), end="\n\n")

In [4]:
def model_performance(dict_names_execs, subtask='norm',
                      round_n=3):
    """
    Generate a pd.DataFrame with the statistics of the performance of each model.
    
    dict_names_seeds: each key is a string with the model name, and 
                      each value is a list with the seeds of the corresponding model.
    """
    res_dict = {}
    for model_name in dict_names_execs:
        p_res, r_res, f1_res = [], [], []
        for i_exec in dict_names_execs[model_name]:
            df_test_preds = pd.read_csv(RES_DIR + "df_test_preds_" + subtask + "_" + TYPE_TASK + "_multi_task_" + \
                    str(model_name) + "_" + str(i_exec) + ".csv", header=0, sep='\t')
            if subtask == 'ner':
                # Adapt to CodiEsp format
                df_test_preds['label_pred'] = TYPE_ANN
                df_test_preds['pos_pred'] = [str(row['start']) + ' ' + str(row['end']) for index, row in df_test_preds.iterrows()]
                df_test_preds['code'] = 'n23' if TYPE_ANN == 'DIAGNOSTICO' else 'bn20'
                df_test_preds = df_test_preds[['clinical_case', 'pos_pred', 'label_pred', 'code']]
                p, r, f1 = calculate_codiesp_ner_metrics(
                    df_gs=df_test_gs[df_test_gs['label_gs'] == TYPE_ANN], 
                    df_pred=format_codiesp_x_pred_df(
                        df_run=df_test_preds,
                        valid_codes=valid_codes
                    )
                )
            else:
                p, r, f1 = calculate_codiesp_x_metrics(
                    df_gs=df_test_gs[df_test_gs['label_gs'] == TYPE_ANN], 
                    df_pred=format_codiesp_x_pred_df(
                        df_run=df_test_preds,
                        valid_codes=valid_codes
                    )
                )
            p_res.append(p)
            r_res.append(r)
            f1_res.append(f1)
        p_res_stat = pd.Series(p_res).describe()
        r_res_stat = pd.Series(r_res).describe()
        f1_res_stat = pd.Series(f1_res).describe()
        res_dict[model_name] = {"P_avg": round(p_res_stat['mean'], round_n), "P_std": round(p_res_stat['std'], round_n), 
                                "P_max": round(p_res_stat['max'], round_n),
                                "R_avg": round(r_res_stat['mean'], round_n), "R_std": round(r_res_stat['std'], round_n), 
                                "R_max": round(r_res_stat['max'], round_n),
                                "F1_avg": round(f1_res_stat['mean'], round_n), "F1_std": round(f1_res_stat['std'], round_n), 
                                "F1_max": round(f1_res_stat['max'], round_n)}
    return pd.DataFrame(res_dict, index=["P_avg", "P_std", "P_max", 
                                         "R_avg", "R_std", "R_max", 
                                         "F1_avg", "F1_std", "F1_max"]).transpose()    

In [5]:
def format_df_paper(df_res):
    arr_metrics = ["P", "R", "F1"]
    arr_cols = []
    for metric in arr_metrics:
        df_res[metric + '_avg_std'] = df_res.apply(
            lambda x: "." + str(x[metric + '_avg']).split('.')[-1] + " ± " + \
                "." + str(x[metric + '_std']).split('.')[-1], 
            axis=1
        )
        df_res[metric + '_max'] = df_res[metric + '_max'].apply(
            lambda x: "." + str(x).split('.')[-1]
        )
        arr_cols += [metric + '_avg_std', metric + '_max']
    return df_res[arr_cols]

In [6]:
# Sanity check

In [6]:
m_name = "xlmr"
execs = [1, 2, 3, 4, 5]

check_ner_norm_performance(model_name=m_name, arr_execs=execs)

Exec 1:
NER performance: (0.7389, 0.4643, 0.5703)
NORM performance: (0.6241, 0.4118, 0.4962)

Exec 2:




NER performance: (0.7015, 0.4921, 0.5784)
NORM performance: (0.5785, 0.4232, 0.4888)

Exec 3:
NER performance: (0.7115, 0.4801, 0.5733)
NORM performance: (0.5854, 0.4096, 0.4819)

Exec 4:




NER performance: (0.7105, 0.4662, 0.563)
NORM performance: (0.5943, 0.405, 0.4817)

Exec 5:
NER performance: (0.7233, 0.4773, 0.5751)
NORM performance: (0.5836, 0.413, 0.4837)





In [7]:
m_name = "xlmr_galen"
execs = [1, 2, 3, 4, 5]

check_ner_norm_performance(model_name=m_name, arr_execs=execs)

Exec 1:
NER performance: (0.7039, 0.4958, 0.5818)
NORM performance: (0.5887, 0.4266, 0.4947)

Exec 2:




NER performance: (0.7106, 0.4893, 0.5796)
NORM performance: (0.5946, 0.4255, 0.496)

Exec 3:
NER performance: (0.7518, 0.4829, 0.588)
NORM performance: (0.6452, 0.4221, 0.5103)

Exec 4:




NER performance: (0.7265, 0.4875, 0.5835)
NORM performance: (0.6049, 0.4198, 0.4956)

Exec 5:
NER performance: (0.7545, 0.4699, 0.5791)
NORM performance: (0.6319, 0.4061, 0.4945)





## Paper

### NORM

In [8]:
model_performance(
    {
        'beto': [1, 2, 3, 4, 5], 
        'beto_galen': [1, 2, 3, 4, 5],
        'mbert': [1, 2, 3, 4, 5], 
        'mbert_galen': [1, 2, 3, 4, 5],
        'xlmr': [1, 2, 3, 4, 5], 
        'xlmr_galen': [1, 2, 3, 4, 5]
    }, 
    subtask='norm'
)



Unnamed: 0,P_avg,P_std,P_max,R_avg,R_std,R_max,F1_avg,F1_std,F1_max
beto,0.62,0.008,0.634,0.401,0.005,0.407,0.487,0.005,0.493
beto_galen,0.615,0.028,0.647,0.423,0.008,0.436,0.501,0.007,0.51
mbert,0.607,0.005,0.612,0.408,0.009,0.416,0.488,0.007,0.494
mbert_galen,0.621,0.014,0.642,0.418,0.005,0.423,0.499,0.007,0.51
xlmr,0.593,0.018,0.624,0.413,0.007,0.423,0.486,0.006,0.496
xlmr_galen,0.613,0.024,0.645,0.42,0.008,0.427,0.498,0.007,0.51


In [9]:
format_df_paper(
    model_performance(
        {
            'beto': [1, 2, 3, 4, 5], 
            'beto_galen': [1, 2, 3, 4, 5],
            'mbert': [1, 2, 3, 4, 5], 
            'mbert_galen': [1, 2, 3, 4, 5],
            'xlmr': [1, 2, 3, 4, 5], 
            'xlmr_galen': [1, 2, 3, 4, 5]
        }, 
        subtask='norm'
    )
)



Unnamed: 0,P_avg_std,P_max,R_avg_std,R_max,F1_avg_std,F1_max
beto,.62 ± .008,0.634,.401 ± .005,0.407,.487 ± .005,0.493
beto_galen,.615 ± .028,0.647,.423 ± .008,0.436,.501 ± .007,0.51
mbert,.607 ± .005,0.612,.408 ± .009,0.416,.488 ± .007,0.494
mbert_galen,.621 ± .014,0.642,.418 ± .005,0.423,.499 ± .007,0.51
xlmr,.593 ± .018,0.624,.413 ± .007,0.423,.486 ± .006,0.496
xlmr_galen,.613 ± .024,0.645,.42 ± .008,0.427,.498 ± .007,0.51


### NER

In [10]:
model_performance(
    {
        'beto': [1, 2, 3, 4, 5], 
        'beto_galen': [1, 2, 3, 4, 5],
        'mbert': [1, 2, 3, 4, 5], 
        'mbert_galen': [1, 2, 3, 4, 5],
        'xlmr': [1, 2, 3, 4, 5], 
        'xlmr_galen': [1, 2, 3, 4, 5]
    }, 
    subtask='ner'
)



Unnamed: 0,P_avg,P_std,P_max,R_avg,R_std,R_max,F1_avg,F1_std,F1_max
beto,0.749,0.012,0.762,0.463,0.006,0.468,0.572,0.007,0.578
beto_galen,0.729,0.024,0.764,0.483,0.015,0.508,0.58,0.007,0.588
mbert,0.725,0.009,0.736,0.472,0.012,0.485,0.572,0.01,0.583
mbert_galen,0.726,0.02,0.75,0.475,0.01,0.486,0.575,0.012,0.586
xlmr,0.717,0.014,0.739,0.476,0.011,0.492,0.572,0.006,0.578
xlmr_galen,0.729,0.023,0.754,0.485,0.01,0.496,0.582,0.004,0.588


In [11]:
format_df_paper(
    model_performance(
        {
            'beto': [1, 2, 3, 4, 5], 
            'beto_galen': [1, 2, 3, 4, 5],
            'mbert': [1, 2, 3, 4, 5], 
            'mbert_galen': [1, 2, 3, 4, 5],
            'xlmr': [1, 2, 3, 4, 5], 
            'xlmr_galen': [1, 2, 3, 4, 5]
        }, 
        subtask='ner'
    )
)



Unnamed: 0,P_avg_std,P_max,R_avg_std,R_max,F1_avg_std,F1_max
beto,.749 ± .012,0.762,.463 ± .006,0.468,.572 ± .007,0.578
beto_galen,.729 ± .024,0.764,.483 ± .015,0.508,.58 ± .007,0.588
mbert,.725 ± .009,0.736,.472 ± .012,0.485,.572 ± .01,0.583
mbert_galen,.726 ± .02,0.75,.475 ± .01,0.486,.575 ± .012,0.586
xlmr,.717 ± .014,0.739,.476 ± .011,0.492,.572 ± .006,0.578
xlmr_galen,.729 ± .023,0.754,.485 ± .01,0.496,.582 ± .004,0.588


Save the (F1) performance of all executions of all models

In [12]:
def model_f1_values(dict_names_execs, subtask='norm'):
    """
    Generate a vector containing the F1 performance of all executions of all models, in the given order.
    
    dict_names_execs: each key is a string with the model name, and 
                      each value is a list with the random execs of the corresponding model.
    """
    arr_values = []
    for model_name in dict_names_execs:
        for i_exec in dict_names_execs[model_name]:
            df_test_preds = pd.read_csv(RES_DIR + "df_test_preds_" + subtask + "_" + TYPE_TASK + "_multi_task_" + \
                    str(model_name) + "_" + str(i_exec) + ".csv", header=0, sep='\t')
            if subtask == 'ner':
                # Adapt to CodiEsp format
                df_test_preds['label_pred'] = TYPE_ANN
                df_test_preds['pos_pred'] = [str(row['start']) + ' ' + str(row['end']) for index, row in df_test_preds.iterrows()]
                df_test_preds['code'] = 'n23' if TYPE_ANN == 'DIAGNOSTICO' else 'bn20'
                df_test_preds = df_test_preds[['clinical_case', 'pos_pred', 'label_pred', 'code']]
                _, _, f1 = calculate_codiesp_ner_metrics(
                    df_gs=df_test_gs[df_test_gs['label_gs'] == TYPE_ANN], 
                    df_pred=format_codiesp_x_pred_df(
                        df_run=df_test_preds,
                        valid_codes=valid_codes
                    )
                )
            else:
                _, _, f1 = calculate_codiesp_x_metrics(
                    df_gs=df_test_gs[df_test_gs['label_gs'] == TYPE_ANN], 
                    df_pred=format_codiesp_x_pred_df(
                        df_run=df_test_preds,
                        valid_codes=valid_codes
                    )
                )
            arr_values.append(f1)
    return arr_values

In [None]:
# NER

In [13]:
arr_val = model_f1_values(
    {
        'beto': [1, 2, 3, 4, 5], 
        'beto_galen': [1, 2, 3, 4, 5],
        'mbert': [1, 2, 3, 4, 5], 
        'mbert_galen': [1, 2, 3, 4, 5],
        'xlmr': [1, 2, 3, 4, 5], 
        'xlmr_galen': [1, 2, 3, 4, 5]
    }, 
    subtask='ner'
)



In [7]:
pd.DataFrame(arr_val).to_csv(RES_DIR + "ner_f1_exec_" + TYPE_TASK + "_multi_task.csv", index=False, header=False, sep = '\t')

In [None]:
# NORM

In [8]:
arr_val = model_f1_values(
    {
        'beto': [1, 2, 3, 4, 5], 
        'beto_galen': [1, 2, 3, 4, 5],
        'mbert': [1, 2, 3, 4, 5], 
        'mbert_galen': [1, 2, 3, 4, 5],
        'xlmr': [1, 2, 3, 4, 5], 
        'xlmr_galen': [1, 2, 3, 4, 5]
    }, 
    subtask='norm'
)

In [9]:
pd.DataFrame(arr_val).to_csv(RES_DIR + "norm_f1_exec_" + TYPE_TASK + "_multi_task.csv", index=False, header=False, sep = '\t')