# Evaluation

In [1]:
utils_path = "../utils/"
corpus_path = "../datasets/final_dataset_v4_to_publish/"
codes_d_path = "../datasets/final_dataset_v4_to_publish/codiesp_codes/codiesp-D_codes.tsv"
test_gs_path = corpus_path + "test/testX.tsv"

In [2]:
import tensorflow as tf

# Auxiliary components
import sys
sys.path.insert(0, utils_path)
from nlp_utils import *

RES_DIR = "../results/CodiEsp/final_exec/"

TYPE_ANN = "DIAGNOSTICO"
TYPE_TASK = TYPE_ANN[0].lower()

# GS data
df_test_gs = format_codiesp_x_gs(test_gs_path)

valid_codes = set(pd.read_csv(codes_d_path, sep='\t', header=None, 
                                  usecols=[0])[0].tolist())
valid_codes = set([x.lower() for x in valid_codes])

2022-09-15 08:18:13.036705: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


## Evaluation

In [3]:
def check_ner_norm_performance(model_name, arr_execs):
    """
    Sanity-check procedure that prints the NER performance of each single model execution.
    """
    for i_exec in arr_execs:
        print("Exec " + str(i_exec) + ":")
        df_test_preds = pd.read_csv(RES_DIR + "df_test_preds_" + TYPE_TASK + "_hier_task_iob_" + \
                str(model_name) + "_" + str(i_exec) + ".csv", header=0, sep='\t')
        # Adapt to CodiEsp format
        df_test_preds['label_pred'] = TYPE_ANN
        df_test_preds['pos_pred'] = [str(row['start']) + ' ' + str(row['end']) for index, row in df_test_preds.iterrows()]
        df_test_preds['code'] = 'n23' if TYPE_ANN == 'DIAGNOSTICO' else 'bn20'
        df_test_preds = df_test_preds[['clinical_case', 'pos_pred', 'label_pred', 'code']]
        print("NER performance:", calculate_codiesp_ner_metrics(
            df_gs=df_test_gs[df_test_gs['label_gs'] == TYPE_ANN], 
            df_pred=format_codiesp_x_pred_df(
                df_run=df_test_preds,
                valid_codes=valid_codes
            )
        ), end="\n\n")

In [4]:
def model_performance(dict_names_execs,
                      round_n=3):
    """
    Generate a pd.DataFrame with the statistics of the performance of each model.
    
    dict_names_execs: each key is a string with the model name, and 
                      each value is a list with the execs of the corresponding model.
    """
    res_dict = {}
    for model_name in dict_names_execs:
        p_res, r_res, f1_res = [], [], []
        for i_exec in dict_names_execs[model_name]:
            df_test_preds = pd.read_csv(RES_DIR + "df_test_preds_" + TYPE_TASK + "_hier_task_iob_" + \
                str(model_name) + "_" + str(i_exec) + ".csv", header=0, sep='\t')
            # Adapt to CodiEsp format
            df_test_preds['label_pred'] = TYPE_ANN
            df_test_preds['pos_pred'] = [str(row['start']) + ' ' + str(row['end']) for index, row in df_test_preds.iterrows()]
            df_test_preds['code'] = 'n23' if TYPE_ANN == 'DIAGNOSTICO' else 'bn20'
            df_test_preds = df_test_preds[['clinical_case', 'pos_pred', 'label_pred', 'code']]
            p, r, f1 = calculate_codiesp_ner_metrics(
            df_gs=df_test_gs[df_test_gs['label_gs'] == TYPE_ANN], 
            df_pred=format_codiesp_x_pred_df(    
                df_run=df_test_preds,
                valid_codes=valid_codes
            ))
            p_res.append(p)
            r_res.append(r)
            f1_res.append(f1)
        p_res_stat = pd.Series(p_res).describe()
        r_res_stat = pd.Series(r_res).describe()
        f1_res_stat = pd.Series(f1_res).describe()
        res_dict[model_name] = {"P_avg": round(p_res_stat['mean'], round_n), "P_std": round(p_res_stat['std'], round_n), 
                                "P_max": round(p_res_stat['max'], round_n),
                                "R_avg": round(r_res_stat['mean'], round_n), "R_std": round(r_res_stat['std'], round_n), 
                                "R_max": round(r_res_stat['max'], round_n),
                                "F1_avg": round(f1_res_stat['mean'], round_n), "F1_std": round(f1_res_stat['std'], round_n), 
                                "F1_max": round(f1_res_stat['max'], round_n)}
    return pd.DataFrame(res_dict, index=["P_avg", "P_std", "P_max", 
                                         "R_avg", "R_std", "R_max", 
                                         "F1_avg", "F1_std", "F1_max"]).transpose()    

In [5]:
def format_df_paper(df_res):
    arr_metrics = ["P", "R", "F1"]
    arr_cols = []
    for metric in arr_metrics:
        df_res[metric + '_avg_std'] = df_res.apply(
            lambda x: "." + str(x[metric + '_avg']).split('.')[-1] + " ± " + \
                "." + str(x[metric + '_std']).split('.')[-1], 
            axis=1
        )
        df_res[metric + '_max'] = df_res[metric + '_max'].apply(
            lambda x: "." + str(x).split('.')[-1]
        )
        arr_cols += [metric + '_avg_std', metric + '_max']
    return df_res[arr_cols]

In [None]:
# Sanity check

In [6]:
m_name = "mbert"
execs = [1, 2, 3, 4, 5]

check_ner_norm_performance(model_name=m_name, arr_execs=execs)

Exec 1:




NER performance: (0.7918, 0.6296, 0.7015)

Exec 2:




NER performance: (0.7767, 0.6244, 0.6923)

Exec 3:




NER performance: (0.7758, 0.628, 0.6941)

Exec 4:




NER performance: (0.7676, 0.6349, 0.695)

Exec 5:




NER performance: (0.7793, 0.6241, 0.6931)



In [7]:
m_name = "xlmr_galen"
execs = [1, 2, 3, 4, 5]

check_ner_norm_performance(model_name=m_name, arr_execs=execs)

Exec 1:




NER performance: (0.7917, 0.6478, 0.7125)

Exec 2:




NER performance: (0.7733, 0.6461, 0.704)

Exec 3:




NER performance: (0.7643, 0.658, 0.7072)

Exec 4:




NER performance: (0.7669, 0.6514, 0.7044)

Exec 5:




NER performance: (0.7859, 0.6398, 0.7054)



## Paper

### NER

In [8]:
model_performance(
    {
        'beto': [1, 2, 3, 4, 5], 
        'beto_galen': [1, 2, 3, 4, 5],
        'mbert': [1, 2, 3, 4, 5], 
        'mbert_galen': [1, 2, 3, 4, 5],
        'xlmr': [1, 2, 3, 4, 5], 
        'xlmr_galen': [1, 2, 3, 4, 5]
    }
)



Unnamed: 0,P_avg,P_std,P_max,R_avg,R_std,R_max,F1_avg,F1_std,F1_max
beto,0.785,0.008,0.793,0.63,0.007,0.638,0.699,0.003,0.703
beto_galen,0.779,0.008,0.789,0.649,0.005,0.652,0.708,0.004,0.714
mbert,0.778,0.009,0.792,0.628,0.004,0.635,0.695,0.004,0.702
mbert_galen,0.781,0.006,0.789,0.644,0.005,0.65,0.706,0.004,0.712
xlmr,0.772,0.011,0.781,0.638,0.007,0.647,0.699,0.003,0.704
xlmr_galen,0.776,0.012,0.792,0.649,0.007,0.658,0.707,0.003,0.712


In [9]:
format_df_paper(
    model_performance(
        {
            'beto': [1, 2, 3, 4, 5], 
            'beto_galen': [1, 2, 3, 4, 5],
            'mbert': [1, 2, 3, 4, 5], 
            'mbert_galen': [1, 2, 3, 4, 5],
            'xlmr': [1, 2, 3, 4, 5], 
            'xlmr_galen': [1, 2, 3, 4, 5]
        }
    )
)



Unnamed: 0,P_avg_std,P_max,R_avg_std,R_max,F1_avg_std,F1_max
beto,.785 ± .008,0.793,.63 ± .007,0.638,.699 ± .003,0.703
beto_galen,.779 ± .008,0.789,.649 ± .005,0.652,.708 ± .004,0.714
mbert,.778 ± .009,0.792,.628 ± .004,0.635,.695 ± .004,0.702
mbert_galen,.781 ± .006,0.789,.644 ± .005,0.65,.706 ± .004,0.712
xlmr,.772 ± .011,0.781,.638 ± .007,0.647,.699 ± .003,0.704
xlmr_galen,.776 ± .012,0.792,.649 ± .007,0.658,.707 ± .003,0.712


Save the (F1) performance of all executions of all models

In [10]:
def model_f1_values(dict_names_execs, df_gs=df_test_gs):
    """
    Generate a vector containing the F1 performance of all executions of all models, in the given order.
    
    dict_names_execs: each key is a string with the model name, and 
                      each value is a list with the random execs of the corresponding model.
    """
    arr_values = []
    for model_name in dict_names_execs:
        for i_exec in dict_names_execs[model_name]:
            df_test_preds = pd.read_csv(RES_DIR + "df_test_preds_" + TYPE_TASK + "_hier_task_iob_" + \
                    str(model_name) + "_" + str(i_exec) + ".csv", header=0, sep='\t')
            # Adapt to CodiEsp format
            df_test_preds['label_pred'] = TYPE_ANN
            df_test_preds['pos_pred'] = [str(row['start']) + ' ' + str(row['end']) for index, row in df_test_preds.iterrows()]
            df_test_preds['code'] = 'n23' if TYPE_ANN == 'DIAGNOSTICO' else 'bn20'
            df_test_preds = df_test_preds[['clinical_case', 'pos_pred', 'label_pred', 'code']]
            _, _, f1 = calculate_codiesp_ner_metrics(
                df_gs=df_gs[df_gs['label_gs'] == TYPE_ANN], 
                df_pred=format_codiesp_x_pred_df(
                    df_run=df_test_preds,
                    valid_codes=valid_codes
                )
            )
            arr_values.append(f1)
    return arr_values

In [None]:
# NER

In [11]:
arr_val = model_f1_values(
    {
        'beto': [1, 2, 3, 4, 5], 
        'beto_galen': [1, 2, 3, 4, 5],
        'mbert': [1, 2, 3, 4, 5], 
        'mbert_galen': [1, 2, 3, 4, 5],
        'xlmr': [1, 2, 3, 4, 5], 
        'xlmr_galen': [1, 2, 3, 4, 5]
    }
)



In [5]:
pd.DataFrame(arr_val).to_csv(RES_DIR + "ner_f1_exec_" + TYPE_TASK + "_hier_task.csv", index=False, header=False, sep = '\t')

## Ensemble

In [13]:
# Load texts from test corpus 
test_path = corpus_path + "test/text_files/"
test_files = [f for f in os.listdir(test_path) if os.path.isfile(test_path + f) and f.split('.')[-1] == 'txt']
test_data = load_text_files(test_files, test_path)
df_text_test = pd.DataFrame({'doc_id': [s.split('.txt')[0] for s in test_files], 'raw_text': test_data})

# Load test doc list
test_doc_list = sorted(set(df_text_test["doc_id"]))

iob_lab_decoder = {0: "B", 1: "I", 2: "O"}

text_col = "raw_text"

subtask = 'norm'
subtask_ann = subtask + '-iob_cont_disc'

ENS_EVAL_STRAT = 'sum'
RES_DIR_ENS = RES_DIR + "ensemble/"

arr_exec = [1, 2, 3, 4, 5]
model_exec = {
    'mbert': arr_exec,
    'mbert_galen': arr_exec,
    'beto': arr_exec,
    'beto_galen': arr_exec,
    'xlmr': arr_exec,
    'xlmr_galen': arr_exec
}

In [14]:
import pickle

def ens_performance(dict_names_execs, df_gs=df_test_gs, 
                        doc_list=test_doc_list, lab_decoder_list=[iob_lab_decoder],
                        df_text=df_text_test, text_col=text_col, 
                        ens_eval_strategy=ENS_EVAL_STRAT):
    ens_preds, ens_start_end = [], []
    for model_name in dict_names_execs:
        # Word start-send pairs
        with open(RES_DIR_ENS + "test_word_start_end_" + TYPE_TASK + "_hier_task_iob_" + str(model_name) + ".pck", "rb") as f:
            word_start_end = pickle.load(f)
        for i_exec in dict_names_execs[model_name]:
            # Word predictions
            with open(RES_DIR_ENS + "test_word_preds_" + TYPE_TASK + "_hier_task_iob_" + str(model_name) + \
                      "_" + str(i_exec) + ".pck", "rb") as f:
                ens_preds.append(pickle.load(f))
                
            ens_start_end.append(word_start_end)
        
    df_ens_ann = ens_ner_preds_brat_format(
        doc_list=doc_list, 
        ens_doc_word_preds=ens_preds,                   
        ens_doc_word_start_end=ens_start_end, 
        lab_decoder_list=lab_decoder_list, 
        df_text=df_text, 
        text_col=text_col, 
        ens_eval_strat=ens_eval_strategy,
        subtask=subtask_ann
    )
    # Adapt to CodiEsp format
    df_ens_ann_format = df_ens_ann.copy()
    df_ens_ann_format['label_pred'] = TYPE_ANN
    df_ens_ann_format['pos_pred'] = [str(row['start']) + ' ' + str(row['end']) for index, row in df_ens_ann_format.iterrows()]
    df_ens_ann_format['code'] = 'n23' if TYPE_ANN == 'DIAGNOSTICO' else 'bn20'
    df_ens_ann_format = df_ens_ann_format[['clinical_case', 'pos_pred', 'label_pred', 'code']]
    
    return df_ens_ann, calculate_codiesp_ner_metrics(
        df_gs=df_gs[df_gs['label_gs'] == TYPE_ANN], 
        df_pred=format_codiesp_x_pred_df(
            df_run=df_ens_ann_format,
            valid_codes=valid_codes
        )
    )

In [15]:
dic_ens_res = {}

### BETO

In [16]:
arr_model_name = ['beto']

In [17]:
df_ens_ann, res_metrics = ens_performance(dict_names_execs={x: model_exec[x] for x in arr_model_name})
print(res_metrics)



(0.8096, 0.6395, 0.7146)


In [18]:
dic_ens_res['_'.join(arr_model_name)] = res_metrics

In [7]:
df_ens_ann.to_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_iob_" + '_'.join(arr_model_name) + \
                  ".csv", index=False, header=True, sep = '\t')

### BETO-Galén

In [19]:
arr_model_name = ['beto_galen']

In [20]:
df_ens_ann, res_metrics = ens_performance(dict_names_execs={x: model_exec[x] for x in arr_model_name})
print(res_metrics)



(0.7954, 0.6533, 0.7174)


In [21]:
dic_ens_res['_'.join(arr_model_name)] = res_metrics

In [10]:
df_ens_ann.to_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_iob_" + '_'.join(arr_model_name) + \
                  ".csv", index=False, header=True, sep = '\t')

### mBERT

In [22]:
arr_model_name = ['mbert']

In [23]:
df_ens_ann, res_metrics = ens_performance(dict_names_execs={x: model_exec[x] for x in arr_model_name})
print(res_metrics)



(0.8067, 0.6357, 0.7111)


In [24]:
dic_ens_res['_'.join(arr_model_name)] = res_metrics

In [13]:
df_ens_ann.to_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_iob_" + '_'.join(arr_model_name) + \
                  ".csv", index=False, header=True, sep = '\t')

### mBERT-Galén

In [25]:
arr_model_name = ['mbert_galen']

In [26]:
df_ens_ann, res_metrics = ens_performance(dict_names_execs={x: model_exec[x] for x in arr_model_name})
print(res_metrics)



(0.8007, 0.6519, 0.7187)


In [27]:
dic_ens_res['_'.join(arr_model_name)] = res_metrics

In [16]:
df_ens_ann.to_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_iob_" + '_'.join(arr_model_name) + \
                  ".csv", index=False, header=True, sep = '\t')

### XLM-R

In [28]:
arr_model_name = ['xlmr']

In [29]:
df_ens_ann, res_metrics = ens_performance(dict_names_execs={x: model_exec[x] for x in arr_model_name})
print(res_metrics)



(0.7933, 0.6428, 0.7102)


In [30]:
dic_ens_res['_'.join(arr_model_name)] = res_metrics

In [19]:
df_ens_ann.to_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_iob_" + '_'.join(arr_model_name) + \
                  ".csv", index=False, header=True, sep = '\t')

### XLM-R-Galén

In [31]:
arr_model_name = ['xlmr_galen']

In [32]:
df_ens_ann, res_metrics = ens_performance(dict_names_execs={x: model_exec[x] for x in arr_model_name})
print(res_metrics)



(0.7979, 0.6558, 0.7199)


In [33]:
dic_ens_res['_'.join(arr_model_name)] = res_metrics

In [22]:
df_ens_ann.to_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_iob_" + '_'.join(arr_model_name) + \
                  ".csv", index=False, header=True, sep = '\t')

### BETO + BETO-Galén

In [34]:
arr_model_name = ['beto', 'beto_galen']

In [35]:
df_ens_ann, res_metrics = ens_performance(dict_names_execs={x: model_exec[x] for x in arr_model_name})
print(res_metrics)



(0.8155, 0.6538, 0.7258)


In [36]:
dic_ens_res['_'.join(arr_model_name)] = res_metrics

In [25]:
df_ens_ann.to_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_iob_" + '_'.join(arr_model_name) + \
                  ".csv", index=False, header=True, sep = '\t')

### mBERT + mBERT-Galén

In [37]:
arr_model_name = ['mbert', 'mbert_galen']

In [38]:
df_ens_ann, res_metrics = ens_performance(dict_names_execs={x: model_exec[x] for x in arr_model_name})
print(res_metrics)



(0.8215, 0.6492, 0.7252)


In [39]:
dic_ens_res['_'.join(arr_model_name)] = res_metrics

In [28]:
df_ens_ann.to_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_iob_" + '_'.join(arr_model_name) + \
                  ".csv", index=False, header=True, sep = '\t')

### XLM-R + XLM-R-Galén

In [40]:
arr_model_name = ['xlmr', 'xlmr_galen']

In [41]:
df_ens_ann, res_metrics = ens_performance(dict_names_execs={x: model_exec[x] for x in arr_model_name})
print(res_metrics)



(0.8145, 0.6558, 0.7266)


In [42]:
dic_ens_res['_'.join(arr_model_name)] = res_metrics

In [31]:
df_ens_ann.to_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_iob_" + '_'.join(arr_model_name) + \
                  ".csv", index=False, header=True, sep = '\t')

### BETO + mBERT + XLM-R

In [43]:
arr_model_name = ['beto', 'mbert', 'xlmr']

In [44]:
df_ens_ann, res_metrics = ens_performance(dict_names_execs={x: model_exec[x] for x in arr_model_name})
print(res_metrics)



(0.8187, 0.6442, 0.721)


In [45]:
dic_ens_res['_'.join(arr_model_name)] = res_metrics

In [34]:
df_ens_ann.to_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_iob_" + '_'.join(arr_model_name) + \
                  ".csv", index=False, header=True, sep = '\t')

### BETO-Galén + mBERT-Galén + XLM-R-Galén

In [46]:
arr_model_name = ['beto_galen', 'mbert_galen', 'xlmr_galen']

In [47]:
df_ens_ann, res_metrics = ens_performance(dict_names_execs={x: model_exec[x] for x in arr_model_name})
print(res_metrics)



(0.8175, 0.6637, 0.7326)


In [48]:
dic_ens_res['_'.join(arr_model_name)] = res_metrics

In [37]:
df_ens_ann.to_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_iob_" + '_'.join(arr_model_name) + \
                  ".csv", index=False, header=True, sep = '\t')