# Evaluation

In [1]:
utils_path = "../utils/"
corpus_path = "../datasets/final_dataset_v4_to_publish/"
test_gs_path = corpus_path + "test/testX.tsv"

In [2]:
import tensorflow as tf

# Auxiliary components
import sys
sys.path.insert(0, utils_path)
from nlp_utils import *

RES_DIR = "../results/CodiEsp/final_exec/"

TYPE_ANN = "DIAGNOSTICO"
TYPE_TASK = TYPE_ANN[0].lower()

# GS data
df_test_gs = format_codiesp_x_gs(test_gs_path)

codes_d_path = corpus_path + "codiesp_codes/codiesp-" + TYPE_TASK.upper() + "_codes.tsv"
valid_codes = set(pd.read_csv(codes_d_path, sep='\t', header=None, 
                                  usecols=[0])[0].tolist())
valid_codes = set([x.lower() for x in valid_codes])

2022-09-15 08:30:28.812239: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


## Evaluation

In [3]:
def check_ner_norm_performance(model_name, arr_execs):
    """
    Sanity-check procedure that prints the NORM performance of each single model execution.
    """
    for i_exec in arr_execs:
        print("Exec " + str(i_exec) + ":")
        df_test_preds = pd.read_csv(RES_DIR + "df_test_preds_" + TYPE_TASK + "_hier_task_cls_" + \
                str(model_name) + "_" + str(i_exec) + ".csv", header=0, sep='\t')
        print("NORM performance:", calculate_codiesp_x_metrics(
            df_gs=df_test_gs[df_test_gs['label_gs'] == TYPE_ANN], 
            df_pred=format_codiesp_x_pred_df(
                df_run=df_test_preds,
                valid_codes=valid_codes
            )
        ), end="\n\n")

In [4]:
def model_performance(dict_names_execs, 
                      df_gs=df_test_gs,
                      round_n=3, multi_task=False):
    """
    Generate a pd.DataFrame with the statistics of the performance of each model.
    
    dict_names_execs: each key is a string with the model name, and 
                      each value is a list with the execs of the corresponding model.
    """
    res_dict = {}
    for model_name in dict_names_execs:
        p_res, r_res, f1_res = [], [], []
        for i_exec in dict_names_execs[model_name]:
            if multi_task:
                df_test_preds = pd.read_csv(RES_DIR + "df_test_preds_multi_task_ner_" + str(i_exec) + "_" + \
                    TYPE_TASK + "_hier_task_cls_" + str(model_name) + "_" + str(i_exec) + ".csv", header=0, sep='\t')
            else:
                df_test_preds = pd.read_csv(RES_DIR + "df_test_preds_" + TYPE_TASK + "_hier_task_cls_" + \
                    str(model_name) + "_" + str(i_exec) + ".csv", header=0, sep='\t')
            p, r, f1 = calculate_codiesp_x_metrics(
                df_gs=df_gs[df_gs['label_gs'] == TYPE_ANN], 
                df_pred=format_codiesp_x_pred_df(
                    df_run=df_test_preds,
                    valid_codes=valid_codes
                )
            )
            p_res.append(p)
            r_res.append(r)
            f1_res.append(f1)
        p_res_stat = pd.Series(p_res).describe()
        r_res_stat = pd.Series(r_res).describe()
        f1_res_stat = pd.Series(f1_res).describe()
        res_dict[model_name] = {"P_avg": round(p_res_stat['mean'], round_n), "P_std": round(p_res_stat['std'], round_n), 
                                "P_max": round(p_res_stat['max'], round_n),
                                "R_avg": round(r_res_stat['mean'], round_n), "R_std": round(r_res_stat['std'], round_n), 
                                "R_max": round(r_res_stat['max'], round_n),
                                "F1_avg": round(f1_res_stat['mean'], round_n), "F1_std": round(f1_res_stat['std'], round_n), 
                                "F1_max": round(f1_res_stat['max'], round_n)}
    return pd.DataFrame(res_dict, index=["P_avg", "P_std", "P_max", 
                                         "R_avg", "R_std", "R_max", 
                                         "F1_avg", "F1_std", "F1_max"]).transpose()    

In [5]:
def format_df_paper(df_res):
    arr_metrics = ["P", "R", "F1"]
    arr_cols = []
    for metric in arr_metrics:
        df_res[metric + '_avg_std'] = df_res.apply(
            lambda x: "." + str(x[metric + '_avg']).split('.')[-1] + " ± " + \
                "." + str(x[metric + '_std']).split('.')[-1], 
            axis=1
        )
        df_res[metric + '_max'] = df_res[metric + '_max'].apply(
            lambda x: "." + str(x).split('.')[-1]
        )
        arr_cols += [metric + '_avg_std', metric + '_max']
    return df_res[arr_cols]

In [5]:
# Sanity check

In [6]:
m_name = "xlmr"
execs = [1, 2, 3, 4, 5]

check_ner_norm_performance(model_name=m_name, arr_execs=execs)

Exec 1:
NORM performance: (0.6685, 0.5662, 0.6131)

Exec 2:
NORM performance: (0.6641, 0.5676, 0.612)

Exec 3:




NORM performance: (0.6912, 0.5654, 0.622)

Exec 4:
NORM performance: (0.6881, 0.5598, 0.6174)

Exec 5:
NORM performance: (0.6795, 0.5609, 0.6145)





In [7]:
m_name = "mbert_galen"
execs = [1, 2, 3, 4, 5]

check_ner_norm_performance(model_name=m_name, arr_execs=execs)

Exec 1:
NORM performance: (0.688, 0.5781, 0.6283)

Exec 2:
NORM performance: (0.6901, 0.576, 0.6279)

Exec 3:




NORM performance: (0.69, 0.5679, 0.623)

Exec 4:
NORM performance: (0.7036, 0.5764, 0.6337)

Exec 5:
NORM performance: (0.6903, 0.5693, 0.624)





## Paper

In [8]:
model_performance(
    {
        'beto': [1, 2, 3, 4, 5], 
        'beto_galen': [1, 2, 3, 4, 5],
        'mbert': [1, 2, 3, 4, 5], 
        'mbert_galen': [1, 2, 3, 4, 5],
        'xlmr': [1, 2, 3, 4, 5], 
        'xlmr_galen': [1, 2, 3, 4, 5]
    }
)



Unnamed: 0,P_avg,P_std,P_max,R_avg,R_std,R_max,F1_avg,F1_std,F1_max
beto,0.694,0.005,0.702,0.559,0.005,0.565,0.619,0.001,0.622
beto_galen,0.684,0.009,0.695,0.576,0.004,0.578,0.625,0.004,0.631
mbert,0.694,0.007,0.703,0.564,0.003,0.568,0.622,0.003,0.627
mbert_galen,0.692,0.006,0.704,0.574,0.005,0.578,0.627,0.004,0.634
xlmr,0.678,0.012,0.691,0.564,0.003,0.568,0.616,0.004,0.622
xlmr_galen,0.686,0.01,0.695,0.575,0.004,0.582,0.626,0.004,0.629


In [9]:
format_df_paper(
    model_performance(
        {
            'beto': [1, 2, 3, 4, 5], 
            'beto_galen': [1, 2, 3, 4, 5],
            'mbert': [1, 2, 3, 4, 5], 
            'mbert_galen': [1, 2, 3, 4, 5],
            'xlmr': [1, 2, 3, 4, 5], 
            'xlmr_galen': [1, 2, 3, 4, 5]
        }
    )
)



Unnamed: 0,P_avg_std,P_max,R_avg_std,R_max,F1_avg_std,F1_max
beto,.694 ± .005,0.702,.559 ± .005,0.565,.619 ± .001,0.622
beto_galen,.684 ± .009,0.695,.576 ± .004,0.578,.625 ± .004,0.631
mbert,.694 ± .007,0.703,.564 ± .003,0.568,.622 ± .003,0.627
mbert_galen,.692 ± .006,0.704,.574 ± .005,0.578,.627 ± .004,0.634
xlmr,.678 ± .012,0.691,.564 ± .003,0.568,.616 ± .004,0.622
xlmr_galen,.686 ± .01,0.695,.575 ± .004,0.582,.626 ± .004,0.629


Save the (F1) performance of all executions of all models

In [10]:
def model_f1_values(dict_names_execs, df_gs=df_test_gs):
    """
    Generate a vector containing the F1 performance of all executions of all models, in the given order.
    
    dict_names_execs: each key is a string with the model name, and 
                      each value is a list with the random execs of the corresponding model.
    """
    arr_values = []
    for model_name in dict_names_execs:
        for i_exec in dict_names_execs[model_name]:
            df_test_preds = pd.read_csv(RES_DIR + "df_test_preds_" + TYPE_TASK + "_hier_task_cls_" + \
                    str(model_name) + "_" + str(i_exec) + ".csv", header=0, sep='\t')
            _, _, f1 = calculate_codiesp_x_metrics(
                df_gs=df_gs[df_gs['label_gs'] == TYPE_ANN], 
                df_pred=format_codiesp_x_pred_df(
                    df_run=df_test_preds,
                    valid_codes=valid_codes
                )
            )
            arr_values.append(f1)
    return arr_values

In [None]:
# NORM

In [11]:
arr_val = model_f1_values(
    {
        'beto': [1, 2, 3, 4, 5], 
        'beto_galen': [1, 2, 3, 4, 5],
        'mbert': [1, 2, 3, 4, 5], 
        'mbert_galen': [1, 2, 3, 4, 5],
        'xlmr': [1, 2, 3, 4, 5], 
        'xlmr_galen': [1, 2, 3, 4, 5]
    }
)



In [5]:
pd.DataFrame(arr_val).to_csv(RES_DIR + "norm_f1_exec_" + TYPE_TASK + "_hier_task.csv", index=False, header=False, sep = '\t')

## Ensemble

In [12]:
ss_corpus_path = "../datasets/CodiEsp-SSplit-text/"

### Load data

In [13]:
train_path = corpus_path + "train/text_files/"
train_files = [f for f in os.listdir(train_path) if os.path.isfile(train_path + f) and f.split('.')[-1] == "txt"]
train_data = load_text_files(train_files, train_path)
df_text_train = pd.DataFrame({'doc_id': [s.split('.txt')[0] for s in train_files], 'raw_text': train_data})

In [14]:
dev_path = corpus_path + "dev/text_files/"
dev_files = [f for f in os.listdir(dev_path) if os.path.isfile(dev_path + f) and f.split('.')[-1] == "txt"]
dev_data = load_text_files(dev_files, dev_path)
df_text_dev = pd.DataFrame({'doc_id': [s.split('.txt')[0] for s in dev_files], 'raw_text': dev_data})

In [15]:
test_path = corpus_path + "test/text_files/"
test_files = [f for f in os.listdir(test_path) if os.path.isfile(test_path + f) and f.split('.')[-1] == 'txt']
test_data = load_text_files(test_files, test_path)
df_text_test = pd.DataFrame({'doc_id': [s.split('.txt')[0] for s in test_files], 'raw_text': test_data})

In [16]:
df_codes_train_ner = pd.read_table(corpus_path + "train/trainX.tsv", sep='\t', header=None)
df_codes_train_ner.columns = ["doc_id", "type", "code", "word", "location"]
df_codes_train_ner = df_codes_train_ner[~df_codes_train_ner[['doc_id', 'type', 'location']].duplicated(keep='first')]
df_codes_train_ner['disc'] = df_codes_train_ner['location'].apply(lambda x: ';' in x)

Select one type of annotations:

In [17]:
df_codes_train_ner = df_codes_train_ner[df_codes_train_ner['type'] == TYPE_ANN]

Split discontinuous annotations:

In [18]:
df_codes_train_ner_final = process_labels_norm_prueba(df_ann=df_codes_train_ner[["doc_id", "type", "code", "word", "location"]])

Remove annotations of zero length:

In [19]:
df_codes_train_ner_final['length'] = df_codes_train_ner_final.apply(lambda x: x['end'] - x['start'], axis=1)
df_codes_train_ner_final = df_codes_train_ner_final[df_codes_train_ner_final['length'] > 0]

Separate continuous and discontinuous annotations:

In [20]:
# Continiuous
df_codes_train_ner_final_cont = df_codes_train_ner_final[df_codes_train_ner_final['disc'] == 0].copy()
df_codes_train_ner_final_cont['disc'] = df_codes_train_ner_final_cont['disc'].astype(bool)

In [21]:
# Discontinuous
df_codes_train_ner_final_disc = restore_disc_ann(df_ann=df_codes_train_ner[df_codes_train_ner['disc']], 
                    df_ann_final=df_codes_train_ner_final[df_codes_train_ner_final['disc'] == 1])

In [22]:
df_codes_train_ner_final_disc['start'] = df_codes_train_ner_final_disc['location'].apply(lambda x: int(x.split(' ')[0]))
df_codes_train_ner_final_disc['end'] = df_codes_train_ner_final_disc['location'].apply(lambda x: int(x.split(' ')[-1]))

Concatenate continuous and discontinuous annotations:

In [23]:
# Concat
cols_concat = ['doc_id', 'type', 'code', 'word', 'location', 'start', 'end', 'disc']
df_codes_train_ner_final = pd.concat([df_codes_train_ner_final_cont[cols_concat], 
                                      df_codes_train_ner_final_disc[cols_concat]])

Now, we remove the right-to-left (text wise) discontinuous annotations:

In [24]:
df_codes_train_ner_final['direction'] = df_codes_train_ner_final.apply(check_ann_left_right_direction, axis=1)

In [25]:
df_codes_train_ner_final = df_codes_train_ner_final[df_codes_train_ner_final['direction']]

We only select the annotations fully contained in a single sentence:

In [26]:
# Sentence-Split data
ss_sub_corpus_path = ss_corpus_path + "train/"
ss_files = [f for f in os.listdir(ss_sub_corpus_path) if os.path.isfile(ss_sub_corpus_path + f)]
ss_dict_train = load_ss_files(ss_files, ss_sub_corpus_path)

In [27]:
df_mult_sent_train, df_one_sent_train, df_no_sent_train = check_ann_span_sent(df_ann=df_codes_train_ner_final, 
                                                                             ss_dict=ss_dict_train)

In [28]:
df_codes_train_ner_final = df_one_sent_train.copy()

In [29]:
print(df_codes_train_ner_final.disc.value_counts())

False    6134
True      723
Name: disc, dtype: int64


In [30]:
df_codes_train_ner_final.sort_values(['doc_id', 'start', 'end'], inplace=True)

In [22]:
# Code splitting

In [31]:
if TYPE_TASK == 'd':
    df_codes_train_ner_final["code_pre"] = df_codes_train_ner_final["code"].apply(lambda x: x.split('.')[0])
    df_codes_train_ner_final["code_suf"] = df_codes_train_ner_final["code"].apply(lambda x: None if not '.' in x else x.split('.')[1])
else:
    df_codes_train_ner_final["code_pre"] = df_codes_train_ner_final["code"].apply(lambda x: x[:4])
    df_codes_train_ner_final["code_suf"] = df_codes_train_ner_final["code"].apply(lambda x: None if len(x) < 7 else x[4:7])

In [32]:
df_codes_dev_ner = pd.read_table(corpus_path + "dev/devX.tsv", sep='\t', header=None)
df_codes_dev_ner.columns = ["doc_id", "type", "code", "word", "location"]
df_codes_dev_ner = df_codes_dev_ner[~df_codes_dev_ner[['doc_id', 'type', 'location']].duplicated(keep='first')]
df_codes_dev_ner['disc'] = df_codes_dev_ner['location'].apply(lambda x: ';' in x)

Select one type of annotations:

In [33]:
df_codes_dev_ner = df_codes_dev_ner[df_codes_dev_ner['type'] == TYPE_ANN]

Split discontinuous annotations:

In [34]:
df_codes_dev_ner_final = process_labels_norm_prueba(df_ann=df_codes_dev_ner[["doc_id", "type", "code", "word", "location"]])

Remove annotations of zero length:

In [35]:
df_codes_dev_ner_final['length'] = df_codes_dev_ner_final.apply(lambda x: x['end'] - x['start'], axis=1)
df_codes_dev_ner_final = df_codes_dev_ner_final[df_codes_dev_ner_final['length'] > 0]

Separate continuous and discontinuous annotations:

In [36]:
# Continiuous
df_codes_dev_ner_final_cont = df_codes_dev_ner_final[df_codes_dev_ner_final['disc'] == 0].copy()
df_codes_dev_ner_final_cont['disc'] = df_codes_dev_ner_final_cont['disc'].astype(bool)

In [37]:
# Discontinuous
df_codes_dev_ner_final_disc = restore_disc_ann(df_ann=df_codes_dev_ner[df_codes_dev_ner['disc']], 
                    df_ann_final=df_codes_dev_ner_final[df_codes_dev_ner_final['disc'] == 1])

In [38]:
df_codes_dev_ner_final_disc['start'] = df_codes_dev_ner_final_disc['location'].apply(lambda x: int(x.split(' ')[0]))
df_codes_dev_ner_final_disc['end'] = df_codes_dev_ner_final_disc['location'].apply(lambda x: int(x.split(' ')[-1]))

Concatenate continuous and discontinuous annotations:

In [39]:
# Concat
cols_concat = ['doc_id', 'type', 'code', 'word', 'location', 'start', 'end', 'disc']
df_codes_dev_ner_final = pd.concat([df_codes_dev_ner_final_cont[cols_concat], 
                                      df_codes_dev_ner_final_disc[cols_concat]])

Now, we remove the right-to-left (text wise) discontinuous annotations:

In [40]:
df_codes_dev_ner_final['direction'] = df_codes_dev_ner_final.apply(check_ann_left_right_direction, axis=1)

In [41]:
df_codes_dev_ner_final = df_codes_dev_ner_final[df_codes_dev_ner_final['direction']]

We only select the annotations fully contained in a single sentence:

In [42]:
# Sentence-Split data
ss_sub_corpus_path = ss_corpus_path + "dev/"
ss_files = [f for f in os.listdir(ss_sub_corpus_path) if os.path.isfile(ss_sub_corpus_path + f)]
ss_dict_dev = load_ss_files(ss_files, ss_sub_corpus_path)

In [43]:
df_mult_sent_dev, df_one_sent_dev, df_no_sent_dev = check_ann_span_sent(df_ann=df_codes_dev_ner_final, 
                                                                             ss_dict=ss_dict_dev)

In [44]:
df_codes_dev_ner_final = df_one_sent_dev.copy()

In [45]:
print(df_codes_dev_ner_final.disc.value_counts())

False    2910
True      362
Name: disc, dtype: int64


In [46]:
df_codes_dev_ner_final.sort_values(['doc_id', 'start', 'end'], inplace=True)

In [39]:
# Code splitting

In [47]:
if TYPE_TASK == 'd':
    df_codes_dev_ner_final["code_pre"] = df_codes_dev_ner_final["code"].apply(lambda x: x.split('.')[0])
    df_codes_dev_ner_final["code_suf"] = df_codes_dev_ner_final["code"].apply(lambda x: None if not '.' in x else x.split('.')[1])
else:
    df_codes_dev_ner_final["code_pre"] = df_codes_dev_ner_final["code"].apply(lambda x: x[:4])
    df_codes_dev_ner_final["code_suf"] = df_codes_dev_ner_final["code"].apply(lambda x: None if len(x) < 7 else x[4:7])

In [48]:
train_dev_codes_pre = sorted(set(df_codes_dev_ner_final["code_pre"].values).union(set(
    df_codes_train_ner_final["code_pre"].values
)))

In [49]:
len(train_dev_codes_pre)

901

In [50]:
train_dev_codes_suf = sorted(set(df_codes_dev_ner_final[df_codes_dev_ner_final['code_suf'].apply(lambda x: x is not None)]["code_suf"].values).union(set(df_codes_train_ner_final[df_codes_train_ner_final['code_suf'].apply(lambda x: x is not None)]["code_suf"].values))) 

In [51]:
len(train_dev_codes_suf)

305

In [52]:
# Create IOB-2 and Clinical-Coding label encoders as dict (more computationally efficient)
iob_lab_encoder = {"B": 0, "I": 1, "O": 2}
iob_lab_decoder = {0: "B", 1: "I", 2: "O"}

# Code-pre
code_pre_lab_encoder = {}
code_pre_lab_decoder = {}
i = 0
for code in train_dev_codes_pre:
    code_pre_lab_encoder[code] = i
    code_pre_lab_decoder[i] = code
    i += 1
    
code_pre_lab_encoder["O"] = i
code_pre_lab_decoder[i] = "O"

# Code-suf
code_suf_lab_encoder = {}
code_suf_lab_decoder = {}
i = 0
for code in train_dev_codes_suf:
    code_suf_lab_encoder[code] = i
    code_suf_lab_decoder[i] = code
    i += 1

# Add "O" label to code-suf, since some codes do not have suffix
code_suf_lab_encoder["O"] = i
code_suf_lab_decoder[i] = "O"

In [53]:
print(len(iob_lab_encoder), len(iob_lab_decoder))

3 3


In [54]:
print(len(code_pre_lab_encoder), len(code_pre_lab_decoder))

902 902


In [55]:
print(len(code_suf_lab_encoder), len(code_suf_lab_decoder))

306 306


### Evaluation

In [56]:
ENS_EVAL_STRAT = 'sum'
RES_DIR_ENS = RES_DIR + "ensemble/"

subtask = 'norm'
subtask_ann = subtask + '-iob_code_suf'

CODE_SEP = '.' if TYPE_ANN == 'DIAGNOSTICO' else ''

arr_exec = [1, 2, 3, 4, 5]

In [57]:
def ens_performance(ens_name, ner_model_name, arr_model_name, 
                    res_dir=RES_DIR_ENS, prefix_name='', arr_exec=arr_exec, 
                    df_gs=df_test_gs, 
                    code_pre_lab_decoder=code_pre_lab_decoder, 
                    code_suf_lab_decoder=code_suf_lab_decoder, 
                    ens_eval_strategy=ENS_EVAL_STRAT, 
                    subtask_ann=subtask_ann):
    df_ens_ner = pd.read_csv(res_dir + "df_test_preds_ens_ner_" + ens_name + "_" + TYPE_TASK + "_hier_task_cls_" + \
                             ner_model_name + "_ann.csv", header=0, sep='\t')
    ens_preds_code_pre, ens_preds_code_suf = [], []
    for model_name in arr_model_name:
        for i_exec in arr_exec:
            ens_preds_code_pre.append(np.load(file=res_dir + prefix_name + "test_preds_code_pre_ens_ner_" + ens_name + \
                                              "_" + TYPE_TASK + "_hier_task_cls_" + model_name + "_" + str(i_exec) + ".npy"))
            ens_preds_code_suf.append(np.load(file=res_dir + prefix_name + "test_preds_code_suf_ens_ner_" + ens_name + \
                                              "_" + TYPE_TASK + "_hier_task_cls_" + model_name + "_" + str(i_exec) + ".npy"))

    # Sanity check: all preds array have the same shape
    assert len(ens_preds_code_pre) == len(ens_preds_code_suf)
    for i in range(len(ens_preds_code_pre) - 1):
        assert ens_preds_code_pre[i].shape == ens_preds_code_pre[i + 1].shape
        assert ens_preds_code_suf[i].shape == ens_preds_code_suf[i + 1].shape
    
    print("Nº executions:", len(ens_preds_code_pre))

    if ens_eval_strategy == 'sum':
        ens_code_pre = np.sum(ens_preds_code_pre, axis=0)
        ens_code_suf = np.sum(ens_preds_code_suf, axis=0)
    else: # default 'prod'
        ens_code_pre = np.prod(ens_preds_code_pre, axis=0)
        ens_code_suf = np.prod(ens_preds_code_suf, axis=0)

    df_ens_preds = cls_code_norm_preds_brat_format(
        y_pred_cls=[ens_code_pre, ens_code_suf], 
        df_pred_ner=df_ens_ner, 
        code_decoder_list=[code_pre_lab_decoder, code_suf_lab_decoder],
        subtask=subtask_ann,
        code_sep=CODE_SEP,
        codes_pre_o_mask=None,
        codes_pre_suf_mask=None
    )
    
    # Adapt to CodiEsp format
    df_ens_preds['label_pred'] = TYPE_ANN
    df_ens_preds['pos_pred'] = [str(row['start']) + ' ' + str(row['end']) for index, row in df_ens_preds.iterrows()]
    df_ens_preds = df_ens_preds.rename(columns={'code_pred': 'code'})
    df_ens_preds = df_ens_preds[['clinical_case', 'pos_pred', 'label_pred', 'code']]

    return df_ens_preds, calculate_codiesp_x_metrics(
        df_gs=df_gs[df_gs['label_gs'] == TYPE_ANN], 
        df_pred=format_codiesp_x_pred_df(
            df_run=df_ens_preds,
            valid_codes=valid_codes
        )
    )

In [58]:
dic_ens_res = {}

### BETO

In [59]:
ens_name = 'beto'
ner_model_name = ens_name
arr_model_name = [ens_name]

In [60]:
df_ens_ann, res_metrics = ens_performance(
    ens_name=ens_name, 
    ner_model_name=ner_model_name, 
    arr_model_name=arr_model_name
)
print(res_metrics)

Nº executions: 5
(0.7185, 0.5711, 0.6363)




In [61]:
dic_ens_res[ens_name] = res_metrics

In [65]:
df_ens_ann.to_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_cls_" + '_'.join(arr_model_name) + \
                  ".csv", index=False, header=True, sep = '\t')

### BETO-Galén

In [62]:
ens_name = 'beto_galen'
ner_model_name = 'beto_galen'
arr_model_name = [ens_name]

In [63]:
df_ens_ann, res_metrics = ens_performance(
    ens_name=ens_name, 
    ner_model_name=ner_model_name, 
    arr_model_name=arr_model_name
)
print(res_metrics)

Nº executions: 5
(0.7051, 0.5848, 0.6394)




In [64]:
dic_ens_res[ens_name] = res_metrics

In [68]:
df_ens_ann.to_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_cls_" + '_'.join(arr_model_name) + \
                  ".csv", index=False, header=True, sep = '\t')

### mBERT

In [65]:
ens_name = 'mbert'
ner_model_name = 'mbert'
arr_model_name = [ens_name]

In [66]:
df_ens_ann, res_metrics = ens_performance(
    ens_name=ens_name, 
    ner_model_name=ner_model_name, 
    arr_model_name=arr_model_name
)
print(res_metrics)

Nº executions: 5
(0.7219, 0.5735, 0.6392)




In [67]:
dic_ens_res[ens_name] = res_metrics

In [71]:
df_ens_ann.to_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_cls_" + '_'.join(arr_model_name) + \
                  ".csv", index=False, header=True, sep = '\t')

### mBERT-Galén

In [68]:
ens_name = 'mbert_galen'
ner_model_name = 'mbert_galen'
arr_model_name = [ens_name]

In [69]:
df_ens_ann, res_metrics = ens_performance(
    ens_name=ens_name, 
    ner_model_name=ner_model_name, 
    arr_model_name=arr_model_name
)
print(res_metrics)

Nº executions: 5
(0.7136, 0.5795, 0.6396)




In [70]:
dic_ens_res[ens_name] = res_metrics

In [80]:
df_ens_ann.to_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_cls_" + '_'.join(arr_model_name) + \
                  ".csv", index=False, header=True, sep = '\t')

### XLM-R

In [71]:
ens_name = 'xlmr'
ner_model_name = 'xlmr'
arr_model_name = [ens_name]

In [72]:
df_ens_ann, res_metrics = ens_performance(
    ens_name=ens_name, 
    ner_model_name=ner_model_name, 
    arr_model_name=arr_model_name
)
print(res_metrics)

Nº executions: 5
(0.7031, 0.5742, 0.6322)




In [73]:
dic_ens_res[ens_name] = res_metrics

In [77]:
df_ens_ann.to_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_cls_" + '_'.join(arr_model_name) + \
                  ".csv", index=False, header=True, sep = '\t')

### XLM-R-Galén

In [74]:
ens_name = 'xlmr_galen'
ner_model_name = 'xlmr_galen'
arr_model_name = [ens_name]

In [75]:
df_ens_ann, res_metrics = ens_performance(
    ens_name=ens_name, 
    ner_model_name=ner_model_name, 
    arr_model_name=arr_model_name
)
print(res_metrics)

Nº executions: 5
(0.7047, 0.5862, 0.64)




In [76]:
dic_ens_res[ens_name] = res_metrics

In [83]:
df_ens_ann.to_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_cls_" + '_'.join(arr_model_name) + \
                  ".csv", index=False, header=True, sep = '\t')

### BETO + BETO-Galén

In [77]:
arr_model_name = ['beto', 'beto_galen']
ens_name = '_'.join(arr_model_name)
ner_model_name = 'beto'

In [78]:
df_ens_ann, res_metrics = ens_performance(
    ens_name=ens_name, 
    ner_model_name=ner_model_name, 
    arr_model_name=arr_model_name
)
print(res_metrics)

Nº executions: 10
(0.7212, 0.5844, 0.6457)




In [79]:
dic_ens_res[ens_name] = res_metrics

In [86]:
df_ens_ann.to_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_cls_" + '_'.join(arr_model_name) + \
                  ".csv", index=False, header=True, sep = '\t')

### mBERT + mBERT-Galén

In [80]:
arr_model_name = ['mbert', 'mbert_galen']
ens_name = '_'.join(arr_model_name)
ner_model_name = 'mbert'

In [81]:
df_ens_ann, res_metrics = ens_performance(
    ens_name=ens_name, 
    ner_model_name=ner_model_name, 
    arr_model_name=arr_model_name
)
print(res_metrics)

Nº executions: 10
(0.7276, 0.5788, 0.6447)




In [82]:
dic_ens_res[ens_name] = res_metrics

In [89]:
df_ens_ann.to_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_cls_" + '_'.join(arr_model_name) + \
                  ".csv", index=False, header=True, sep = '\t')

### XLM-R + XLM-R-Galén

In [83]:
arr_model_name = ['xlmr', 'xlmr_galen']
ens_name = '_'.join(arr_model_name)
ner_model_name = 'xlmr'

In [84]:
df_ens_ann, res_metrics = ens_performance(
    ens_name=ens_name, 
    ner_model_name=ner_model_name, 
    arr_model_name=arr_model_name
)
print(res_metrics)

Nº executions: 10
(0.7231, 0.5862, 0.6475)




In [85]:
dic_ens_res[ens_name] = res_metrics

In [92]:
df_ens_ann.to_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_cls_" + '_'.join(arr_model_name) + \
                  ".csv", index=False, header=True, sep = '\t')

In [86]:
MULTI_ENS_PREF = "multi_model_"

In [87]:
def add_empty_ner_preds(ens_name, ner_model_name, arr_model_name, 
                        res_dir=RES_DIR_ENS, empty_value=0, 
                        prefix_name=MULTI_ENS_PREF):
    """
    Considering a set of NER predictions as reference, this procedure inserts empty code-pre and code-suf
    samples in the case a certain model does not predict a reference NER sample.
    """
    # Load reference DF of NER predictions
    df_ref_ens_ner = pd.read_csv(res_dir + "df_test_preds_ens_ner_" + ens_name + "_" + TYPE_TASK + "_hier_task_cls_" + \
                                 ner_model_name + "_ann.csv", header=0, sep='\t')
    for model_name in arr_model_name:
        # Load DF of NER predictions from the current model
        df_ens_ner = pd.read_csv(res_dir + "df_test_preds_ens_ner_" + ens_name + "_" + TYPE_TASK + "_hier_task_cls_" + \
                                 model_name + "_ann.csv", header=0, sep='\t')
        assert df_ens_ner.shape[0] <= df_ref_ens_ner.shape[0]
        arr_i_insert = []
        if df_ens_ner.shape[0] < df_ref_ens_ner.shape[0]:
            # Check the indices of the absent samples, i.e. reference NER samples that are not predicted by the current model
            absent_samples = set(df_ref_ens_ner.apply(lambda x: str(x['clinical_case']) + '|' + str(x['location']), 
                                                      axis=1)) - \
                             set(df_ens_ner.apply(lambda x: str(x['clinical_case']) + '|' + str(x['location']), 
                                                  axis=1))
            assert len(absent_samples) == df_ref_ens_ner.shape[0] - df_ens_ner.shape[0]
            arr_i_absent = []
            for s in absent_samples:
                s = s.split('|')
                s_index = df_ref_ens_ner[(df_ref_ens_ner['clinical_case'] == s[0]) & (df_ref_ens_ner['location'] == s[1])].index[0]
                arr_i_absent.append(np.where(df_ref_ens_ner.index.values == s_index)[0][0])
            assert len(arr_i_absent) > 0
            # Sort indices of absent samples
            arr_i_absent = np.sort(arr_i_absent)
            # Convert indices of absent samples to insertion indices
            arr_i_insert.append(arr_i_absent[0])
            for i in range(1, len(arr_i_absent)):
                arr_i_insert.append(arr_i_absent[i] - i) # needs to be sanity checked
        ## Insert empty values for the absent samples (if any)
        for i_exec in arr_exec:
            # Load predictions of both code prefix and suffix
            preds_code_pre = np.load(file=res_dir + "test_preds_code_pre_ens_ner_" + ens_name + \
                                     "_" + TYPE_TASK + "_hier_task_cls_" + model_name + "_" + str(i_exec) + ".npy")
            preds_code_suf = np.load(file=res_dir + "test_preds_code_suf_ens_ner_" + ens_name + \
                                     "_" + TYPE_TASK + "_hier_task_cls_" + model_name + "_" + str(i_exec) + ".npy")
            preds_code_pre = np.insert(arr=preds_code_pre, obj=arr_i_insert, values=empty_value, axis=0)
            preds_code_suf = np.insert(arr=preds_code_suf, obj=arr_i_insert, values=empty_value, axis=0)
            # Save predictions after insertion
            np.save(file=res_dir + prefix_name + "test_preds_code_pre_ens_ner_" + ens_name + \
                    "_" + TYPE_TASK + "_hier_task_cls_" + model_name + "_" + str(i_exec) + ".npy",
                    arr=preds_code_pre)
            np.save(file=res_dir + prefix_name + "test_preds_code_suf_ens_ner_" + ens_name + \
                    "_" + TYPE_TASK + "_hier_task_cls_" + model_name + "_" + str(i_exec) + ".npy",
                    arr=preds_code_suf)

### BETO + mBERT + XLM-R

In [88]:
ens_name = "beto_mbert_xlmr"
arr_model_name = ['beto', 'mbert', 'xlmr']

In [89]:
df_ens_iob = pd.read_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_iob_" + ens_name + ".csv", header=0, sep='\t')

In [90]:
df_ens_iob.shape

(2869, 5)

In [91]:
df_ens_ner_beto = pd.read_csv(RES_DIR_ENS + "df_test_preds_ens_ner_" + ens_name + "_" + TYPE_TASK + "_hier_task_cls_" + \
                             arr_model_name[0] + "_ann.csv", header=0, sep='\t')

In [92]:
df_ens_ner_beto.shape

(2867, 5)

In [93]:
set_ens_ner_beto = set(df_ens_ner_beto.apply(lambda x: str(x['clinical_case']) + '|' + str(x['location']), axis=1))

In [94]:
len(set_ens_ner_beto)

2861

In [95]:
df_ens_ner_mbert = pd.read_csv(RES_DIR_ENS + "df_test_preds_ens_ner_" + ens_name + "_" + TYPE_TASK + "_hier_task_cls_" + \
                             arr_model_name[1] + "_ann.csv", header=0, sep='\t')

In [96]:
df_ens_ner_mbert.shape

(2867, 5)

In [97]:
set_ens_ner_mbert = set(df_ens_ner_mbert.apply(lambda x: str(x['clinical_case']) + '|' + str(x['location']), axis=1))

In [98]:
len(set_ens_ner_mbert)

2861

In [99]:
df_ens_ner_xlmr = pd.read_csv(RES_DIR_ENS + "df_test_preds_ens_ner_" + ens_name + "_" + TYPE_TASK + "_hier_task_cls_" + \
                             arr_model_name[2] + "_ann.csv", header=0, sep='\t')

In [100]:
df_ens_ner_xlmr.shape

(2866, 5)

In [101]:
set_ens_ner_xlmr = set(df_ens_ner_xlmr.apply(lambda x: str(x['clinical_case']) + '|' + str(x['location']), axis=1))

In [102]:
len(set_ens_ner_xlmr)

2860

In [103]:
assert set_ens_ner_beto == set_ens_ner_mbert

In [104]:
len(set_ens_ner_xlmr - set_ens_ner_beto)

0

In [105]:
len(set_ens_ner_beto - set_ens_ner_xlmr)

1

Both BETO and mBERT models could be empoyed as NER-reference.

In [106]:
ner_model_name = "beto"

In [107]:
add_empty_ner_preds(
    ens_name=ens_name, 
    ner_model_name=ner_model_name, 
    arr_model_name=arr_model_name, 
    empty_value=0
)

In [108]:
df_ens_ann, res_metrics = ens_performance(
    ens_name=ens_name, 
    ner_model_name=ner_model_name, 
    arr_model_name=arr_model_name,
    prefix_name=MULTI_ENS_PREF
)
print(res_metrics)

Nº executions: 15
(0.7332, 0.5802, 0.6478)




In [109]:
dic_ens_res[ens_name] = res_metrics

In [119]:
df_ens_ann.to_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_cls_" + '_'.join(arr_model_name) + \
                  ".csv", index=False, header=True, sep = '\t')

### BETO-Galén + mBERT-Galén + XLM-R-Galén

In [110]:
ens_name = "beto_galen_mbert_galen_xlmr_galen"
arr_model_name = ['beto_galen', 'mbert_galen', 'xlmr_galen']

In [111]:
df_ens_iob = pd.read_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_iob_" + ens_name + ".csv", header=0, sep='\t')

In [112]:
df_ens_iob.shape

(2957, 5)

In [113]:
df_ens_ner_beto = pd.read_csv(RES_DIR_ENS + "df_test_preds_ens_ner_" + ens_name + "_" + TYPE_TASK + "_hier_task_cls_" + \
                             arr_model_name[0] + "_ann.csv", header=0, sep='\t')

In [114]:
df_ens_ner_beto.shape

(2951, 5)

In [115]:
set_ens_ner_beto = set(df_ens_ner_beto.apply(lambda x: str(x['clinical_case']) + '|' + str(x['location']), axis=1))

In [116]:
len(set_ens_ner_beto)

2948

In [117]:
df_ens_ner_mbert = pd.read_csv(RES_DIR_ENS + "df_test_preds_ens_ner_" + ens_name + "_" + TYPE_TASK + "_hier_task_cls_" + \
                             arr_model_name[1] + "_ann.csv", header=0, sep='\t')

In [118]:
df_ens_ner_mbert.shape

(2951, 5)

In [119]:
set_ens_ner_mbert = set(df_ens_ner_mbert.apply(lambda x: str(x['clinical_case']) + '|' + str(x['location']), axis=1))

In [120]:
len(set_ens_ner_mbert)

2948

In [121]:
df_ens_ner_xlmr = pd.read_csv(RES_DIR_ENS + "df_test_preds_ens_ner_" + ens_name + "_" + TYPE_TASK + "_hier_task_cls_" + \
                             arr_model_name[2] + "_ann.csv", header=0, sep='\t')

In [122]:
df_ens_ner_xlmr.shape

(2950, 5)

In [123]:
set_ens_ner_xlmr = set(df_ens_ner_xlmr.apply(lambda x: str(x['clinical_case']) + '|' + str(x['location']), axis=1))

In [124]:
len(set_ens_ner_xlmr)

2947

In [125]:
assert set_ens_ner_beto == set_ens_ner_mbert

In [126]:
len(set_ens_ner_xlmr - set_ens_ner_beto)

0

In [127]:
len(set_ens_ner_beto - set_ens_ner_xlmr)

1

Both BETO-Galén and mBERT-Galén models could be empoyed as NER-reference.

In [128]:
ner_model_name = "beto_galen"

In [129]:
add_empty_ner_preds(
    ens_name=ens_name, 
    ner_model_name=ner_model_name, 
    arr_model_name=arr_model_name, 
    empty_value=0
)

In [130]:
df_ens_ann, res_metrics = ens_performance(
    ens_name=ens_name, 
    ner_model_name=ner_model_name, 
    arr_model_name=arr_model_name,
    prefix_name=MULTI_ENS_PREF
)
print(res_metrics)

Nº executions: 15
(0.7277, 0.5932, 0.6536)




In [131]:
dic_ens_res[ens_name] = res_metrics

In [141]:
df_ens_ann.to_csv(RES_DIR_ENS + "df_test_preds_" + TYPE_TASK + "_hier_task_cls_" + '_'.join(arr_model_name) + \
                  ".csv", index=False, header=True, sep = '\t')

In [132]:
df = pd.DataFrame(dic_ens_res, index=["P", "R", "F1"]).transpose()

In [133]:
df

Unnamed: 0,P,R,F1
beto,0.7185,0.5711,0.6363
beto_galen,0.7051,0.5848,0.6394
mbert,0.7219,0.5735,0.6392
mbert_galen,0.7136,0.5795,0.6396
xlmr,0.7031,0.5742,0.6322
xlmr_galen,0.7047,0.5862,0.64
beto_beto_galen,0.7212,0.5844,0.6457
mbert_mbert_galen,0.7276,0.5788,0.6447
xlmr_xlmr_galen,0.7231,0.5862,0.6475
beto_mbert_xlmr,0.7332,0.5802,0.6478


## Multi-task NER

In [134]:
df = format_df_paper(
    model_performance(
        {
            'beto': [1, 2, 3, 4, 5], 
            'beto_galen': [1, 2, 3, 4, 5],
            'mbert': [1, 2, 3, 4, 5], 
            'mbert_galen': [1, 2, 3, 4, 5],
            'xlmr': [1, 2, 3, 4, 5], 
            'xlmr_galen': [1, 2, 3, 4, 5]
        },
        multi_task=True
    )
)



In [135]:
df

Unnamed: 0,P_avg_std,P_max,R_avg_std,R_max,F1_avg_std,F1_max
beto,.71 ± .019,0.736,.544 ± .01,0.553,.616 ± .005,0.623
beto_galen,.703 ± .019,0.735,.561 ± .002,0.563,.624 ± .008,0.637
mbert,.688 ± .017,0.703,.559 ± .007,0.564,.617 ± .006,0.621
mbert_galen,.701 ± .005,0.707,.566 ± .004,0.572,.627 ± .003,0.63
xlmr,.667 ± .015,0.688,.564 ± .003,0.569,.611 ± .007,0.62
xlmr_galen,.694 ± .009,0.704,.565 ± .004,0.569,.623 ± .006,0.628


Save the (F1) performance of all executions of all models

In [136]:
def multi_model_f1_values(dict_names_execs, df_gs=df_test_gs):
    """
    Generate a vector containing the F1 performance of all executions of all models, in the given order.
    
    dict_names_execs: each key is a string with the model name, and 
                      each value is a list with the random execs of the corresponding model.
    """
    arr_values = []
    for model_name in dict_names_execs:
        for i_exec in dict_names_execs[model_name]:
            df_test_preds = pd.read_csv(RES_DIR + "df_test_preds_multi_task_ner_" + str(i_exec) + "_" + TYPE_TASK + \
                    "_hier_task_cls_" + str(model_name) + "_" + str(i_exec) + ".csv", header=0, sep='\t')
            _, _, f1 = calculate_codiesp_x_metrics(
                df_gs=df_gs[df_gs['label_gs'] == TYPE_ANN], 
                df_pred=format_codiesp_x_pred_df(
                    df_run=df_test_preds,
                    valid_codes=valid_codes
                )
            )
            arr_values.append(f1)
    return arr_values

In [None]:
# NORM

In [137]:
arr_val = multi_model_f1_values(
    {
        'beto': [1, 2, 3, 4, 5], 
        'beto_galen': [1, 2, 3, 4, 5],
        'mbert': [1, 2, 3, 4, 5], 
        'mbert_galen': [1, 2, 3, 4, 5],
        'xlmr': [1, 2, 3, 4, 5], 
        'xlmr_galen': [1, 2, 3, 4, 5]
    }
)



In [5]:
pd.DataFrame(arr_val).to_csv(RES_DIR + "norm_f1_exec_" + TYPE_TASK + "_multi_task_ner_hier_task.csv", index=False, header=False, sep = '\t')