# Model evaluation

In this notebook, the transformer-based models analyzed in this work are evaluated according to their clinical coding performance achieved in the CodiEsp-D task.

In [1]:
# Auxiliary components
import sys
sys.path.append("../../")
from nlp_utils import *

In [2]:
corpus_path = "../../datasets/codiesp_v4/"

# Both arrays were saved in CodiEsp-D_XLM-R_Fine-Tuning.ipynb
test_doc_list = np.load("test_docs.npy")
codes = np.load("classes.npy")

## Load models predictions

We load the coding predictions made by the models at sentence-level, and we further convert them to document-level predictions.

In [3]:
preds_dir = "predictions/"
rnd_seeds = list(range(5))

### mBERT

In [4]:
model_name = "mbert"
mbert_test_frags = np.load(model_name + "_test_frags.npy")
mbert_doc_preds = [max_fragment(y_frag_pred=np.load(preds_dir + model_name + "_seed_" + str(i) + "_test_preds.npy"), 
                            n_fragments=mbert_test_frags) for i in rnd_seeds]

### mBERT-Galén

In [5]:
model_name = "mbert_galen"
mbert_galen_doc_preds = [max_fragment(y_frag_pred=np.load(preds_dir + model_name + "_seed_" + str(i) + "_test_preds.npy"), 
                            n_fragments=mbert_test_frags) for i in rnd_seeds]

### BETO

In [6]:
model_name = "beto"
beto_test_frags = np.load(model_name + "_test_frags.npy")
beto_doc_preds = [max_fragment(y_frag_pred=np.load(preds_dir + model_name + "_seed_" + str(i) + "_test_preds.npy"), 
                            n_fragments=beto_test_frags) for i in rnd_seeds]

### BETO-Galén

In [7]:
model_name = "beto_galen"
beto_galen_doc_preds = [max_fragment(y_frag_pred=np.load(preds_dir + model_name + "_seed_" + str(i) + "_test_preds.npy"), 
                            n_fragments=beto_test_frags) for i in rnd_seeds]

### XLM-R

In [8]:
model_name = "xlm_r"
xlmr_test_frags = np.load(model_name + "_test_frags.npy")
xlmr_doc_preds = [max_fragment(y_frag_pred=np.load(preds_dir + model_name + "_seed_" + str(i) + "_test_preds.npy"), 
                            n_fragments=xlmr_test_frags) for i in rnd_seeds]

### XLM-R-Galén

In [9]:
model_name = "xlm_r_galen"
xlmr_galen_doc_preds = [max_fragment(y_frag_pred=np.load(preds_dir + model_name + "_seed_" + str(i) + "_test_preds.npy"), 
                            n_fragments=xlmr_test_frags) for i in rnd_seeds]

## MAP

Firstly, we evaluate the models according to the main evaluation metric of the task, the Mean Average Precision (MAP) score.

In [10]:
# Load valid codes
codes_path = "../../resources/codiesp-evaluation-script/codiesp_codes/codiesp-D_codes.tsv"
valid_codes = set(pd.read_csv(codes_path, sep='\t', header=None, 
                              usecols=[0])[0].tolist())
valid_codes = set([x.lower() for x in valid_codes])

# Format Gold Standard (GS) annotations
test_gs_path = corpus_path + "test/testD.tsv"
test_gs_out_path = "./test_intermediate_gs_file.txt"
format_gs(filepath=test_gs_path, output_path=test_gs_out_path)

In [11]:
map_res_dict = {}

In [12]:
map_res_dict["mBERT"] = compute_map_avg(doc_preds_arr=mbert_doc_preds, codes_labels=codes, docs=test_doc_list, 
                                        valid_codes=valid_codes, gs_out_path=test_gs_out_path)

In [13]:
map_res_dict["mBERT-Galén"] = compute_map_avg(doc_preds_arr=mbert_galen_doc_preds, codes_labels=codes, docs=test_doc_list, 
                                              valid_codes=valid_codes, gs_out_path=test_gs_out_path)

In [14]:
map_res_dict["BETO"] = compute_map_avg(doc_preds_arr=beto_doc_preds, codes_labels=codes, docs=test_doc_list, 
                                        valid_codes=valid_codes, gs_out_path=test_gs_out_path)

In [15]:
map_res_dict["BETO-Galén"] = compute_map_avg(doc_preds_arr=beto_galen_doc_preds, codes_labels=codes, docs=test_doc_list, 
                                              valid_codes=valid_codes, gs_out_path=test_gs_out_path)

In [16]:
map_res_dict["XLM-R"] = compute_map_avg(doc_preds_arr=xlmr_doc_preds, codes_labels=codes, docs=test_doc_list, 
                                        valid_codes=valid_codes, gs_out_path=test_gs_out_path)

In [17]:
map_res_dict["XLM-R-Galén"] = compute_map_avg(doc_preds_arr=xlmr_galen_doc_preds, codes_labels=codes, docs=test_doc_list, 
                                              valid_codes=valid_codes, gs_out_path=test_gs_out_path)

In [18]:
pd.DataFrame(map_res_dict).transpose()

Unnamed: 0,MAP,MAP_std,MAP_max
mBERT,0.594,0.005,0.602
mBERT-Galén,0.609,0.006,0.616
BETO,0.584,0.002,0.586
BETO-Galén,0.616,0.002,0.619
XLM-R,0.595,0.004,0.601
XLM-R-Galén,0.611,0.003,0.615


### Ensemble

In [19]:
map_ens_res_dict = {}

#### Single-model

In [20]:
map_ens_res_dict["mBERT"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum(mbert_doc_preds), label_encoder_classes=codes, doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [21]:
map_ens_res_dict["mBERT-Galén"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum(mbert_galen_doc_preds), label_encoder_classes=codes, 
                                            doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [22]:
map_ens_res_dict["BETO"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum(beto_doc_preds), label_encoder_classes=codes, doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [23]:
map_ens_res_dict["BETO-Galén"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum(beto_galen_doc_preds), label_encoder_classes=codes, 
                                            doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [24]:
map_ens_res_dict["XLM-R"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum(xlmr_doc_preds), label_encoder_classes=codes, doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [25]:
map_ens_res_dict["XLM-R-Galén"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum(xlmr_galen_doc_preds), label_encoder_classes=codes, 
                                            doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

#### Multiple-model

In [26]:
map_ens_res_dict["mBERT + mBERT-Galén"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum([sum(mbert_doc_preds), sum(mbert_galen_doc_preds)]), 
                                            label_encoder_classes=codes, doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [27]:
map_ens_res_dict["BETO + BETO-Galén"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum([sum(beto_doc_preds), sum(beto_galen_doc_preds)]), 
                                            label_encoder_classes=codes, doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [28]:
map_ens_res_dict["XLM-R + XLM-R-Galén"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum([sum(xlmr_doc_preds), sum(xlmr_galen_doc_preds)]), 
                                            label_encoder_classes=codes, doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [29]:
map_ens_res_dict["mBERT + BETO + XLM-R"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum([sum(mbert_doc_preds), sum(beto_doc_preds), sum(xlmr_doc_preds)]), 
                                            label_encoder_classes=codes, doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [30]:
map_ens_res_dict["mBERT-Galén + BETO-Galén + XLM-R-Galén"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum([sum(mbert_galen_doc_preds), sum(beto_galen_doc_preds), 
                                                        sum(xlmr_galen_doc_preds)]), 
                                            label_encoder_classes=codes, doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [31]:
pd.DataFrame(map_ens_res_dict).transpose()

Unnamed: 0,MAP
mBERT,0.633
mBERT-Galén,0.64
BETO,0.625
BETO-Galén,0.648
XLM-R,0.629
XLM-R-Galén,0.645
mBERT + mBERT-Galén,0.65
BETO + BETO-Galén,0.649
XLM-R + XLM-R-Galén,0.651
mBERT + BETO + XLM-R,0.647


## Additional metrics

Additionally, we also evaluate the models according to the micro-averaged precision, recall and F1-score. For this purpose, we use binary classification thresholds previously estimated on the development set of the CodiEsp-P corpus.

In [32]:
add_res_dict = {}

In [33]:
test_pred_file_path = "./test_intermediate_pred_file.txt"

In [34]:
mbert_thr = 0.16

In [35]:
%%time
add_res_dict["mBERT"] = compute_metrics_avg(thr=mbert_thr, doc_preds_arr=mbert_doc_preds, codes_labels=codes, 
                                            docs=list(test_doc_list), pred_file_path=test_pred_file_path, 
                                            gs_path=test_gs_path, valid_codes_path=codes_path)

CPU times: user 4.8 s, sys: 20 ms, total: 4.82 s
Wall time: 4.82 s


In [36]:
mbert_galen_thr = 0.23

In [37]:
%%time
add_res_dict["mBERT-Galén"] = compute_metrics_avg(thr=mbert_galen_thr, doc_preds_arr=mbert_galen_doc_preds, codes_labels=codes, 
                                            docs=list(test_doc_list), pred_file_path=test_pred_file_path, 
                                            gs_path=test_gs_path, valid_codes_path=codes_path)

CPU times: user 4.84 s, sys: 8.61 ms, total: 4.84 s
Wall time: 4.84 s


In [38]:
beto_thr = 0.19

In [39]:
%%time
add_res_dict["BETO"] = compute_metrics_avg(thr=beto_thr, doc_preds_arr=beto_doc_preds, codes_labels=codes, 
                                            docs=list(test_doc_list), pred_file_path=test_pred_file_path, 
                                            gs_path=test_gs_path, valid_codes_path=codes_path)

CPU times: user 4.78 s, sys: 8.62 ms, total: 4.79 s
Wall time: 4.79 s


In [40]:
beto_galen_thr = 0.31

In [41]:
%%time
add_res_dict["BETO-Galén"] = compute_metrics_avg(thr=beto_galen_thr, doc_preds_arr=beto_galen_doc_preds, codes_labels=codes, 
                                            docs=list(test_doc_list), pred_file_path=test_pred_file_path, 
                                            gs_path=test_gs_path, valid_codes_path=codes_path)

CPU times: user 4.8 s, sys: 11.5 ms, total: 4.82 s
Wall time: 4.82 s


In [42]:
xlmr_thr = 0.16

In [43]:
%%time
add_res_dict["XLM-R"] = compute_metrics_avg(thr=xlmr_thr, doc_preds_arr=xlmr_doc_preds, codes_labels=codes, 
                                            docs=list(test_doc_list), pred_file_path=test_pred_file_path, 
                                            gs_path=test_gs_path, valid_codes_path=codes_path)

CPU times: user 4.85 s, sys: 18.5 ms, total: 4.86 s
Wall time: 4.85 s


In [44]:
xlmr_galen_thr = 0.27

In [45]:
%%time
add_res_dict["XLM-R-Galén"] = compute_metrics_avg(thr=xlmr_galen_thr, doc_preds_arr=xlmr_galen_doc_preds, codes_labels=codes, 
                                            docs=list(test_doc_list), pred_file_path=test_pred_file_path, 
                                            gs_path=test_gs_path, valid_codes_path=codes_path)

CPU times: user 4.82 s, sys: 12.2 ms, total: 4.83 s
Wall time: 4.83 s


In [47]:
pd.DataFrame(add_res_dict).transpose()

Unnamed: 0,P,P_std,P_max,R,R_std,R_max,F1,F1_std,F1_max
mBERT,0.685,0.01,0.692,0.576,0.007,0.586,0.626,0.004,0.634
mBERT-Galén,0.719,0.017,0.742,0.565,0.009,0.577,0.633,0.01,0.646
BETO,0.696,0.014,0.717,0.553,0.007,0.561,0.616,0.003,0.621
BETO-Galén,0.754,0.011,0.766,0.559,0.007,0.569,0.642,0.003,0.646
XLM-R,0.666,0.023,0.691,0.582,0.006,0.594,0.62,0.012,0.635
XLM-R-Galén,0.71,0.013,0.726,0.574,0.007,0.583,0.635,0.007,0.642


### Ensemble

In [48]:
add_ens_res_dict = {}

#### Single-model

In [49]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum(mbert_doc_preds)/len(rnd_seeds), label_encoder_classes=codes, 
                                             doc_list=test_doc_list, thr=mbert_thr)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["mBERT"] = pd.Series({"P": p, "R": r, "F1": f1})

In [50]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum(mbert_galen_doc_preds)/len(rnd_seeds), label_encoder_classes=codes, 
                                             doc_list=test_doc_list, thr=mbert_galen_thr)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["mBERT-Galén"] = pd.Series({"P": p, "R": r, "F1": f1})

In [51]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum(beto_doc_preds)/len(rnd_seeds), label_encoder_classes=codes, 
                                             doc_list=test_doc_list, thr=beto_thr)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["BETO"] = pd.Series({"P": p, "R": r, "F1": f1})

In [52]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum(beto_galen_doc_preds)/len(rnd_seeds), label_encoder_classes=codes, 
                                             doc_list=test_doc_list, thr=beto_galen_thr)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["BETO-Galén"] = pd.Series({"P": p, "R": r, "F1": f1})

In [53]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum(xlmr_doc_preds)/len(rnd_seeds), label_encoder_classes=codes, 
                                             doc_list=test_doc_list, thr=xlmr_thr)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["XLM-R"] = pd.Series({"P": p, "R": r, "F1": f1})

In [54]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum(xlmr_galen_doc_preds)/len(rnd_seeds), label_encoder_classes=codes, 
                                             doc_list=test_doc_list, thr=xlmr_galen_thr)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["XLM-R-Galén"] = pd.Series({"P": p, "R": r, "F1": f1})

#### Multiple-model

In [55]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum([sum(mbert_doc_preds), sum(mbert_galen_doc_preds)])/(len(rnd_seeds)*2), 
                                             label_encoder_classes=codes, doc_list=test_doc_list, 
                                             thr=(mbert_thr + mbert_galen_thr)/2)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["mBERT + mBERT-Galén"] = pd.Series({"P": p, "R": r, "F1": f1})

In [56]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum([sum(beto_doc_preds), sum(beto_galen_doc_preds)])/(len(rnd_seeds)*2), 
                                             label_encoder_classes=codes, doc_list=test_doc_list, 
                                             thr=(beto_thr + beto_galen_thr)/2)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["BETO + BETO-Galén"] = pd.Series({"P": p, "R": r, "F1": f1})

In [57]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum([sum(xlmr_doc_preds), sum(xlmr_galen_doc_preds)])/(len(rnd_seeds)*2), 
                                             label_encoder_classes=codes, doc_list=test_doc_list, 
                                             thr=(xlmr_thr + xlmr_galen_thr)/2)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["XLM-R + XLM-R-Galén"] = pd.Series({"P": p, "R": r, "F1": f1})

In [58]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum([sum(mbert_doc_preds), sum(beto_doc_preds), 
                                                         sum(xlmr_doc_preds)])/(len(rnd_seeds)*3), 
                                             label_encoder_classes=codes, doc_list=test_doc_list, 
                                             thr=(mbert_thr + beto_thr + xlmr_thr)/3)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["mBERT + BETO + XLM-R"] = pd.Series({"P": p, "R": r, "F1": f1})

In [59]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum([sum(mbert_galen_doc_preds), sum(beto_galen_doc_preds), 
                                                         sum(xlmr_galen_doc_preds)])/(len(rnd_seeds)*3), 
                                             label_encoder_classes=codes, doc_list=test_doc_list, 
                                             thr=(mbert_galen_thr + beto_galen_thr + xlmr_galen_thr)/3)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["mBERT-Galén + BETO-Galén + XLM-R-Galén"] = pd.Series({"P": p, "R": r, "F1": f1})

In [60]:
pd.DataFrame(add_ens_res_dict).transpose()

Unnamed: 0,P,R,F1
mBERT,0.673,0.622,0.647
mBERT-Galén,0.732,0.6,0.659
BETO,0.703,0.595,0.645
BETO-Galén,0.781,0.588,0.671
XLM-R,0.649,0.627,0.638
XLM-R-Galén,0.732,0.603,0.662
mBERT + mBERT-Galén,0.72,0.619,0.665
BETO + BETO-Galén,0.766,0.599,0.672
XLM-R + XLM-R-Galén,0.717,0.626,0.669
mBERT + BETO + XLM-R,0.703,0.624,0.661
