# Model evaluation

In this notebook, the transformer-based models analyzed in this work are evaluated according to their clinical coding performance achieved in the Cantemist-Coding task.

In [1]:
# Auxiliary components
import sys
sys.path.append("../../")
from nlp_utils import *

In [2]:
corpus_path = "../../datasets/cantemist_v6/"
sub_task_path = "cantemist-coding/"

# Both arrays were saved in Cantemist-Coding_XLM-R_Fine-Tuning.ipynb
test_doc_list = np.load("test_docs.npy")
codes = np.load("classes.npy")

## Load models predictions

We load the coding predictions made by the models at sentence-level, and we further convert them to document-level predictions.

In [3]:
preds_dir = "predictions/"
rnd_seeds = list(range(5))

### mBERT

In [4]:
model_name = "mbert"
mbert_test_frags = np.load(model_name + "_test_frags.npy")
mbert_doc_preds = [max_fragment(y_frag_pred=np.load(preds_dir + model_name + "_seed_" + str(i) + "_test_preds.npy"), 
                            n_fragments=mbert_test_frags) for i in rnd_seeds]

### mBERT-Galén

In [5]:
model_name = "mbert_galen"
mbert_galen_doc_preds = [max_fragment(y_frag_pred=np.load(preds_dir + model_name + "_seed_" + str(i) + "_test_preds.npy"), 
                            n_fragments=mbert_test_frags) for i in rnd_seeds]

### BETO

In [6]:
model_name = "beto"
beto_test_frags = np.load(model_name + "_test_frags.npy")
beto_doc_preds = [max_fragment(y_frag_pred=np.load(preds_dir + model_name + "_seed_" + str(i) + "_test_preds.npy"), 
                            n_fragments=beto_test_frags) for i in rnd_seeds]

### BETO-Galén

In [7]:
model_name = "beto_galen"
beto_galen_doc_preds = [max_fragment(y_frag_pred=np.load(preds_dir + model_name + "_seed_" + str(i) + "_test_preds.npy"), 
                            n_fragments=beto_test_frags) for i in rnd_seeds]

### XLM-R

In [8]:
model_name = "xlm_r"
xlmr_test_frags = np.load(model_name + "_test_frags.npy")
xlmr_doc_preds = [max_fragment(y_frag_pred=np.load(preds_dir + model_name + "_seed_" + str(i) + "_test_preds.npy"), 
                            n_fragments=xlmr_test_frags) for i in rnd_seeds]

### XLM-R-Galén

In [9]:
model_name = "xlm_r_galen"
xlmr_galen_doc_preds = [max_fragment(y_frag_pred=np.load(preds_dir + model_name + "_seed_" + str(i) + "_test_preds.npy"), 
                            n_fragments=xlmr_test_frags) for i in rnd_seeds]

## MAP

Firstly, we evaluate the models according to the main evaluation metric of the task, the Mean Average Precision (MAP) score.

In [10]:
# Load valid codes
codes_path = "../../resources/cantemist-evaluation-library/valid-codes.tsv"
valid_codes = set(pd.read_csv(codes_path, sep='\t', header=None, 
                              usecols=[0])[0].tolist())
valid_codes = set([x.lower() for x in valid_codes])

# Format Gold Standard (GS) annotations
test_gs_path = corpus_path + "test-set/" + sub_task_path + "test-coding.tsv"
test_gs_out_path = "./test_intermediate_gs_file.txt"
format_gs(filepath=test_gs_path, output_path=test_gs_out_path)

In [11]:
map_res_dict = {}

In [12]:
map_res_dict["mBERT"] = compute_map_avg(doc_preds_arr=mbert_doc_preds, codes_labels=codes, docs=test_doc_list, 
                                        valid_codes=valid_codes, gs_out_path=test_gs_out_path)

In [13]:
map_res_dict["mBERT-Galén"] = compute_map_avg(doc_preds_arr=mbert_galen_doc_preds, codes_labels=codes, docs=test_doc_list, 
                                              valid_codes=valid_codes, gs_out_path=test_gs_out_path)

In [14]:
map_res_dict["BETO"] = compute_map_avg(doc_preds_arr=beto_doc_preds, codes_labels=codes, docs=test_doc_list, 
                                        valid_codes=valid_codes, gs_out_path=test_gs_out_path)

In [15]:
map_res_dict["BETO-Galén"] = compute_map_avg(doc_preds_arr=beto_galen_doc_preds, codes_labels=codes, docs=test_doc_list, 
                                              valid_codes=valid_codes, gs_out_path=test_gs_out_path)

In [16]:
map_res_dict["XLM-R"] = compute_map_avg(doc_preds_arr=xlmr_doc_preds, codes_labels=codes, docs=test_doc_list, 
                                        valid_codes=valid_codes, gs_out_path=test_gs_out_path)

In [17]:
map_res_dict["XLM-R-Galén"] = compute_map_avg(doc_preds_arr=xlmr_galen_doc_preds, codes_labels=codes, docs=test_doc_list, 
                                              valid_codes=valid_codes, gs_out_path=test_gs_out_path)

In [18]:
pd.DataFrame(map_res_dict).transpose()

Unnamed: 0,MAP,MAP_std,MAP_max
mBERT,0.834,0.004,0.838
mBERT-Galén,0.858,0.002,0.86
BETO,0.818,0.005,0.824
BETO-Galén,0.862,0.001,0.864
XLM-R,0.836,0.004,0.841
XLM-R-Galén,0.859,0.001,0.861


### Ensemble

In [19]:
map_ens_res_dict = {}

#### Single-model

In [20]:
map_ens_res_dict["mBERT"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum(mbert_doc_preds), label_encoder_classes=codes, doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [21]:
map_ens_res_dict["mBERT-Galén"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum(mbert_galen_doc_preds), label_encoder_classes=codes, 
                                            doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [22]:
map_ens_res_dict["BETO"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum(beto_doc_preds), label_encoder_classes=codes, doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [23]:
map_ens_res_dict["BETO-Galén"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum(beto_galen_doc_preds), label_encoder_classes=codes, 
                                            doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [24]:
map_ens_res_dict["XLM-R"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum(xlmr_doc_preds), label_encoder_classes=codes, doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [25]:
map_ens_res_dict["XLM-R-Galén"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum(xlmr_galen_doc_preds), label_encoder_classes=codes, 
                                            doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

#### Multiple-model

In [26]:
map_ens_res_dict["mBERT + mBERT-Galén"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum([sum(mbert_doc_preds), sum(mbert_galen_doc_preds)]), 
                                            label_encoder_classes=codes, doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [27]:
map_ens_res_dict["BETO + BETO-Galén"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum([sum(beto_doc_preds), sum(beto_galen_doc_preds)]), 
                                            label_encoder_classes=codes, doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [28]:
map_ens_res_dict["XLM-R + XLM-R-Galén"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum([sum(xlmr_doc_preds), sum(xlmr_galen_doc_preds)]), 
                                            label_encoder_classes=codes, doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [29]:
map_ens_res_dict["mBERT + BETO + XLM-R"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum([sum(mbert_doc_preds), sum(beto_doc_preds), sum(xlmr_doc_preds)]), 
                                            label_encoder_classes=codes, doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [30]:
map_ens_res_dict["mBERT-Galén + BETO-Galén + XLM-R-Galén"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum([sum(mbert_galen_doc_preds), sum(beto_galen_doc_preds), 
                                                        sum(xlmr_galen_doc_preds)]), 
                                            label_encoder_classes=codes, doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [31]:
pd.DataFrame(map_ens_res_dict).transpose()

Unnamed: 0,MAP
mBERT,0.861
mBERT-Galén,0.876
BETO,0.853
BETO-Galén,0.88
XLM-R,0.862
XLM-R-Galén,0.875
mBERT + mBERT-Galén,0.874
BETO + BETO-Galén,0.874
XLM-R + XLM-R-Galén,0.874
mBERT + BETO + XLM-R,0.87


## Additional metrics

Additionally, we also evaluate the models according to the micro-averaged precision, recall and F1-score. For this purpose, we use binary classification thresholds previously estimated on the development set of the Cantemist-Coding corpus.

In [32]:
add_res_dict = {}

In [33]:
test_pred_file_path = "./test_intermediate_pred_file.txt"

In [34]:
mbert_thr = 0.66

In [35]:
%%time
add_res_dict["mBERT"] = compute_metrics_avg(thr=mbert_thr, doc_preds_arr=mbert_doc_preds, codes_labels=codes, 
                                            docs=list(test_doc_list), pred_file_path=test_pred_file_path, 
                                            gs_path=test_gs_path, valid_codes_path=codes_path)

CPU times: user 3.05 s, sys: 8.94 ms, total: 3.06 s
Wall time: 3.06 s


In [36]:
mbert_galen_thr = 0.37

In [37]:
%%time
add_res_dict["mBERT-Galén"] = compute_metrics_avg(thr=mbert_galen_thr, doc_preds_arr=mbert_galen_doc_preds, codes_labels=codes, 
                                            docs=list(test_doc_list), pred_file_path=test_pred_file_path, 
                                            gs_path=test_gs_path, valid_codes_path=codes_path)

CPU times: user 3.05 s, sys: 7.76 ms, total: 3.06 s
Wall time: 3.05 s


In [38]:
beto_thr = 0.57

In [39]:
%%time
add_res_dict["BETO"] = compute_metrics_avg(thr=beto_thr, doc_preds_arr=beto_doc_preds, codes_labels=codes, 
                                            docs=list(test_doc_list), pred_file_path=test_pred_file_path, 
                                            gs_path=test_gs_path, valid_codes_path=codes_path)

CPU times: user 3.18 s, sys: 5.24 ms, total: 3.19 s
Wall time: 3.18 s


In [40]:
beto_galen_thr = 0.59

In [41]:
%%time
add_res_dict["BETO-Galén"] = compute_metrics_avg(thr=beto_galen_thr, doc_preds_arr=beto_galen_doc_preds, codes_labels=codes, 
                                            docs=list(test_doc_list), pred_file_path=test_pred_file_path, 
                                            gs_path=test_gs_path, valid_codes_path=codes_path)

CPU times: user 3.26 s, sys: 7.1 ms, total: 3.27 s
Wall time: 3.27 s


In [42]:
xlmr_thr = 0.42

In [43]:
%%time
add_res_dict["XLM-R"] = compute_metrics_avg(thr=xlmr_thr, doc_preds_arr=xlmr_doc_preds, codes_labels=codes, 
                                            docs=list(test_doc_list), pred_file_path=test_pred_file_path, 
                                            gs_path=test_gs_path, valid_codes_path=codes_path)

CPU times: user 3.34 s, sys: 4.39 ms, total: 3.35 s
Wall time: 3.34 s


In [44]:
xlmr_galen_thr = 0.45

In [45]:
%%time
add_res_dict["XLM-R-Galén"] = compute_metrics_avg(thr=xlmr_galen_thr, doc_preds_arr=xlmr_galen_doc_preds, codes_labels=codes, 
                                            docs=list(test_doc_list), pred_file_path=test_pred_file_path, 
                                            gs_path=test_gs_path, valid_codes_path=codes_path)

CPU times: user 3.05 s, sys: 1.38 ms, total: 3.05 s
Wall time: 3.04 s


In [46]:
pd.DataFrame(add_res_dict).transpose()

Unnamed: 0,P,P_std,P_max,R,R_std,R_max,F1,F1_std,F1_max
mBERT,0.871,0.012,0.882,0.736,0.004,0.742,0.798,0.005,0.805
mBERT-Galén,0.878,0.008,0.891,0.763,0.007,0.776,0.817,0.001,0.819
BETO,0.869,0.016,0.888,0.711,0.014,0.728,0.782,0.005,0.791
BETO-Galén,0.905,0.007,0.917,0.761,0.004,0.768,0.827,0.003,0.832
XLM-R,0.852,0.012,0.868,0.751,0.006,0.762,0.799,0.006,0.805
XLM-R-Galén,0.875,0.008,0.889,0.768,0.003,0.771,0.818,0.003,0.823


### Ensemble

In [47]:
add_ens_res_dict = {}

#### Single-model

In [48]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum(mbert_doc_preds)/len(rnd_seeds), label_encoder_classes=codes, 
                                             doc_list=test_doc_list, thr=mbert_thr)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["mBERT"] = pd.Series({"P": p, "R": r, "F1": f1})

In [49]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum(mbert_galen_doc_preds)/len(rnd_seeds), label_encoder_classes=codes, 
                                             doc_list=test_doc_list, thr=mbert_galen_thr)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["mBERT-Galén"] = pd.Series({"P": p, "R": r, "F1": f1})

In [50]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum(beto_doc_preds)/len(rnd_seeds), label_encoder_classes=codes, 
                                             doc_list=test_doc_list, thr=beto_thr)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["BETO"] = pd.Series({"P": p, "R": r, "F1": f1})

In [51]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum(beto_galen_doc_preds)/len(rnd_seeds), label_encoder_classes=codes, 
                                             doc_list=test_doc_list, thr=beto_galen_thr)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["BETO-Galén"] = pd.Series({"P": p, "R": r, "F1": f1})

In [52]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum(xlmr_doc_preds)/len(rnd_seeds), label_encoder_classes=codes, 
                                             doc_list=test_doc_list, thr=xlmr_thr)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["XLM-R"] = pd.Series({"P": p, "R": r, "F1": f1})

In [53]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum(xlmr_galen_doc_preds)/len(rnd_seeds), label_encoder_classes=codes, 
                                             doc_list=test_doc_list, thr=xlmr_galen_thr)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["XLM-R-Galén"] = pd.Series({"P": p, "R": r, "F1": f1})

#### Multiple-model

In [54]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum([sum(mbert_doc_preds), sum(mbert_galen_doc_preds)])/(len(rnd_seeds)*2), 
                                             label_encoder_classes=codes, doc_list=test_doc_list, 
                                             thr=(mbert_thr + mbert_galen_thr)/2)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["mBERT + mBERT-Galén"] = pd.Series({"P": p, "R": r, "F1": f1})

In [55]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum([sum(beto_doc_preds), sum(beto_galen_doc_preds)])/(len(rnd_seeds)*2), 
                                             label_encoder_classes=codes, doc_list=test_doc_list, 
                                             thr=(beto_thr + beto_galen_thr)/2)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["BETO + BETO-Galén"] = pd.Series({"P": p, "R": r, "F1": f1})

In [56]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum([sum(xlmr_doc_preds), sum(xlmr_galen_doc_preds)])/(len(rnd_seeds)*2), 
                                             label_encoder_classes=codes, doc_list=test_doc_list, 
                                             thr=(xlmr_thr + xlmr_galen_thr)/2)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["XLM-R + XLM-R-Galén"] = pd.Series({"P": p, "R": r, "F1": f1})

In [57]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum([sum(mbert_doc_preds), sum(beto_doc_preds), 
                                                         sum(xlmr_doc_preds)])/(len(rnd_seeds)*3), 
                                             label_encoder_classes=codes, doc_list=test_doc_list, 
                                             thr=(mbert_thr + beto_thr + xlmr_thr)/3)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["mBERT + BETO + XLM-R"] = pd.Series({"P": p, "R": r, "F1": f1})

In [58]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum([sum(mbert_galen_doc_preds), sum(beto_galen_doc_preds), 
                                                         sum(xlmr_galen_doc_preds)])/(len(rnd_seeds)*3), 
                                             label_encoder_classes=codes, doc_list=test_doc_list, 
                                             thr=(mbert_galen_thr + beto_galen_thr + xlmr_galen_thr)/3)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["mBERT-Galén + BETO-Galén + XLM-R-Galén"] = pd.Series({"P": p, "R": r, "F1": f1})

In [59]:
pd.DataFrame(add_ens_res_dict).transpose()

Unnamed: 0,P,R,F1
mBERT,0.936,0.729,0.819
mBERT-Galén,0.895,0.78,0.833
BETO,0.927,0.712,0.806
BETO-Galén,0.935,0.755,0.835
XLM-R,0.895,0.761,0.822
XLM-R-Galén,0.902,0.776,0.834
mBERT + mBERT-Galén,0.925,0.756,0.832
BETO + BETO-Galén,0.946,0.738,0.829
XLM-R + XLM-R-Galén,0.907,0.776,0.836
mBERT + BETO + XLM-R,0.939,0.736,0.825
