# Model evaluation

In this notebook, the transformer-based models analyzed in this work are evaluated according to their clinical coding performance achieved in the CodiEsp-P task.

In [1]:
# Auxiliary components
import sys
sys.path.append("../../")
from nlp_utils import *

In [2]:
corpus_path = "../../datasets/codiesp_v4/"

# Both arrays were saved in CodiEsp-P_XLM-R_Fine-Tuning.ipynb
test_doc_list = np.load("test_docs.npy")
codes = np.load("classes.npy")

## Load models predictions

We load the coding predictions made by the models at sentence-level, and we further convert them to document-level predictions.

In [3]:
preds_dir = "predictions/"
rnd_seeds = list(range(5))

### mBERT

In [4]:
model_name = "mbert"
mbert_test_frags = np.load(model_name + "_test_frags.npy")
mbert_doc_preds = [max_fragment(y_frag_pred=np.load(preds_dir + model_name + "_seed_" + str(i) + "_test_preds.npy"), 
                            n_fragments=mbert_test_frags) for i in rnd_seeds]

### mBERT-Galén

In [5]:
model_name = "mbert_galen"
mbert_galen_doc_preds = [max_fragment(y_frag_pred=np.load(preds_dir + model_name + "_seed_" + str(i) + "_test_preds.npy"), 
                            n_fragments=mbert_test_frags) for i in rnd_seeds]

### XLM-R

In [6]:
model_name = "xlm_r"
xlmr_test_frags = np.load(model_name + "_test_frags.npy")
xlmr_doc_preds = [max_fragment(y_frag_pred=np.load(preds_dir + model_name + "_seed_" + str(i) + "_test_preds.npy"), 
                            n_fragments=xlmr_test_frags) for i in rnd_seeds]

### XLM-R-Galén

In [7]:
model_name = "xlm_r_galen"
xlmr_galen_doc_preds = [max_fragment(y_frag_pred=np.load(preds_dir + model_name + "_seed_" + str(i) + "_test_preds.npy"), 
                            n_fragments=xlmr_test_frags) for i in rnd_seeds]

## MAP

Firstly, we evaluate the models according to the main evaluation metric of the task, the Mean Average Precision (MAP) score.

In [8]:
# Load valid codes
codes_path = "../../resources/codiesp-evaluation-script/codiesp_codes/codiesp-P_codes.tsv"
valid_codes = set(pd.read_csv(codes_path, sep='\t', header=None, 
                              usecols=[0])[0].tolist())
valid_codes = set([x.lower() for x in valid_codes])

# Format Gold Standard (GS) annotations
test_gs_path = corpus_path + "test/testP.tsv"
test_gs_out_path = "./test_intermediate_gs_file.txt"
format_gs(filepath=test_gs_path, output_path=test_gs_out_path)

In [9]:
map_res_dict = {}

In [10]:
map_res_dict["mBERT"] = compute_map_avg(doc_preds_arr=mbert_doc_preds, codes_labels=codes, docs=test_doc_list, 
                                        valid_codes=valid_codes, gs_out_path=test_gs_out_path)

In [11]:
map_res_dict["mBERT-Galén"] = compute_map_avg(doc_preds_arr=mbert_galen_doc_preds, codes_labels=codes, docs=test_doc_list, 
                                              valid_codes=valid_codes, gs_out_path=test_gs_out_path)

In [12]:
map_res_dict["XLM-R"] = compute_map_avg(doc_preds_arr=xlmr_doc_preds, codes_labels=codes, docs=test_doc_list, 
                                        valid_codes=valid_codes, gs_out_path=test_gs_out_path)

In [13]:
map_res_dict["XLM-R-Galén"] = compute_map_avg(doc_preds_arr=xlmr_galen_doc_preds, codes_labels=codes, docs=test_doc_list, 
                                              valid_codes=valid_codes, gs_out_path=test_gs_out_path)

In [14]:
pd.DataFrame(map_res_dict).transpose()

Unnamed: 0,MAP,MAP_std,MAP_max
mBERT,0.469,0.003,0.475
mBERT-Galén,0.495,0.007,0.508
XLM-R,0.469,0.006,0.477
XLM-R-Galén,0.493,0.003,0.498


### Ensemble

In [15]:
map_ens_res_dict = {}

#### Single-model

In [16]:
map_ens_res_dict["mBERT"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum(mbert_doc_preds), label_encoder_classes=codes, doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [17]:
map_ens_res_dict["mBERT-Galén"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum(mbert_galen_doc_preds), label_encoder_classes=codes, 
                                            doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [18]:
map_ens_res_dict["XLM-R"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum(xlmr_doc_preds), label_encoder_classes=codes, doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [19]:
map_ens_res_dict["XLM-R-Galén"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum(xlmr_galen_doc_preds), label_encoder_classes=codes, 
                                            doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

#### Multiple-model

In [20]:
map_ens_res_dict["mBERT + mBERT-Galén"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum([sum(mbert_doc_preds), sum(mbert_galen_doc_preds)]), 
                                            label_encoder_classes=codes, doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [21]:
map_ens_res_dict["XLM-R + XLM-R-Galén"] = pd.Series({"MAP": round(compute_map(valid_codes=valid_codes, 
        pred=prob_codiesp_prediction_format(y_pred=sum([sum(xlmr_doc_preds), sum(xlmr_galen_doc_preds)]), 
                                            label_encoder_classes=codes, doc_list=test_doc_list),
        gs_out_path=test_gs_out_path), 3)})

In [22]:
pd.DataFrame(map_ens_res_dict).transpose()

Unnamed: 0,MAP
mBERT,0.508
mBERT-Galén,0.521
XLM-R,0.501
XLM-R-Galén,0.526
mBERT + mBERT-Galén,0.528
XLM-R + XLM-R-Galén,0.524


## Additional metrics

Additionally, we also evaluate the models according to the micro-averaged precision, recall and F1-score. For this purpose, we use binary classification thresholds previously estimated on the development set of the CodiEsp-P corpus.

In [23]:
add_res_dict = {}

In [24]:
test_pred_file_path = "./test_intermediate_pred_file.txt"

In [25]:
mbert_thr = 0.52

In [26]:
%%time
add_res_dict["mBERT"] = compute_metrics_avg(thr=mbert_thr, doc_preds_arr=mbert_doc_preds, codes_labels=codes, 
                                            docs=list(test_doc_list), pred_file_path=test_pred_file_path, 
                                            gs_path=test_gs_path, valid_codes_path=codes_path)

CPU times: user 2.61 s, sys: 40.9 ms, total: 2.65 s
Wall time: 2.65 s


In [27]:
mbert_galen_thr = 0.12

In [28]:
%%time
add_res_dict["mBERT-Galén"] = compute_metrics_avg(thr=mbert_galen_thr, doc_preds_arr=mbert_galen_doc_preds, codes_labels=codes, 
                                            docs=list(test_doc_list), pred_file_path=test_pred_file_path, 
                                            gs_path=test_gs_path, valid_codes_path=codes_path)

CPU times: user 2.64 s, sys: 40.9 ms, total: 2.69 s
Wall time: 2.68 s


In [None]:
beto_thr = 0.53

In [None]:
beto_galen_thr = 0.31

In [29]:
xlmr_thr = 0.43

In [30]:
%%time
add_res_dict["XLM-R"] = compute_metrics_avg(thr=xlmr_thr, doc_preds_arr=xlmr_doc_preds, codes_labels=codes, 
                                            docs=list(test_doc_list), pred_file_path=test_pred_file_path, 
                                            gs_path=test_gs_path, valid_codes_path=codes_path)

CPU times: user 2.68 s, sys: 39.7 ms, total: 2.72 s
Wall time: 2.72 s


In [31]:
xlmr_galen_thr = 0.33

In [32]:
%%time
add_res_dict["XLM-R-Galén"] = compute_metrics_avg(thr=xlmr_galen_thr, doc_preds_arr=xlmr_galen_doc_preds, codes_labels=codes, 
                                            docs=list(test_doc_list), pred_file_path=test_pred_file_path, 
                                            gs_path=test_gs_path, valid_codes_path=codes_path)

CPU times: user 2.63 s, sys: 43.9 ms, total: 2.68 s
Wall time: 2.68 s


In [33]:
pd.DataFrame(add_res_dict).transpose()

Unnamed: 0,P,P_std,P_max,R,R_std,R_max,F1,F1_std,F1_max
mBERT,0.655,0.013,0.669,0.438,0.013,0.452,0.525,0.012,0.537
mBERT-Galén,0.582,0.014,0.603,0.501,0.009,0.51,0.538,0.009,0.552
XLM-R,0.608,0.024,0.638,0.461,0.018,0.487,0.524,0.003,0.527
XLM-R-Galén,0.6,0.012,0.612,0.477,0.01,0.49,0.532,0.008,0.543


### Ensemble

In [34]:
add_ens_res_dict = {}

#### Single-model

In [42]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum(mbert_doc_preds)/len(rnd_seeds), label_encoder_classes=codes, 
                                             doc_list=test_doc_list, thr=mbert_thr)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["mBERT"] = pd.Series({"P": p, "R": r, "F1": f1})

In [43]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum(mbert_galen_doc_preds)/len(rnd_seeds), label_encoder_classes=codes, 
                                             doc_list=test_doc_list, thr=mbert_galen_thr)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["mBERT-Galén"] = pd.Series({"P": p, "R": r, "F1": f1})

In [37]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum(xlmr_doc_preds)/len(rnd_seeds), label_encoder_classes=codes, 
                                             doc_list=test_doc_list, thr=xlmr_thr)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["XLM-R"] = pd.Series({"P": p, "R": r, "F1": f1})

In [38]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum(xlmr_galen_doc_preds)/len(rnd_seeds), label_encoder_classes=codes, 
                                             doc_list=test_doc_list, thr=xlmr_galen_thr)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["XLM-R-Galén"] = pd.Series({"P": p, "R": r, "F1": f1})

#### Multiple-model

In [39]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum([sum(mbert_doc_preds), sum(mbert_galen_doc_preds)])/(len(rnd_seeds)*2), 
                                             label_encoder_classes=codes, doc_list=test_doc_list, 
                                             thr=(mbert_thr + mbert_galen_thr)/2)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["mBERT + mBERT-Galén"] = pd.Series({"P": p, "R": r, "F1": f1})

In [40]:
df_test_pred = thr_codiesp_prediction_format(y_pred=sum([sum(xlmr_doc_preds), sum(xlmr_galen_doc_preds)])/(len(rnd_seeds)*2), 
                                             label_encoder_classes=codes, doc_list=test_doc_list, 
                                             thr=(xlmr_thr + xlmr_galen_thr)/2)
df_test_pred[["doc_id", "code"]].to_csv(path_or_buf=test_pred_file_path, sep="\t", header=False, index=False)
p, r, f1 = compute_p_r_f1(gs_path=test_gs_path, pred_path=test_pred_file_path, codes_path=codes_path)
add_ens_res_dict["XLM-R + XLM-R-Galén"] = pd.Series({"P": p, "R": r, "F1": f1})

In [44]:
pd.DataFrame(add_ens_res_dict).transpose()

Unnamed: 0,P,R,F1
mBERT,0.736,0.433,0.545
mBERT-Galén,0.568,0.573,0.571
XLM-R,0.699,0.468,0.56
XLM-R-Galén,0.646,0.503,0.566
mBERT + mBERT-Galén,0.692,0.482,0.568
XLM-R + XLM-R-Galén,0.681,0.484,0.566
