# Model evaluation

In this notebook, the transformer-based models analyzed in this work are evaluated according to their tumor morphology mentions detection performance achieved in the Cantemist-NER task (on the test corpus).

In [1]:
import tensorflow as tf

# Auxiliary components
import sys
sys.path.append("../")
from nlp_utils import *

# Hyper-parameters
text_col = "raw_text"

## Load text

In [2]:
corpus_path = "../datasets/cantemist_v6/"
sub_task_path = "cantemist-ner/"

### Test corpus

In [3]:
%%time
test_path = corpus_path + "test-set/" + sub_task_path
test_files = [f for f in os.listdir(test_path) if os.path.isfile(test_path + f) and f.split('.')[-1] == 'txt']
test_data = load_text_files(test_files, test_path)
df_text_test = pd.DataFrame({'doc_id': [s.split('.txt')[0] for s in test_files], 'raw_text': test_data})

CPU times: user 8.14 ms, sys: 40 µs, total: 8.18 ms
Wall time: 7.97 ms


In [4]:
df_text_test.shape

(300, 2)

In [5]:
df_text_test.head()

Unnamed: 0,doc_id,raw_text
0,cc_onco877,"Anamnesis\nMujer de 59 años, alérgica a penici..."
1,cc_onco1075,"Anamnesis\nMujer de 52 años, sin alergias cono..."
2,cc_onco1450,"Anamnesis\nMujer de 51 años de edad, sin antec..."
3,cc_onco1165,Anamnesis\nPaciente varón de 75 años sin hábit...
4,cc_onco1298,"Anamnesis\nMujer de 60 años, exfumadora de 20 ..."


In [6]:
len(set(df_text_test['doc_id']))

300

In [7]:
df_text_test.raw_text[0]

'Anamnesis\nMujer de 59 años, alérgica a penicilina y procaína. Fumadora activa (IPA: 43).\nAntecedentes familiares: abuelo materno diagnosticado de carcinoma colon a los 70 años; madre diagnosticada de carcinoma de mama bilateral a los 50 años; padre fallecido de carcinoma gástrico a los 47 años; tres tías maternas diagnosticadas de carcinoma de mama a los 55, 56 y 57 años respectivamente; y tres primas afectas de cáncer de mama.\nAntecedentes personales: bronquitis crónica, poliposis colónica, carcinoma ductal infiltrante clásico mama pT2pN0M0 G2 subtipo tumoral luminal a (RH: +, HER-2: negativo) intervenido en agosto de 2013 mediante tumorectomía mama izquierda (patrón round block) + biopsia selectiva ganglio centinela (negativo) y posterior QT adyuvante con esquema TC (paclitaxel-ciclofosfamida) x 4 ciclos.\nAcude en noviembre de 2013 a visita de seguimiento tras finalizar tratamiento adyuvante. Asintomática.\n\nExploración física\nTemperatura axilar 36,5ºC, tensión arterial 130/83

In [8]:
from sklearn.preprocessing import LabelEncoder

lab_encoder = LabelEncoder()
# IOB-2 format
lab_encoder.fit(["B", "I", "O"])

LabelEncoder()

### Test corpus

All test texts are considered:

In [9]:
test_doc_list = sorted(set(df_text_test["doc_id"]))

In [10]:
len(test_doc_list)

300

In [11]:
test_gs_path = corpus_path + "test-set/" + sub_task_path

In [12]:
exec_path = "predictions/"

In [13]:
import pickle

def ner_avg_metrics(model_names, model_preds_dict, subtask='ner'):
    # Calculate performance of each individual model
    res_dict = {}
    for model_name in model_names:
        p_res, r_res, f1_res = [], [], []
        for mod_name in model_preds_dict[model_name]:
            # Word predictions
            with open(exec_path + "test_doc_word_preds_" + mod_name + ".pck", "rb") as f:
                arr_doc_seq_preds = pickle.load(f)

            # Word start-send pairs
            with open(exec_path + "test_doc_word_start_end_" + mod_name + ".pck", "rb") as f:
                arr_doc_seq_start_end = pickle.load(f)

            ann_res = []
            for d in tqdm(range(len(test_doc_list))):
                doc = test_doc_list[d]
                arr_seq_preds = arr_doc_seq_preds[d]
                arr_seq_start_end = arr_doc_seq_start_end[d]
                arr_seq_iob_preds = [lab_encoder.classes_[pred] for pred in np.argmax(arr_seq_preds, axis=1)]
                ann_res.extend(extract_seq_preds_iob2(doc_id=doc, iob_seq_preds=arr_seq_iob_preds, 
                                             seq_start_end=arr_seq_start_end, df_text=df_text_test, text_col=text_col))

            ann_res = pd.DataFrame(ann_res)
            p, r, f1 = calculate_ner_metrics(gs=format_ner_gs(test_gs_path, subtask=subtask), 
                                        pred=format_ner_pred_df(gs_path=test_gs_path, df_preds=ann_res))
            p_res.append(p)
            r_res.append(r)
            f1_res.append(f1)
        
        p_res_desc = pd.Series(p_res).describe()
        r_res_desc = pd.Series(r_res).describe()
        f1_res_desc = pd.Series(f1_res).describe()
        
        res_dict[model_name] = {"P_avg": round(p_res_desc['mean'], 3), "P_std": round(p_res_desc['std'], 3), 
                                "P_max": round(p_res_desc['max'], 3),
                                "R_avg": round(r_res_desc['mean'], 3), "R_std": round(r_res_desc['std'], 3), 
                                "R_max": round(r_res_desc['max'], 3),
                                "F1_avg": round(f1_res_desc['mean'], 3), "F1_std": round(f1_res_desc['std'], 3), 
                                "F1_max": round(f1_res_desc['max'], 3)}
        
    return pd.DataFrame(res_dict).transpose()

## General-domain

In [14]:
mbert_scr_names = ["mbert_0", "mbert_1", "mbert_2", "mbert_3", "mbert_4"]

In [15]:
beto_scr_names = ["beto_0", "beto_1", "beto_2", "beto_3", "beto_4"]

In [16]:
xlmr_scr_names = ["xlmr_0", "xlmr_1", "xlmr_2", "xlmr_3", "xlmr_4"]

In [17]:
scr_model_names = ['mBERT', 'BETO', 'XLM-R']
scr_model_dict = {scr_model_names[0]: mbert_scr_names, scr_model_names[1]: beto_scr_names, 
                  scr_model_names[2]: xlmr_scr_names}
scr_model_perf = ner_avg_metrics(model_names=scr_model_names, model_preds_dict=scr_model_dict, subtask='ner')

100%|██████████| 300/300 [00:01<00:00, 260.88it/s]
100%|██████████| 300/300 [00:01<00:00, 253.67it/s]
100%|██████████| 300/300 [00:01<00:00, 260.57it/s]
100%|██████████| 300/300 [00:01<00:00, 261.40it/s]
100%|██████████| 300/300 [00:01<00:00, 255.56it/s]
100%|██████████| 300/300 [00:01<00:00, 251.70it/s]
100%|██████████| 300/300 [00:01<00:00, 255.75it/s]
100%|██████████| 300/300 [00:01<00:00, 258.63it/s]
100%|██████████| 300/300 [00:01<00:00, 260.66it/s]
100%|██████████| 300/300 [00:01<00:00, 257.34it/s]
100%|██████████| 300/300 [00:01<00:00, 252.25it/s]
100%|██████████| 300/300 [00:01<00:00, 259.25it/s]
100%|██████████| 300/300 [00:01<00:00, 249.39it/s]
100%|██████████| 300/300 [00:01<00:00, 249.28it/s]
100%|██████████| 300/300 [00:01<00:00, 261.80it/s]


In [18]:
scr_model_perf

Unnamed: 0,P_avg,P_std,P_max,R_avg,R_std,R_max,F1_avg,F1_std,F1_max
mBERT,0.85,0.009,0.861,0.854,0.007,0.862,0.852,0.004,0.858
BETO,0.85,0.006,0.859,0.858,0.008,0.869,0.854,0.004,0.856
XLM-R,0.846,0.014,0.861,0.858,0.006,0.863,0.852,0.005,0.858


## Galén

In [19]:
mbert_gal_names = ["mbert_galen_0", "mbert_galen_1", "mbert_galen_2", "mbert_galen_3", "mbert_galen_4"]

In [20]:
beto_gal_names = ["beto_galen_0", "beto_galen_1", "beto_galen_2", "beto_galen_3", "beto_galen_4"]

In [21]:
xlmr_gal_names = ["xlmr_galen_0", "xlmr_galen_1","xlmr_galen_2", "xlmr_galen_3", "xlmr_galen_4"]

In [22]:
gal_model_names = ['mBERT-Galén', 'BETO-Galén', 'XLM-R-Galén']
gal_model_dict = {gal_model_names[0]: mbert_gal_names, gal_model_names[1]: beto_gal_names, 
                  gal_model_names[2]: xlmr_gal_names}
gal_model_perf = ner_avg_metrics(model_names=gal_model_names, model_preds_dict=gal_model_dict, subtask='ner')

100%|██████████| 300/300 [00:01<00:00, 261.56it/s]
100%|██████████| 300/300 [00:01<00:00, 257.43it/s]
100%|██████████| 300/300 [00:01<00:00, 254.28it/s]
100%|██████████| 300/300 [00:01<00:00, 253.72it/s]
100%|██████████| 300/300 [00:01<00:00, 254.50it/s]
100%|██████████| 300/300 [00:01<00:00, 253.82it/s]
100%|██████████| 300/300 [00:01<00:00, 257.80it/s]
100%|██████████| 300/300 [00:01<00:00, 260.10it/s]
100%|██████████| 300/300 [00:01<00:00, 255.03it/s]
100%|██████████| 300/300 [00:01<00:00, 252.95it/s]
100%|██████████| 300/300 [00:01<00:00, 251.08it/s]
100%|██████████| 300/300 [00:01<00:00, 254.50it/s]
100%|██████████| 300/300 [00:01<00:00, 258.60it/s]
100%|██████████| 300/300 [00:01<00:00, 249.32it/s]
100%|██████████| 300/300 [00:01<00:00, 255.51it/s]


In [23]:
gal_model_perf

Unnamed: 0,P_avg,P_std,P_max,R_avg,R_std,R_max,F1_avg,F1_std,F1_max
mBERT-Galén,0.867,0.008,0.876,0.869,0.007,0.877,0.868,0.004,0.876
BETO-Galén,0.872,0.008,0.88,0.865,0.004,0.869,0.868,0.002,0.87
XLM-R-Galén,0.867,0.009,0.881,0.869,0.006,0.878,0.868,0.003,0.874


## Ensemble

In [24]:
eval_strat = 'prod'

In [25]:
def compute_metrics_ens(arr_models, doc_list, lb_encoder, df_text, text_col, gs_path, ens_eval_strategy='max', subtask='ner'):
    ens_preds, ens_start_end = [], []
    for mod_name in arr_models:
        # Word predictions
        with open(exec_path + "test_doc_word_preds_" + mod_name + ".pck", "rb") as f:
            ens_preds.append(pickle.load(f))
        # Word start-send pairs
        with open(exec_path + "test_doc_word_start_end_" + mod_name + ".pck", "rb") as f:
            ens_start_end.append(pickle.load(f))
    df_ens_ann = ens_ner_preds_brat_format(doc_list=doc_list, ens_doc_word_preds=ens_preds, 
                          ens_doc_word_start_end=ens_start_end, lb_encoder=lb_encoder, df_text=df_text, 
                          text_col=text_col, strategy=ens_eval_strategy)
    return df_ens_ann, calculate_ner_metrics(gs=format_ner_gs(gs_path, subtask=subtask), 
                      pred=format_ner_pred_df(gs_path=gs_path, df_preds=df_ens_ann))

### mBERT

In [26]:
ens_models = mbert_scr_names
df_ens_ann, res_metrics = compute_metrics_ens(arr_models=ens_models, doc_list=test_doc_list, lb_encoder=lab_encoder, 
                    df_text=df_text_test, text_col=text_col, gs_path=test_gs_path, ens_eval_strategy=eval_strat, subtask='ner')
print(res_metrics)

100%|██████████| 300/300 [00:01<00:00, 214.50it/s]


(0.873, 0.872, 0.872)


### mBERT-Galén

In [27]:
ens_models = mbert_gal_names
df_ens_ann, res_metrics = compute_metrics_ens(arr_models=ens_models, doc_list=test_doc_list, lb_encoder=lab_encoder, 
                    df_text=df_text_test, text_col=text_col, gs_path=test_gs_path, ens_eval_strategy=eval_strat, subtask='ner')
print(res_metrics)

100%|██████████| 300/300 [00:01<00:00, 211.13it/s]


(0.885, 0.881, 0.883)


### BETO

In [28]:
ens_models = beto_scr_names
df_ens_ann, res_metrics = compute_metrics_ens(arr_models=ens_models, doc_list=test_doc_list, lb_encoder=lab_encoder, 
                    df_text=df_text_test, text_col=text_col, gs_path=test_gs_path, ens_eval_strategy=eval_strat, subtask='ner')
print(res_metrics)

100%|██████████| 300/300 [00:01<00:00, 213.40it/s]


(0.876, 0.873, 0.875)


### BETO-Galén

In [29]:
ens_models = beto_gal_names
df_ens_ann, res_metrics = compute_metrics_ens(arr_models=ens_models, doc_list=test_doc_list, lb_encoder=lab_encoder, 
                    df_text=df_text_test, text_col=text_col, gs_path=test_gs_path, ens_eval_strategy=eval_strat, subtask='ner')
print(res_metrics)

100%|██████████| 300/300 [00:01<00:00, 215.42it/s]


(0.883, 0.873, 0.878)


### XLM-R

In [30]:
ens_models = xlmr_scr_names
df_ens_ann, res_metrics = compute_metrics_ens(arr_models=ens_models, doc_list=test_doc_list, lb_encoder=lab_encoder, 
                    df_text=df_text_test, text_col=text_col, gs_path=test_gs_path, ens_eval_strategy=eval_strat, subtask='ner')
print(res_metrics)

100%|██████████| 300/300 [00:01<00:00, 209.93it/s]


(0.868, 0.874, 0.871)


### XLM-R-Galén

In [31]:
ens_models = xlmr_gal_names
df_ens_ann, res_metrics = compute_metrics_ens(arr_models=ens_models, doc_list=test_doc_list, lb_encoder=lab_encoder, 
                    df_text=df_text_test, text_col=text_col, gs_path=test_gs_path, ens_eval_strategy=eval_strat, subtask='ner')
print(res_metrics)

100%|██████████| 300/300 [00:01<00:00, 213.75it/s]


(0.887, 0.879, 0.883)


### mBERT + mBERT-Galén

In [32]:
ens_models = mbert_scr_names + mbert_gal_names
df_ens_ann, res_metrics = compute_metrics_ens(arr_models=ens_models, doc_list=test_doc_list, lb_encoder=lab_encoder, 
                    df_text=df_text_test, text_col=text_col, gs_path=test_gs_path, ens_eval_strategy=eval_strat, subtask='ner')
print(res_metrics)

100%|██████████| 300/300 [00:01<00:00, 173.62it/s]


(0.881, 0.876, 0.879)


### BETO + BETO-Galén

In [33]:
ens_models = beto_scr_names + beto_gal_names
df_ens_ann, res_metrics = compute_metrics_ens(arr_models=ens_models, doc_list=test_doc_list, lb_encoder=lab_encoder, 
                    df_text=df_text_test, text_col=text_col, gs_path=test_gs_path, ens_eval_strategy=eval_strat, subtask='ner')
print(res_metrics)

100%|██████████| 300/300 [00:01<00:00, 175.03it/s]


(0.887, 0.878, 0.882)


### XLM-R + XLM-R-Galén

In [34]:
ens_models = xlmr_scr_names + xlmr_gal_names
df_ens_ann, res_metrics = compute_metrics_ens(arr_models=ens_models, doc_list=test_doc_list, lb_encoder=lab_encoder, 
                    df_text=df_text_test, text_col=text_col, gs_path=test_gs_path, ens_eval_strategy=eval_strat, subtask='ner')
print(res_metrics)

100%|██████████| 300/300 [00:01<00:00, 174.13it/s]


(0.883, 0.88, 0.882)


### mBERT + BETO + XLM-R

In [35]:
ens_models = mbert_scr_names + beto_scr_names + xlmr_scr_names
df_ens_ann, res_metrics = compute_metrics_ens(arr_models=ens_models, doc_list=test_doc_list, lb_encoder=lab_encoder, 
                    df_text=df_text_test, text_col=text_col, gs_path=test_gs_path, ens_eval_strategy=eval_strat, subtask='ner')
print(res_metrics)

100%|██████████| 300/300 [00:02<00:00, 149.29it/s]


(0.882, 0.876, 0.879)


### mBERT-Galén + BETO-Galén + XLM-R-Galén

In [36]:
ens_models = mbert_gal_names + beto_gal_names + xlmr_gal_names
df_ens_ann, res_metrics = compute_metrics_ens(arr_models=ens_models, doc_list=test_doc_list, lb_encoder=lab_encoder, 
                    df_text=df_text_test, text_col=text_col, gs_path=test_gs_path, ens_eval_strategy=eval_strat, subtask='ner')
print(res_metrics)

100%|██████████| 300/300 [00:01<00:00, 151.99it/s]


(0.893, 0.887, 0.89)


In [None]:
# Sanity check

In [37]:
out_test_path = "ens_test_preds/"

In [38]:
write_ner_ann(df_pred_ann=df_ens_ann, out_path=out_test_path)

In [39]:
%%time
!python ../resources/cantemist-evaluation-library/src/main.py -g ../datasets/cantemist_v6/test-set/cantemist-ner/ -p ./ens_test_preds/ -s ner 


-----------------------------------------------------
Clinical case name			Precision
-----------------------------------------------------
cc_onco1006.ann		1.0
-----------------------------------------------------
cc_onco1023.ann		1.0
-----------------------------------------------------
cc_onco1027.ann		0.667
-----------------------------------------------------
cc_onco1038.ann		0.769
-----------------------------------------------------
cc_onco104.ann		0.917
-----------------------------------------------------
cc_onco1040.ann		0.938
-----------------------------------------------------
cc_onco1044.ann		0.889
-----------------------------------------------------
cc_onco1046.ann		1.0
-----------------------------------------------------
cc_onco1051.ann		1.0
-----------------------------------------------------
cc_onco1056.ann		1.0
-----------------------------------------------------
cc_onco1057.ann		0.812
-----------------------------------------------------

CPU times: user 15 ms, sys: 16.2 ms, total: 31.3 ms
Wall time: 1.14 s
