In [1]:
utils_path = "../utils/"
corpus_path = "../datasets/final_dataset_v4_to_publish/"
test_gs_path = corpus_path + "test/testX.tsv"

In [2]:
import tensorflow as tf

# Auxiliary components
import sys
sys.path.insert(0, utils_path)
from nlp_utils import *

RES_DIR = "../results/CodiEsp/final_exec/"

TYPE_ANN = "PROCEDIMIENTO"
TYPE_TASK = TYPE_ANN[0].lower()

round_n = 3

# GS data
df_test_gs = format_codiesp_x_gs(test_gs_path)
df_test_gs = df_test_gs[df_test_gs['label_gs'] == TYPE_ANN]

codes_d_path = corpus_path + "codiesp_codes/codiesp-" + TYPE_TASK.upper() + "_codes.tsv"
valid_codes = set(pd.read_csv(codes_d_path, sep='\t', header=None, 
                                  usecols=[0])[0].tolist())
valid_codes = set([x.lower() for x in valid_codes])

2023-01-11 10:02:43.366503: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [3]:
ss_corpus_path = "../datasets/CodiEsp-SSplit-text/"

# Load data

In [4]:
train_path = corpus_path + "train/text_files/"
train_files = [f for f in os.listdir(train_path) if os.path.isfile(train_path + f) and f.split('.')[-1] == "txt"]
train_data = load_text_files(train_files, train_path)
df_text_train = pd.DataFrame({'doc_id': [s.split('.txt')[0] for s in train_files], 'raw_text': train_data})

In [5]:
test_path = corpus_path + "test/text_files/"
test_files = [f for f in os.listdir(test_path) if os.path.isfile(test_path + f) and f.split('.')[-1] == 'txt']
test_data = load_text_files(test_files, test_path)
df_text_test = pd.DataFrame({'doc_id': [s.split('.txt')[0] for s in test_files], 'raw_text': test_data})

In [6]:
df_codes_train_ner = pd.read_table(corpus_path + "train/trainX.tsv", sep='\t', header=None)
df_codes_train_ner.columns = ["doc_id", "type", "code", "word", "location"]
df_codes_train_ner = df_codes_train_ner[~df_codes_train_ner[['doc_id', 'type', 'location']].duplicated(keep='first')]
df_codes_train_ner['disc'] = df_codes_train_ner['location'].apply(lambda x: ';' in x)

Select one type of annotations:

In [7]:
df_codes_train_ner = df_codes_train_ner[df_codes_train_ner['type'] == TYPE_ANN]

Split discontinuous annotations:

In [8]:
df_codes_train_ner_final = process_labels_norm_prueba(df_ann=df_codes_train_ner[["doc_id", "type", "code", "word", "location"]])

Remove annotations of zero length:

In [9]:
df_codes_train_ner_final['length'] = df_codes_train_ner_final.apply(lambda x: x['end'] - x['start'], axis=1)
df_codes_train_ner_final = df_codes_train_ner_final[df_codes_train_ner_final['length'] > 0]

Separate continuous and discontinuous annotations:

In [10]:
# Continiuous
df_codes_train_ner_final_cont = df_codes_train_ner_final[df_codes_train_ner_final['disc'] == 0].copy()
df_codes_train_ner_final_cont['disc'] = df_codes_train_ner_final_cont['disc'].astype(bool)

In [11]:
# Discontinuous
df_codes_train_ner_final_disc = restore_disc_ann(df_ann=df_codes_train_ner[df_codes_train_ner['disc']], 
                    df_ann_final=df_codes_train_ner_final[df_codes_train_ner_final['disc'] == 1])

In [12]:
df_codes_train_ner_final_disc['start'] = df_codes_train_ner_final_disc['location'].apply(lambda x: int(x.split(' ')[0]))
df_codes_train_ner_final_disc['end'] = df_codes_train_ner_final_disc['location'].apply(lambda x: int(x.split(' ')[-1]))

Concatenate continuous and discontinuous annotations:

In [13]:
# Concat
cols_concat = ['doc_id', 'type', 'code', 'word', 'location', 'start', 'end', 'disc']
df_codes_train_ner_final = pd.concat([df_codes_train_ner_final_cont[cols_concat], 
                                      df_codes_train_ner_final_disc[cols_concat]])

Now, we remove the right-to-left (text wise) discontinuous annotations:

In [14]:
df_codes_train_ner_final['direction'] = df_codes_train_ner_final.apply(check_ann_left_right_direction, axis=1)

In [15]:
df_codes_train_ner_final = df_codes_train_ner_final[df_codes_train_ner_final['direction']]

We only select the annotations fully contained in a single sentence:

In [16]:
# Sentence-Split data
ss_sub_corpus_path = ss_corpus_path + "train/"
ss_files = [f for f in os.listdir(ss_sub_corpus_path) if os.path.isfile(ss_sub_corpus_path + f)]
ss_dict_train = load_ss_files(ss_files, ss_sub_corpus_path)

In [17]:
df_mult_sent_train, df_one_sent_train, df_no_sent_train = check_ann_span_sent(df_ann=df_codes_train_ner_final, 
                                                                             ss_dict=ss_dict_train)

In [18]:
df_codes_train_ner_final = df_one_sent_train.copy()

In [19]:
print(df_codes_train_ner_final.disc.value_counts())

False    1205
True      355
Name: disc, dtype: int64


In [20]:
df_codes_train_ner_final.sort_values(['doc_id', 'start', 'end'], inplace=True)

In [21]:
# Code splitting

In [22]:
if TYPE_TASK == 'd':
    df_codes_train_ner_final["code_pre"] = df_codes_train_ner_final["code"].apply(lambda x: x.split('.')[0])
    df_codes_train_ner_final["code_suf"] = df_codes_train_ner_final["code"].apply(lambda x: None if not '.' in x else x.split('.')[1])
else:
    df_codes_train_ner_final["code_pre"] = df_codes_train_ner_final["code"].apply(lambda x: x[:4])
    df_codes_train_ner_final["code_suf"] = df_codes_train_ner_final["code"].apply(lambda x: None if len(x) < 7 else x[4:7])

In [23]:
train_dev_codes_pre = sorted(set(df_codes_train_ner_final["code_pre"].values))

In [24]:
len(train_dev_codes_pre)

354

In [25]:
train_dev_codes_suf = sorted(set(df_codes_train_ner_final[df_codes_train_ner_final['code_suf'].apply(lambda x: x is not None)]["code_suf"].values)) 

In [26]:
len(train_dev_codes_suf)

53

In [27]:
subtask = 'norm'
subtask_ann = subtask + '-iob_code_suf'

CODE_SEP = '.' if TYPE_ANN == 'DIAGNOSTICO' else ''

# Evaluation

In [28]:
res_stat = {}
res_eval = {}

## Full

Considering all mentions that are present in the test set.

In [29]:
df_test_gs_full = df_test_gs.sort_values(by=["clinical_case", "start_pos_gs", "end_pos_gs"])

In [30]:
df_test_gs_full.shape

(1112, 7)

In [31]:
arr_res_eval = []
for i in range(1, 6):
    df_pred_full = pd.read_csv(RES_DIR + "df_test_full_preds_" + TYPE_TASK + "_hier_task_cls_train_mbert_galen_" + str(i) + ".csv", 
                    index_col=0, header=0, sep='\t').sort_values(by=["clinical_case", "pos_pred"])
    code_pred_full = [df_pred_full["code"].loc[i] if i in df_pred_full.index else "-" for i in df_test_gs_full.index] 
    assert len(code_pred_full) == df_test_gs_full.shape[0]
    arr_res_eval.append(
        (
            pd.Series(df_test_gs_full["code"].values) == pd.Series(code_pred_full)
        ).value_counts(normalize=True)[True]
    )
dist_res_eval = pd.Series(arr_res_eval).describe()
res_full_mean_std = "." + str(round(dist_res_eval['mean'], round_n)).split('.')[-1] + " ± " + \
                    "." + str(round(dist_res_eval['std'], round_n)).split('.')[-1]
res_full_max = "." + str(round(dist_res_eval['max'], round_n)).split('.')[-1]

### Additional analysis

We only select the test samples annotated with code-pre and code-suf already seen in train-dev (otherwise the sample would be impossible to be correctly annotated):

In [32]:
df_test_gs_full_add = df_test_gs_full.copy()

In [33]:
if TYPE_TASK == 'd':
    df_test_gs_full_add["code_pre"] = df_test_gs_full_add.code.apply(
        lambda x: x.split('.')[0] in train_dev_codes_pre
    )
    df_test_gs_full_add["code_suf"] = df_test_gs_full_add.code.apply(
        lambda x: (None if not '.' in x else x.split('.')[1]) in train_dev_codes_suf
    )
else:
    df_test_gs_full_add["code_pre"] = df_test_gs_full_add.code.apply(
        lambda x: x[:4] in train_dev_codes_pre
    )
    df_test_gs_full_add["code_suf"] = df_test_gs_full_add.code.apply(
        lambda x: (None if len(x) < 7 else x[4:7]) in train_dev_codes_suf
    )

In [34]:
bool_filt_samples = df_test_gs_full_add.apply(
    lambda x: x["code_pre"] == x["code_suf"] == True, 
    axis=1
).values

In [35]:
df_test_gs_full_add = df_test_gs_full[bool_filt_samples]

In [36]:
df_test_gs_full_add.shape

(437, 7)

Finally, se only select the test samples considered by the model at the time of prediction, i.e. test annotations for which a CLS sample can be produced: 

In [37]:
ind_pred_full_add = sorted(set(df_test_gs_full_add.index.values).intersection(set(df_pred_full.index)))

In [38]:
df_test_gs_full_add_pos = df_test_gs_full_add.loc[ind_pred_full_add]

In [39]:
df_test_gs_full_add_pos.shape

(413, 7)

In [40]:
arr_res_eval = []
for i in range(1, 6):
    df_pred_full = pd.read_csv(RES_DIR + "df_test_full_preds_" + TYPE_TASK + "_hier_task_cls_train_mbert_galen_" + str(i) + ".csv", 
                    index_col=0, header=0, sep='\t').sort_values(by=["clinical_case", "pos_pred"])
    code_pred_full_add_pos = [df_pred_full["code"].loc[i] if i in df_pred_full.index else "-" for i in df_test_gs_full_add_pos.index] 
    assert len(code_pred_full_add_pos) == df_test_gs_full_add_pos.shape[0]
    arr_res_eval.append(
        (
            pd.Series(df_test_gs_full_add_pos["code"].values) == pd.Series(code_pred_full_add_pos)
        ).value_counts(normalize=True)[True]
    )
dist_res_eval = pd.Series(arr_res_eval).describe()
res_full_add_pos_mean_std = "." + str(round(dist_res_eval['mean'], round_n)).split('.')[-1] + " ± " + \
                    "." + str(round(dist_res_eval['std'], round_n)).split('.')[-1]
res_full_add_pos_max = "." + str(round(dist_res_eval['max'], round_n)).split('.')[-1]

In [41]:
res_eval["Full"] = {
    "All acc avg": res_full_mean_std, 
    "All acc max": res_full_max, 
    "Pos acc avg": res_full_add_pos_mean_std,
    "Pos acc max": res_full_add_pos_max
}

In [42]:
res_stat["Full"] = {
    "All codes": len(set(df_test_gs_full.code)), "All ment": df_test_gs_full.shape[0], 
    "Pos codes": len(set(df_test_gs_full_add_pos.code)), "Pos ment": df_test_gs_full_add_pos.shape[0]
}

## Filtering

Considering mentions that are only present in the test set.

In [43]:
df_codes_train_ner_all = pd.read_table(corpus_path + "train/trainX.tsv", sep='\t', header=None)
df_codes_train_ner_all.columns = ["doc_id", "type", "code", "word", "location"]
df_codes_train_ner_all = df_codes_train_ner_all[df_codes_train_ner_all['type'] == TYPE_ANN]

In [44]:
df_codes_train_ner_all.shape

(1972, 5)

In [45]:
train_mentions = sorted(set([x.lower() for x in set(
    df_codes_train_ner_all["word"].values
)]))

In [46]:
len(train_mentions)

1096

In [47]:
df_test_gs_filt = df_test_gs[df_test_gs.ref.apply(
    lambda x: x.lower() not in train_mentions
)].sort_values(by=["clinical_case", "start_pos_gs", "end_pos_gs"])

In [48]:
df_test_gs_filt.shape

(586, 7)

In [49]:
arr_res_eval = []
for i in range(1, 6):
    df_pred_filt = pd.read_csv(RES_DIR + "df_test_filt_preds_" + TYPE_TASK + "_hier_task_cls_train_mbert_galen_" + str(i) + ".csv", 
                    index_col=0, header=0, sep='\t').sort_values(by=["clinical_case", "pos_pred"])
    code_pred_filt = [df_pred_filt["code"].loc[i] if i in df_pred_filt.index else "-" for i in df_test_gs_filt.index] 
    assert len(code_pred_filt) == df_test_gs_filt.shape[0]
    arr_res_eval.append(
        (
            pd.Series(df_test_gs_filt["code"].values) == pd.Series(code_pred_filt)
        ).value_counts(normalize=True)[True]
    )
dist_res_eval = pd.Series(arr_res_eval).describe()
res_filt_mean_std = "." + str(round(dist_res_eval['mean'], round_n)).split('.')[-1] + " ± " + \
                    "." + str(round(dist_res_eval['std'], round_n)).split('.')[-1]
res_filt_max = "." + str(round(dist_res_eval['max'], round_n)).split('.')[-1]

### Additional analysis

We only select the test samples annotated with code-pre and code-suf already seen in train-dev (otherwise the sample would be impossible to be correctly annotated):

In [50]:
df_test_gs_filt_add = df_test_gs_filt.copy()

In [51]:
if TYPE_TASK == 'd':
    df_test_gs_filt_add["code_pre"] = df_test_gs_filt_add.code.apply(
        lambda x: x.split('.')[0] in train_dev_codes_pre
    )
    df_test_gs_filt_add["code_suf"] = df_test_gs_filt_add.code.apply(
        lambda x: (None if not '.' in x else x.split('.')[1]) in train_dev_codes_suf
    )
else:
    df_test_gs_filt_add["code_pre"] = df_test_gs_filt_add.code.apply(
        lambda x: x[:4] in train_dev_codes_pre
    )
    df_test_gs_filt_add["code_suf"] = df_test_gs_filt_add.code.apply(
        lambda x: (None if len(x) < 7 else x[4:7]) in train_dev_codes_suf
    )

In [52]:
bool_filt_samples = df_test_gs_filt_add.apply(
    lambda x: x["code_pre"] == x["code_suf"] == True, 
    axis=1
).values

In [53]:
df_test_gs_filt_add = df_test_gs_filt[bool_filt_samples]

In [54]:
df_test_gs_filt_add.shape

(124, 7)

Finally, se only select the test samples considered by the model at the time of prediction, i.e. test annotations for which a CLS sample can be produced: 

In [55]:
ind_pred_filt_add = sorted(set(df_test_gs_filt_add.index.values).intersection(set(df_pred_filt.index)))

In [56]:
df_test_gs_filt_add_pos = df_test_gs_filt_add.loc[ind_pred_filt_add]

In [57]:
df_test_gs_filt_add_pos.shape

(109, 7)

In [58]:
arr_res_eval = []
for i in range(1, 6):
    df_pred_filt = pd.read_csv(RES_DIR + "df_test_filt_preds_" + TYPE_TASK + "_hier_task_cls_train_mbert_galen_" + str(i) + ".csv", 
                    index_col=0, header=0, sep='\t').sort_values(by=["clinical_case", "pos_pred"])
    code_pred_filt_add_pos = [df_pred_filt["code"].loc[i] if i in df_pred_filt.index else "-" for i in df_test_gs_filt_add_pos.index] 
    assert len(code_pred_filt_add_pos) == df_test_gs_filt_add_pos.shape[0]
    arr_res_eval.append(
        (
            pd.Series(df_test_gs_filt_add_pos["code"].values) == pd.Series(code_pred_filt_add_pos)
        ).value_counts(normalize=True)[True]
    )
dist_res_eval = pd.Series(arr_res_eval).describe()
res_filt_add_pos_mean_std = "." + str(round(dist_res_eval['mean'], round_n)).split('.')[-1] + " ± " + \
                    "." + str(round(dist_res_eval['std'], round_n)).split('.')[-1]
res_filt_add_pos_max = "." + str(round(dist_res_eval['max'], round_n)).split('.')[-1]

In [59]:
res_eval["Filtering"] = {
    "All acc avg": res_filt_mean_std, 
    "All acc max": res_filt_max, 
    "Pos acc avg": res_filt_add_pos_mean_std,
    "Pos acc max": res_filt_add_pos_max
}

In [60]:
res_stat["Filtering"] = {
    "All codes": len(set(df_test_gs_filt.code)), "All ment": df_test_gs_filt.shape[0], 
    "Pos codes": len(set(df_test_gs_filt_add_pos.code)), "Pos ment": df_test_gs_filt_add_pos.shape[0]
}

In [61]:
pd.DataFrame(res_eval).transpose()

Unnamed: 0,All acc avg,All acc max,Pos acc avg,Pos acc max
Full,.546 ± .006,0.556,.798 ± .009,0.809
Filtering,.278 ± .008,0.288,.552 ± .02,0.578


In [62]:
pd.DataFrame(res_stat).transpose()

Unnamed: 0,All codes,All ment,Pos codes,Pos ment
Full,371,1112,81,413
Filtering,325,586,57,109
