In [1]:
utils_path = "../utils/"
corpus_path = "../datasets/final_dataset_v4_to_publish/"
test_gs_path = corpus_path + "test/testX.tsv"

In [2]:
import tensorflow as tf

# Auxiliary components
import sys
sys.path.insert(0, utils_path)
from nlp_utils import *

RES_DIR = "../results/CodiEsp/final_exec_rev/"

TYPE_ANN = "PROCEDIMIENTO"
TYPE_TASK = TYPE_ANN[0].lower()

round_n = 3

# GS data
df_test_gs = format_codiesp_x_gs(test_gs_path)
df_test_gs = df_test_gs[df_test_gs['label_gs'] == TYPE_ANN]

codes_d_path = corpus_path + "codiesp_codes/codiesp-" + TYPE_TASK.upper() + "_codes.tsv"
valid_codes = set(pd.read_csv(codes_d_path, sep='\t', header=None, 
                                  usecols=[0])[0].tolist())
valid_codes = set([x.lower() for x in valid_codes])

2022-12-23 11:17:10.533101: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [3]:
ss_corpus_path = "../datasets/CodiEsp-SSplit-text/"

# Load data

In [4]:
train_path = corpus_path + "train/text_files/"
train_files = [f for f in os.listdir(train_path) if os.path.isfile(train_path + f) and f.split('.')[-1] == "txt"]
train_data = load_text_files(train_files, train_path)
df_text_train = pd.DataFrame({'doc_id': [s.split('.txt')[0] for s in train_files], 'raw_text': train_data})

In [5]:
dev_path = corpus_path + "dev/text_files/"
dev_files = [f for f in os.listdir(dev_path) if os.path.isfile(dev_path + f) and f.split('.')[-1] == "txt"]
dev_data = load_text_files(dev_files, dev_path)
df_text_dev = pd.DataFrame({'doc_id': [s.split('.txt')[0] for s in dev_files], 'raw_text': dev_data})

In [6]:
test_path = corpus_path + "test/text_files/"
test_files = [f for f in os.listdir(test_path) if os.path.isfile(test_path + f) and f.split('.')[-1] == 'txt']
test_data = load_text_files(test_files, test_path)
df_text_test = pd.DataFrame({'doc_id': [s.split('.txt')[0] for s in test_files], 'raw_text': test_data})

In [7]:
df_codes_train_ner = pd.read_table(corpus_path + "train/trainX.tsv", sep='\t', header=None)
df_codes_train_ner.columns = ["doc_id", "type", "code", "word", "location"]
df_codes_train_ner = df_codes_train_ner[~df_codes_train_ner[['doc_id', 'type', 'location']].duplicated(keep='first')]
df_codes_train_ner['disc'] = df_codes_train_ner['location'].apply(lambda x: ';' in x)

Select one type of annotations:

In [8]:
df_codes_train_ner = df_codes_train_ner[df_codes_train_ner['type'] == TYPE_ANN]

Split discontinuous annotations:

In [9]:
df_codes_train_ner_final = process_labels_norm_prueba(df_ann=df_codes_train_ner[["doc_id", "type", "code", "word", "location"]])

Remove annotations of zero length:

In [10]:
df_codes_train_ner_final['length'] = df_codes_train_ner_final.apply(lambda x: x['end'] - x['start'], axis=1)
df_codes_train_ner_final = df_codes_train_ner_final[df_codes_train_ner_final['length'] > 0]

Separate continuous and discontinuous annotations:

In [11]:
# Continiuous
df_codes_train_ner_final_cont = df_codes_train_ner_final[df_codes_train_ner_final['disc'] == 0].copy()
df_codes_train_ner_final_cont['disc'] = df_codes_train_ner_final_cont['disc'].astype(bool)

In [12]:
# Discontinuous
df_codes_train_ner_final_disc = restore_disc_ann(df_ann=df_codes_train_ner[df_codes_train_ner['disc']], 
                    df_ann_final=df_codes_train_ner_final[df_codes_train_ner_final['disc'] == 1])

In [13]:
df_codes_train_ner_final_disc['start'] = df_codes_train_ner_final_disc['location'].apply(lambda x: int(x.split(' ')[0]))
df_codes_train_ner_final_disc['end'] = df_codes_train_ner_final_disc['location'].apply(lambda x: int(x.split(' ')[-1]))

Concatenate continuous and discontinuous annotations:

In [14]:
# Concat
cols_concat = ['doc_id', 'type', 'code', 'word', 'location', 'start', 'end', 'disc']
df_codes_train_ner_final = pd.concat([df_codes_train_ner_final_cont[cols_concat], 
                                      df_codes_train_ner_final_disc[cols_concat]])

Now, we remove the right-to-left (text wise) discontinuous annotations:

In [15]:
df_codes_train_ner_final['direction'] = df_codes_train_ner_final.apply(check_ann_left_right_direction, axis=1)

In [16]:
df_codes_train_ner_final = df_codes_train_ner_final[df_codes_train_ner_final['direction']]

We only select the annotations fully contained in a single sentence:

In [17]:
# Sentence-Split data
ss_sub_corpus_path = ss_corpus_path + "train/"
ss_files = [f for f in os.listdir(ss_sub_corpus_path) if os.path.isfile(ss_sub_corpus_path + f)]
ss_dict_train = load_ss_files(ss_files, ss_sub_corpus_path)

In [18]:
df_mult_sent_train, df_one_sent_train, df_no_sent_train = check_ann_span_sent(df_ann=df_codes_train_ner_final, 
                                                                             ss_dict=ss_dict_train)

In [19]:
df_codes_train_ner_final = df_one_sent_train.copy()

In [20]:
print(df_codes_train_ner_final.disc.value_counts())

False    1205
True      355
Name: disc, dtype: int64


In [21]:
df_codes_train_ner_final.sort_values(['doc_id', 'start', 'end'], inplace=True)

In [22]:
# Code splitting

In [23]:
if TYPE_TASK == 'd':
    df_codes_train_ner_final["code_pre"] = df_codes_train_ner_final["code"].apply(lambda x: x.split('.')[0])
    df_codes_train_ner_final["code_suf"] = df_codes_train_ner_final["code"].apply(lambda x: None if not '.' in x else x.split('.')[1])
else:
    df_codes_train_ner_final["code_pre"] = df_codes_train_ner_final["code"].apply(lambda x: x[:4])
    df_codes_train_ner_final["code_suf"] = df_codes_train_ner_final["code"].apply(lambda x: None if len(x) < 7 else x[4:7])

In [24]:
df_codes_dev_ner = pd.read_table(corpus_path + "dev/devX.tsv", sep='\t', header=None)
df_codes_dev_ner.columns = ["doc_id", "type", "code", "word", "location"]
df_codes_dev_ner = df_codes_dev_ner[~df_codes_dev_ner[['doc_id', 'type', 'location']].duplicated(keep='first')]
df_codes_dev_ner['disc'] = df_codes_dev_ner['location'].apply(lambda x: ';' in x)

Select one type of annotations:

In [25]:
df_codes_dev_ner = df_codes_dev_ner[df_codes_dev_ner['type'] == TYPE_ANN]

Split discontinuous annotations:

In [26]:
df_codes_dev_ner_final = process_labels_norm_prueba(df_ann=df_codes_dev_ner[["doc_id", "type", "code", "word", "location"]])

Remove annotations of zero length:

In [27]:
df_codes_dev_ner_final['length'] = df_codes_dev_ner_final.apply(lambda x: x['end'] - x['start'], axis=1)
df_codes_dev_ner_final = df_codes_dev_ner_final[df_codes_dev_ner_final['length'] > 0]

Separate continuous and discontinuous annotations:

In [28]:
# Continiuous
df_codes_dev_ner_final_cont = df_codes_dev_ner_final[df_codes_dev_ner_final['disc'] == 0].copy()
df_codes_dev_ner_final_cont['disc'] = df_codes_dev_ner_final_cont['disc'].astype(bool)

In [29]:
# Discontinuous
df_codes_dev_ner_final_disc = restore_disc_ann(df_ann=df_codes_dev_ner[df_codes_dev_ner['disc']], 
                    df_ann_final=df_codes_dev_ner_final[df_codes_dev_ner_final['disc'] == 1])

In [30]:
df_codes_dev_ner_final_disc['start'] = df_codes_dev_ner_final_disc['location'].apply(lambda x: int(x.split(' ')[0]))
df_codes_dev_ner_final_disc['end'] = df_codes_dev_ner_final_disc['location'].apply(lambda x: int(x.split(' ')[-1]))

Concatenate continuous and discontinuous annotations:

In [31]:
# Concat
cols_concat = ['doc_id', 'type', 'code', 'word', 'location', 'start', 'end', 'disc']
df_codes_dev_ner_final = pd.concat([df_codes_dev_ner_final_cont[cols_concat], 
                                      df_codes_dev_ner_final_disc[cols_concat]])

Now, we remove the right-to-left (text wise) discontinuous annotations:

In [32]:
df_codes_dev_ner_final['direction'] = df_codes_dev_ner_final.apply(check_ann_left_right_direction, axis=1)

In [33]:
df_codes_dev_ner_final = df_codes_dev_ner_final[df_codes_dev_ner_final['direction']]

We only select the annotations fully contained in a single sentence:

In [34]:
# Sentence-Split data
ss_sub_corpus_path = ss_corpus_path + "dev/"
ss_files = [f for f in os.listdir(ss_sub_corpus_path) if os.path.isfile(ss_sub_corpus_path + f)]
ss_dict_dev = load_ss_files(ss_files, ss_sub_corpus_path)

In [35]:
df_mult_sent_dev, df_one_sent_dev, df_no_sent_dev = check_ann_span_sent(df_ann=df_codes_dev_ner_final, 
                                                                             ss_dict=ss_dict_dev)

In [36]:
df_codes_dev_ner_final = df_one_sent_dev.copy()

In [37]:
print(df_codes_dev_ner_final.disc.value_counts())

False    575
True     202
Name: disc, dtype: int64


In [38]:
df_codes_dev_ner_final.sort_values(['doc_id', 'start', 'end'], inplace=True)

In [39]:
# Code splitting

In [40]:
if TYPE_TASK == 'd':
    df_codes_dev_ner_final["code_pre"] = df_codes_dev_ner_final["code"].apply(lambda x: x.split('.')[0])
    df_codes_dev_ner_final["code_suf"] = df_codes_dev_ner_final["code"].apply(lambda x: None if not '.' in x else x.split('.')[1])
else:
    df_codes_dev_ner_final["code_pre"] = df_codes_dev_ner_final["code"].apply(lambda x: x[:4])
    df_codes_dev_ner_final["code_suf"] = df_codes_dev_ner_final["code"].apply(lambda x: None if len(x) < 7 else x[4:7])

In [41]:
train_dev_codes_pre = sorted(set(df_codes_dev_ner_final["code_pre"].values).union(set(
    df_codes_train_ner_final["code_pre"].values
)))

In [42]:
len(train_dev_codes_pre)

446

In [43]:
train_dev_codes_suf = sorted(set(df_codes_dev_ner_final[df_codes_dev_ner_final['code_suf'].apply(lambda x: x is not None)]["code_suf"].values).union(set(df_codes_train_ner_final[df_codes_train_ner_final['code_suf'].apply(lambda x: x is not None)]["code_suf"].values))) 

In [44]:
len(train_dev_codes_suf)

63

In [45]:
subtask = 'norm'
subtask_ann = subtask + '-iob_code_suf'

CODE_SEP = '.' if TYPE_ANN == 'DIAGNOSTICO' else ''

# Evaluation

In [46]:
train_dev_codes = sorted(set(df_codes_dev_ner_final["code"].values).union(set(
    df_codes_train_ner_final["code"].values
)))

In [47]:
len(train_dev_codes)

529

In [48]:
test_codes = sorted(set(df_test_gs["code"]))

In [49]:
len(test_codes)

371

In [50]:
res_stat = {}
res_eval = {}

## Zero-shot

Considering codes that are only present in the test set.

In [51]:
zero_test_codes = set(test_codes) - set(train_dev_codes)

In [52]:
len(zero_test_codes)

168

In [53]:
df_test_gs_zero = df_test_gs[df_test_gs.code.apply(
    lambda x: x in zero_test_codes
)].sort_values(by=["clinical_case", "pos_gs"])

In [54]:
df_test_gs_zero.shape

(219, 7)

In [55]:
arr_res_eval = []
for i in range(1, 6):
    df_pred_zero = pd.read_csv(RES_DIR + "df_test_zero_preds_" + TYPE_TASK + "_hier_task_cls_mbert_galen_" + str(i) + ".csv", 
                    index_col=0, header=0, sep='\t').sort_values(by=["clinical_case", "pos_pred"])
    code_pred_zero = [df_pred_zero["code"].loc[i] if i in df_pred_zero.index else "-" for i in df_test_gs_zero.index] 
    assert len(code_pred_zero) == df_test_gs_zero.shape[0]
    arr_res_eval.append(
        (
            pd.Series(df_test_gs_zero["code"].values) == pd.Series(code_pred_zero)
        ).value_counts(normalize=True)[True]
    )
dist_res_eval = pd.Series(arr_res_eval).describe()
res_zero_mean_std = "." + str(round(dist_res_eval['mean'], round_n)).split('.')[-1] + " ± " + \
                    "." + str(round(dist_res_eval['std'], round_n)).split('.')[-1]
res_zero_max = "." + str(round(dist_res_eval['max'], round_n)).split('.')[-1]

### Additional analysis

We only select the test samples annotated with code-pre and code-suf already seen in train-dev (otherwise the sample would be impossible to be correctly annotated):

In [56]:
train_dev_codes_suf.append(None) # some codes do not have a suffix

In [57]:
df_test_gs_zero_add = df_test_gs_zero.copy()

In [58]:
if TYPE_TASK == 'd':
    df_test_gs_zero_add["code_pre"] = df_test_gs_zero_add.code.apply(
        lambda x: x.split('.')[0] in train_dev_codes_pre
    )
    df_test_gs_zero_add["code_suf"] = df_test_gs_zero_add.code.apply(
        lambda x: (None if not '.' in x else x.split('.')[1]) in train_dev_codes_suf
    )
else:
    df_test_gs_zero_add["code_pre"] = df_test_gs_zero_add.code.apply(
        lambda x: x[:4] in train_dev_codes_pre
    )
    df_test_gs_zero_add["code_suf"] = df_test_gs_zero_add.code.apply(
        lambda x: (None if len(x) < 7 else x[4:7]) in train_dev_codes_suf
    )

In [59]:
bool_filt_samples = df_test_gs_zero_add.apply(
    lambda x: x["code_pre"] == x["code_suf"] == True, 
    axis=1
).values

In [60]:
df_test_gs_zero_add = df_test_gs_zero[bool_filt_samples]

In [61]:
df_test_gs_zero_add.shape

(28, 7)

Finally, we only select the test samples considered by the model at the time of prediction, i.e. test annotations for which a CLS sample can be produced: 

In [62]:
ind_pred_zero_add = sorted(set(df_test_gs_zero_add.index.values).intersection(set(df_pred_zero.index)))

In [63]:
df_test_gs_zero_add_pos = df_test_gs_zero_add.loc[ind_pred_zero_add]

In [64]:
df_test_gs_zero_add_pos.shape

(22, 7)

In [65]:
arr_res_eval = []
for i in range(1, 6):
    df_pred_zero = pd.read_csv(RES_DIR + "df_test_zero_preds_" + TYPE_TASK + "_hier_task_cls_mbert_galen_" + str(i) + ".csv", 
                    index_col=0, header=0, sep='\t').sort_values(by=["clinical_case", "pos_pred"])
    code_pred_zero_add_pos = [df_pred_zero["code"].loc[i] if i in df_pred_zero.index else "-" for i in df_test_gs_zero_add_pos.index] 
    assert len(code_pred_zero_add_pos) == df_test_gs_zero_add_pos.shape[0]
    arr_res_eval.append(
        (
            pd.Series(df_test_gs_zero_add_pos["code"].values) == pd.Series(code_pred_zero_add_pos)
        ).value_counts(normalize=True)[True]
    )
dist_res_eval = pd.Series(arr_res_eval).describe()
res_zero_add_pos_mean_std = "." + str(round(dist_res_eval['mean'], round_n)).split('.')[-1] + " ± " + \
                    "." + str(round(dist_res_eval['std'], round_n)).split('.')[-1]
res_zero_add_pos_max = "." + str(round(dist_res_eval['max'], round_n)).split('.')[-1]

In [66]:
res_eval["Zero-shot"] = {
    "All acc avg": res_zero_mean_std, 
    "All acc max": res_zero_max, 
    "Pos acc avg": res_zero_add_pos_mean_std,
    "Pos acc max": res_zero_add_pos_max
}

In [67]:
res_stat["Zero-shot"] = {
    "All codes": len(set(df_test_gs_zero.code)), "All ment": df_test_gs_zero.shape[0], 
    "Pos codes": len(set(df_test_gs_zero_add_pos.code)), "Pos ment": df_test_gs_zero_add_pos.shape[0]
}

## Few-shots

Considering codes with absolute freq <= 5 in the train-dev set.

In [68]:
dist_train_dev_codes = pd.concat((
    df_codes_train_ner_final.code, 
    df_codes_dev_ner_final.code
)).value_counts()

In [69]:
few_train_dev_codes = sorted(set(
    dist_train_dev_codes[dist_train_dev_codes <= 5].index.values
))

In [70]:
few_codes = set(few_train_dev_codes).union(set(zero_test_codes))

In [71]:
len(few_codes)

616

In [72]:
df_test_gs_few = df_test_gs[df_test_gs.code.apply(
    lambda x: x in few_codes
)].sort_values(by=["clinical_case", "start_pos_gs", "end_pos_gs"])

In [73]:
df_test_gs_few.shape

(488, 7)

In [74]:
arr_res_eval = []
for i in range(1, 6):
    df_pred_few = pd.read_csv(RES_DIR + "df_test_few_preds_" + TYPE_TASK + "_hier_task_cls_mbert_galen_" + str(i) + ".csv", 
                    index_col=0, header=0, sep='\t').sort_values(by=["clinical_case", "pos_pred"])
    code_pred_few = [df_pred_few["code"].loc[i] if i in df_pred_few.index else "-" for i in df_test_gs_few.index] 
    assert len(code_pred_few) == df_test_gs_few.shape[0]
    arr_res_eval.append(
        (
            pd.Series(df_test_gs_few["code"].values) == pd.Series(code_pred_few)
        ).value_counts(normalize=True)[True]
    )
dist_res_eval = pd.Series(arr_res_eval).describe()
res_few_mean_std = "." + str(round(dist_res_eval['mean'], round_n)).split('.')[-1] + " ± " + \
                    "." + str(round(dist_res_eval['std'], round_n)).split('.')[-1]
res_few_max = "." + str(round(dist_res_eval['max'], round_n)).split('.')[-1]

### Additional analysis

We only select the test samples annotated with code-pre and code-suf already seen in train-dev (otherwise the sample would be impossible to be correctly annotated):

In [75]:
df_test_gs_few_add = df_test_gs_few.copy()

In [76]:
if TYPE_TASK == 'd':
    df_test_gs_few_add["code_pre"] = df_test_gs_few_add.code.apply(
        lambda x: x.split('.')[0] in train_dev_codes_pre
    )
    df_test_gs_few_add["code_suf"] = df_test_gs_few_add.code.apply(
        lambda x: (None if not '.' in x else x.split('.')[1]) in train_dev_codes_suf
    )
else:
    df_test_gs_few_add["code_pre"] = df_test_gs_few_add.code.apply(
        lambda x: x[:4] in train_dev_codes_pre
    )
    df_test_gs_few_add["code_suf"] = df_test_gs_few_add.code.apply(
        lambda x: (None if len(x) < 7 else x[4:7]) in train_dev_codes_suf
    )

In [77]:
bool_filt_samples = df_test_gs_few_add.apply(
    lambda x: x["code_pre"] == x["code_suf"] == True, 
    axis=1
).values

In [78]:
df_test_gs_few_add = df_test_gs_few[bool_filt_samples]

In [79]:
df_test_gs_few_add.shape

(297, 7)

Finally, se only select the test samples considered by the model at the time of prediction, i.e. test annotations for which a CLS sample can be produced: 

In [80]:
ind_pred_few_add = sorted(set(df_test_gs_few_add.index.values).intersection(set(df_pred_few.index)))

In [81]:
df_test_gs_few_add_pos = df_test_gs_few_add.loc[ind_pred_few_add]

In [82]:
df_test_gs_few_add_pos.shape

(236, 7)

In [83]:
arr_res_eval = []
for i in range(1, 6):
    df_pred_few = pd.read_csv(RES_DIR + "df_test_few_preds_" + TYPE_TASK + "_hier_task_cls_mbert_galen_" + str(i) + ".csv", 
                    index_col=0, header=0, sep='\t').sort_values(by=["clinical_case", "pos_pred"])
    code_pred_few_add_pos = [df_pred_few["code"].loc[i] if i in df_pred_few.index else "-" for i in df_test_gs_few_add_pos.index] 
    assert len(code_pred_few_add_pos) == df_test_gs_few_add_pos.shape[0]
    arr_res_eval.append(
        (
            pd.Series(df_test_gs_few_add_pos["code"].values) == pd.Series(code_pred_few_add_pos)
        ).value_counts(normalize=True)[True]
    )
dist_res_eval = pd.Series(arr_res_eval).describe()
res_few_add_pos_mean_std = "." + str(round(dist_res_eval['mean'], round_n)).split('.')[-1] + " ± " + \
                    "." + str(round(dist_res_eval['std'], round_n)).split('.')[-1]
res_few_add_pos_max = "." + str(round(dist_res_eval['max'], round_n)).split('.')[-1]

In [84]:
res_eval["Few-shot"] = {
    "All acc avg": res_few_mean_std, 
    "All acc max": res_few_max, 
    "Pos acc avg": res_few_add_pos_mean_std,
    "Pos acc max": res_few_add_pos_max
}

In [85]:
res_stat["Few-shot"] = {
    "All codes": len(set(df_test_gs_few.code)), "All ment": df_test_gs_few.shape[0], 
    "Pos codes": len(set(df_test_gs_few_add_pos.code)), "Pos ment": df_test_gs_few_add_pos.shape[0]
}

In [86]:
pd.DataFrame(res_eval).transpose()

Unnamed: 0,All acc avg,All acc max,Pos acc avg,Pos acc max
Zero-shot,.023 ± .003,0.027,.227 ± .032,0.273
Few-shot,.285 ± .007,0.295,.59 ± .014,0.61


In [87]:
pd.DataFrame(res_stat).transpose()

Unnamed: 0,All codes,All ment,Pos codes,Pos ment
Zero-shot,168,219,18,22
Few-shot,296,488,127,236
