In [1]:
utils_path = "../utils/"
corpus_path = "../datasets/cantemist_v6/"
sub_task_path = "cantemist-norm/"
test_gs_path = corpus_path + "test-set/" + sub_task_path

In [2]:
import tensorflow as tf

# Auxiliary components
import sys
sys.path.insert(0, utils_path)
from nlp_utils import *

RES_DIR = "../results/Cantemist/final_exec/"
subtask = "norm"

round_n = 3

# GS data
df_test_gs = format_ner_gs(test_gs_path, subtask=subtask)

2023-01-11 09:50:15.551329: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


# Load data

In [3]:
train_path = corpus_path + "train-set/" + sub_task_path
train_files = [f for f in os.listdir(train_path) if os.path.isfile(train_path + f) and f.split('.')[-1] == "txt"]
n_train_files = len(train_files)
train_data = load_text_files(train_files, train_path)
dev1_path = corpus_path + "dev-set1/" + sub_task_path
train_files.extend([f for f in os.listdir(dev1_path) if os.path.isfile(dev1_path + f) and f.split('.')[-1] == "txt"])
train_data.extend(load_text_files(train_files[n_train_files:], dev1_path))
df_text_train = pd.DataFrame({'doc_id': [s.split('.txt')[0] for s in train_files], 'raw_text': train_data})

In [4]:
dev_path = corpus_path + "dev-set2/" + sub_task_path
dev_files = [f for f in os.listdir(dev_path) if os.path.isfile(dev_path + f) and f.split('.')[-1] == "txt"]
dev_data = load_text_files(dev_files, dev_path)
df_text_dev = pd.DataFrame({'doc_id': [s.split('.txt')[0] for s in dev_files], 'raw_text': dev_data})

In [5]:
test_path = corpus_path + "test-set/" + sub_task_path
test_files = [f for f in os.listdir(test_path) if os.path.isfile(test_path + f) and f.split('.')[-1] == 'txt']
test_data = load_text_files(test_files, test_path)
df_text_test = pd.DataFrame({'doc_id': [s.split('.txt')[0] for s in test_files], 'raw_text': test_data})

In [6]:
train_ann_files = [train_path + f for f in os.listdir(train_path) if f.split('.')[-1] == "ann"]
train_ann_files.extend([dev1_path + f for f in os.listdir(dev1_path) if f.split('.')[-1] == "ann"])

In [7]:
df_codes_train_ner = process_brat_norm(train_ann_files).sort_values(["doc_id", "start", "end"])

In [8]:
df_codes_train_ner["code_pre"] = df_codes_train_ner["code"].apply(lambda x: x.split('/')[0])
df_codes_train_ner["code_suf"] = df_codes_train_ner["code"].apply(lambda x: '/'.join(x.split('/')[1:]))

In [9]:
assert ~df_codes_train_ner[["doc_id", "start", "end"]].duplicated().any()

In [10]:
df_codes_train_ner_final = df_codes_train_ner.copy()

In [11]:
print(df_codes_train_ner_final.shape[0])

9737


In [12]:
dev_ann_files = [dev_path + f for f in os.listdir(dev_path) if f.split('.')[-1] == "ann"]

In [13]:
df_codes_dev_ner = process_brat_norm(dev_ann_files).sort_values(["doc_id", "start", "end"])

In [14]:
df_codes_dev_ner["code_pre"] = df_codes_dev_ner["code"].apply(lambda x: x.split('/')[0])
df_codes_dev_ner["code_suf"] = df_codes_dev_ner["code"].apply(lambda x: '/'.join(x.split('/')[1:]))

In [15]:
assert ~df_codes_dev_ner[["doc_id", "start", "end"]].duplicated().any()

In [16]:
df_codes_dev_ner_final = df_codes_dev_ner.copy()

In [17]:
print(df_codes_dev_ner_final.shape[0])

2660


In [18]:
train_dev_codes_pre = sorted(set(df_codes_dev_ner_final["code_pre"].values).union(set(
    df_codes_train_ner_final["code_pre"].values
)))

In [19]:
len(train_dev_codes_pre)

307

In [20]:
train_dev_codes_suf = sorted(set(df_codes_dev_ner_final["code_suf"].values).union(set(df_codes_train_ner_final["code_suf"].values))) 

In [21]:
print(len(train_dev_codes_suf))

35


# Evaluation

In [22]:
train_dev_codes = sorted(set(df_codes_dev_ner_final["code"].values).union(set(
    df_codes_train_ner_final["code"].values
)))

In [23]:
len(train_dev_codes)

743

In [24]:
test_codes = sorted(set(df_test_gs["code_gs"]))

In [25]:
len(test_codes)

386

In [28]:
res_stat = {}
res_eval = {}

## Zero-shot

Considering codes that are only present in the test set.

In [29]:
zero_test_codes = set(test_codes) - set(train_dev_codes)

In [30]:
len(zero_test_codes)

107

In [31]:
df_test_gs_zero = df_test_gs[df_test_gs.code_gs.apply(
    lambda x: x in zero_test_codes
)].sort_values(by=["clinical_case", "start_pos_gs", "end_pos_gs"])

In [32]:
df_test_gs_zero.shape

(148, 8)

In [33]:
arr_res_eval = []
for i in range(1, 6):
    df_pred_zero = pd.read_csv(RES_DIR + "df_test_zero_preds_c_hier_task_cls_mbert_galen_" + str(i) + ".csv", 
                    index_col=0, header=0, sep='\t').sort_values(by=["clinical_case", "start", "end"])
    assert df_pred_zero.shape[0] == df_test_gs_zero.shape[0]
    arr_res_eval.append(
        (
            pd.Series(df_test_gs_zero["code_gs"].values) == pd.Series(df_pred_zero["code_pred"].values)
        ).value_counts(normalize=True)[True]
    )
dist_res_eval = pd.Series(arr_res_eval).describe()
res_zero_mean_std = "." + str(round(dist_res_eval['mean'], round_n)).split('.')[-1] + " ± " + \
                    "." + str(round(dist_res_eval['std'], round_n)).split('.')[-1]
res_zero_max = "." + str(round(dist_res_eval['max'], round_n)).split('.')[-1]

### Additional analysis

We only select the test samples annotated with code-pre and code-suf already seen in train-dev (otherwise the sample would be impossible to be correctly annotated):

In [34]:
df_test_gs_zero_add = df_test_gs_zero.copy()

In [35]:
df_test_gs_zero_add["code_pre"] = df_test_gs_zero_add.code_gs.apply(
    lambda x: x.split('/')[0] in train_dev_codes_pre
)
df_test_gs_zero_add["code_suf"] = df_test_gs_zero_add.code_gs.apply(
    lambda x: '/'.join(x.split('/')[1:]) in train_dev_codes_suf
)

In [36]:
bool_filt_samples = df_test_gs_zero_add.apply(
    lambda x: x["code_pre"] == x["code_suf"] == True, 
    axis=1
).values

In [37]:
df_test_gs_zero_add = df_test_gs_zero[bool_filt_samples]

In [38]:
df_test_gs_zero_add.shape

(83, 8)

In [39]:
arr_res_eval = []
for i in range(1, 6):
    df_pred_zero = pd.read_csv(RES_DIR + "df_test_zero_preds_c_hier_task_cls_mbert_galen_" + str(i) + ".csv", 
                    index_col=0, header=0, sep='\t').sort_values(by=["clinical_case", "start", "end"])
    df_pred_zero_add = df_pred_zero[bool_filt_samples]
    assert df_pred_zero_add.shape[0] == df_test_gs_zero_add.shape[0]
    arr_res_eval.append(
        (
            pd.Series(df_test_gs_zero_add["code_gs"].values) == pd.Series(df_pred_zero_add["code_pred"].values)
        ).value_counts(normalize=True)[True]
    )
dist_res_eval = pd.Series(arr_res_eval).describe()
res_zero_add_mean_std = "." + str(round(dist_res_eval['mean'], round_n)).split('.')[-1] + " ± " + \
                    "." + str(round(dist_res_eval['std'], round_n)).split('.')[-1]
res_zero_add_max = "." + str(round(dist_res_eval['max'], round_n)).split('.')[-1]

In [40]:
res_eval["Zero-shot"] = {
    "All acc avg": res_zero_mean_std, 
    "All acc max": res_zero_max, 
    "Pos acc avg": res_zero_add_mean_std,
    "Pos acc max": res_zero_add_max
}

In [41]:
res_stat["Zero-shot"] = {
    "All codes": len(set(df_test_gs_zero.code_gs)), "All ment": df_test_gs_zero.shape[0], 
    "Pos codes": len(set(df_test_gs_zero_add.code_gs)), "Pos ment": df_test_gs_zero_add.shape[0]
}

## Few-shots

Considering codes with absolute freq <= 5 in the train-dev set.

In [42]:
dist_train_dev_codes = pd.concat((
    df_codes_train_ner_final.code, 
    df_codes_dev_ner_final.code
)).value_counts()

In [43]:
few_train_dev_codes = sorted(set(
    dist_train_dev_codes[dist_train_dev_codes <= 5].index.values
))

In [44]:
few_codes = set(few_train_dev_codes).union(set(zero_test_codes))

In [45]:
len(few_codes)

692

In [46]:
df_test_gs_few = df_test_gs[df_test_gs.code_gs.apply(
    lambda x: x in few_codes
)].sort_values(by=["clinical_case", "start_pos_gs", "end_pos_gs"])

In [47]:
df_test_gs_few.shape

(445, 8)

In [48]:
arr_res_eval = []
for i in range(1, 6):
    df_pred_few = pd.read_csv(RES_DIR + "df_test_few_preds_c_hier_task_cls_mbert_galen_" + str(i) + ".csv", 
                    index_col=0, header=0, sep='\t').sort_values(by=["clinical_case", "start", "end"])
    assert df_pred_few.shape[0] == df_test_gs_few.shape[0]
    arr_res_eval.append(
        (
            pd.Series(df_test_gs_few["code_gs"].values) == pd.Series(df_pred_few["code_pred"].values)
        ).value_counts(normalize=True)[True]
    )
dist_res_eval = pd.Series(arr_res_eval).describe()
res_few_mean_std = "." + str(round(dist_res_eval['mean'], round_n)).split('.')[-1] + " ± " + \
                    "." + str(round(dist_res_eval['std'], round_n)).split('.')[-1]
res_few_max = "." + str(round(dist_res_eval['max'], round_n)).split('.')[-1]

### Additional analysis

We only select the test samples annotated with code-pre and code-suf already seen in train-dev (otherwise the sample would be impossible to be correctly annotated):

In [49]:
df_test_gs_few_add = df_test_gs_few.copy()

In [50]:
df_test_gs_few_add["code_pre"] = df_test_gs_few_add.code_gs.apply(
    lambda x: x.split('/')[0] in train_dev_codes_pre
)
df_test_gs_few_add["code_suf"] = df_test_gs_few_add.code_gs.apply(
    lambda x: '/'.join(x.split('/')[1:]) in train_dev_codes_suf
)

In [51]:
bool_filt_samples = df_test_gs_few_add.apply(
    lambda x: x["code_pre"] == x["code_suf"] == True, 
    axis=1
).values

In [52]:
df_test_gs_few_add = df_test_gs_few[bool_filt_samples]

In [53]:
df_test_gs_few_add.shape

(380, 8)

In [54]:
arr_res_eval = []
for i in range(1, 6):
    df_pred_few = pd.read_csv(RES_DIR + "df_test_few_preds_c_hier_task_cls_mbert_galen_" + str(i) + ".csv", 
                    index_col=0, header=0, sep='\t').sort_values(by=["clinical_case", "start", "end"])
    df_pred_few_add = df_pred_few[bool_filt_samples]
    assert df_pred_few_add.shape[0] == df_test_gs_few_add.shape[0]
    arr_res_eval.append(
        (
            pd.Series(df_test_gs_few_add["code_gs"].values) == pd.Series(df_pred_few_add["code_pred"].values)
        ).value_counts(normalize=True)[True]
    )
dist_res_eval = pd.Series(arr_res_eval).describe()
res_few_add_mean_std = "." + str(round(dist_res_eval['mean'], round_n)).split('.')[-1] + " ± " + \
                    "." + str(round(dist_res_eval['std'], round_n)).split('.')[-1]
res_few_add_max = "." + str(round(dist_res_eval['max'], round_n)).split('.')[-1]

In [55]:
res_eval["Few-shot"] = {
    "All acc avg": res_few_mean_std, 
    "All acc max": res_few_max, 
    "Pos acc avg": res_few_add_mean_std,
    "Pos acc max": res_few_add_max
}

In [56]:
res_stat["Few-shot"] = {
    "All codes": len(set(df_test_gs_few.code_gs)), "All ment": df_test_gs_few.shape[0], 
    "Pos codes": len(set(df_test_gs_few_add.code_gs)), "Pos ment": df_test_gs_few_add.shape[0]
}

In [57]:
pd.DataFrame(res_stat).transpose()

Unnamed: 0,All codes,All ment,Pos codes,Pos ment
Zero-shot,107,148,66,83
Few-shot,254,445,213,380


In [58]:
pd.DataFrame(res_eval).transpose()

Unnamed: 0,All acc avg,All acc max,Pos acc avg,Pos acc max
Zero-shot,.17 ± .013,0.182,.304 ± .023,0.325
Few-shot,.544 ± .014,0.562,.637 ± .016,0.658
