In [1]:
utils_path = "../utils/"
corpus_path = "../datasets/cantemist_v6/"
sub_task_path = "cantemist-norm/"
test_gs_path = corpus_path + "test-set/" + sub_task_path

In [2]:
import tensorflow as tf

# Auxiliary components
import sys
sys.path.insert(0, utils_path)
from nlp_utils import *

RES_DIR = "../results/Cantemist/final_exec/"
subtask = "norm"

round_n = 3

# GS data
df_test_gs = format_ner_gs(test_gs_path, subtask=subtask)

2023-01-11 09:11:03.051646: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


# Load data

In [3]:
train_path = corpus_path + "train-set/" + sub_task_path
train_files = [f for f in os.listdir(train_path) if os.path.isfile(train_path + f) and f.split('.')[-1] == "txt"]
n_train_files = len(train_files)
train_data = load_text_files(train_files, train_path)
df_text_train = pd.DataFrame({'doc_id': [s.split('.txt')[0] for s in train_files], 'raw_text': train_data})

In [4]:
test_path = corpus_path + "test-set/" + sub_task_path
test_files = [f for f in os.listdir(test_path) if os.path.isfile(test_path + f) and f.split('.')[-1] == 'txt']
test_data = load_text_files(test_files, test_path)
df_text_test = pd.DataFrame({'doc_id': [s.split('.txt')[0] for s in test_files], 'raw_text': test_data})

In [5]:
train_ann_files = [train_path + f for f in os.listdir(train_path) if f.split('.')[-1] == "ann"]

In [6]:
df_codes_train_ner = process_brat_norm(train_ann_files).sort_values(["doc_id", "start", "end"])

In [7]:
df_codes_train_ner["code_pre"] = df_codes_train_ner["code"].apply(lambda x: x.split('/')[0])
df_codes_train_ner["code_suf"] = df_codes_train_ner["code"].apply(lambda x: '/'.join(x.split('/')[1:]))

In [8]:
assert ~df_codes_train_ner[["doc_id", "start", "end"]].duplicated().any()

In [9]:
df_codes_train_ner_final = df_codes_train_ner.copy()

In [10]:
print(df_codes_train_ner_final.shape[0])

6396


In [11]:
train_codes_pre = sorted(set(df_codes_train_ner_final["code_pre"].values))

In [12]:
len(train_codes_pre)

224

In [13]:
train_codes_suf = sorted(set(df_codes_train_ner_final["code_suf"].values))

In [14]:
print(len(train_codes_suf))

34


# Evaluation

In [15]:
res_stat = {}
res_eval = {}

## Full

In [16]:
df_test_gs_full = df_test_gs.sort_values(by=["clinical_case", "start_pos_gs", "end_pos_gs"])

In [17]:
df_test_gs_full.shape

(3635, 8)

In [18]:
arr_res_eval = []
for i in range(1, 6):
    df_pred_full = pd.read_csv(RES_DIR + "df_test_full_preds_c_hier_task_cls_train_mbert_galen_" + str(i) + ".csv", 
                    index_col=0, header=0, sep='\t').sort_values(by=["clinical_case", "start", "end"])
    assert df_pred_full.shape[0] == df_test_gs_full.shape[0]
    arr_res_eval.append(
        (
            pd.Series(df_test_gs_full["code_gs"].values) == pd.Series(df_pred_full["code_pred"].values)
        ).value_counts(normalize=True)[True]
    )
dist_res_eval = pd.Series(arr_res_eval).describe()
res_full_mean_std = "." + str(round(dist_res_eval['mean'], round_n)).split('.')[-1] + " ± " + \
                    "." + str(round(dist_res_eval['std'], round_n)).split('.')[-1]
res_full_max = "." + str(round(dist_res_eval['max'], round_n)).split('.')[-1]

### Additional analysis

We only select the test samples annotated with code-pre and code-suf already seen in train-dev (otherwise the sample would be impossible to be correctly annotated):

In [19]:
df_test_gs_full_add = df_test_gs_full.copy()

In [20]:
df_test_gs_full_add["code_pre"] = df_test_gs_full_add.code_gs.apply(
    lambda x: x.split('/')[0] in train_codes_pre
)
df_test_gs_full_add["code_suf"] = df_test_gs_full_add.code_gs.apply(
    lambda x: '/'.join(x.split('/')[1:]) in train_codes_suf
)

In [21]:
bool_filt_samples = df_test_gs_full_add.apply(
    lambda x: x["code_pre"] == x["code_suf"] == True, 
    axis=1
).values

In [22]:
df_test_gs_full_add = df_test_gs_full[bool_filt_samples]

In [23]:
df_test_gs_full_add.shape

(3504, 8)

In [24]:
arr_res_eval = []
for i in range(1, 6):
    df_pred_full = pd.read_csv(RES_DIR + "df_test_full_preds_c_hier_task_cls_train_mbert_galen_" + str(i) + ".csv", 
                    index_col=0, header=0, sep='\t').sort_values(by=["clinical_case", "start", "end"])
    df_pred_full_add = df_pred_full[bool_filt_samples]
    assert df_pred_full_add.shape[0] == df_test_gs_full_add.shape[0]
    arr_res_eval.append(
        (
            pd.Series(df_test_gs_full_add["code_gs"].values) == pd.Series(df_pred_full_add["code_pred"].values)
        ).value_counts(normalize=True)[True]
    )
dist_res_eval = pd.Series(arr_res_eval).describe()
res_full_add_mean_std = "." + str(round(dist_res_eval['mean'], round_n)).split('.')[-1] + " ± " + \
                    "." + str(round(dist_res_eval['std'], round_n)).split('.')[-1]
res_full_add_max = "." + str(round(dist_res_eval['max'], round_n)).split('.')[-1]

In [25]:
res_eval["Full"] = {
    "All acc avg": res_full_mean_std, 
    "All acc max": res_full_max, 
    "Pos acc avg": res_full_add_mean_std,
    "Pos acc max": res_full_add_max
}

In [26]:
res_stat["Full"] = {
    "All codes": len(set(df_test_gs_full.code_gs)), "All ment": df_test_gs_full.shape[0], 
    "Pos codes": len(set(df_test_gs_full_add.code_gs)), "Pos ment": df_test_gs_full_add.shape[0]
}

## Filtering

Considering mentions that are only present in the test set.

In [27]:
train_mentions = sorted(set([x.lower() for x in set(
    df_codes_train_ner_final["text_ref"].values
)]))

In [28]:
len(train_mentions)

1978

In [29]:
df_test_gs_filt = df_test_gs[df_test_gs.span.apply(
    lambda x: x.lower() not in train_mentions
)].sort_values(by=["clinical_case", "start_pos_gs", "end_pos_gs"])

In [30]:
df_test_gs_filt.shape

(998, 8)

In [31]:
arr_res_eval = []
for i in range(1, 6):
    df_pred_filt = pd.read_csv(RES_DIR + "df_test_filt_preds_c_hier_task_cls_train_mbert_galen_" + str(i) + ".csv", 
                    index_col=0, header=0, sep='\t').sort_values(by=["clinical_case", "start", "end"])
    assert df_pred_filt.shape[0] == df_test_gs_filt.shape[0]
    arr_res_eval.append(
        (
            pd.Series(df_test_gs_filt["code_gs"].values) == pd.Series(df_pred_filt["code_pred"].values)
        ).value_counts(normalize=True)[True]
    )
dist_res_eval = pd.Series(arr_res_eval).describe()
res_filt_mean_std = "." + str(round(dist_res_eval['mean'], round_n)).split('.')[-1] + " ± " + \
                    "." + str(round(dist_res_eval['std'], round_n)).split('.')[-1]
res_filt_max = "." + str(round(dist_res_eval['max'], round_n)).split('.')[-1]

### Additional analysis

We only select the test samples annotated with code-pre and code-suf already seen in train-dev (otherwise the sample would be impossible to be correctly annotated):

In [32]:
df_test_gs_filt_add = df_test_gs_filt.copy()

In [33]:
df_test_gs_filt_add["code_pre"] = df_test_gs_filt_add.code_gs.apply(
    lambda x: x.split('/')[0] in train_codes_pre
)
df_test_gs_filt_add["code_suf"] = df_test_gs_filt_add.code_gs.apply(
    lambda x: '/'.join(x.split('/')[1:]) in train_codes_suf
)

In [34]:
bool_filt_samples = df_test_gs_filt_add.apply(
    lambda x: x["code_pre"] == x["code_suf"] == True, 
    axis=1
).values

In [35]:
df_test_gs_filt_add = df_test_gs_filt[bool_filt_samples]

In [36]:
df_test_gs_filt_add.shape

(868, 8)

In [37]:
arr_res_eval = []
for i in range(1, 6):
    df_pred_filt = pd.read_csv(RES_DIR + "df_test_filt_preds_c_hier_task_cls_train_mbert_galen_" + str(i) + ".csv", 
                    index_col=0, header=0, sep='\t').sort_values(by=["clinical_case", "start", "end"])
    df_pred_filt_add = df_pred_filt[bool_filt_samples]
    assert df_pred_filt_add.shape[0] == df_test_gs_filt_add.shape[0]
    arr_res_eval.append(
        (
            pd.Series(df_test_gs_filt_add["code_gs"].values) == pd.Series(df_pred_filt_add["code_pred"].values)
        ).value_counts(normalize=True)[True]
    )
dist_res_eval = pd.Series(arr_res_eval).describe()
res_filt_add_mean_std = "." + str(round(dist_res_eval['mean'], round_n)).split('.')[-1] + " ± " + \
                    "." + str(round(dist_res_eval['std'], round_n)).split('.')[-1]
res_filt_add_max = "." + str(round(dist_res_eval['max'], round_n)).split('.')[-1]

In [38]:
res_eval["Filtering"] = {
    "All acc avg": res_filt_mean_std, 
    "All acc max": res_filt_max, 
    "Pos acc avg": res_filt_add_mean_std,
    "Pos acc max": res_filt_add_max
}

In [39]:
res_stat["Filtering"] = {
    "All codes": len(set(df_test_gs_filt.code_gs)), "All ment": df_test_gs_filt.shape[0], 
    "Pos codes": len(set(df_test_gs_filt_add.code_gs)), "Pos ment": df_test_gs_filt_add.shape[0]
}

In [40]:
pd.DataFrame(res_stat).transpose()

Unnamed: 0,All codes,All ment,Pos codes,Pos ment
Full,386,3635,307,3504
Filtering,332,998,254,868


In [41]:
pd.DataFrame(res_eval).transpose()

Unnamed: 0,All acc avg,All acc max,Pos acc avg,Pos acc max
Full,.894 ± .002,0.896,.927 ± .002,0.929
Filtering,.64 ± .004,0.645,.735 ± .005,0.742
