In [1]:
MODEL_NAME = "bsc-bio-ehr-es"
SEQ_LEN = 128
LR = 3e-5

In [2]:
import os

In [3]:
ROOT_PATH = "../"
model_root_path = os.path.join(
    ROOT_PATH,
    "models"
)

text_path = "../data/text.txt"

In [4]:
from transformers import BertTokenizerFast, XLMRobertaTokenizerFast, RobertaTokenizerFast

# All variables that depend on model_name

    
if MODEL_NAME == 'bsc-bio-ehr-es':
    model_path = os.path.join(
        model_root_path,
        "RoBERTa/pytorch",
        MODEL_NAME
    )
    tokenizer = RobertaTokenizerFast.from_pretrained(model_path, do_lower_case=False)
    
else:
    raise Exception("ERROR: NO AVAILABLE MODEL!!")

2023-08-24 17:23:09.431339: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [5]:
import tensorflow as tf

import time

import pandas as pd
import numpy as np

# Auxiliary components
import sys
sys.path.append(
    os.path.join(ROOT_PATH, "src")
)

import utils.ner.load_data as load_data
import utils.ner.pre_process as pre_proc
import utils.ner.post_process as post_proc
import utils.tf.loss as tf_loss

# Hyper-parameters
text_col = "raw_text"
GREEDY = True
IGNORE_VALUE = -100
LOGITS = False
ROUND_N = 4

custom_tokenizer = pre_proc.TransformersTokenizer(
    tokenizer=tokenizer, ign_value=IGNORE_VALUE
)

# IOB labels
B_VAL, I_VAL, EMPTY_VAL = "B", "I", "O"
ALLOW_IN_AS_BEGIN = False

custom_tokenizer = pre_proc.TransformersTokenizer(
    tokenizer=tokenizer, ign_value=IGNORE_VALUE
)

random_seed = 0
tf.random.set_seed(random_seed)

## 1. Load text

### Test

In [6]:
arr_text = []
with open(text_path, "r") as file:
    arr_text.append(file.read())
df_text = pd.DataFrame({'doc_id': ["text"], 'raw_text': arr_text})

## 2. Data pre-processing

We generate the valid inputs to the model.

In [10]:
# Create label encoders as dict (more computationally efficient)
lab_encoder = {B_VAL: 0, I_VAL: 1, EMPTY_VAL: 2}
lab_decoder = {0: B_VAL, 1: I_VAL, 2: EMPTY_VAL}

We define the custom pre-processing objects:

In [11]:
custom_annotator = pre_proc.AnnotatorContinuous(
    labeler=pre_proc.LabelerIOB(
        empty_val=EMPTY_VAL,
        begin_val=B_VAL,
        inside_val=I_VAL
    )
)

sub_lab_converter = pre_proc.AllSubLabel()

### Test

In [13]:
doc_list = sorted(set(df_text["doc_id"]))

In [17]:
df_empty = pd.DataFrame({
    "doc_id": []
})

In [18]:
start_time = time.time()

In [19]:
text_tok_dict, text_y, text_frag, text_start_end_frag, \
                text_word_id = pre_proc.create_input_data(df_text=df_text, text_col=text_col, 
                                    df_ann=df_empty,
                                    arr_doc=doc_list, ss_dict=None,
                                    tokenizer=custom_tokenizer, 
                                    arr_lab_encoder=[lab_encoder], 
                                    seq_len=SEQ_LEN,
                                    annotator=custom_annotator,
                                    sub_lab_converter=sub_lab_converter,
                                    greedy=GREEDY)

In [20]:
end_time = time.time()

In [21]:
print("\n1. Exec time of pre-processing documents (in mins):", (end_time - start_time) / 60, "\n")


1. Exec time of pre-processing documents (in mins): 0.020047342777252196 



In [22]:
text_ind, text_att = text_tok_dict['input_ids'], text_tok_dict['attention_mask']

In [None]:
# here guille now: create for loop for different clinical entities

In [27]:
arr_ent_type = ["enfermedad", "procedimiento"]

In [28]:
entity_type = arr_ent_type[0]

## 3. Model Loading

In [23]:
from transformers import TFBertForTokenClassification, TFXLMRobertaForTokenClassification, TFRobertaForTokenClassification 

if MODEL_NAME.split('_')[0] in ('beto', 'mbert'):
    model = TFBertForTokenClassification.from_pretrained(model_path, from_pt=True)
    
elif MODEL_NAME.split('_')[0] == 'xlmr':
    model = TFXLMRobertaForTokenClassification.from_pretrained(model_path, from_pt=True)

elif MODEL_NAME in ('bsc-bio-ehr-es', 'roberta-base-bne', 'roberta-large-bne'):
    model = TFRobertaForTokenClassification.from_pretrained(model_path, from_pt=True)

2023-08-24 17:26:10.398659: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2023-08-24 17:26:10.430654: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-24 17:26:10.431141: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce GTX 1080 Ti computeCapability: 6.1
coreClock: 1.6575GHz coreCount: 28 deviceMemorySize: 10.91GiB deviceMemoryBandwidth: 451.17GiB/s
2023-08-24 17:26:10.431163: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2023-08-24 17:26:10.434079: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2023-08-24 17:26:10.434121: I tensorflow/stream_executo

In [24]:
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.initializers import GlorotUniform

iob_num_labels = len(lab_encoder)

input_ids = Input(shape=(SEQ_LEN,), name='input_ids', dtype='int64')
attention_mask = Input(shape=(SEQ_LEN,), name='attention_mask', dtype='int64')

out_seq = model.layers[0](input_ids=input_ids, attention_mask=attention_mask)[0] # take the output sub-token sequence 

# IOB-2
out_iob = Dense(units=iob_num_labels, kernel_initializer=GlorotUniform(seed=random_seed))(out_seq) # Multi-class classification 
out_iob_model = Activation(activation='softmax', name='iob_output')(out_iob)

model = Model(inputs=[input_ids, attention_mask], outputs=out_iob_model)

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


In [25]:
print(model.summary())

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
roberta (TFRobertaMainLayer)    TFBaseModelOutputWit 124052736   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
dense (Dense)                   (None, 128, 3)       2307        roberta[0][0]                

In [31]:
import tensorflow_addons as tfa
"""
optimizer = tfa.optimizers.RectifiedAdam(learning_rate=LR)
loss = {'iob_output': tf_loss.TokenClassificationLoss(
    from_logits=LOGITS, ignore_val=IGNORE_VALUE
)}
loss_weights = {'iob_output': 1}
model.compile(optimizer=optimizer, loss=loss, loss_weights=loss_weights)
"""
# Load model weights
model.load_weights(
    os.path.join(
        model_root_path,
        "model_checkpoints",
        entity_type
    )
)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fa66074eac0>

## 4. Model predictions

### Test

In [32]:
start_time = time.time()

In [34]:
text_preds = model.predict({'input_ids': text_ind, 'attention_mask': text_att})

2023-08-24 17:35:17.608131: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2023-08-24 17:35:17.628702: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3199980000 Hz


In [26]:
end_time = time.time()

In [27]:
print("\n2. Exec time of making predictions (in mins):", (end_time - start_time) / 60, "\n")

2. Exec time of making predictions (in mins): 0.09016862710316977


## 5. Data post-processing

We post-process the models predictions to generate valid annotations.

We define the custom post-processing objects:

In [28]:
word_preds_converter = post_proc.ProdWordPreds()

custom_ann_extractor = post_proc.AnnExtractorContinuous(
    lab_extractor=post_proc.LabExtractorIOB(
        arr_lab_decoder=[lab_decoder],
        empty_val=EMPTY_VAL,
        begin_val=B_VAL,
        inside_val=I_VAL
    ), 
    allow_inside_as_begin=ALLOW_IN_AS_BEGIN
)

custom_preds_frag_tok = post_proc.NeuralPredsFragTok(
    tokenizer=custom_tokenizer
)

In [29]:
valid_codes = set(map(lambda k: k.split('\t')[0], open(codes_path).readlines()))

In [30]:
subtask = 'ner'

In [None]:
start_time = time.time()

In [31]:
df_pred_test = post_proc.extract_annotations_from_model_preds(arr_doc=test_doc_list, arr_frags=test_frag,
                                      arr_preds=[test_preds],
                                      arr_start_end=test_start_end_frag, arr_word_id=test_word_id,
                                      arr_preds_pos_tok=custom_preds_frag_tok.calculate_pos_tok(
                                          arr_len=test_start_end_frag
                                      ),
                                      ann_extractor=custom_ann_extractor,
                                      word_preds_converter=word_preds_converter)

In [32]:
df_pred_test = metrics.format_distemist_preds(
    df_preds=df_pred_test,
    df_text=df_text_test,
    text_col=text_col
)

In [33]:
df_pred_test = metrics.format_distemist_df(df=df_pred_test, valid_codes=valid_codes)


According to file headers, you are on subtask ner, predictions file


In [34]:
# Save preds
df_pred_test.to_csv(
    RES_DIR + "df_pred_test.tsv", header=True, index=False, sep='\t'
)

In [None]:
end_time = time.time()

In [None]:
print("\n3. Exec time of post-processing predictions (in mins):", (end_time - start_time) / 60, "\n")

In [35]:
df_test_gs = metrics.format_distemist_df(
    df=pd.read_csv(
        corpus_path + "test_annotated/" + subtask_path.replace('training', 'test'), 
        header=0, sep="\t"
    ),
    valid_codes=valid_codes
)


According to file headers, you are on subtask ner, GS file


In [36]:
p, r, f1 = metrics.calculate_distemist_metrics(gs=df_test_gs, pred=df_pred_test, subtask=subtask)

In [38]:
print("\nmicro-avg P = ", round(p, ROUND_N), " | micro-avg R = ", round(r, ROUND_N), 
      " | micro-avg F1 = ", round(f1, ROUND_N), "\n", sep="")

micro-avg P = 0.8049 | micro-avg R = 0.764 | micro-avg F1 = 0.784
