In [1]:
#Import statements
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
import gc

max_length = 200

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


2024-07-31 17:54:46.167809: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-31 17:54:46.181039: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-31 17:54:46.181058: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-31 17:54:46.190495: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


Num GPUs Available:  1


2024-07-31 17:54:49.110217: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-31 17:54:49.119014: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-31 17:54:49.125000: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [2]:
tokenizer_checkpoint = 'bert-base-cased'
model_checkpoint = 'emilyalsentzer/Bio_ClinicalBERT'
bert_tokenizer = BertTokenizer.from_pretrained(tokenizer_checkpoint)
offset = bert_tokenizer.vocab_size


In [3]:
with open('../vocabulary/dictionary.json', 'r') as f:
    label_dict = json.load(f)
    label_dict = {key: int(value) for key, value in label_dict.items()}

with open('../vocabulary/reverse_dictionary.json', 'r') as f:
    reverse_label_dict = json.load(f)
    reverse_label_dict = {int(key): value for key, value in reverse_label_dict.items()}

with open('../vocabulary/reverse_dictionary.json', 'r') as f:
    reverse_label_dict = json.load(f)
    reverse_label_dict = {int(key): value for key, value in reverse_label_dict.items()}

with open('../vocabulary/label_names.json', 'r') as f:
    label_names = json.load(f)
    

In [4]:
class ExtractAbbreviationHiddenStates(tf.keras.layers.Layer):
    """
    Custom layer that extracts abbreviation embeddings from BERT
    hidden layer state and position         padded_slices = padded_array[:, :inputs
    """
    def call(self, inputs):
        last_hidden_state, start_abbrev_token_positions, end_abbrev_token_positions = inputs

        batch_size = tf.shape(last_hidden_state)[0]
        max_length = tf.shape(last_hidden_state)[1]

        mask = tf.range(max_length)
        mask = tf.tile(mask[tf.newaxis, :], [batch_size, 1])
        mask = tf.logical_and(mask >= start_abbrev_token_positions, mask < end_abbrev_token_positions)
        span_hidden_state = tf.where(tf.expand_dims(mask, -1), last_hidden_state, tf.zeros_like(last_hidden_state))

        return span_hidden_state

def create_bert_multiclass_model(checkpoint = model_checkpoint,
                                 num_classes = len(label_names),
                                 learning_rate=0.00005):
    """
    Build a simple classification model with BERT. Use the pooled abbreviation
    token embeddings for classification purposes.
    """
    tf.keras.backend.clear_session()

    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='attention_mask_layer')
    start_abbrev_token_positions = tf.keras.layers.Input(shape=(1,), dtype=tf.int32, name='start_abbreviation_token_positions_layer')
    end_abbrev_token_positions = tf.keras.layers.Input(shape=(1,), dtype=tf.int32, name='end_abbreviation_token_positions_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}

    bert_model = TFBertModel.from_pretrained(checkpoint, from_pt=True)
    bert_model.trainable = True

    bert_out = bert_model(bert_inputs)

    last_hidden_state = bert_out.last_hidden_state

    span_hidden_states = ExtractAbbreviationHiddenStates()([last_hidden_state, start_abbrev_token_positions, end_abbrev_token_positions])

    pooled_output = tf.reduce_mean(span_hidden_states, axis=1)

    classification = tf.keras.layers.Dense(num_classes, activation='softmax', name='classification_layer')(pooled_output)

    classification_model = tf.keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask, start_abbrev_token_positions, end_abbrev_token_positions],
        outputs=[classification],
    )

    def custom_loss(y_true, y_pred, offset=offset):
        y_true = y_true - offset
        return tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=False)
    
    def custom_accuracy(y_true, y_pred):
        offset = bert_tokenizer.vocab_size
        y_true_adjusted = y_true - offset
        return tf.keras.metrics.sparse_categorical_accuracy(y_true_adjusted, y_pred)
    
    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss=custom_loss,
                                 metrics=[custom_accuracy])

    return classification_model


In [5]:
model = create_bert_multiclass_model()
model.load_weights('../models/20240729_bio_bert_ft_weights.hdf5')
model.summary()


2024-07-31 17:54:49.328591: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-31 17:54:49.338414: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-31 17:54:49.346416: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 attention_mask_layer (Inpu  [(None, 200)]                0         []                            
 tLayer)                                                                                          
                                                                                                  
 input_ids_layer (InputLaye  [(None, 200)]                0         []                            
 r)                                                                                               
                                                                                                  
 token_type_ids_layer (Inpu  [(None, 200)]                0         []                            
 tLayer)                                                                                      

In [6]:
test_input_ids = np.load('../tokenized_medal_inputs/test_input_ids.npy')
test_token_type_ids = np.load('../tokenized_medal_inputs/test_token_type_ids.npy')
test_attention_masks = np.load('../tokenized_medal_inputs/test_attention_masks.npy')
test_start_positions = np.load('../tokenized_medal_inputs/test_start_positions.npy')
test_end_positions = np.load('../tokenized_medal_inputs/test_end_positions.npy')
test_labels = np.load('../tokenized_medal_inputs/test_labels.npy')

samples = np.random.choice(test_input_ids.shape[0], 1000)

test_input_ids = test_input_ids[samples]
test_token_type_ids = test_token_type_ids[samples]
test_attention_masks = test_attention_masks[samples]
test_start_positions = test_start_positions[samples]
test_end_positions = test_end_positions[samples]
test_labels = test_labels[samples]

test_inputs = [
    test_input_ids,
    test_token_type_ids,
    test_attention_masks,
    test_start_positions,
    test_end_positions,
]

predictions = model.predict(test_inputs)
y_pred = np.argmax(predictions, axis=-1)
y_true = test_labels




In [7]:
sample = np.random.randint(0, test_labels.shape[0])
print("Sample:", sample)
print("First X Test:\n", bert_tokenizer.convert_tokens_to_string(bert_tokenizer.convert_ids_to_tokens(test_inputs[0][sample])))
print(y_true[sample])
print("First Y True:\n", reverse_label_dict[y_true[sample]])
print("First Y Pred:\n", label_names[y_pred[sample]])


Sample: 783
First X Test:
 [CLS] a comparison was made between penicillin and bicucullineinduced EA aqueous solutions of either penicillin or BIC were applied to striate SC of rb pups and the electroencephalogram was monitored applications were made twice daily for consecutive days beginning on PN day during this time period interictal spikes generated by bicuculline and penicillin displayed similar properties in most respects however the morphology of spikes induced by each convulsant was different [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PA