# Import Statements and Loading Data

In [1]:
#Import statements
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
import gc
import random

max_length = 200
RANDOM_SEED = 42

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


2024-07-19 15:34:54.901069: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-19 15:34:54.914823: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-19 15:34:54.914845: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-19 15:34:54.925118: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


Num GPUs Available:  1


2024-07-19 15:34:57.758395: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-19 15:34:57.764883: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-19 15:34:57.770005: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

# BERT Base Model


In [2]:
model_checkpoint = 'bert-base-cased'
bert_tokenizer = BertTokenizer.from_pretrained(model_checkpoint)
bert_model = TFBertModel.from_pretrained(model_checkpoint)


2024-07-19 15:34:57.942195: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-19 15:34:57.948436: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-19 15:34:57.955763: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [3]:
train_df = pd.read_parquet('data/train_df.parquet')
val_df = pd.read_parquet('data/val_df.parquet')

n = 10
random.seed(RANDOM_SEED)
abbreviation_subset = random.sample(list(train_df['abbreviation'].unique()), n)

train_df = train_df.loc[train_df['abbreviation'].isin(abbreviation_subset)]
val_df = val_df.loc[val_df['abbreviation'].isin(abbreviation_subset)]

label_names = (list(train_df.label) +
               list(val_df.label))
label_names = sorted(list(set(label_names)))
offset = bert_tokenizer.vocab_size
label_dict = {value: index + offset for index, value in enumerate(label_names)}
reverse_label_dict = {value: key for key, value in label_dict.items()}
print(f"Size: {len(label_names)}", label_names)
print(f"Dict Item 1: {list(label_dict.items())[0]}")
print(f"Reverse Dict Item 1: {list(reverse_label_dict.items())[0]}")

print("Train dataset length:", len(train_df))
print("Validation dataset length:", len(val_df))


Size: 37 ['active wakefulness', 'active waking', 'anterior wall', 'ash weight', 'desimipramine', 'desmethylimipramine', 'diabetic muscle infarction', 'dibutylnitrosamine', 'doppler myocardial imaging', 'downbeat nystagmus', 'dry matter intake', 'ecarin clotting time', 'electrochemotherapy', 'electroconvulsive therapy', 'electroconvulsive treatment', 'endocurietherapy', 'glomerular volume', 'graft', 'interaural level differences', 'interstitial lung diseases', 'narcissistic personality disorder', 'nasal potential difference', 'nndibutylnitrosamine', 'nocturnal paroxysmal dystonia', 'normal protein diet', 'paired filtration dialysis', 'pelvic floor dysfunction', 'perfluorodecalin', 'photon flux density', 'proteinfree diet', 'sulfated cholecystokinin octapeptide', 'uridine', 'urinary retention', 'utilization review', 'vein grafts', 'ventriculography', 'vestibular ganglion']
Dict Item 1: ('active wakefulness', 28996)
Reverse Dict Item 1: (28996, 'active wakefulness')
Train dataset length: 

In [4]:
def tokenize(dataset, tokenizer=bert_tokenizer, max_len=max_length, label_dict=label_dict):
    input_ids = []
    token_type_ids = []
    start_positions = []
    end_positions = []
    attention_masks = []
    label_ids = []

    for text, loc, abbreviation, label in zip(dataset['text'], dataset['location'], dataset['abbreviation'], dataset['label']):
        pre_tokens = tokenizer.tokenize(' '.join(text.split()[:loc]))
        adjusted_loc_start = len(pre_tokens) + 1
        adjusted_loc_end = adjusted_loc_start + len(tokenizer.tokenize(abbreviation))
        encoded_input = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='tf'
        )
        
        if adjusted_loc_end < max_length:
            input_ids.append(encoded_input['input_ids'])
            token_type_ids.append(encoded_input['token_type_ids'])
            start_positions.append(adjusted_loc_start)
            end_positions.append(adjusted_loc_end)
            attention_masks.append(encoded_input['attention_mask'])
            label_ids.append(label_dict[label])

    input_ids = np.array(input_ids, dtype=np.int32).squeeze()
    token_type_ids = np.array(token_type_ids, dtype=np.int32).squeeze()
    attention_masks = np.array(attention_masks, dtype=np.int32).squeeze()
    start_positions = np.array(start_positions, dtype=np.int32).squeeze()
    end_positions = np.array(end_positions, dtype=np.int32).squeeze()
    label_ids = np.array(label_ids, dtype=np.int32).squeeze()

    print("First text:\n", dataset['text'].iloc[0])
    print("First location:", dataset['location'].iloc[0])
    print("First acronym:", dataset['text'].iloc[0].split()[dataset['location'].iloc[0]])
    print("First expansion:", dataset['label'].iloc[0])
    print("First text decoded:\n", tokenizer.decode(input_ids[0]))
    print("Confirm adjusted location accuracy: \n",
          tokenizer.decode(input_ids[0][start_positions[0]:end_positions[0]]))
    print("Confirm label:", reverse_label_dict[label_ids[0]])

    return input_ids, token_type_ids, attention_masks, start_positions, end_positions, label_ids

In [5]:
class ExtractAbbreviationHiddenStates(tf.keras.layers.Layer):
    """
    Custom layer that extracts abbreviation embeddings from BERT
    hidden layer state and position         padded_slices = padded_array[:, :inputs
    """
    def call(self, inputs):
        last_hidden_state, start_abbrev_token_positions, end_abbrev_token_positions = inputs

        batch_size = tf.shape(last_hidden_state)[0]
        max_length = tf.shape(last_hidden_state)[1]

        mask = tf.range(max_length)
        mask = tf.tile(mask[tf.newaxis, :], [batch_size, 1])
        mask = tf.logical_and(mask >= start_abbrev_token_positions, mask < end_abbrev_token_positions)
        span_hidden_state = tf.where(tf.expand_dims(mask, -1), last_hidden_state, tf.zeros_like(last_hidden_state))

        return span_hidden_state

def create_base_bert_model(checkpoint = model_checkpoint):
    """
    Build a simple model with BERT. Use the pooled abbreviation
    token embeddings for classification purposes. The final classification layer will be
    added dynamically to support chunking.
    """

    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='attention_mask_layer')
    start_abbrev_token_positions = tf.keras.layers.Input(shape=(1,), dtype=tf.int32, name='start_abbreviation_token_positions_layer')
    end_abbrev_token_positions = tf.keras.layers.Input(shape=(1,), dtype=tf.int32, name='end_abbreviation_token_positions_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}

    bert_model = TFBertModel.from_pretrained(checkpoint, from_pt=True)
    bert_model.trainable = True

    bert_out = bert_model(bert_inputs)

    last_hidden_state = bert_out.last_hidden_state

    span_hidden_states = ExtractAbbreviationHiddenStates()([last_hidden_state, start_abbrev_token_positions, end_abbrev_token_positions])

    pooled_output = tf.reduce_mean(span_hidden_states, axis=1)

    base_model = tf.keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask, start_abbrev_token_positions, end_abbrev_token_positions],
        outputs=[pooled_output],
    )

    return base_model


In [6]:
tf.keras.backend.clear_session()
tf.random.set_seed(RANDOM_SEED)
base_model = create_base_bert_model()
base_model.summary()


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 attention_mask_layer (Inpu  [(None, 200)]                0         []                            
 tLayer)                                                                                          
                                                                                                  
 input_ids_layer (InputLaye  [(None, 200)]                0         []                            
 r)                                                                                               
                                                                                                  
 token_type_ids_layer (Inpu  [(None, 200)]                0         []                            
 tLayer)                                                                                      

In [7]:
n = 5

def split_arrays(arrays, n):
    """Split each array in the list into n equal parts."""
    split_arrays = []
    for array in arrays:
        if len(array.shape) == 2:
            split_arrays.append(np.array_split(array, n))
        else:
            split_arrays.append(np.array_split(array, n))
    return list(zip(*split_arrays))

print("Train-------------------------------------------------------------")
train_input_ids, train_token_type_ids, train_attention_masks, train_start_positions, train_end_positions, train_labels = tokenize(train_df)
print("Val---------------------------------------------------------------")
val_input_ids, val_token_type_ids, val_attention_masks, val_start_positions, val_end_positions, val_labels = tokenize(val_df)

del train_df
del val_df
gc.collect()

train_inputs = [
    train_input_ids,
    train_token_type_ids,
    train_attention_masks,
    train_start_positions,
    train_end_positions,
]
val_inputs = [
    val_input_ids,
    val_token_type_ids,
    val_attention_masks,
    val_start_positions,
    val_end_positions,
]

train_chunks = split_arrays(train_inputs, n)
train_label_chunks = np.array_split(train_labels, n)
val_chunks = split_arrays(val_inputs, n)
val_label_chunks = np.array_split(val_labels, n)


Train-------------------------------------------------------------
First text:
 this study investigated the relationships between renal allograft interstitial fibrosis renal function and graft survival a total of consecutive renal Tx immunosuppressed with cyclosporin were studied needle core transplant biopsies were performed before operation and at and months Tx allograft fibrosis was assessed by histomorphometric analysis of VG interstitial volume fraction renal function was measured by isotopic glomerular filtration rate gfr measurement at the same time points ISV fraction was already high in preperfusion biopsies significantly increased with time but stabilized at months after transplantation gfr correlated negatively with ISV fraction at months p ISV fraction at month was not a useful predictor of subsequent VG survival but for allografts surviving to months an ISV fraction above per cent predicted significantly poorer survival p it provides an objective measure of chronic allogra

In [9]:
epochs = 2

def custom_loss(y_true, y_pred, offset=offset):
        y_true = y_true - offset
        return tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=False)
    
def custom_accuracy(y_true, y_pred):
    offset = bert_tokenizer.vocab_size
    y_true_adjusted = y_true - offset
    return tf.keras.metrics.sparse_categorical_accuracy(y_true_adjusted, y_pred)

for i in range(epochs):
    for i, (train_chunk, label_chunk, val_chunk, val_label_chunk) in enumerate(zip(train_chunks, train_label_chunks, val_chunks, val_label_chunks)):
        num_classes = len(np.unique(label_chunk))
        print(f"Training on chunk {i+1}/{len(train_chunks)} with {num_classes} classes")

        pooled_output = base_model.output
        classification = tf.keras.layers.Dense(num_classes, activation='softmax', name=f'classification_layer_{i}')(pooled_output)
        classification_model = tf.keras.Model(inputs=base_model.inputs, outputs=classification)

        classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),
                                    loss=custom_loss,
                                    metrics=[custom_accuracy])
        
        history = classification_model.fit(
            train_chunk,
            label_chunk,
            validation_data=(val_chunk, val_label_chunk),
            batch_size=16,
            shuffle=True,
            verbose=1,
            epochs=1,
        )

        base_model.set_weights(classification_model.get_weights()[:-2])


#base_model.save('models/20240718_base_bert_ft')

Training on chunk 1/5 with 36 classes


I0000 00:00:1721418074.096547 3011991 service.cc:145] XLA service 0x7e34740d5e90 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1721418074.096587 3011991 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce RTX 3080, Compute Capability 8.6
2024-07-19 15:41:14.103186: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-07-19 15:41:14.127941: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8902
I0000 00:00:1721418074.167372 3011991 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Training on chunk 2/5 with 37 classes
Training on chunk 3/5 with 36 classes
Training on chunk 4/5 with 37 classes
Training on chunk 5/5 with 36 classes
Training on chunk 1/5 with 36 classes
Training on chunk 2/5 with 37 classes
Training on chunk 3/5 with 36 classes
Training on chunk 4/5 with 37 classes
Training on chunk 5/5 with 36 classes
