# Import Statements and Loading Data

In [1]:
#Import statements
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
import gc
import random

max_length = 200
RANDOM_SEED = 42

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


2024-07-17 19:37:35.573939: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-17 19:37:35.588346: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-17 19:37:35.588373: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-17 19:37:35.598753: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


Num GPUs Available:  1


2024-07-17 19:37:38.619106: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-17 19:37:38.623219: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-17 19:37:38.626104: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

# BERT Base Model


In [2]:
model_checkpoint = 'bert-base-cased'
bert_tokenizer = BertTokenizer.from_pretrained(model_checkpoint)
bert_model = TFBertModel.from_pretrained(model_checkpoint)


2024-07-17 19:37:38.836903: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-17 19:37:38.839501: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-17 19:37:38.842389: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [3]:
train_df = pd.read_parquet('data/train_df.parquet')
val_df = pd.read_parquet('data/val_df.parquet')
test_df = pd.read_parquet('data/test_df.parquet')

label_names = (list(train_df.label) +
               list(val_df.label) +
               list(test_df.label))
label_names = np.array(label_names)
label_names = sorted(np.unique(label_names).tolist())
offset = bert_tokenizer.vocab_size
label_dict = {value: index + offset for index, value in enumerate(label_names)}
reverse_label_dict = {value: key for key, value in label_dict.items()}
print(f"Size: {len(label_names)}", label_names)
print(f"Dict Item 1: {list(label_dict.items())[0]}")
print(f"Reverse Dict Item 1: {list(reverse_label_dict.items())[0]}")

n = 15
random.seed(RANDOM_SEED)
abbreviation_subset = random.sample(list(train_df['abbreviation'].unique()), n)

train_df = train_df.loc[train_df['abbreviation'].isin(abbreviation_subset)]
val_df = val_df.loc[val_df['abbreviation'].isin(abbreviation_subset)]
test_df = test_df.loc[test_df['abbreviation'].isin(abbreviation_subset)]

print("Train dataset length:", len(train_df))
print("Validation dataset length:", len(val_df))
print("Test dataset length:", len(test_df))


Dict Item 1: ('a receptor antagonist cyclopentyldipropylxanthine', 28996)
Reverse Dict Item 1: (28996, 'a receptor antagonist cyclopentyldipropylxanthine')
Train dataset length: 9379
Validation dataset length: 3091
Test dataset length: 3130


In [4]:
def tokenize(dataset, tokenizer=bert_tokenizer, max_len=max_length, label_dict=label_dict):
    input_ids = []
    token_type_ids = []
    start_positions = []
    end_positions = []
    attention_masks = []
    label_ids = []

    for text, loc, abbreviation, label in zip(dataset['text'], dataset['location'], dataset['abbreviation'], dataset['label']):
        pre_tokens = tokenizer.tokenize(' '.join(text.split()[:loc]))
        adjusted_loc_start = len(pre_tokens) + 1
        adjusted_loc_end = adjusted_loc_start + len(tokenizer.tokenize(abbreviation))
        encoded_input = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='tf'
        )
        
        if adjusted_loc_end < max_length:
            input_ids.append(encoded_input['input_ids'])
            token_type_ids.append(encoded_input['token_type_ids'])
            start_positions.append(adjusted_loc_start)
            end_positions.append(adjusted_loc_end)
            attention_masks.append(encoded_input['attention_mask'])
            label_ids.append(label_dict[label])

    input_ids = np.array(input_ids, dtype=np.int32).squeeze()
    token_type_ids = np.array(token_type_ids, dtype=np.int32).squeeze()
    attention_masks = np.array(attention_masks, dtype=np.int32).squeeze()
    start_positions = np.array(start_positions, dtype=np.int32).squeeze()
    end_positions = np.array(end_positions, dtype=np.int32).squeeze()
    label_ids = np.array(label_ids, dtype=np.int32).squeeze()

    print("First text:\n", dataset['text'].iloc[0])
    print("First location:", dataset['location'].iloc[0])
    print("First acronym:", dataset['text'].iloc[0].split()[dataset['location'].iloc[0]])
    print("First expansion:", dataset['label'].iloc[0])
    print("First text decoded:\n", tokenizer.decode(input_ids[0]))
    print("Confirm adjusted location accuracy: \n",
          tokenizer.decode(input_ids[0][start_positions[0]:end_positions[0]]))
    print("Confirm label:", reverse_label_dict[label_ids[0]])

    return input_ids, token_type_ids, attention_masks, start_positions, end_positions, label_ids

In [5]:
print("Train-------------------------------------------------------------")
train_input_ids, train_token_type_ids, train_attention_masks, train_start_positions, train_end_positions, train_labels = tokenize(train_df)
print("Val---------------------------------------------------------------")
val_input_ids, val_token_type_ids, val_attention_masks, val_start_positions, val_end_positions, val_labels = tokenize(val_df)
print("Test--------------------------------------------------------------")
test_input_ids, test_token_type_ids, test_attention_masks, test_start_positions, test_end_positions, test_labels = tokenize(test_df)

del train_df
del val_df
del test_df
gc.collect()


Train-------------------------------------------------------------
First text:
 this study investigated the relationships between renal allograft interstitial fibrosis renal function and graft survival a total of consecutive renal Tx immunosuppressed with cyclosporin were studied needle core transplant biopsies were performed before operation and at and months Tx allograft fibrosis was assessed by histomorphometric analysis of VG interstitial volume fraction renal function was measured by isotopic glomerular filtration rate gfr measurement at the same time points ISV fraction was already high in preperfusion biopsies significantly increased with time but stabilized at months after transplantation gfr correlated negatively with ISV fraction at months p ISV fraction at month was not a useful predictor of subsequent VG survival but for allografts surviving to months an ISV fraction above per cent predicted significantly poorer survival p it provides an objective measure of chronic allogra

0

In [6]:
class ExtractAbbreviationHiddenStates(tf.keras.layers.Layer):
    """
    Custom layer that extracts abbreviation embeddings from BERT
    hidden layer state and position         padded_slices = padded_array[:, :inputs
    """
    def call(self, inputs):
        last_hidden_state, start_abbrev_token_positions, end_abbrev_token_positions = inputs

        batch_size = tf.shape(last_hidden_state)[0]
        max_length = tf.shape(last_hidden_state)[1]

        mask = tf.range(max_length)
        mask = tf.tile(mask[tf.newaxis, :], [batch_size, 1])
        mask = tf.logical_and(mask >= start_abbrev_token_positions, mask < end_abbrev_token_positions)
        span_hidden_state = tf.where(tf.expand_dims(mask, -1), last_hidden_state, tf.zeros_like(last_hidden_state))

        return span_hidden_state

def create_bert_multiclass_model(checkpoint = model_checkpoint,
                                 num_classes = len(label_names),
                                 learning_rate=0.00005):
    """
    Build a simple classification model with BERT. Use the pooled abbreviation
    token embeddings for classification purposes.
    """
    tf.keras.backend.clear_session()
    tf.random.set_seed(RANDOM_SEED)

    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='attention_mask_layer')
    start_abbrev_token_positions = tf.keras.layers.Input(shape=(1,), dtype=tf.int32, name='start_abbreviation_token_positions_layer')
    end_abbrev_token_positions = tf.keras.layers.Input(shape=(1,), dtype=tf.int32, name='end_abbreviation_token_positions_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}

    bert_model = TFBertModel.from_pretrained(checkpoint, from_pt=True)
    bert_model.trainable = True

    bert_out = bert_model(bert_inputs)

    last_hidden_state = bert_out.last_hidden_state

    span_hidden_states = ExtractAbbreviationHiddenStates()([last_hidden_state, start_abbrev_token_positions, end_abbrev_token_positions])

    pooled_output = tf.reduce_mean(span_hidden_states, axis=1)

    classification = tf.keras.layers.Dense(num_classes, activation='softmax', name='classification_layer')(pooled_output)

    classification_model = tf.keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask, start_abbrev_token_positions, end_abbrev_token_positions],
        outputs=[classification],
    )

    def custom_loss(y_true, y_pred, offset=offset):
        y_true = y_true - offset
        return tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=False)
    
    def custom_accuracy(y_true, y_pred):
        offset = bert_tokenizer.vocab_size
        y_true_adjusted = y_true - offset
        return tf.keras.metrics.sparse_categorical_accuracy(y_true_adjusted, y_pred)
    
    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss=custom_loss,
                                 metrics=[custom_accuracy])

    return classification_model


In [7]:
model = create_bert_multiclass_model()
model.summary()


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 attention_mask_layer (Inpu  [(None, 200)]                0         []                            
 tLayer)                                                                                          
                                                                                                  
 input_ids_layer (InputLaye  [(None, 200)]                0         []                            
 r)                                                                                               
                                                                                                  
 token_type_ids_layer (Inpu  [(None, 200)]                0         []                            
 tLayer)                                                                                      

In [8]:
train_inputs = [
    train_input_ids,
    train_token_type_ids,
    train_attention_masks,
    train_start_positions,
    train_end_positions,
]
valid_inputs = [
    val_input_ids,
    val_token_type_ids,
    val_attention_masks,
    val_start_positions,
    val_end_positions,
]
history = model.fit(
    train_inputs,
    np.array(train_labels),
    validation_data=(valid_inputs, np.array(val_labels)),
    batch_size=16,
    shuffle=True,
    verbose=1,
    epochs=2,
)

#model.save('models/20240717_base_bert_ft')


Epoch 1/2


I0000 00:00:1721259556.297119  266204 service.cc:145] XLA service 0x747e105a1bf0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1721259556.297144  266204 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce RTX 3080, Compute Capability 8.6
2024-07-17 19:39:16.301066: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-07-17 19:39:16.317435: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8902
I0000 00:00:1721259556.356418  266204 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/2
