

Using Colab for GPU fine tuning of model

*  Edit 🡒 Notebook Settings 🡒 Hardware accelerator 🡒 (GPU)






---


To use the GPU for torch, identify and specify the GPU as the device.



---


Install Transformers for a pytorch interface.

Install wget to download our training data in colabs instanced file system

In [None]:
!pip install transformers

In [None]:
!pip install wget

In [None]:
import os
import wget

print('Downloading Dataset...')

# URL for the dataset zip file
url = 'https://bionlp.nlm.nih.gov/tac2019druginteractions/trainingFiles2018.zip'

# Download the file
if not os.path.exists('./trainingFiles2018.zip'):
    wget.download(url, './trainingFiles2018.zip')


Downloading Dataset...


In [None]:
# unzip
if not os.path.exists('./trainingFiles2018.zip'):
  !unzip trainingFiles2018.zip

Setting up the Tokenizer

In [None]:
from transformers import BertTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

# Load the BERT tokenizer
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



---

Next is Parsing the data.

In [None]:
import pandas as pd
import xml.etree.ElementTree as ET
import os

# Path to the XML files
folder_path = 'drive/MyDrive/ddi_training/training2018/'

# Counter for limiting output to the first three files
output_counter = 0

# Iterate through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".xml"):
        file_path = os.path.join(folder_path, filename)

        # Read the content of the XML file
        with open(file_path, 'r', encoding='utf-8') as file:
            xml_data = file.read()

        # Parse the XML data
        root = ET.fromstring(xml_data)

        # Extract drug name from the first line
        drug_name = root.get('drug', 'Unknown Drug')

        # Extract information from Text section
        text_section = root.find('.//Text')
        text_content = ''
        if text_section is not None:
            for section in text_section.findall('.//Section'):
                section_name = section.get('name', 'Unknown Section')
                section_text = section.text.strip() if section.text is not None else ''
                text_content += f'Section: {section_name}\n{section_text}\n\n'

        # Extract information from Sentences section
        sentences_section = root.find('.//Sentences')
        sentences_list = []
        if sentences_section is not None:
            for sentence in sentences_section.findall('.//Sentence'):
                sentence_id = sentence.get('id', 'Unknown ID')
                sentence_text = sentence.find('.//SentenceText').text.strip() if sentence.find('.//SentenceText') is not None else ''
                sentences_list.append({'id': sentence_id, 'text': sentence_text})

        # Extract information from LabelInteractions (similar to previous code)
        label_interactions = root.find('.//LabelInteractions')
        interactions_list = []

        if label_interactions is not None:
            for label_interaction in label_interactions.findall('.//LabelInteraction'):
                interaction_info = {
                    'type': label_interaction.get('type'),
                    'precipitant': label_interaction.get('precipitant'),
                    'precipitantCode': label_interaction.get('precipitantCode'),
                    'effect': label_interaction.get('effect', '')
                }
                interactions_list.append(interaction_info)

        # Print or use the extracted information as needed (modify as per your requirements)
        print(f'Drug: {drug_name}')
        print('Text:', text_content)
        print('Sentences:', sentences_list)
        print('Label Interactions:', interactions_list)
        print('\n')

        # Increment the output counter
        output_counter += 1

        # Limit output to the first three files
        if output_counter >= 3:
            break




---

Code snippet to find the max_length for sentence tokenization.

In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Path to the XML files
folder_path = './training2018/'

# Initialize max_len
max_len = 0

# Iterate through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".xml"):
        file_path = os.path.join(folder_path, filename)

        # Read the content of the XML file
        with open(file_path, 'r', encoding='utf-8') as file:
            xml_data = file.read()

        # Parse the XML data
        root = ET.fromstring(xml_data)

        # Extract information from Sentences section
        sentences_section = root.find('.//Sentences')
        if sentences_section is not None:
            for sentence in sentences_section.findall('.//Sentence'):
                sentence_text = sentence.find('.//SentenceText').text.strip() if sentence.find('.//SentenceText') is not None else ''

                # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
                input_ids = tokenizer.encode(sentence_text, add_special_tokens=True)

                # Update the maximum sentence length.
                max_len = max(max_len, len(input_ids))

# Print the maximum sentence length
print('Max sentence length: ', max_len)

In [None]:
import os
import xml.etree.ElementTree as ET

# Define paths
training_path = './training2018/'

def extract_drug_names_numerals(xml_folder_path):
    drug_names = set()
    numerals = set()

    for filename in os.listdir(xml_folder_path):
        if filename.endswith(".xml"):
            file_path = os.path.join(xml_folder_path, filename)

            with open(file_path, 'r', encoding='utf-8') as file:
                xml_data = file.read()

            root = ET.fromstring(xml_data)

            # Extract drug names and numerals from the XML structure
            for section in root.findall('.//Section'):
                section_text = section.text
                if section_text:
                    # Extract drug names
                    drugs = [word.strip(string.punctuation) for word in section_text.split() if word.isalpha()]
                    drug_names.update(drugs)

                    # Extract numerals
                    nums = [word for word in section_text.split() if word.isdigit()]
                    numerals.update(nums)

    return drug_names, numerals

def map_to_placeholders(unique_items):
    placeholders = {}
    placeholder_prefix = ['drug', 'num']

    for idx, item in enumerate(unique_items):
        placeholder = f"{placeholder_prefix[idx % 2]}{idx + 1}"  # Alternate between 'drug' and 'num' placeholders
        placeholders[item] = placeholder

    return placeholders

# Extract drug names and numerals from training files
drug_names, numerals = extract_drug_names_numerals(training_path)

# Map drug names and numerals to placeholders
drug_placeholders = map_to_placeholders(drug_names)
num_placeholders = map_to_placeholders(numerals)

# Print mappings for verification
print("Drug Name Placeholders:")
print(drug_placeholders)
print("\nNumeral Placeholders:")
print(num_placeholders)

Drug Name Placeholders:

Numeral Placeholders:
{'60': 'drug1', '0': 'num2', '607': 'drug3', '1000': 'num4', '130': 'drug5', '35': 'num6', '24': 'drug7', '720': 'num8', '51': 'drug9', '320': 'num10', '3': 'drug11', '200': 'num12', '325': 'drug13', '100': 'num14', '9': 'drug15', '64': 'num16', '19': 'drug17', '1836': 'num18', '230': 'drug19', '6': 'num20', '25': 'drug21', '5': 'num22', '20': 'drug23', '33': 'num24', '90': 'drug25', '70': 'num26', '14': 'drug27', '85': 'num28', '94': 'drug29', '12': 'num30', '180': 'drug31', '45': 'num32', '15': 'drug33', '500': 'num34', '76': 'drug35', '106': 'num36', '160': 'drug37', '2100': 'num38', '65': 'drug39', '400': 'num40', '18': 'drug41', '300': 'num42', '8': 'drug43', '72': 'num44', '288': 'drug45', '7': 'num46', '36': 'drug47', '40': 'num48', '540': 'drug49', '375': 'num50', '17': 'drug51', '32': 'num52', '11': 'drug53', '13': 'num54', '658': 'drug55', '1': 'num56', '50': 'drug57', '16': 'num58', '250': 'drug59', '4': 'num60', '80': 'drug61',

In [None]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0




---

Here we continue with the tokenization as we used previously, and then set up the correct input formatting, and set up the DataLoader, The trainer, and let it start.

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
!pip install install --upgrade transformers

!pip install torch transformers

!pip install accelerate>=0.20.1
!pip install --upgrade torch

PREPROCESSING SCRIPT

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install keras-bert

In [None]:
# This is used to use checkpoints from bioBERT's model download page and create the model file
from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
import torch

# Define paths
pretrained_path = 'drive/MyDrive/ddi_training/biobert_v1.0_pubmed_pmc/'
tf_checkpoint_path = pretrained_path + 'biobert_model.ckpt'
config_file = pretrained_path + 'config.json'
pytorch_dump_path = pretrained_path + 'pytorch_model.bin'

# Load the configuration file
config = BertConfig.from_json_file(config_file)

# Initialize a model with the configuration
model = BertForPreTraining(config)

# Load the TensorFlow weights into the model
model = load_tf_weights_in_bert(model, config, tf_checkpoint_path)

# Save the model's weights in PyTorch format
torch.save(model.state_dict(), pytorch_dump_path)

In [None]:
from transformers import BertTokenizer, BertModel, BertConfig
import torch
import os
import pickle

# Define paths
pretrained_path = 'drive/MyDrive/ddi_training/biobert_v1.0_pubmed_pmc/'
vocab_path = os.path.join(pretrained_path, 'vocab.txt')
training_path = 'drive/MyDrive/ddi_training/trainsentence_token.txt'
testing_path = 'drive/MyDrive/ddi_training/testsentence_token.txt'
config = BertConfig.from_json_file(os.path.join(pretrained_path, 'config.json'))
model_path = os.path.join(pretrained_path, 'pytorch_model.bin')
state_dict = torch.load(model_path)
adjusted_state_dict = {key.replace('bert.', ''): value for key, value in state_dict.items()}
save_train_pkl = 'drive/MyDrive/ddi_training/trainsentence_embeddings.pkl'
save_test_pkl = 'drive/MyDrive/ddi_training/testsentence_embeddings.pkl'

# Load BioBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained(vocab_path)
model = BertModel(config)
model.load_state_dict(state_dict, strict=False)

def generate_embeddings_and_process_file(file_path, save_path):
    sentences_embeddings = {}
    uid = 0  # Unique identifier for each sentence

    with open(file_path, 'rt', encoding='utf-8') as readf:
        for line in readf:
            newline = line.replace('drug1', '##1').replace('drug2', '##2').replace('drug0', '##0')
            input_ids = tokenizer.encode(newline, add_special_tokens=True, max_length=250, padding='max_length', truncation=True, return_tensors='pt')
            with torch.no_grad():
                outputs = model(input_ids)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()  # Mean pooling
            sentences_embeddings[uid] = embeddings
            uid += 1

    with open(save_path, 'wb') as writef:
        pickle.dump(sentences_embeddings, writef)

# Process training and testing data
generate_embeddings_and_process_file(training_path, save_train_pkl)
generate_embeddings_and_process_file(testing_path, save_test_pkl)


MODEL TRAINING SCRIPT


In [None]:
from transformers import AutoTokenizer, get_linear_schedule_with_warmup, BertConfig
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
import torch
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
import numpy as np
import pickle
import random
import time
import datetime
from transformers import BertModel, BertPreTrainedModel
from transformers import BertConfig
from sklearn.metrics import roc_auc_score
import torch.nn.functional as F
import torch.nn as nn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import math

# Set seed for reproducibility
def set_seed(seed_value=42):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

set_seed(42)

# Set up 'device'
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

print('Loading BERT tokenizer...')
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")

# Define paths for testing, training, embedding, and index files
# Load sentence embeddings
with open('drive/MyDrive/ddi_training/trainsentence_embeddings.pkl', 'rb') as f:
    train_sentence_embeddings = pickle.load(f)
with open('drive/MyDrive/ddi_training/testsentence_embeddings.pkl', 'rb') as f:
    test_sentence_embeddings = pickle.load(f)
training_path = 'drive/MyDrive/ddi_training/trainsentence_token.txt'
testing_path = 'drive/MyDrive/ddi_training/testsentence_token.txt'

def preprocess_text_with_custom_mask(text, tokenizer):
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=250,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = encoded_dict['input_ids'].squeeze().tolist()
    mask1, mask2 = [0] * len(input_ids), [0] * len(input_ids)  # Initialize masks

    # Generate custom masks for 'drug1' and 'drug2'
    for i, token_id in enumerate(input_ids):
        token = tokenizer.convert_ids_to_tokens(token_id)
        if token.endswith('##1'):
            mask1[i] = 1
        elif token.endswith('##2'):
            mask2[i] = 1

    return encoded_dict['input_ids'], encoded_dict['attention_mask'], torch.tensor(mask1), torch.tensor(mask2)

def read_data(file_path, tokenizer):
    data_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            sentence = line.strip()
            label = 1 if 'drug1' in sentence and 'drug2' in sentence else 0
            input_ids, attention_mask, mask1, mask2 = preprocess_text_with_custom_mask(sentence, tokenizer)
            data_list.append({
                'text': sentence,
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'mask1': mask1,
                'mask2': mask2,
                'label': label
            })
    return data_list

# Assume tokenizer is already defined
training_data_list = read_data(training_path, tokenizer)
labels = [data['label'] for data in training_data_list]

################################################################################
# SPLIT THE DATA!
# Create a StratifiedShuffleSplit object
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=42)

# Get the indices for splitting
train_indices, test_indices = next(sss.split(np.zeros(len(labels)), labels))

# Use the indices to create stratified train and test sets
train_data = [training_data_list[i] for i in train_indices]
test_data = [training_data_list[i] for i in test_indices]

# Further split the test_data into validation and test sets if needed
validation_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

################################################################################
# Define your dataset class
class DrugInteractionDataset(Dataset):
    def __init__(self, data_list, embeddings, device):
        self.data = data_list
        self.embeddings = embeddings
        self.device = device

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        embedding = torch.tensor(self.embeddings[idx], dtype=torch.float).to(self.device)
        return {
            'input_ids': item['input_ids'],
            'attention_mask': item['attention_mask'],
            'labels': torch.tensor(item['label'], dtype=torch.long),
            'mask1': item['mask1'],
            'mask2': item['mask2'],
            'embedding': embedding
        }

# Instantiate your datasets and loaders directly using the split data
train_dataset = DrugInteractionDataset(train_data, train_sentence_embeddings, device)
validation_dataset = DrugInteractionDataset(validation_data, train_sentence_embeddings, device)
test_dataset = DrugInteractionDataset(test_data, test_sentence_embeddings, device)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Define your Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def calculate_class_weights(training_data_list):
    label_counts = np.bincount([data['label'] for data in training_data_list], minlength=2)
    total_counts = len(training_data_list)
    # Add a small value to prevent division by zero
    class_weights = [total_counts / (len(label_counts) * max(count, 1)) for count in label_counts]
    return torch.tensor(class_weights, dtype=torch.float).to(device)


# Calculate class weights based on training data
class_weights = calculate_class_weights(training_data_list)

class BiGRULayer(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers=1):
        super(BiGRULayer, self).__init__()
        self.bigru = nn.GRU(input_dim, hidden_dim, num_layers=num_layers, bidirectional=True, batch_first=True)

    def forward(self, x):
      batch_size = input_ids.size(0)

      return self.bigru(x)[0]

class CustomAttention(nn.Module):
    def __init__(self, hidden_dim):
        super(CustomAttention, self).__init__()
        self.query = nn.Linear(hidden_dim, hidden_dim)
        self.key = nn.Linear(hidden_dim, hidden_dim)
        self.value = nn.Linear(hidden_dim, hidden_dim)

    def forward(self, inputs, mask=None):
        batch_size = input_ids.size(0)

        Q = self.query(inputs)
        K = self.key(inputs)
        V = self.value(inputs)

        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(K.size(-1))
        attention_weights = F.softmax(attention_scores, dim=-1)
        output = torch.matmul(attention_weights, V)

        return output, attention_weights

class EntityVectorExtraction(nn.Module):
    def __init__(self):
        super(EntityVectorExtraction, self).__init__()

    def forward(self, sequences, entity_mask):
        entity_mask_unsqueezed = entity_mask.unsqueeze(-1).float()
        entity_vectors = sequences * entity_mask_unsqueezed
        entity_vectors_sum = torch.sum(entity_vectors, dim=1)
        return entity_vectors_sum

class DrugInteractionModel(BertPreTrainedModel):
    def __init__(self, config, gru_hidden_dim=768, embedding_dim=200, transformed_embedding_dim=256):
        super(DrugInteractionModel, self).__init__(config)
        self.bert = BertModel(config)
        self.gru = nn.GRU(config.hidden_size, gru_hidden_dim // 2, bidirectional=True, batch_first=True)
        self.custom_attention = CustomAttention(gru_hidden_dim)
        self.entity_vector_extraction = EntityVectorExtraction()

        # Transformation layer for embeddings
        self.embedding_transformation = nn.Linear(768, transformed_embedding_dim)

        # Adjusted classifier to include the transformed embedding dimension
        transformed_feature_size = gru_hidden_dim + transformed_embedding_dim
        self.classifier = nn.Linear(2560, 2)
        self.init_weights()

    def forward(self, input_ids, attention_mask, mask1, mask2, embedding=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]

        gru_output, _ = self.gru(sequence_output)
        attention_output, _ = self.custom_attention(gru_output)

        e1_vector = self.entity_vector_extraction(gru_output, mask1)
        e2_vector = self.entity_vector_extraction(gru_output, mask2)

        # print(f"Device of input_ids: {input_ids.device}")
        # print(f"Device of attention_mask: {attention_mask.device}")
        # print(f"Device of mask1: {mask1.device}")
        # print(f"Device of mask2: {mask2.device}")
        # if embedding is not None:
        #     print(f"Device of embedding before transformation: {embedding.device}")

        if embedding is not None:
            transformed_embedding = self.embedding_transformation(embedding)
            # # Now that transformed_embedding is defined, print its shape
            # print("attention_output shape:", attention_output.shape)
            # print("e1_vector shape:", e1_vector.shape)
            # print("e2_vector shape:", e2_vector.shape)
            # print("transformed_embedding shape:", transformed_embedding.shape)
            # print(f"Device of attention_output: {attention_output.device}")
            # print(f"Device of e1_vector: {e1_vector.device}")
            # print(f"Device of e2_vector: {e2_vector.device}")
            # print(f"Device of transformed_embedding: {transformed_embedding.device}")


            combined_output = torch.cat((attention_output.mean(dim=1), e1_vector, e2_vector, transformed_embedding), dim=-1)
            # print(f"Device of combined_output: {combined_output.device}")
        else:
            # If there's no embedding, just combine the other vectors
            combined_output = torch.cat((attention_output.mean(dim=1), e1_vector, e2_vector), dim=-1)
        #     print(f"Device of combined_output: {combined_output.device}")

        # print("Shape of combined_output:", combined_output.shape)
        logits = self.classifier(combined_output)
        # print(f"Device of logits: {logits.device}")
        # print("Shape of logits:", logits.shape)

        return logits

# Define your model
config = BertConfig.from_pretrained("dmis-lab/biobert-v1.1", num_labels=2)
model = DrugInteractionModel(config, gru_hidden_dim=768, embedding_dim=200).to(device)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

# Define optimizer & epochs
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 3

# Calculate the total steps for the linear scheduler
total_steps = len(train_loader) * epochs

# Create the scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

def evaluate_model_auc(model, dataloader, device):
    model.eval()
    predictions, actuals = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device).squeeze(1)
            attention_mask = batch['attention_mask'].to(device).squeeze(1)
            labels = batch['labels'].to(device)
            mask1 = batch['mask1'].to(device)
            mask2 = batch['mask2'].to(device)
            embeddings = batch.get('embedding', None)
            if embeddings is not None:
                embeddings = embeddings.to(device)

            # Obtain model outputs
            outputs = model(input_ids, attention_mask, mask1, mask2, embeddings)
            logits = outputs
            probs = torch.softmax(logits, dim=1)[:, 1]

            threshold = 0.5
            _, predicted_labels = torch.max(logits, dim=1)
            predictions.extend(predicted_labels.cpu().numpy())
            actuals.extend(labels.cpu().numpy())

    # Convert probabilities to binary predictions for accuracy
    binary_predictions = [1 if prob > threshold else 0 for prob in predictions]

    accuracy = accuracy_score(actuals, binary_predictions)
    precision = precision_score(actuals, binary_predictions, average='binary', zero_division=1)
    recall = recall_score(actuals, binary_predictions, average='binary', zero_division=1)
    fscore = f1_score(actuals, binary_predictions, average='binary', zero_division=1)
    auc_score = roc_auc_score(actuals, predictions) if len(set(actuals)) > 1 else None

    # Return the calculated metrics including accuracy
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': fscore,
        'auc_score': auc_score
    }


# Layer freezing -- retain the learned embeddings while training only the top layers
    # Hopefully helps prevent overfitting and leverage the pretrained models knowledge effectively
for param in model.bert.parameters():
  print(param.device)
  param.requires_grad = False
for param in model.parameters():
  print(param.device)


# Training loop
criterion = torch.nn.CrossEntropyLoss(weight=class_weights.to(device))
best_val_accuracy = 0
best_val_auc = 0
for epoch in range(epochs):
    print(f'\n======== Epoch {epoch + 1} / {epochs} ========')
    t0 = time.time()
    total_train_loss = 0

    model.train()  # Ensure model is in training mode

    step = 0

    for batch in train_loader:
        batch_size = batch['input_ids'].size(0)

        # Reshape input_ids and attention_mask to remove the middle dimension
        input_ids = batch['input_ids'].view(batch_size, -1).to(device)
        attention_mask = batch['attention_mask'].view(batch_size, -1).to(device)
        embeddings = batch['embedding'].to(device)
        mask1 = batch['mask1'].to(device)
        mask2 = batch['mask2'].to(device)
        labels = batch['labels'].to(device)

        # Print Device Information
        # print(f"Device of input_ids in loop: {input_ids.device}")
        # print(f"Device of attention_mask in loop: {attention_mask.device}")
        # print(f"Device of embeddings in loop: {embeddings.device}")
        # print(f"Device of labels in loop: {labels.device}")


        # # Confirm the shapes
        # print(input_ids.shape)  # Should now print torch.Size([8, 250])
        # print(attention_mask.shape)  # Should now print torch.Size([8, 250])

        model.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            mask1=mask1.to(device),       # Ensures mask tensors are also on the correct device
            mask2=mask2.to(device),
            embedding=embeddings
        )

        logits = model(input_ids, attention_mask, mask1, mask2, embeddings)
        # print('###############')
        # print("Immediate Shape of logits:", logits.shape)
        loss = criterion(logits, labels)
        # print(f"Device of model outputs: {logits.device}")  # Check if model outputs tensor has the correct device
        loss.backward()
        optimizer.step()

        scheduler.step()

        total_train_loss += loss.item()

        step += 1

        if  print(f'Batch {step + 1} of {len(train_loader)}. Elapsed: {elapsed}.')

    # Custom loss function-- weighted loss if the dataset is imbalanced
… step % 100 == 0:
            elapsed = format_time(time.time() - t0)
            print(f'Batch {step + 1} of {len(train_loader)}. Elapsed: {elapsed}.')

    # Custom loss function-- weighted loss if the dataset is imbalanced
    loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)

    validation_metrics = evaluate_model_auc(model, validation_loader, device)

    # Check for improvement in accuracy
    print("\nEvaluating Accuracy...")
    if validation_metrics['accuracy'] > best_val_accuracy:
        best_val_accuracy = validation_metrics['accuracy']
        print(f"New best accuracy: {best_val_accuracy:.4f}, saving model...")
        torch.save(model.state_dict(), 'best_model_state_acc.bin')

    # Move AUC-ROC evaluation here, after all batches for the epoch have been processed
    print("\nEvaluating AUC-ROC...")
    if 'auc_score' in validation_metrics and validation_metrics['auc_score'] is not None and validation_metrics['auc_score'] > best_val_auc:
        best_val_auc = validation_metrics['auc_score']
        print(f"New best AUC-ROC: {best_val_auc:.4f}, saving model...")
        torch.save(model.state_dict(), 'best_model_state_auc.bin')

    test_auc = evaluate_model_auc(model, test_loader, device)
    # Assuming test_auc = evaluate_model_auc(model, test_loader, device)
    if test_auc is not None and 'auc_score' in test_auc and test_auc['auc_score'] is not None:
        print(f"Test AUC-ROC: {test_auc['auc_score']:.4f}")
    else:
        print("Test AUC-ROC cannot be calculated due to the presence of only one class in y_true or other evaluation issues.")


    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Average training ok: {format_time(time.time() - t0)}")


    # Evaluation phase
    print("\nRunning Validation...")
    validation_metrics = evaluate_model_auc(model, validation_loader, device)
    print(f"Validation Metrics: F1 Score: {validation_metrics['f1_score']:.4f}, Accuracy: {validation_metrics['accuracy']:.4f}, Precision: {validation_metrics['precision']:.4f}, Recall: {validation_metrics['recall']:.4f}")

    print("\nRunning Test...")
    test_metrics = evaluate_model_auc(model, test_loader, device)
    print(f"Test Metrics: F1 Score: {test_metrics[0]:.4f}, Accuracy: {test_metrics[1]:.4f}, Precision: {test_metrics[2]:.4f}, Recall: {test_metrics[3]:.4f}")

    print(f"loss: {avg_train_loss:.2f}")
    print(f"Training epoch took: {format_time(time.time() - t0)}")

    # Evaluation phase
    print("\nRunning Validation...")
    validation_metrics = evaluate_model_auc(model, validation_loader, device)
    print(f"Validation Metrics: F1 Score: {validation_metrics[0]:.4f}, Accuracy: {validation_metrics[1]:.4f}, Precision: {validation_metrics[2]:.4f}, Recall: {validation_metrics[3]:.4f}")

    print("\nRunning Test...")
    test_metrics = evaluate_model_auc(model, test_loader, device)
    print(f"Test Metrics: F1 Score: {test_metrics[0]:.4f}, Accuracy: {test_metrics[1]:.4f}, Precision: {test_metrics[2]:.4f}, Recall: {test_metrics[3]:.4f}")


    # Save/Export the model
    model.save_pretrained("drive/MyDrive/ddi_training/")
    tokenizer.save_pretrained("drive/MyDrive/ddi_training/")



---

# DEBUGGING SPACE

In [None]:
### Debugging Scripts
import pickle

# Load the embeddings
with open('drive/MyDrive/ddi_training/trainsentence_embeddings.pkl', 'rb') as f:
    train_embedding_matrix = pickle.load(f)
with open('drive/MyDrive/ddi_training/testsentence_embeddings.pkl', 'rb') as f:
    test_embedding_matrix = pickle.load(f)

# Define paths to newly processed data
training_path = 'drive/MyDrive/ddi_training/trainsentence_token.txt'
testing_path = 'drive/MyDrive/ddi_training/testsentence_token.txt'

# Function to verify if the sentence IDs in the .pkl files match with the actual sentences
def verify_embeddings_with_data(file_path, embeddings):
    missing_keys = []
    uid = 0  # Assuming UID starts at 0 and increments for each sentence
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if uid not in embeddings:
                missing_keys.append(uid)
            uid += 1
    return missing_keys

# Check for missing keys in the training and testing data embeddings
missing_keys_training = verify_embeddings_with_data(training_path, train_embedding_matrix)
missing_keys_testing = verify_embeddings_with_data(testing_path, test_embedding_matrix)

print("Missing Keys in Training Data:", missing_keys_training[:10])  # Show only first 10 for brevity
print("Missing Keys in Testing Data:", missing_keys_testing[:10])


Missing Keys in Training Data: []
Missing Keys in Testing Data: []
