In [1]:
# Installing necessary libraries (if not already installed)
!pip install -q transformers


In [4]:
# Importing necessary libraries and modules
import logging
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForQuestionAnswering, AdamW
from tqdm import tqdm
from torch.cuda.amp import GradScaler, autocast
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Navigate to the directory containing your data if it's in your drive
os.chdir('/content/drive/My Drive/Dataspeak')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Configure logging
logging.basicConfig(
    filename="bert_training_log.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
)

# Load the fine-tuned model and tokenizer
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

# Load your preprocessed data
preprocessed_data_path = "preprocessed_data.csv"
data = pd.read_csv(preprocessed_data_path)

# Ensure there are no NaN values in start_positions and end_positions
assert data["Start_Positions"].notna().all()
assert data["End_Positions"].notna().all()

# Split the data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

# Initialize GradScaler for mixed precision training
scaler = GradScaler()

class QADataset(Dataset):
    def __init__(self, questions, contexts, start_positions, end_positions):
        self.questions = questions
        self.contexts = contexts
        self.start_positions = start_positions
        self.end_positions = end_positions

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        context = self.contexts[idx]
        if not isinstance(question, str) or not isinstance(context, str):
            logging.error(
                f"Invalid types - Question: {type(question)}, Context: {type(context)} at index {idx}"
            )
            return None
        try:
            inputs = tokenizer(
                question,
                context,
                return_tensors="pt",
                max_length=512,
                truncation=True,
                padding="max_length",  # Ensure sequences are padded to max_length
            )
        except Exception as e:
            logging.error(f"Error in tokenization at index {idx}: {e}")
            return None

        start_position = self.start_positions[idx]
        end_position = self.end_positions[idx]

        # Handle cases where model mispredicts and start_position is after end_position
        if start_position > end_position:
            start_position, end_position = end_position, start_position

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "start_positions": torch.tensor(start_position, dtype=torch.long),
            "end_positions": torch.tensor(end_position, dtype=torch.long),
        }


# Define your custom collate function here
def custom_collate(batch):
    batch = list(filter(lambda x: x is not None, batch))
    if len(batch) == 0:
        return None  # Return None if all items are filtered out
    return torch.utils.data.dataloader.default_collate(batch)


# Initialize the training and validation datasets
train_dataset = QADataset(
    questions=train_data["Cleaned_Title"].tolist(),
    contexts=train_data["BERT_Context"].tolist(),
    start_positions=train_data["Start_Positions"].astype(int).tolist(),
    end_positions=train_data["End_Positions"].astype(int).tolist(),
)

val_dataset = QADataset(
    questions=val_data["Cleaned_Title"].tolist(),
    contexts=val_data["BERT_Context"].tolist(),
    start_positions=val_data["Start_Positions"].astype(int).tolist(),
    end_positions=val_data["End_Positions"].astype(int).tolist(),
)

# Modify the DataLoader instantiation with the custom collate function
batch_size = 5
train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, collate_fn=custom_collate, shuffle=True
)
val_dataloader = DataLoader(
    val_dataset, batch_size=batch_size, collate_fn=custom_collate, shuffle=False
)

# Setup the optimizer
optimizer = AdamW(model.parameters(), lr=0.9)

# Fine-tune the model
num_epochs = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Use GPU if available
model.to(device)

# Initialize the progress bar
progress_bar = tqdm(total=num_epochs * len(train_dataloader), desc="Training Progress")

# Gradient accumulation steps
gradient_accumulation_steps = 4  # Adjust if necessary

best_val_loss = float('inf')
patience = 2  # Number of epochs with no improvement to wait
no_improve_epoch = 0

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    for step, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}

        # Use autocast to enable mixed precision training
        with autocast():
            outputs = model(**batch)
            loss = outputs.loss
            if loss is not None:
                loss = loss / gradient_accumulation_steps  # Normalize the loss

        # Scale the loss using GradScaler and call backward
        scaler.scale(loss).backward()
        total_loss += scaler.scale(loss).item()

        if (step + 1) % gradient_accumulation_steps == 0:
            # Perform a step using GradScaler
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            progress_bar.update(gradient_accumulation_steps)
            logging.info(f"Processed batch {total_loss} in epoch {epoch + 1}")

    logging.info(f"Epoch {epoch+1}, Training Loss: {total_loss/len(train_dataloader)}")

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            val_loss += loss.item()

    logging.info(f"Epoch {epoch+1}, Validation Loss: {val_loss/len(val_dataloader)}")

progress_bar.close()

model.save_pretrained("/content/drive/My Drive/Dataspeak/fine_tuned_bert")
tokenizer.save_pretrained("/content/drive/My Drive/Dataspeak/fine_tuned_bert")

logging.info("Training completed")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training Progress:   5%|▍         | 8248/174393 [1:10:19<23:49:28,  1.94it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Training Progress:   5%|▍         | 8252/174393 [1:10:21<23:40:19,  1.95it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with