In [1]:
import torch

def check_cuda_availability():
    """
    Checks if CUDA is available o the system.

    Returns:
        bool: True if CUDA is available, False otherwise.
    """
    return torch.cuda.is_available()

check_cuda_availability()
torch.cuda.is_available()

True

In [13]:
import warnings
from transformers import logging

warnings.filterwarnings("ignore")
logging.set_verbosity_error()
"""
This script disables warnings and sets the logging verbosity to error level.

Warnings are filtered out to avoid cluttering the output with non-critical messages.
The logging verbosity is set to error level to only display error messages.

This script is typically used in situations where warnings are not relevant or can be safely ignored.
"""


In [14]:
import csv
import json
import pandas as pd
import numpy as np
import torch

In [15]:
from torch.utils.data import DataLoader, Dataset
from transformers import BertForQuestionAnswering, BertTokenizer
import torch.optim.lr_scheduler as lr_scheduler
from torch.cuda.amp import autocast, GradScaler
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizer

In [16]:
batch_size = 8
epochs = 3
learning_rate = 2e-5
# create a dataframe with sample data
df = pd.DataFrame({'Name': ['Alice', 'Bob', 'Charlie'], 'Age': [25, 30, 35], 'Gender': ['F', 'M', 'M']})
print(df)


In [17]:
class SQuAD_Data(Dataset):
    def __init__(self, file_path):
        """
        Initializes an instance of the class.

        Parameters:
        - file_path (str): The path to the file containing the data.

        Attributes:
        - context_list (list): A list to store the context strings.
        - question_list (list): A list to store the question strings.
        - answer_list (list): A list to store the answer strings.
        - start_pos_list (list): A list to store the start positions of the answers.
        - end_pos_list (list): A list to store the end positions of the answers.
        """
        self.context_list = []
        self.question_list = []
        self.answer_list = []
        self.start_pos_list = []
        self.end_pos_list = []
        with open(file_path, 'r', encoding='cp1252') as f:
            reader = csv.reader(f)
            next(reader)
            for row in reader:
                self.context_list.append(row[0])
                self.question_list.append(row[1])
                self.answer_list.append(row[2])
                self.start_pos_list.append(int(row[3]))
                self.end_pos_list.append(int(row[4]))

    def __len__(self):
        """
        Returns the length of the dataset.

        Returns:
        - int: The number of samples in the dataset.
        """
        return len(self.context_list)

    def __getitem__(self, idx):
        """
        Returns the sample at the given index.

        Parameters:
        - idx (int): The index of the sample to retrieve.

        Returns:
        - dict: A dictionary containing the context, question, answer, start position, and end position of the sample.
        """
        return {
            'context': self.context_list[idx],
            'question': self.question_list[idx],
            'answer': self.answer_list[idx],
            'start_pos': self.start_pos_list[idx],
            'end_pos': self.end_pos_list[idx]
        }


In [38]:
def load_data():
    """
    Loads the SQuAD dataset and creates data loaders for training and testing.

    Returns:
        train_dl (DataLoader): DataLoader for the training dataset.
        test_dl (DataLoader): DataLoader for the testing dataset.
        ds_size (int): Size of the training dataset.
        dl_size (int): Size of the training data loader.
        ds_test_size (int): Size of the testing dataset.
        dl_test_size (int): Size of the testing data loader.
    """
    train_ds = SQuAD_Data('squad_train_data.csv')
    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    ds_size = len(train_ds)
    dl_size = len(train_dl)

    test_ds = SQuAD_Data('squad_test_data.csv')
    test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=True)
    ds_test_size = len(test_ds)
    dl_test_size = len(test_dl)

    return train_dl, test_dl, ds_size, dl_size, ds_test_size, dl_test_size

train_dl, test_dl, ds_size, dl_size, ds_test_size, dl_test_size = load_data()

print(f'Training dataset size: {ds_size}')
print(f'Training loader size: {dl_size}')
print(f'Testing dataset size: {ds_test_size}')
print(f'Testing loader size: {dl_test_size}')

Training dataset size: 37079
Training loader size: 4635
Testing dataset size: 5351
Testing loader size: 669


In [25]:
def train_fn(train_model, data_ld, optim, device, acc_steps, tokenizer):
    """
    Trains the model on the given data loader.

    Args:
        train_model (nn.Module): The model to be trained.
        data_ld (DataLoader): The data loader containing the training data.
        optim (Optimizer): The optimizer used for training.
        device (str): The device to be used for training (e.g., 'cuda', 'cpu').
        acc_steps (int): The number of gradient accumulation steps.
        tokenizer (Tokenizer): The tokenizer used for encoding the input data.

    Returns:
        float: The average training loss per data point.
    """
    train_model.train()
    train_model.to(device)
    t_loss = 0
    scaler_object = GradScaler() 
    batch_count = 0
    for data in data_ld:
        inputs = tokenizer(
            data['context'],
            data['question'],
            return_tensors='pt',
            padding=True,
            truncation=True,
            stride=128,
            max_length=512
        )
        inputs = {key: val.to(device) for key, val in inputs.items()}
        start_pos = data['start_pos'].to(device)
        end_pos = data['end_pos'].to(device)
        optim.zero_grad()
        with autocast(): 
            outputs = train_model(**inputs, start_positions=start_pos, end_positions=end_pos)
            loss = outputs.loss
        scaler_object.scale(loss).backward()
        batch_count += 1
        if batch_count % acc_steps == 0:
            scaler_object.step(optim)  
            scaler_object.update()  
            optim.zero_grad()   
        loss_val = loss.item()
        if str(loss_val) == 'nan':
            loss_val = 0
        t_loss += loss_val
    if batch_count % acc_steps != 0:
        scaler_object.step(optim)  
        scaler_object.update() 
        optim.zero_grad() 
    return t_loss / len(data_ld)


def test_fn(test_model, data_ld, optim, device, tokenizer):
    """
    Function to evaluate the performance of a model on test data.

    Args:
        test_model (torch.nn.Module): The model to be evaluated.
        data_ld (torch.utils.data.DataLoader): The data loader containing the test data.
        optim (torch.optim.Optimizer): The optimizer used for training the model.
        device (torch.device): The device on which the model and data should be loaded.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used for tokenizing the input data.

    Returns:
        float: The average loss value over the test data.

    """
    t_loss = 0.0
    test_model.eval()
    test_model.to(device)
    scaler_object = GradScaler() 
    for data in data_ld:
        inputs = tokenizer(
            data['context'],
            data['question'],
            return_tensors='pt',
            padding=True,
            truncation=True,
            stride=128,
            max_length=512
        )
        inputs = {key: val.to(device) for key, val in inputs.items()}
        start_pos = data['start_pos'].to(device)
        end_pos = data['end_pos'].to(device)
        optim.zero_grad()
        with autocast():  
            outputs = test_model(**inputs, start_positions=start_pos, end_positions=end_pos)
            loss = outputs.loss
        loss_val = loss.item()
        if str(loss_val) == 'nan':
            loss_val = 0
        t_loss += loss_val
    return t_loss / len(data_ld)


In [34]:
"""
This code snippet trains a BERT-based Question Answering model using the SQuAD dataset.
It initializes the device based on the availability of CUDA, prints the device type,
loads the BERT tokenizer and model, sets up the optimizer and scheduler, and moves the model to the device.
Then, it trains the model for the specified number of epochs, printing the training and testing loss for each epoch.
The learning rate is reduced on a plateau using the testing loss as a metric.
"""

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', max_length=512)
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
model = model.to(device)

for epoch in range(epochs):
    train_loss = train_fn(model, train_dl, optimizer, device, acc_steps=2, tokenizer=tokenizer)
    test_loss = test_fn(model, test_dl, optimizer, device, tokenizer=tokenizer)
    print(f'Epoch {epoch+1}, train loss {train_loss}, test loss {test_loss}')
    scheduler.step(test_loss)


cuda
Epoch 1 , train loss 5.6710381880402565, test loss 5.692981338500976
Epoch 2 , train loss 5.431648567318916, test loss 5.71663028717041
Epoch 3 , train loss 5.215241692960262, test loss 5.727792015075684


In [35]:
def predicting_answer(model, tokenizer, context, question):
    """
    Predicts the answer to a given question based on the context using a pre-trained model.

    Args:
        model (torch.nn.Module): The pre-trained model used for prediction.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to tokenize the input.
        context (str): The context in which the question is asked.
        question (str): The question to be answered.

    Returns:
        str: The predicted answer to the question.

    """
    inputs = tokenizer(question, context, return_tensors='pt', padding=True, truncation=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    input_ids = inputs['input_ids'].squeeze()
    output = model(**inputs)
    start_index = torch.argmax(output.start_logits)
    end_index = torch.argmax(output.end_logits) + 1
    if end_index < start_index:
        start_index, end_index = end_index, start_index
    predicted_answer = tokenizer.decode(input_ids[start_index:end_index])
    return predicted_answer


In [37]:
para = "super bowl fifty was an american football game to determine the champion of the national football league nfl for the twenty fifteen season. the american football conference a f c c champion denver broncos defeated the national football conference n f c c champion carolina panthers twenty four to ten to earn their third super bowl title. the game was played on february seventh twenty sixteen and levis stadium in the san francisco bay area santa clara california. as this was the fiftieth super bowl the league emphasized the golden anniversary with various goldsteins initiatives as well as temporarily suspending the tradition of naming each super bowl game with roman numerals under which they gain would have been known as super bowl l sell that the logo could prominently featured the arabic numerals fifty."
ques = "What does AFC stand for?"
predicted_answer = predicting_answer(model, tokenizer, para, ques)
print(predicted_answer)

american football conference


In [28]:
"""
This code snippet trains a DistilBERT model for question answering using the SQuAD dataset.
It initializes a DistilBertTokenizer and a DistilBertForQuestionAnswering model.
The optimizer used is AdamW with a specified learning rate.
The model is then moved to the specified device (e.g., GPU) for training.
The training loop runs for the specified number of epochs.
In each epoch, the model is trained using the train_fn function and evaluated using the test_fn function.
The train and test losses are printed for each epoch.
The learning rate scheduler is updated based on the test loss.
"""

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased', max_length=512)
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

model = model.to(device)
for epoch in range(epochs):
    train_loss = train_fn(model, train_dl, optimizer, device, acc_steps=2)
    test_loss = test_fn(model, test_dl, optimizer, device)
    print(f'Epoch {epoch+1}, train loss {train_loss}, test loss {test_loss}')
    scheduler.step(test_loss)


Epoch 1 , train loss 5.75785543769598, test loss 5.828500843048095
Epoch 2 , train loss 5.645928509533405, test loss 5.828914890289306
Epoch 3 , train loss 5.3587766364216805, test loss 5.79552303314209


In [29]:
def predict_answer(model, tokenizer, context, question):
    """
    Predicts the answer to a given question based on the context using a pre-trained model.

    Args:
        model (torch.nn.Module): The pre-trained model used for prediction.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to tokenize the input.
        context (str): The context in which the question is asked.
        question (str): The question to be answered.

    Returns:
        str: The predicted answer to the question.
    """
    inputs = tokenizer(question, context, return_tensors='pt', padding=True, truncation=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    input_ids = inputs['input_ids'].squeeze()
    output = model(**inputs)
    start_index = torch.argmax(output.start_logits)
    end_index = torch.argmax(output.end_logits) + 1
    if end_index < start_index:
        start_index, end_index = end_index, start_index
    predicted_answer = tokenizer.decode(input_ids[start_index:end_index])
    return predicted_answer


In [33]:
para = "super bowl fifty was an american football game to determine the champion of the national football league nfl for the twenty fifteen season. the american football conference a f c c champion denver broncos defeated the national football conference n f c c champion carolina panthers twenty four to ten to earn their third super bowl title. the game was played on february seventh twenty sixteen and levis stadium in the san francisco bay area santa clara california. as this was the fiftieth super bowl the league emphasized the golden anniversary with various goldsteins initiatives as well as temporarily suspending the tradition of naming each super bowl game with roman numerals under which they gain would have been known as super bowl l sell that the logo could prominently featured the arabic numerals fifty."
ques = "What does AFC stand for?"
paragraph = "super bowl fifty was an american football game to determine the champion of the national football league nfl for the twenty fifteen season. the american football conference a f c c champion denver broncos defeated the national football conference n f c c champion carolina panthers twenty four to ten to earn their third super bowl title. the game was played on february seventh twenty sixteen and levis stadium in the san francisco bay area santa clara california. as this was the fiftieth super bowl the league emphasized the golden anniversary with various goldsteins initiatives as well as temporarily suspending the tradition of naming each super bowl game with roman numerals under which they gain would have been known as super bowl l sell that the logo could prominently featured the arabic numerals fifty."
question = "What does AFC stand for?"

predicted_answer = predicting_answer(model, tokenizer, paragraph, question)
print(predicted_answer)

american football conference
