<a href="https://colab.research.google.com/github/clementlemon02/extractive-question-answering/blob/main/RoBERTa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
squad_data = json.load(open('train-v1.1.json', 'rb'))

# Dataset overview
print(f"Dataset version: {squad_data['version']}")
print(f"Number of articles: {len(squad_data['data'])}")

# Sample entry: the first article, its title, and the first question-answer pair
print(f"\nSample article title: {squad_data['data'][0]['title']}")
print(f"Number of paragraphs in the first article: {len(squad_data['data'][0]['paragraphs'])}")

# Sample paragraph and QA pair
first_paragraph = squad_data['data'][0]['paragraphs'][0]
print(f"\nFirst paragraph context: {first_paragraph['context'][:500]}")  # Displaying part of the context
print(f"Number of Q&A pairs in this paragraph: {len(first_paragraph['qas'])}")

# Sample Question-Answer pair
sample_qa = first_paragraph['qas'][0]
print(f"\nSample question: {sample_qa['question']}")
print(f"Sample answer: {sample_qa['answers'][0]['text']}")


Dataset version: 1.1
Number of articles: 442

Sample article title: University_of_Notre_Dame
Number of paragraphs in the first article: 55

First paragraph context: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputed
Number of Q&A pairs in this paragraph: 5

Sample question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Sample answer: Saint Bernadette Soubirous


In [None]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer
import pandas as pd
import json
import numpy as np
from tqdm import tqdm

class SQuADDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512, stride=128):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.stride = stride

        self.dataset = []

        for article in data['data']:
            for paragraph in article['paragraphs']:
                context = paragraph['context']
                for qa in paragraph['qas']:
                    question = qa['question']
                    question_id = qa['id']  # Get the question ID

                    for answer in qa['answers']:
                        self.dataset.append({
                            'question_id': question_id,
                            'question': question,
                            'context': context,
                            'answer_start': answer['answer_start'],
                            'answer_text': answer['text']
                        })

        print(f"Dataset initialized with {len(self.dataset)} items")

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]

        question = item['question']
        context = item['context']
        answer_start = item['answer_start']
        answer_text = item['answer_text']
        question_id = item['question_id']

        # Tokenize the question and context with a sliding window
        tokenized_item = self.tokenizer(
            question,
            context,
            max_length=self.max_length,
            truncation='only_second',
            stride=self.stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding='max_length',
            return_tensors='pt'
        )

        # Prepare start and end positions for the answer
        offset_mapping = tokenized_item['offset_mapping'][0]
        cls_token_idx = self.tokenizer.cls_token_id  # Typically, CLS token index is 0
        start_positions = []
        end_positions = []

        for i, (offset) in enumerate(offset_mapping):
            if offset[0] <= answer_start < offset[1]:
                start_positions.append(i)
            if offset[0] < answer_start + len(answer_text) <= offset[1]:
                end_positions.append(i)

        # If no answer is found, set start and end positions to the CLS token (0)
        if not start_positions:
            start_positions = [cls_token_idx]
        if not end_positions:
            end_positions = [cls_token_idx]

        # Select the first (or only) valid span
        start_position = start_positions[0]
        end_position = end_positions[0]

        return {
            'input_ids': tokenized_item['input_ids'][0],
            'attention_mask': tokenized_item['attention_mask'][0],
            'start_positions': torch.tensor(start_position, dtype=torch.long),
            'end_positions': torch.tensor(end_position, dtype=torch.long),
            'question_id': question_id
        }

    def export_to_csv(self, filename):
        # Prepare data for export
        df_data = []
        for item in self.dataset:
            df_data.append({
                'question_id': item['question_id'],
                'question': item['question'],
                'context': item['context'],
                'answer_start': item['answer_start'],
                'answer_text': item['answer_text']
            })

        # Create a DataFrame and save it to CSV
        df = pd.DataFrame(df_data)
        df.to_csv(filename, index=False)
        print(f"Dataset exported to {filename}")

    def export_to_squad_format(self, filename):
        # Create a new data structure in SQuAD format
        squad_format = {
            "version": "v2.0",
            "data": []
        }

        # Group items by context
        context_groups = {}
        for item in self.dataset:
            if item['context'] not in context_groups:
                context_groups[item['context']] = []
            context_groups[item['context']].append(item)

        # Build the SQuAD format structure
        for context, items in context_groups.items():
            article = {
                "title": "Generated Article",
                "paragraphs": [{
                    "context": context,
                    "qas": []
                }]
            }

            for item in items:
                qa = {
                    "question": item['question'],
                    "id": item['question_id'],
                    "answers": [{
                        "text": item['answer_text'],
                        "answer_start": item['answer_start']
                    }]
                }
                article['paragraphs'][0]['qas'].append(qa)

            squad_format['data'].append(article)

        # Write to JSON file
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(squad_format, f, ensure_ascii=False, indent=2)

        print(f"Dataset exported in SQuAD format to {filename}")


In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

model_name = 'deepset/roberta-base-squad2'  # RoBERTa model fine-tuned on SQuAD 2.0
tokenizer = AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Verify the model loaded correctly
print(f"Model loaded: {model.__class__.__name__}")
print(f"Model name: {model.name_or_path}")


Model loaded: RobertaForQuestionAnswering
Model name: deepset/roberta-base-squad2


In [None]:
# Recreate the dataset with the updated class
dataset = SQuADDataset(squad_data, tokenizer)
print("\nTesting final dataset access:")
for i in range(3):  # Print info for first 3 items
    sample = dataset[i]
    print(f"\nSample {i}:")
    for k, v in sample.items():
        if isinstance(v, torch.Tensor):
            print(f"  {k}: {v.shape}")
        else:
            print(f"  {k}: {v}")

    # Print start and end positions
    start_pos = sample['start_positions'].item()
    end_pos = sample['end_positions'].item()
    print(f"  Start position: {start_pos}")
    print(f"  End position: {end_pos}")

    # Decode the answer span
    answer_span = sample['input_ids'][start_pos:end_pos+1]
    decoded_answer = tokenizer.decode(answer_span)
    print(f"  Decoded answer: {decoded_answer}")

    # Decode the full input
    full_input = tokenizer.decode(sample['input_ids'])
    print(f"  Full input: {full_input[:200]}...")  # Print first 200 characters

# Verify that start and end positions are within the expected range
print("\nVerifying start and end positions:")
for i in range(3):
    sample = dataset[i]
    start_pos = sample['start_positions'].item()
    end_pos = sample['end_positions'].item()
    input_length = sample['input_ids'].shape[0]
    print(f"Sample {i}:")
    print(f"  Input length: {input_length}")
    print(f"  Start position: {start_pos}")
    print(f"  End position: {end_pos}")
    assert 0 <= start_pos < input_length, f"Start position {start_pos} out of range"
    assert 0 <= end_pos < input_length, f"End position {end_pos} out of range"
    assert start_pos <= end_pos, f"Start position {start_pos} greater than end position {end_pos}"

Dataset initialized with 87599 items

Testing final dataset access:

Sample 0:
  input_ids: torch.Size([512])
  attention_mask: torch.Size([512])
  start_positions: torch.Size([])
  end_positions: torch.Size([])
  question_id: 5733be284776f41900661182
  Start position: 135
  End position: 142
  Decoded answer:  Saint Bernadette Soubirous
  Full input: <s>To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?</s></s>Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Vir...

Sample 1:
  input_ids: torch.Size([512])
  attention_mask: torch.Size([512])
  start_positions: torch.Size([])
  end_positions: torch.Size([])
  question_id: 5733be284776f4190066117f
  Start position: 54
  End position: 58
  Decoded answer:  a copper statue of Christ
  Full input: <s>What is in front of the Notre Dame Main Building?</s></s>Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golde

In [None]:
import json

# Load the dataset from local file
file_path = 'train-v1.1.json'

# Open and load the JSON file
with open(file_path, 'r') as file:
    squad_data = json.load(file)

with open('dev-v1.1.json', 'r', encoding='utf-8') as f:
    val_squad_data = json.load(f)

# Create dataset and dataloader
train_dataset = SQuADDataset(squad_data, tokenizer, max_length=512, stride=128)
val_dataset = SQuADDataset(val_squad_data, tokenizer, max_length=512, stride=128)


Dataset initialized with 87599 items
Dataset initialized with 34726 items


In [None]:
from torch.utils.data import DataLoader

# Assuming you've already created your dataset
batch_size = 16  # You can adjust this based on your GPU memory
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Test the dataloader
# Test the dataloader
for batch in dataloader:
    print("Batch shapes:")
    for k, v in batch.items():
        if isinstance(v, torch.Tensor):
            print(f"  {k}: {v.shape}")
        else:
            print(f"  {k}: {type(v)} (length: {len(v)})")
    break  # Just print the first batch

Batch shapes:
  input_ids: torch.Size([16, 512])
  attention_mask: torch.Size([16, 512])
  start_positions: torch.Size([16])
  end_positions: torch.Size([16])
  question_id: <class 'list'> (length: 16)


In [None]:
from transformers import default_data_collator
from torch.utils.data import DataLoader
from transformers import AutoModelForQuestionAnswering, AdamW, get_scheduler
import torch
from tqdm.auto import tqdm

def train_model(
    train_dataset,
    val_dataset,
    model_name="bert-base-uncased",
    batch_size=8,
    epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=0,
    output_dir="model_output"
):
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)

    # Set up the DataLoader for training and validation datasets
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=default_data_collator
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=default_data_collator
    )

    # Move model to GPU if available
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)

    # Set up the optimizer and learning rate scheduler
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    num_training_steps = epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        name="linear", optimizer=optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps
    )

    # Set up a progress bar
    progress_bar = tqdm(range(num_training_steps))

    # Training loop
    model.train()
    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        total_loss = 0

        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()

            # Backpropagation
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            progress_bar.update(1)

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Average training loss: {avg_train_loss}")

        # Evaluate on the validation set
        eval_loss = evaluate(model, val_dataloader, device)
        print(f"Validation loss: {eval_loss}")

        # Save the model at the end of each epoch
        model.save_pretrained(f"{output_dir}/checkpoint_epoch_{epoch+1}")
        tokenizer.save_pretrained(f"{output_dir}/checkpoint_epoch_{epoch+1}")

    print("Training complete!")


def evaluate(model, val_dataloader, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in val_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            total_loss += outputs.loss.item()

    avg_loss = total_loss / len(val_dataloader)
    model.train()  # Set back to training mode
    return avg_loss


In [None]:
from transformers import default_data_collator

def custom_collator(features):
    # Extract question_ids from the dataset
    question_ids = [feature['question_id'] for feature in features]

    # Use the default data collator to handle the rest of the tensors
    batch = default_data_collator(features)

    # Add question_ids back to the batch
    batch['question_id'] = question_ids

    return batch


In [None]:
def predict(model, val_dataset, tokenizer, batch_size=8, device=None):
    # Set model to evaluation mode
    model.eval()

    # Set device (use GPU if available)
    if device is None:
        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

    model.to(device)

    # Set up DataLoader for validation dataset, using the custom collator
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=custom_collator)

    predictions = []

    # Create a progress bar based on the number of batches
    progress_bar = tqdm(val_dataloader, desc="Predicting", leave=True)

    # No gradients needed during evaluation
    with torch.no_grad():
        for batch in progress_bar:
            # Move tensors to device, ignore `question_id`
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'question_id'}

            # Get model outputs
            outputs = model(**inputs)

            # Get start and end logits
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits

            # Iterate over each example in the batch
            for i in range(start_logits.size(0)):
                start_idx = torch.argmax(start_logits[i]).item()
                end_idx = torch.argmax(end_logits[i]).item()

                # Get input IDs (tokens) and convert to words
                input_ids = batch['input_ids'][i]
                tokens = tokenizer.convert_ids_to_tokens(input_ids[start_idx:end_idx+1])

                # Convert tokens back to string
                answer = tokenizer.convert_tokens_to_string(tokens)

                # Retrieve the question ID from the batch (now included in the batch via the collator)
                question_id = batch['question_id'][i]

                # Save the question ID and predicted answer
                predictions.append({"question_id": question_id, "answer": answer})

    return predictions


In [None]:
train_model(
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    model_name=model_name,
    batch_size=8,  # You can adjust batch size depending on your hardware
    epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=0,
    output_dir="model_output"
)



  0%|          | 0/32850 [00:00<?, ?it/s]

Epoch 1/3


In [None]:
def save_predictions(predictions, filename="predictions.json"):
    # Convert the predictions into the required format: {question_id: answer}
    formatted_predictions = {pred['question_id']: pred['answer'] for pred in predictions}

    # Save the formatted predictions as a JSON file
    with open(filename, 'w') as f:
        json.dump(formatted_predictions, f, indent=2)

    print(f"Predictions saved to {filename}")


In [None]:
# Run the prediction process
predictions = predict(model, val_dataset, tokenizer)

# Save the predictions in the required format
save_predictions(predictions, "predictions.json")


In [None]:
import subprocess
# Run the evaluation script
result = subprocess.run(['python', 'evaluate-v2.0.py', 'dev-v1.1.json', 'predictions.json'], capture_output=True, text=True)

# Print the output
print(result.stdout)

# If there were any errors, print them
if result.stderr:
    print("Errors:")
    print(result.stderr)