In [1]:
!pip install torch torchvision torchtext




In [3]:
!pip install portalocker>=2.0.0


In [5]:
import portalocker
print(portalocker.__version__)  # This should print the version number if all is well


2.8.2


In [13]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [21]:
!pip install torch transformers



In [None]:
def extract_qa_pairs(squad_data, num_samples=30):
    qna_pairs = []
    for article in squad_data['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                if qa['is_impossible']:
                    answer = 'No answer'
                else:
                    answers = qa['answers']
                    answer = answers[0]['text'] if answers else 'No answer'
                qna_pairs.append({'question': question, 'answer': answer, 'context': context})
                if len(qna_pairs) >= num_samples:
                    return qna_pairs
    return qna_pairs


In [41]:
import json
import torch
from transformers import BertTokenizer, BertForQuestionAnswering

# Assuming the SQuAD dataset JSON file is stored locally in the same directory as this script
def load_squad_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        squad_json = json.load(file)
    return squad_json

def extract_qa_pairs(squad_data):
    qna_pairs = []
    for article in squad_data['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                if qa['is_impossible']:
                    answer = 'No answer'
                else:
                    answers = qa['answers']
                    answer = answers[0]['text'] if answers else 'No answer'
                qna_pairs.append({'question': question, 'answer': answer, 'context': context})
    return qna_pairs


# Path to the downloaded SQuAD 2.0 dataset
file_path = '/content/drive/MyDrive/DL_HW10/train-v2.0.json'
squad_data = load_squad_data(file_path)
qna_pairs = extract_qa_pairs(squad_data)




In [42]:
# Display extracted Q&A pairs
i=0
for i, qa in enumerate(qna_pairs):
  if i< 5:
    print(f"Q&A Pair {i+1}:")
    print("Question:", qa['question'])
    print("Answer:", qa['answer'])
    print("Context:", qa['context'])
    print("-" * 80)
    i=i+1
  else:
    break

Q&A Pair 1:
Question: When did Beyonce start becoming popular?
Answer: in the late 1990s
Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
--------------------------------------------------------------------------------
Q&A Pair 2:
Question: What areas did Beyonce compete in when she was growing up?
Answer: singing and dancing
Context: Beyoncé 

In [43]:
from transformers import BertTokenizer

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess_data_for_bert(qna_pairs):
    processed_data = []
    for qa in qna_pairs:
        question, text = qa['question'], qa['context']
        # Encode the question and text as input ids and attention masks
        # The encode_plus method manages tokenization, adding special tokens, and creating attention masks
        encoded_dict = tokenizer.encode_plus(
            text=question,                # Sentence to encode.
            text_pair=text,               # Second sentence to encode.
            add_special_tokens=True,      # Add '[CLS]' and '[SEP]'
            max_length=512,               # Pad & truncate all sentences.
            pad_to_max_length=True,       # Pad to max_length or not.
            return_attention_mask=True,   # Return attention mask.
            return_tensors='pt',          # Return pytorch tensors.
        )

        # Extract outputs for input to BERT
        input_ids = encoded_dict['input_ids']
        attention_masks = encoded_dict['attention_mask']
        token_type_ids = encoded_dict['token_type_ids']

        # Extract start and end positions of answer in the context
        start_positions = tokenizer.encode(qa['answer'], add_special_tokens=False)
        start_position = text.find(qa['answer'])
        end_position = start_position + len(start_positions) - 1

        processed_data.append({
            'input_ids': input_ids,
            'attention_mask': attention_masks,
            'token_type_ids': token_type_ids,
            'start_position': start_position,
            'end_position': end_position
        })
    return processed_data

# Example usage
processed_qna_pairs = preprocess_data_for_bert(qna_pairs)
for data in processed_qna_pairs[:3]:  # Display the first 3 processed entries
    print("Input IDs:", data['input_ids'])
    print("Attention Masks:", data['attention_mask'])
    print("Token Type IDs:", data['token_type_ids'])
    print("Start Position:", data['start_position'])
    print("End Position:", data['end_position'])
    print("-" * 50)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will alwa

Input IDs: tensor([[  101,  2043,  2106, 20773,  2707,  3352,  2759,  1029,   102, 20773,
         21025, 19358, 22815,  1011,  5708,  1006,  1013, 12170, 23432, 29715,
          3501, 29678, 12325, 29685,  1013, 10506,  1011, 10930,  2078,  1011,
          2360,  1007,  1006,  2141,  2244,  1018,  1010,  3261,  1007,  2003,
          2019,  2137,  3220,  1010,  6009,  1010,  2501,  3135,  1998,  3883,
          1012,  2141,  1998,  2992,  1999,  5395,  1010,  3146,  1010,  2016,
          2864,  1999,  2536,  4823,  1998,  5613,  6479,  2004,  1037,  2775,
          1010,  1998,  3123,  2000,  4476,  1999,  1996,  2397,  4134,  2004,
          2599,  3220,  1997,  1054,  1004,  1038,  2611,  1011,  2177, 10461,
          1005,  1055,  2775,  1012,  3266,  2011,  2014,  2269,  1010, 25436,
         22815,  1010,  1996,  2177,  2150,  2028,  1997,  1996,  2088,  1005,
          1055,  2190,  1011,  4855,  2611,  2967,  1997,  2035,  2051,  1012,
          2037, 14221,  2387,  1996,  271

In [44]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# Assuming processed_qna_pairs is your dataset already processed
train_size = int(0.9 * len(processed_qna_pairs))
train_dataset = processed_qna_pairs[:train_size]
val_dataset = processed_qna_pairs[train_size:]

# Convert to PyTorch DataLoaders
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=8)
validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=8)


In [45]:
from transformers import BertForQuestionAnswering, AdamW

model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

num_epochs = 8
# Setup the optimizer and the learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
total_steps = len(train_dataloader) * num_epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,  # Default value in run_glue.py
                                            num_training_steps=total_steps)

# Setup the training loop
training_stats = []

for epoch_i in range(0, num_epochs):
    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, num_epochs))
    print('Training...')

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch['input_ids'].squeeze(1).to(device)
        b_attention_mask = batch['attention_mask'].squeeze(1).to(device)
        b_token_type_ids = batch['token_type_ids'].squeeze(1).to(device)
        b_start_positions = batch['start_position'].to(device)
        b_end_positions = batch['end_position'].to(device)

        model.zero_grad()

        outputs = model(b_input_ids,
                        token_type_ids=b_token_type_ids,
                        attention_mask=b_attention_mask,
                        start_positions=b_start_positions,
                        end_positions=b_end_positions)

        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print("  Average training loss: {0:.2f}".format(avg_train_loss))



Training...


In [36]:
# ========================================
#               Evaluation
# ========================================
print("")
print("Running Validation...")

model.eval()

eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

for batch in validation_dataloader:
    b_input_ids = batch['input_ids'].squeeze(1).to(device)
    b_attention_mask = batch['attention_mask'].squeeze(1).to(device)
    b_token_type_ids = batch['token_type_ids'].squeeze(1).to(device)
    b_start_positions = batch['start_position'].to(device)
    b_end_positions = batch['end_position'].to(device)

    with torch.no_grad():
        outputs = model(b_input_ids,
                        token_type_ids=b_token_type_ids,
                        attention_mask=b_attention_mask,
                        start_positions=b_start_positions,
                        end_positions=b_end_positions)

    loss = outputs.loss
    eval_loss += loss.item()

    logits = outputs.start_logits, outputs.end_logits
    start_logits, end_logits = logits
    start_preds = torch.argmax(start_logits, dim=-1)
    end_preds = torch.argmax(end_logits, dim=-1)

    # Calculate accuracy for this batch
    correct = (start_preds == b_start_positions) & (end_preds == b_end_positions)
    accuracy = correct.cpu().numpy().mean() * 100
    eval_accuracy += accuracy

    nb_eval_steps += 1

print("  Validation Loss: {0:.2f}".format(eval_loss/nb_eval_steps))
print("  Validation Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))



Running Validation...
  Validation Loss: 9.78
  Validation Accuracy: 0.00


In [40]:
import numpy as np

# Pick a sample from the validation set
sample = next(iter(validation_dataloader))
sample_batch = {k: v.to(device) for k, v in sample.items() if k in ['input_ids', 'attention_mask', 'token_type_ids']}

with torch.no_grad():
    output = model(input_ids=sample_batch['input_ids'].squeeze(1),
                   attention_mask=sample_batch['attention_mask'].squeeze(1),
                   token_type_ids=sample_batch['token_type_ids'].squeeze(1))

start_logits = output.start_logits
end_logits = output.end_logits

# Decode the predicted and actual answers
for i in range(sample_batch['input_ids'].size(0)):
    start_pred = torch.argmax(start_logits[i]).item()
    end_pred = torch.argmax(end_logits[i]).item()
    if end_pred < start_pred:
        start_pred, end_pred = end_pred, start_pred  # Ensure start is not greater than end
    pred_answer = tokenizer.decode(sample_batch['input_ids'][i][start_pred:end_pred+1], skip_special_tokens=True)

    actual_start = sample['start_position'][i].item()  # Assuming these are available for comparison
    actual_end = sample['end_position'][i].item()
    if actual_end < actual_start:
        actual_start, actual_end = actual_end, actual_start  # Ensure start is not greater than end
    actual_answer = tokenizer.decode(sample_batch['input_ids'][i][actual_start:actual_end+1], skip_special_tokens=True)

    print(f"Sample {i+1}")
    print("Predicted Answer:", pred_answer)
    print("Actual Answer:", actual_answer)
    print("-" * 100)


Sample 1
Predicted Answer: 
Actual Answer: 
----------------------------------------------------------------------------------------------------
Sample 2
Predicted Answer: 
Actual Answer: 
----------------------------------------------------------------------------------------------------
Sample 3
Predicted Answer: 
Actual Answer: 
----------------------------------------------------------------------------------------------------
