In [58]:
import requests
import json
import torch
import torch.nn as nn
import os
from tqdm import tqdm
import transformers
from evaluate import load
from transformers import BertTokenizerFast, AdamW, BertForQuestionAnswering, get_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ExponentialLR
import matplotlib.pyplot as plt
import numpy as np

In [59]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [60]:
BASE_PATH = "/scratch/gpuligu/Spoken-SQuAD-master"

spoken_train = "spoken_train-v1.1.json"
spoken_test = "spoken_test-v1.1.json"
spoken_test_WER44 = "spoken_test-v1.1_WER44.json"
spoken_test_WER54 = "spoken_test-v1.1_WER54.json"

In [61]:
def read_json_data(path):
    contexts = []
    questions = []
    answers = []
    
    with open(path, 'rb') as file:
        raw_data = json.load(file)

    for group in raw_data['data']:
        for paragraph in group['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context.lower())
                    questions.append(question.lower())
                    answers.append(answer)

    return contexts, questions, answers


In [62]:
train_data_path = os.path.join(BASE_PATH, spoken_train) 
test_data_path = os.path.join(BASE_PATH, spoken_test)
test_data_path_WER44 = os.path.join(BASE_PATH, spoken_test_WER44)
test_data_path_WER54 = os.path.join(BASE_PATH, spoken_test_WER54)


# loading training data
train_contexts, train_questions, train_answers = read_json_data(train_data_path)
print(f"Training Data: {train_questions[0]}, {train_answers[0]}")

# loading testing data
valid_contexts, valid_questions, valid_answers = read_json_data(test_data_path)
print(f"Testing Data: {valid_questions[0]}, {valid_answers[0]}")

# loading testing WER 44 data
valid_contexts_44, valid_questions_44, valid_answers_44 = read_json_data(test_data_path_WER44)
print(f"Testing Data_44: {valid_questions_44[0]}, {valid_answers_44[0]}")

# loading testing WER 54 data
valid_contexts_54, valid_questions_54, valid_answers_54 = read_json_data(test_data_path_WER54)
print(f"Testing Data_54: {valid_questions_54[0]}, {valid_answers_54[0]}")

Training Data: what is in front of the notre dame main building?, {'answer_start': 187, 'text': 'a copper statue of christ'}
Testing Data: which nfl team represented the afc at super bowl 50?, {'answer_start': 190, 'text': 'denver broncos'}
Testing Data_44: which nfl team represented the afc at super bowl 50?, {'answer_start': 177, 'text': 'Denver Broncos'}
Testing Data_54: which nfl team represented the afc at super bowl 50?, {'answer_start': 177, 'text': 'Denver Broncos'}


In [63]:
def add_answer_end_index(answers, contexts):
    for answer, context in zip(answers, contexts):
        expected_answer = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(expected_answer)

        if context[start_idx:end_idx] == expected_answer:
            answer['answer_end'] = end_idx
        else:
            for offset in [1, 2]:
                if context[start_idx - offset:end_idx - offset] == expected_answer:
                    answer['answer_start'] = start_idx - offset
                    answer['answer_end'] = end_idx - offset
                    break


In [64]:
add_answer_end_index(train_answers, train_contexts)
add_answer_end_index(valid_answers, valid_contexts)
add_answer_end_index(valid_answers_44, valid_contexts_44)
add_answer_end_index(valid_answers_44, valid_contexts_44)

In [65]:
train_contexts_trunc = []
MAX_LENGTH = 512

for i in range(len(train_contexts)):
    if len(train_contexts[i]) > MAX_LENGTH:
        answer_start = train_answers[i]['answer_start']
        answer_end = train_answers[i]['answer_start'] + len(train_answers[i]['text'])
        mid = (answer_start + answer_end) // 2
        para_start = max(0, min(mid - MAX_LENGTH // 2, len(train_contexts[i]) - MAX_LENGTH))
        para_end = para_start + MAX_LENGTH
        train_contexts_trunc.append(train_contexts[i][para_start:para_end])
        train_answers[i]['answer_start'] = max(0, ((MAX_LENGTH // 2) - len(train_answers[i]['text'])))

    else:
        train_contexts_trunc.append(train_contexts[i])

In [66]:
MAX_LENGTH = 512
MODEL = "bert-base-uncased"
doc_stride = 128

# initialize the tokenizer
tokenizerFast = BertTokenizerFast.from_pretrained(MODEL)

# tokenize
train_encodings = tokenizerFast(train_questions, train_contexts, max_length=MAX_LENGTH, truncation=True, padding=True, stride=doc_stride)
valid_encodings = tokenizerFast(valid_questions,valid_contexts, max_length=MAX_LENGTH, truncation=True, padding=True, stride=doc_stride)
valid_encodings_44 = tokenizerFast(valid_questions_44,valid_contexts_44, max_length=MAX_LENGTH, truncation=True, padding=True, stride=doc_stride)
valid_encodings_54 = tokenizerFast(valid_questions_54,valid_contexts_54, max_length=MAX_LENGTH, truncation=True, padding=True, stride=doc_stride)

In [67]:
def find_answer_positions(encodings, answers, tokenizer):
    start_positions = []
    end_positions = []

    for idx in range(len(encodings['input_ids'])):
        answer_text = answers[idx]['text']
        answer_encoding = tokenizer(answer_text, max_length=MAX_LENGTH, truncation=True, padding=True)

        found_start = False
        for a in range(len(encodings['input_ids'][idx]) - len(answer_encoding['input_ids'])):
            match = True
            for i in range(1, len(answer_encoding['input_ids'])-1):
                if answer_encoding['input_ids'][i] != encodings['input_ids'][idx][a + i]:
                    match = False
                    break
            if match:
                start_positions.append(a)
                end_positions.append(a + len(answer_encoding['input_ids']) - 1)
                found_start = True
                break
        if not found_start:
            start_positions.append(0)
            end_positions.append(0)

    return start_positions, end_positions

In [68]:
# Update train encodings with start and end positions
train_start_positions, train_end_positions = find_answer_positions(train_encodings, train_answers, tokenizerFast)
train_encodings.update({'start_positions': train_start_positions, 'end_positions': train_end_positions})

# Update test encodings with start and end positions
valid_start_positions, valid_end_positions = find_answer_positions(valid_encodings, valid_answers, tokenizerFast)
valid_encodings.update({'start_positions': valid_start_positions, 'end_positions': valid_end_positions})

# Update test 44 encodings with start and end positions
valid_start_positions_44, valid_end_positions_44 = find_answer_positions(valid_encodings_44, valid_answers_44, tokenizerFast)
valid_encodings_44.update({'start_positions': valid_start_positions_44, 'end_positions': valid_end_positions_44})

# Update test 54 encodings with start and end positions
valid_start_positions_54, valid_end_positions_54 = find_answer_positions(valid_encodings_54, valid_answers_54, tokenizerFast)
valid_encodings_54.update({'start_positions': valid_start_positions_54, 'end_positions': valid_end_positions_54})

In [69]:
class SquadDataset(Dataset):
    def __init__(self, encodings):
        self.input_ids = torch.tensor(encodings['input_ids'])
        self.token_type_ids = torch.tensor(encodings['token_type_ids'])
        self.attention_mask = torch.tensor(encodings['attention_mask'])
        self.start_positions = torch.tensor(encodings['start_positions'])
        self.end_positions = torch.tensor(encodings['end_positions'])
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'token_type_ids': self.token_type_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'start_positions': self.start_positions[idx],
            'end_positions': self.end_positions[idx]
        }
    
    def __len__(self):
        return len(self.input_ids)


In [70]:
# build datasets
train_dataset = SquadDataset(train_encodings)
valid_dataset = SquadDataset(valid_encodings)
valid_dataset_44 = SquadDataset(valid_encodings_44)
valid_dataset_54 = SquadDataset(valid_encodings_54)

In [71]:
# Load the pre-trained model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
print(model)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [72]:
def train_model(model, train_dataloader, num_epochs=1):
    model.to(device)
    model.train()
    
    optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=2e-2)
    training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=training_steps,
    )
    
    loss_arr = []
    acc_arr = []
    
    for epoch in range(num_epochs):
        losses = []
        accuracies = []
        with tqdm(train_dataloader, desc=f'Epoch {epoch+1}') as t:
            
            for batch in t:
                optimizer.zero_grad()
                
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                token_type_ids = batch['token_type_ids'].to(device)
                start_positions = batch['start_positions'].to(device)
                end_positions = batch['end_positions'].to(device)
                
                outputs = model(input_ids, attention_mask=attention_mask,
                                start_positions=start_positions,
                                token_type_ids=token_type_ids,
                                end_positions=end_positions)
                
                loss = outputs[0]
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
                
                # Calculate accuracy
                start_logits, end_logits = outputs[1], outputs[2]
                start_pred = start_logits.argmax(dim=1)
                end_pred = end_logits.argmax(dim=1)
                acc = ((start_pred == start_positions).float().mean() + 
                       (end_pred == end_positions).float().mean()) / 2
                
                losses.append(loss.item())
                accuracies.append(acc.item())
                
                t.set_postfix_str(f'Loss: {loss.item():.4f}, Acc: {acc.item():.4f}')
            
        loss_arr.append(np.mean(losses))
        acc_arr.append(np.mean(accuracies))
    
    return loss_arr, acc_arr

In [73]:
train_data_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
loss_arr, acc_arr = train_model(model, train_data_loader, 2)

Epoch 1:   2%|▏         | 55/2320 [00:26<18:10,  2.08it/s, Loss: 4.1377, Acc: 0.0312]


KeyboardInterrupt: 

In [44]:
from collections import Counter
def calculate_f1_score(predictions, references):
    f1_scores = []
    for pred, ref in zip(predictions, references):
        common = Counter(pred) & Counter(ref)
        num_same = sum(common.values())
        if num_same == 0:
            f1_scores.append(0)
            continue
        precision = num_same / len(pred)
        recall = num_same / len(ref)
        f1 = (2 * precision * recall) / (precision + recall)
        f1_scores.append(f1)
    avg_f1_score = sum(f1_scores) / len(f1_scores)
    return avg_f1_score



In [57]:
max_answer_length=30

def evaluate_model(model, test_dataloader):
    model.eval()
    predictions = []
    references = []

    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask,
                            start_positions=start_positions,
                            token_type_ids=token_type_ids,
                            end_positions=end_positions)

            start_logits, end_logits = outputs.start_logits, outputs.end_logits
            start_pred = torch.argmax(start_logits, dim=1)
            end_pred = torch.argmax(end_logits, dim=1)

            for i in range(len(start_pred)):
                start = start_pred[i].item()
                end = end_pred[i].item()
                
                # Skip answers where start index > end index or length is greater than max_answer_length
                if start > end or end - start + 1 > max_answer_length:
                    predicted_answer = ""
                else:
                    predicted_answer = tokenizerFast.decode(batch['input_ids'][i][start:end+1], skip_special_tokens=True)
                
                predictions.append(predicted_answer)

                reference_start = batch['start_positions'][i].item()
                reference_end = batch['end_positions'][i].item()
                reference_answer = tokenizerFast.decode(batch['input_ids'][i][reference_start:reference_end+1], skip_special_tokens=True)
                references.append(reference_answer)
            
    avg_f1_score = calculate_f1_score(predictions, references)
    return avg_f1_score



valid_data_loader = DataLoader(valid_dataset, batch_size=16)
valid_data_loader_44 = DataLoader(valid_dataset_44, batch_size=16)
valid_data_loader_54 = DataLoader(valid_dataset_54, batch_size=16)

f1_score = evaluate_model(model, valid_data_loader)
print(f"F1 Score on Test Data: {f1_score}")

f1_score = evaluate_model(model, valid_data_loader_44)
print(f"F1 Score on Test Data 44: {f1_score}")

f1_score = evaluate_model(model, valid_data_loader_54)
print(f"F1 Score on Test Data 54: {f1_score}")


F1 Score on Test Data: 0.7542423885356532
F1 Score on Test Data 44: 0.42468614972543284
F1 Score on Test Data 54: 0.3041936139479789
