In [1]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering, default_data_collator, get_scheduler
from datasets import load_dataset
from accelerate import Accelerator, notebook_launcher
from huggingface_hub import Repository, get_full_repo_name
import evaluate
from tqdm.auto import tqdm
import numpy as np
import collections
import json

In [2]:
# SpokenSQuAD dataset files
spoken_train = 'spoken_train-v1.1.json'
spoken_test = 'spoken_test-v1.1.json'
spoken_test_WER44 = 'spoken_test-v1.1_WER44.json'
spoken_test_WER54 = 'spoken_test-v1.1_WER54.json'

# Reformat the json data 
def reformat_json(json_file):
    with open(json_file, 'r') as f:
        json_data = json.load(f)

        examples = []
    # iterate over json 'data' list
    for elem in json_data['data']:
        title = elem['title']
        # iterate over paragraphs
        for paragraph in elem['paragraphs']:
            context = paragraph['context']
            # iterate over question-answers for this paragraph
            for qa in paragraph['qas']:
                example = {}
                example['id'] = qa['id']
                example['title'] = title.strip()
                example['context'] = context.strip()
                example['question'] = qa['question'].strip()
                example['answers'] = {}
                example['answers']['answer_start'] = [answer["answer_start"] for answer in qa['answers']]
                example['answers']['text'] = [answer["text"] for answer in qa['answers']]
                examples.append(example)
    
    out_dict = {'data': examples}
    output_json_file = 'out_'+json_file
    with open(output_json_file, 'w') as f:
        json.dump(out_dict, f)
    return output_json_file


print("Loading spoken squad datasets...")
# reformat the json data
spoken_train = reformat_json(spoken_train)
spoken_test = reformat_json(spoken_test)
spoken_test_WER44 = reformat_json(spoken_test_WER44)
spoken_test_WER54 = reformat_json(spoken_test_WER54)

spoken_squad_dataset = load_dataset('json',data_files= { 'train': spoken_train,'validation': spoken_test, 
                                                        'test_WER44': spoken_test_WER44,'test_WER54': spoken_test_WER54 }, 
                                    field = 'data')

Loading spoken squad datasets...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test_WER44 split: 0 examples [00:00, ? examples/s]

Generating test_WER54 split: 0 examples [00:00, ? examples/s]

In [3]:
#Use the model bert-base-uncased from huggingface.co
model_name = "bert-base-uncased"
print("Initializing Model and Tokenizer...")

model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(model)

Initializing Model and Tokenizer...


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [4]:
max_length = 384
stride = 64

""" 
Preprocessing the training examples. 
(1)tokenize examples into question-context token sequences of the form: [CLS] question [SEP] context [SEP] ....       
(2)apply windowing with given stride
(3)compute output labels (start_index, end_index)
(4)if answer not fully within windowed context, set label to (0, 0)
"""

def preprocess_training_examples(examples):
    #tokenize question-context
    questions = [question.strip() for question in examples['question']]
    inputs = tokenizer(
        questions, 
        examples['context'],
        max_length = max_length,
        truncation = 'only_second',
        stride = stride, 
        return_overflowing_tokens = True,
        return_offsets_mapping=True, 
        padding = 'max_length'
    )

    offset_mapping = inputs.pop('offset_mapping')
    sample_map = inputs.pop('overflow_to_sample_mapping')
    answers = examples['answers']
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer['answer_start'][0]
        end_char = answer['answer_start'][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # find start and end of the context
        idx = 0
        while sequence_ids[idx] != 1: 
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # if answer not fully inside context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)
    
    inputs['start_positions'] = start_positions
    inputs['end_positions'] = end_positions
    return inputs


print("Preprocessing training data...")
train_dataset = spoken_squad_dataset['train'].map(
    preprocess_training_examples,
    batched = True,
    remove_columns=spoken_squad_dataset['train'].column_names
)

Preprocessing training data...


Map:   0%|          | 0/37111 [00:00<?, ? examples/s]

In [5]:
# function to preprocess validation/test examples (performs tokenization, windowing)
def process_validation_examples(examples):
    questions = [question.strip() for question in examples['question']]
    inputs = tokenizer(
        questions, 
        examples['context'],
        max_length = max_length,
        truncation = 'only_second',
        stride = stride, 
        return_overflowing_tokens = True,
        return_offsets_mapping=True, 
        padding = 'max_length'
    )

    sample_map = inputs.pop('overflow_to_sample_mapping')
    example_ids = []

    for i in range(len(inputs['input_ids'])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offsets = inputs['offset_mapping'][i]
        inputs["offset_mapping"][i] = [offset if sequence_ids[k] == 1 else None for k, offset in enumerate(offsets)]

    inputs['example_id'] = example_ids
    return inputs


In [6]:
#Preprocess the three evaluation data sets

print("Preprocessing test data (NO NOISE: 22.73% WER)...")
validation_dataset = spoken_squad_dataset['validation'].map(
    process_validation_examples,
    batched = True,
    remove_columns=spoken_squad_dataset['validation'].column_names
)

print("Preprocessing V1 noise test data (44.22% WER)...")
test_WER44_dataset = spoken_squad_dataset['test_WER44'].map(
    process_validation_examples,
    batched = True,
    remove_columns=spoken_squad_dataset['test_WER44'].column_names
)

print("Preprocessing V2 noise test data (54.82% WER)...")
test_WER54_dataset = spoken_squad_dataset['test_WER54'].map(
    process_validation_examples,
    batched = True,
    remove_columns=spoken_squad_dataset['test_WER54'].column_names
)

Preprocessing test data (NO NOISE: 22.73% WER)...


Map:   0%|          | 0/5351 [00:00<?, ? examples/s]

Preprocessing V1 noise test data (44.22% WER)...


Map:   0%|          | 0/5351 [00:00<?, ? examples/s]

Preprocessing V2 noise test data (54.82% WER)...


Map:   0%|          | 0/5351 [00:00<?, ? examples/s]

In [7]:
#Define a compute_metric() function 
metric = evaluate.load("squad")
n_best = 10
max_answer_length = 32

def compute_metrics(start_logits, end_logits, features, examples):
    #create default item if not present in the dictionary 
    example_to_features = collections.defaultdict(list)  
    for idx, feature in enumerate(features): 
        example_to_features[feature["example_id"]].append(idx)
    
    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []
        
        # loop thru all features associated with example ID
        for feature_index in example_to_features[example_id]: 
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]
            
            start_indices = np.argsort(start_logit)[-1: -n_best - 1: -1].tolist()
            end_indices = np.argsort(end_logit)[-1: -n_best - 1: -1].tolist()
            for start_index in start_indices: 
                for end_index in end_indices: 
                    # skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None: 
                        continue
                    # skip answers with a length that is either <0 or >max_answer_length
                    if end_index < start_index or end_index-start_index+1 > max_answer_length: 
                        continue
                    answer = {"text": context[offsets[start_index][0] : offsets[end_index][1]],
                              "logit_score": start_logit[start_index] + end_logit[end_index] }
                    answers.append(answer)
                    
        # select answer with best score among n_best based on logit score
        if len(answers) > 0: 
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else: 
            predicted_answers.append({"id": example_id, "prediction_text": ""})
        
    reference_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    
    #return 'exact matches' and 'f1' score
    return metric.compute(predictions=predicted_answers, references=reference_answers)

In [8]:
train_dataset.set_format("torch")
validation_set = validation_dataset.remove_columns(["example_id", "offset_mapping"])
validation_set.set_format("torch")
test_WER44_set = test_WER44_dataset.remove_columns(["example_id", "offset_mapping"])
test_WER44_set.set_format("torch")
test_WER54_set = test_WER54_dataset.remove_columns(["example_id", "offset_mapping"])
test_WER54_set.set_format("torch")

print("Creating dataloader for all datasets...")
train_dataloader = DataLoader(train_dataset, shuffle = True, collate_fn=default_data_collator, batch_size=8)
eval_dataloader = DataLoader(validation_set, collate_fn=default_data_collator, batch_size=8)
test_WER44_dataloader = DataLoader(test_WER44_set, collate_fn=default_data_collator, batch_size=8)
test_WER54_dataloader = DataLoader(test_WER54_set, collate_fn=default_data_collator, batch_size=8)
print("Dataloader creatred...")


Creating dataloader for all datasets...
Dataloader creatred...


In [9]:
### Function to evaluate the model accuracy on a given dataset 
def evaluate_model(model, dataloader, dataset, dataset_before_preprocessing, accelerator=None):
    #Use Accelerator with 16bit floating point
    if not accelerator: 
        accelerator = Accelerator(mixed_precision='fp16')
        model, dataloader = accelerator.prepare(model, dataloader)
    
    model.eval()
    start_logits = []
    end_logits = []
    for batch in tqdm(dataloader):
        with torch.no_grad(): 
            outputs = model(**batch)
        start_logits.append(accelerator.gather(outputs.start_logits).cpu().numpy())
        end_logits.append(accelerator.gather(outputs.end_logits).cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(dataset)]
    end_logits = end_logits[: len(dataset)]

    metrics = compute_metrics(start_logits, end_logits, dataset, dataset_before_preprocessing)
    return metrics

In [10]:
### Training the model
def train_model(model=model, train_dataloader=train_dataloader, eval_dataloader=eval_dataloader, epochs = 1):
    training_steps = epochs * len(train_dataloader)

    #accelerator = Accelerator(mixed_precision='fp16')
    optimizer = AdamW(model.parameters(), lr = 2e-5)
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(model, optimizer, train_dataloader, eval_dataloader)
    
    #Using scheduler to ramp down the learning rate linearly
    lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0,num_training_steps=training_steps)

    for epoch in range(epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        # evaluate after each epoch 
        accelerator.print("Evaluation...")
        metrics = evaluate_model(model, eval_dataloader, validation_dataset, spoken_squad_dataset['validation'], accelerator)
        print(f"epoch {epoch}:", metrics)
    
    # save the trained model
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained("./", save_function=accelerator.save)

In [11]:
### UNCOMMENT TO TRAIN THE MODEL
#print("Fine-tuning the model...")
#train_model()

In [12]:
#Load Finetuned model
finetuned_model_name = "./"
finetuned_model = AutoModelForQuestionAnswering.from_pretrained(finetuned_model_name)
print(finetuned_model)

### EVALUATE FINETUNED MODEL for validation dataset test data set with noises
print("Evaluating model on Validation dataset...")
test_metrics = evaluate_model(finetuned_model, eval_dataloader, validation_dataset, spoken_squad_dataset['validation'])
print("Evaluating model on V1 Noise dataset...")
test_v1_metrics = evaluate_model(finetuned_model, test_WER44_dataloader, test_WER44_dataset, spoken_squad_dataset['test_WER44'])
print("Evaluating model on V2 Noise dataset...")
test_v2_metrics = evaluate_model(finetuned_model, test_WER54_dataloader, test_WER54_dataset, spoken_squad_dataset['test_WER54'])

print("============= RESULTS =============")
print("Validation dataset    (NO NOISE - WER = 22.73%) - exact match: " + str(test_metrics['exact_match']) + ", F1 score: " + str(test_metrics['f1']))
print("V1 Noise dataset (WER = 44.22%) - exact match: " + str(test_v1_metrics['exact_match']) + ", F1 score: " + str(test_v1_metrics['f1']))
print("V2 Noise dataset (WER = 54.82%) - exact match: " + str(test_v2_metrics['exact_match']) + ", F1 score: " + str(test_v2_metrics['f1']))
print("===================================")



Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

  0%|          | 0/678 [00:00<?, ?it/s]

  0%|          | 0/5351 [00:00<?, ?it/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Evaluating model on V1 Noise dataset...


  0%|          | 0/679 [00:00<?, ?it/s]

  0%|          | 0/5351 [00:00<?, ?it/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Evaluating model on V2 Noise dataset...


  0%|          | 0/679 [00:00<?, ?it/s]

  0%|          | 0/5351 [00:00<?, ?it/s]

Validation dataset    (NO NOISE - WER = 22.73%) - exact match: 61.166137170622314, F1 score: 71.89843641916406
V1 Noise dataset (WER = 44.22%) - exact match: 38.74042235096244, F1 score: 53.667193882586005
V2 Noise dataset (WER = 54.82%) - exact match: 26.966922070641, F1 score: 40.71879797236333
