### variables

In [1]:
# This flag is the difference between SQUAD v1 or 2 (if you're using another dataset, it indicates if impossible
# answers are allowed or not).
model_checkpoint = "google/electra-base-discriminator"
model_name = model_checkpoint + " finetuned"
model_path = r"C:\Users\tanch\Documents\GitHub\Covid-19-QA-System\URECA Research\models"+f"\\{model_name}"
batch_size = 5
squad_v2 = False

### 1. Load dataset

In [2]:
from datasets import load_dataset, load_metric, load_from_disk
from transformers import AutoTokenizer
import transformers
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
import numpy as np

In [3]:
import json
def read_squad(path):                           # 
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)
        
    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)        
    return contexts, questions, answers
# remove some irrelevant keys - they seem to cause some issues
# convert 'answer_start': 177 into 'answer_start': [177]
# convert 'text': "Singapore Airlines" into 'text': ["Singapore Airlines"]
# convert ids to string - 1 to '1'
def clean_up(example):
    del example['answers']['answer_category']
    del example['answers']['answer_id']
    del example['answers']['document_id']
    del example['answers']['question_id']
    example['answers']['answer_start'] = [example['answers']['answer_start']] #for _ in range(3)]
    example['answers']['text'] = [example['answers']['text']] #for _ in range(3)]
    example['id'] = str(example['id'])
    return example

In [4]:
# load covidQA - singapore covid 19 related question answer pairs
from datasets import Dataset
path = r"C:\Users\tanch\Documents\GitHub\Covid-19-QA-System\QA System\SQUAD formatted data\covidQA.json"
train_contexts, train_questions, train_answers = read_squad(path)
covidQA = Dataset.from_dict({"id":range(len(train_contexts)),
                                      'context': train_contexts,
                                      "question": train_questions,
                                      "answers": train_answers})
covidQA = covidQA.map(clean_up)
covidQA

<IPython.core.display.Javascript object>

HBox(children=(FloatProgress(value=0.0, max=265.0), HTML(value='')))




Dataset({
    features: ['answers', 'context', 'id', 'question'],
    num_rows: 265
})

### 2. Load fine-tuned Model

In [5]:
# model has not been fine tuned on any downstream task
model = AutoModelForQuestionAnswering.from_pretrained(model_path)

In [6]:
import torch
import gc
gc.collect()
torch.cuda.empty_cache()                   # empty cache so cude can be used 
model.to(torch.device('cuda:0'))    # use GPU 0
model.device                         # current device

device(type='cuda', index=0)

### 3. Evaluate on varying max_length

In [7]:
def prepare_validation_features(examples):
    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [8]:
from tqdm.auto import tqdm
import collections
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
        if not squad_v2:
            predictions[example["id"]] = best_answer["text"]
        else:
            answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
            predictions[example["id"]] = answer

    return predictions

In [9]:
max_lengths_list = range(100,400+1,30)
max_lengths_results = []

In [10]:
print(*range(100,400+1,30))

100 130 160 190 220 250 280 310 340 370 400


In [11]:

for max_length in max_lengths_list:
    doc_stride = int(max_length/2)
    # instantiate the tokenzier 
    # note that different models require different tokenizers
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    pad_on_right = tokenizer.padding_side == "right"

    # check that the tokenizer we instantiated  is a fast tokenizer because we need its special features
    assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

    validation_features = covidQA.map(
        prepare_validation_features,
        batched=True,
        remove_columns=covidQA.column_names
    )

    args = TrainingArguments(
        f"test-squad",
        learning_rate=2e-5,
    #         per_device_train_batch_size=batch_size,
    #         per_device_eval_batch_size=batch_size,
    #         num_train_epochs=num_train_epochs,
        weight_decay=0.01
    )

    from transformers import default_data_collator
    data_collator = default_data_collator

    trainer = Trainer(
        model,
        args,
        # train_dataset=tokenized_datasets['train'],         # no evaluation set as it is done separately
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    raw_predictions = trainer.predict(validation_features)
    validation_features.set_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))
    max_answer_length = 30
    final_predictions = postprocess_qa_predictions(covidQA, validation_features, raw_predictions.predictions)

    metric = load_metric("squad_v2" if squad_v2 else "squad")
    if squad_v2:
        formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in final_predictions.items()]
    else:
        formatted_predictions = [{"id": k, "prediction_text": v} for k, v in final_predictions.items()]
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in covidQA]
    
    EM_f1 = metric.compute(predictions=formatted_predictions, references=references)
    print(max_length, EM_f1)
    max_lengths_results.append(EM_f1)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Post-processing 265 example predictions split into 7290 features.


HBox(children=(FloatProgress(value=0.0, max=265.0), HTML(value='')))


100 {'exact_match': 44.15094339622642, 'f1': 53.548015995759705}


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Post-processing 265 example predictions split into 5072 features.


HBox(children=(FloatProgress(value=0.0, max=265.0), HTML(value='')))


130 {'exact_match': 43.77358490566038, 'f1': 55.55124080117947}


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Post-processing 265 example predictions split into 3884 features.


HBox(children=(FloatProgress(value=0.0, max=265.0), HTML(value='')))


160 {'exact_match': 46.0377358490566, 'f1': 56.28710688808826}


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Post-processing 265 example predictions split into 3135 features.


HBox(children=(FloatProgress(value=0.0, max=265.0), HTML(value='')))


190 {'exact_match': 44.528301886792455, 'f1': 57.66610619427552}


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Post-processing 265 example predictions split into 2617 features.


HBox(children=(FloatProgress(value=0.0, max=265.0), HTML(value='')))


220 {'exact_match': 44.15094339622642, 'f1': 55.34739478108319}


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Post-processing 265 example predictions split into 2257 features.


HBox(children=(FloatProgress(value=0.0, max=265.0), HTML(value='')))


250 {'exact_match': 47.924528301886795, 'f1': 59.251209655465175}


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Post-processing 265 example predictions split into 1960 features.


HBox(children=(FloatProgress(value=0.0, max=265.0), HTML(value='')))


280 {'exact_match': 48.301886792452834, 'f1': 61.30299976853839}


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Post-processing 265 example predictions split into 1738 features.


HBox(children=(FloatProgress(value=0.0, max=265.0), HTML(value='')))


310 {'exact_match': 49.81132075471698, 'f1': 61.93029076324671}


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Post-processing 265 example predictions split into 1541 features.


HBox(children=(FloatProgress(value=0.0, max=265.0), HTML(value='')))


340 {'exact_match': 49.81132075471698, 'f1': 61.769702875129916}


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Post-processing 265 example predictions split into 1402 features.


HBox(children=(FloatProgress(value=0.0, max=265.0), HTML(value='')))


370 {'exact_match': 49.056603773584904, 'f1': 61.58015556840471}


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Post-processing 265 example predictions split into 1299 features.


HBox(children=(FloatProgress(value=0.0, max=265.0), HTML(value='')))


400 {'exact_match': 47.924528301886795, 'f1': 60.88761305812137}


In [12]:
import pandas as pd
path = r"C:\Users\tanch\Documents\GitHub\Covid-19-QA-System\URECA Research\RAM\max_length performance on covidQA.csv"
output_df = pd.DataFrame({"model":[model_checkpoint for _ in range(len(max_lengths_list))],
                "max_length":max_lengths_list,
                "EM":[max_lengths_results[i]['exact_match'] for i in range(len(max_lengths_results)) ],
                 "f1":[max_lengths_results[i]['f1'] for i in range(len(max_lengths_results)) ]})
temp_df = pd.read_csv(path)
temp_df = temp_df.append(output_df)
temp_df.to_csv(path, index=False)
temp_df

Unnamed: 0,model,max_length,EM,f1
0,bert-base-uncased,100,40.377358,48.378431
1,bert-base-uncased,130,40.754717,50.033348
2,bert-base-uncased,160,39.622642,50.045857
3,bert-base-uncased,190,38.113208,48.354075
4,bert-base-uncased,220,40.0,50.1846
5,bert-base-uncased,250,40.377358,51.687932
6,bert-base-uncased,280,37.735849,48.400679
7,bert-base-uncased,310,40.0,50.82078
8,bert-base-uncased,340,38.490566,48.13585
9,bert-base-uncased,370,40.0,50.068434
