In [None]:
!pip install -qqq -U wandb --progress-bar off
import wandb
from huggingface_hub import login
from google.colab import userdata

login(userdata.get('HF_TOKEN'))

wb_token = userdata.get('wandb')
wandb.login(key=wb_token)

In [None]:
!pip install -q -U git+https://github.com/huggingface/transformers.git
#!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install datasets evaluate

In [None]:
from transformers import AutoTokenizer

base_model_id = "VMware/roberta-base-mrqa"#"FacebookAI/roberta-base"#
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

In [None]:
from datasets import load_dataset
import evaluate

split = "test"# "test""validation"#
mrqa_eval = load_dataset("enriquesaou/mrqa-squadded-sample", split=split)

In [None]:
max_length = 512
stride = 128

In [None]:
# source: https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering
def preprocess_validation(examples):
    examples["question"] = [q.strip() for q in examples["question"]]
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # examples with long context give us several features -> map feature to example
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # convert predictions to substrings of the context for evaluation
    tokenized_examples["example_id"] = []
    for i in range(len(tokenized_examples["input_ids"])):
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1

        # an example can give many spans -> take index of the example containing the span
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # offset_mapping to None if they are not part of the context
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [None]:
import collections
import json
import logging
import os
from typing import Optional, Tuple

import numpy as np
from tqdm.auto import tqdm


logger = logging.getLogger(__name__)

# source: https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering
def postprocess_qa_predictions(
    examples,
    features,
    predictions: Tuple[np.ndarray, np.ndarray],
    version_2_with_negative: bool = False,
    n_best_size: int = 20,
    max_answer_length: int = 30,
    null_score_diff_threshold: float = 0.0,
    output_dir: Optional[str] = None,
    prefix: Optional[str] = None,
    log_level: Optional[int] = logging.WARNING,
):
    if len(predictions) != 2:
        raise ValueError("`predictions` should be a tuple with two elements (start_logits, end_logits).")
    all_start_logits, all_end_logits = predictions

    if len(predictions[0]) != len(features):
        raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.")

    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    all_predictions = collections.OrderedDict()
    all_nbest_json = collections.OrderedDict()
    if version_2_with_negative:
        scores_diff_json = collections.OrderedDict()

    # Logging.
    logger.setLevel(log_level)
    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_prediction = None
        prelim_predictions = []

        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]
            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
            # available in the current feature.
            token_is_max_context = features[feature_index].get("token_is_max_context", None)

            # Update minimum null prediction.
            feature_null_score = start_logits[0] + end_logits[0]
            if min_null_prediction is None or min_null_prediction["score"] > feature_null_score:
                min_null_prediction = {
                    "offsets": (0, 0),
                    "score": feature_null_score,
                    "start_logit": start_logits[0],
                    "end_logit": end_logits[0],
                }

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or len(offset_mapping[start_index]) < 2
                        or offset_mapping[end_index] is None
                        or len(offset_mapping[end_index]) < 2
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    # Don't consider answer that don't have the maximum context available (if such information is
                    # provided).
                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
                        continue

                    prelim_predictions.append(
                        {
                            "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
                            "score": start_logits[start_index] + end_logits[end_index],
                            "start_logit": start_logits[start_index],
                            "end_logit": end_logits[end_index],
                        }
                    )
        if version_2_with_negative and min_null_prediction is not None:
            # Add the minimum null prediction
            prelim_predictions.append(min_null_prediction)
            null_score = min_null_prediction["score"]

        # Only keep the best `n_best_size` predictions.
        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]

        # Add back the minimum null prediction if it was removed because of its low score.
        if (
            version_2_with_negative
            and min_null_prediction is not None
            and not any(p["offsets"] == (0, 0) for p in predictions)
        ):
            predictions.append(min_null_prediction)

        # Use the offsets to gather the answer text in the original context.
        context = example["context"]
        for pred in predictions:
            offsets = pred.pop("offsets")
            pred["text"] = context[offsets[0] : offsets[1]]

        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
        # failure.
        if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""):
            predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0})

        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
        # the LogSumExp trick).
        scores = np.array([pred.pop("score") for pred in predictions])
        exp_scores = np.exp(scores - np.max(scores))
        probs = exp_scores / exp_scores.sum()

        # Include the probabilities in our predictions.
        for prob, pred in zip(probs, predictions):
            pred["probability"] = prob

        # Pick the best prediction. If the null answer is not possible, this is easy.
        if not version_2_with_negative:
            all_predictions[example["id"]] = predictions[0]["text"]
        else:
            # Otherwise we first need to find the best non-empty prediction.
            i = 0
            while predictions[i]["text"] == "":
                i += 1
            best_non_null_pred = predictions[i]

            # Then we compare to the null prediction using the threshold.
            score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"]
            scores_diff_json[example["id"]] = float(score_diff)  # To be JSON-serializable.
            if score_diff > null_score_diff_threshold:
                all_predictions[example["id"]] = ""
            else:
                all_predictions[example["id"]] = best_non_null_pred["text"]

        # Make `predictions` JSON-serializable by casting np.float back to float.
        all_nbest_json[example["id"]] = [
            {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
            for pred in predictions
        ]

    # If we have an output_dir, let's save all those dicts.
    if output_dir is not None:
        if not os.path.isdir(output_dir):
            raise EnvironmentError(f"{output_dir} is not a directory.")

        prediction_file = os.path.join(
            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
        )
        nbest_file = os.path.join(
            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
        )
        if version_2_with_negative:
            null_odds_file = os.path.join(
                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
            )

        logger.info(f"Saving predictions to {prediction_file}.")
        with open(prediction_file, "w") as writer:
            writer.write(json.dumps(all_predictions, indent=4) + "\n")
        logger.info(f"Saving nbest_preds to {nbest_file}.")
        with open(nbest_file, "w") as writer:
            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
        if version_2_with_negative:
            logger.info(f"Saving null_odds to {null_odds_file}.")
            with open(null_odds_file, "w") as writer:
                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")

    return all_predictions

In [None]:
# preprocess eval dataset
eval_set = mrqa_eval.map(
    preprocess_validation,
    batched=True,
    remove_columns=mrqa_eval.column_names,
)

eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
eval_set_for_model.set_format("torch")

In [None]:
eval_set_for_model

In [None]:
# source: https://github.com/mrqa/MRQA-Shared-Task-2019/blob/master/mrqa_official_eval.py

import string
import re
import json
import gzip
from collections import Counter

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)


def read_predictions(prediction_file):
    with open(prediction_file) as f:
        predictions = json.load(f)
    return predictions


def read_answers(gold_file):
    answers = {}
    with gzip.open(gold_file, 'rb') as f:
        for i, line in enumerate(f):
            example = json.loads(line)
            if i == 0 and 'header' in example:
                continue
            for qa in example['qas']:
                answers[qa['id']] = qa['answers']['text']
    return answers


def evaluate_predictions(answers, predictions, skip_no_answer=False):
    f1 = exact_match = total = 0
    for qid, ground_truths in answers.items():
        if qid not in predictions:
            if not skip_no_answer:
                message = 'Unanswered question %s will receive score 0.' % qid
                print(message)
                total += 1
            continue
        total += 1
        prediction = predictions[qid]
        exact_match += metric_max_over_ground_truths(
            exact_match_score, prediction, ground_truths)
        f1 += metric_max_over_ground_truths(
            f1_score, prediction, ground_truths)

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'exact_match': exact_match, 'f1': f1}

In [None]:
models_to_evaluate = ["enriquesaou/roberta-vmw-mrqa-waw"]#["enriquesaou/roberta-vmw-mrqa-plus-checkpoint-1000","enriquesaou/roberta-vmw-mrqa-plus-checkpoint-1500", "enriquesaou/roberta-vmw-mrqa-plus-checkpoint-2000","enriquesaou/roberta-vmw-mrqa-plus-checkpoint-2500"]#"enriquesaou/roberta-mrqa-plus"]#["enriquesaou/roberta-vmw-mrqa-s"]#["enriquesaou/roberta-vmw-mrqa"]#, "VMware/roberta-base-mrqa"]
output_dir = "predictions/"

In [None]:
import torch
from transformers import AutoModelForQuestionAnswering
import os
import numpy as np
from tqdm import tqdm

def split_batch(batch, chunk_size):
    keys = list(batch.keys())
    length = batch[keys[0]].size(0)
    for start in range(0, length, chunk_size):
        end = min(start + chunk_size, length)
        yield {k: v[start:end] for k, v in batch.items()}

def accumulate_predictions(predictions, start_logits_chunk, end_logits_chunk):
    predictions['start_logits'].append(start_logits_chunk)
    predictions['end_logits'].append(end_logits_chunk)
    return predictions

# prepare batches
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}

chunk_size = 8  # adjust to memory

for model_id in models_to_evaluate:
    model_for_eval = AutoModelForQuestionAnswering.from_pretrained(model_id).to(device)

    all_start_logits = []
    all_end_logits = []

    # chunk processing
    for sub_batch in tqdm(split_batch(batch, chunk_size)):
        with torch.no_grad():
            outputs = model_for_eval(**sub_batch)

        all_start_logits.append(outputs.start_logits.cpu().numpy())
        all_end_logits.append(outputs.end_logits.cpu().numpy())

    start_logits = np.concatenate(all_start_logits, axis=0)
    end_logits = np.concatenate(all_end_logits, axis=0)

    # for inspection
    outdir = f"{output_dir}/{model_id}"
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    # postprocess the predictions (logits to actual answers)
    all_predictions = postprocess_qa_predictions(
        examples=mrqa_eval,
        features=eval_set,
        predictions=(start_logits, end_logits),
        version_2_with_negative=False,  # non squad-v2
        output_dir=outdir,
    )

    # compute metrics
    answers = {qid: aws['text'] for qid, aws in zip(mrqa_eval['id'], mrqa_eval['answers'])}
    metrics = evaluate_predictions(answers, predictions=all_predictions)

    print(model_id, split, json.dumps(metrics))

#deprecated

In [None]:
"""
import torch
from transformers import AutoModelForQuestionAnswering
import os


# prepare batches. Use cuda for faster computation
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}

for model_id in models_to_evaluate:
    # load model and evaluate
    model_for_eval = AutoModelForQuestionAnswering.from_pretrained(model_id).to(device)
    with torch.no_grad():
        outputs = model_for_eval(**batch)

    # retrieve results
    start_logits = outputs.start_logits.cpu().numpy()
    end_logits = outputs.end_logits.cpu().numpy()

    # output dir for inspection
    outdir = f"{output_dir}/{model_id}"
    if not os.path.exists(outdir): os.makedirs(outdir)

    # postprocess the predictions (logits to actual answers)
    all_predictions = postprocess_qa_predictions(
        examples=mrqa_eval,
        features=eval_set,
        predictions=(start_logits, end_logits),
        version_2_with_negative=False, # non squad-v2
        output_dir=outdir,
    )

    # compute metrics
    #answers = mrqa_eval.remove_columns(["subset", "context", "context_tokens", "question", "question_tokens", "detected_answers"]).to_dict()
    answers = {qid: aws for qid, aws in zip(mrqa_eval['id'], mrqa_eval['answers']['text'])}
    metrics = evaluate_predictions(answers, predictions=all_predictions)

    print(model_id, json.dumps(metrics))


    for k in answers.keys():
        print(all_predictions[k], answers[k])
        metrics = evaluate_predictions({k: answers[k]}, {k: all_predictions[k]})
        print(json.dumps(metrics))
"""