In [None]:
from huggingface_hub import login
from google.colab import userdata

login(userdata.get('HF_TOKEN'))

In [None]:
!pip install -q -U git+https://github.com/huggingface/transformers.git --progress-bar off
!pip install -q -U git+https://github.com/huggingface/accelerate.git --progress-bar off
!pip install datasets evaluate --progress-bar off
!pip install -q -U bitsandbytes --progress-bar off
!pip install -q -U git+https://github.com/huggingface/peft.git --progress-bar off

In [None]:
# source: https://github.com/mrqa/MRQA-Shared-Task-2019/blob/master/mrqa_official_eval.py

import string
import re
import json
import gzip
from collections import Counter

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)


def read_predictions(prediction_file):
    with open(prediction_file) as f:
        predictions = json.load(f)
    return predictions


def read_answers(gold_file):
    answers = {}
    with gzip.open(gold_file, 'rb') as f:
        for i, line in enumerate(f):
            example = json.loads(line)
            if i == 0 and 'header' in example:
                continue
            for qa in example['qas']:
                answers[qa['qid']] = qa['answers']
    return answers


def evaluate_predictions(answers, predictions, skip_no_answer=False):
    f1 = exact_match = total = 0

    for qid, ground_truths in answers.items():
        if qid not in predictions:
            if not skip_no_answer:
                message = 'Unanswered question %s will receive score 0.' % qid
                print(message)
                total += 1
            continue
        total += 1
        prediction = predictions[qid]
        exact_match += metric_max_over_ground_truths(
            exact_match_score, prediction, ground_truths)
        f1 += metric_max_over_ground_truths(
            f1_score, prediction, ground_truths)

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'exact_match': exact_match, 'f1': f1}

In [None]:
example = {'subset': "HotpotQA",
           'id': "62b2c06aa9b04b1a804341a512564410",
           'question': "While at the University of Michigan, Louis Smith played with what American jazz trumpeter, bandleader, and composer?",
           'answers': { "answer_start": [399], "text": ["Miles Dewey Davis III"] },
           'context': "[PAR] [TLE] Louis Smith (musician) [SEP] While studying at the University of Michigan, he played with visiting musicians such as Dizzy Gillespie, Miles Davis, Thad Jones and Billy Mitchell, before going on to play with Sonny Stitt, Count Basie and Al McKibbon, Cannonball Adderley, Percy Heath, Philly Joe Jones, Lou Donaldson, Donald Byrd, Kenny Dorham and Zoot Sims. [PAR] [TLE] Miles Davis [SEP] Miles Dewey Davis III (May 26, 1926September 28, 1991) was an American jazz trumpeter, bandleader, and composer. He is among the most influential and acclaimed figures in the history of jazz and 20th century music. Davis adopted a variety of musical directions in his five-decade career which kept him at the forefront of a number of major stylistic developments in jazz.",
           }

example_answer = {example['id']: example['answers']['text']}

In [None]:
example = {'subset': "RelationExtraction",
           'id': "efc04fd3b51042c7a6e68dd1dd0af8ba",
           'question': "What year did 52nd government of Turkey start?",
           'answers': { "answer_start": [ 42 ], "text": [ "1995" ] },
           'context': "The 52nd government of Turkey (30 October 1995 -- 6 March 1996) was a caretaker coalition government formed by True Path Party (DYP) and Republican People's Party (CHP).",
           }

example_answer = {example['id']: example['answers']['text']}

In [None]:
example = {'subset': "SQuAD",
           'id': "2654518b7c4d411ba2e45daa9c9e870e",
           'question': "Who opened the new Parliament building on October 9, 2004?",
           'answers': { "answer_start": [743], "text": [ "Queen Elizabeth II"] },
           'context': "Since September 2004, the official home of the Scottish Parliament has been a new Scottish Parliament Building, in the Holyrood area of Edinburgh. The Scottish Parliament building was designed by Spanish architect Enric Miralles in partnership with local Edinburgh Architecture firm RMJM which was led by Design Principal Tony Kettle. Some of the principal features of the complex include leaf-shaped buildings, a grass-roofed branch merging into adjacent parkland and gabion walls formed from the stones of previous buildings. Throughout the building there are many repeated motifs, such as shapes based on Raeburn's Skating Minister. Crow-stepped gables and the upturned boat skylights of the Garden Lobby, complete the unique architecture. Queen Elizabeth II opened the new building on 9 October 2004.",
           }

example_answer = {example['id']: example['answers']['text']}

In [None]:
example = {'subset': "TextbookQA",
           'id': "ede8e61b8ea94f4d99a0263b45e2a6b7",
           'question': "hydrocarbons in gas form are called __________________.",
           'answers': { "answer_start": [ 1357 ], "text": [ "natural gas" ] },
           'context': "Can you name some fossils? How about dinosaur bones or dinosaur footprints? Animal skeletons, teeth, shells, coprolites (otherwise known as feces), or any other remains or traces from a living creature that becomes rock is a fossil. The same processes that formed these fossils also created some of our most important energy resources, fossil fuels. Coal, oil, and natural gas are fossil fuels. Fossil fuels come from living matter starting about 500 million years ago. Millions of years ago, plants used energy from the Sun to form sugars, carbohydrates, and other energy-rich carbon compounds. As plants and animals died, their remains settled on the ground on land and in swamps, lakes, and seas (Figure 1.1). Over time, layer upon layer of these remains accumulated. Eventually, the layers were buried so deeply that they were crushed by an enormous mass of earth. The weight of this earth pressing down on these plant and animal remains created intense heat and pressure. After millions of years of heat and pressure, the material in these layers turned into chemicals called hydrocarbons (Figure 1.2). Hydrocarbons are made of carbon and hydrogen atoms. This molecule with one carbon and four hydrogen atoms is methane. Hydrocarbons can be solid, liquid, or gaseous. The solid form is what we know as coal. The liquid form is petroleum, or crude oil. Natural gas is the gaseous form. The solar energy stored in fossil fuels is a rich source of energy. Although fossil fuels provide very high quality energy, they are non-renewable. Click image to the left or use the URL below. URL:",
           }

example_answer = {example['id']: example['answers']['text']}

In [None]:
example = {'subset': "DROP",
           'id': "21c3bcfcfd0b4f49b0663f8325b852a9",
           'question': "What happened first, the end of the rebellion of the city of Danzig or Maximilian's II death?",
           'answers': { "answer_start": [ 446, 446, 446, 446 ], "text": [ "Maximilian's II death", "Maximilian's II death", "Maximilian's II death", "Maximilian's II death" ] },
           'context': "The rebellion of the city of Danzig was a revolt from December 1575 to December 1577 of the city against the outcome of the Polish-Lithuanian royal election, 1576. The Polish throne was contested by Stephen Báthory and the Holy Roman Emperor Maximillian II. It began on 12 December 1575 when Emperor Maximillian was chosen as monarch by the Polish Senate, while the majority of the szlachta had voted for Bathory. It ended on 16 December 1577. Maximilian's II death in fall of 1576 weakened Danzig's position and made the conflict less about the recognition of the ruler than about Danzig's privileges. With neither side being able to defeat the other militarily, a compromise was reached, with economic as well as religious privileges of the city being restored and recognized, in return for a large reparation and recognition of Bathory as the king.",
           }

example_answer = {example['id']: example['answers']['text']}

# Phi-3

In [None]:
base_model_id = "microsoft/Phi-3-mini-4k-instruct"
model_id = "enriquesaou/phi-3-mrqa"

In [None]:
import torch
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token=True,
    trust_remote_code=True,
    use_fast=False,
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=torch.float16, # to fit into T4
)

# load peft. note that base_model may be modified in place
model = PeftModel.from_pretrained(base_model, model_id)

In [None]:
def format_cqa(context, question):
    return "Answer the question extracting from the context below.\nContext: " + context + "\nQuestion: " + question + "\nAnswer: "

In [None]:
def tokenize_and_generate(test_model, prompt, new_tokens=16):
    inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False).to('cuda')
    with torch.no_grad():
        outputs = test_model.generate(**inputs, pad_token_id=tokenizer.eos_token_id, max_new_tokens=new_tokens)
        answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    return answer

In [None]:
new_tok = 5

prompt = format_cqa(example['context'], example['question'])
outs = tokenize_and_generate(model, prompt, new_tokens=new_tok)
outs = outs.replace(prompt, '')
outs = outs.split('Answer:')[1] if 'Answer:' in outs else outs
outs.strip()

# compute metrics
metrics = evaluate_predictions(example_answer,
                               predictions={example['id']: outs})

print(model_id)
print("=== Original Answer ====")
print(' | '.join(example['answers']['text']))
print("====== Prediction ======")
print(outs)
print("========================")
print(json.dumps(metrics))

# T5

In [None]:
base_model_id = "google/flan-t5-base"
model_id = "enriquesaou/flan-t5-base-mrqa"

In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.add_special_tokens({'sep_token': "<s>"})

In [None]:
import numpy as np

max_length = 512
stride = 128

def generate_input(_question, _context):
    return " ".join(["question:", _question.strip(), tokenizer.sep_token, "context:", _context.strip(), tokenizer.sep_token,  "answer:"])

def preprocess_mrqa_batch(examples):
        questions = examples["question"]
        contexts = examples["context"]
        answers = examples["answers"]

        inputs = [generate_input(question, context) for question, context in zip(questions, contexts)]
        targets = [answer['text'][0] if len(answer) > 0 else "" for answer in answers]
        return inputs, targets


# validation preprocessing
def preprocess_validation(examples):
    inputs, targets = preprocess_mrqa_batch(examples)

    model_inputs = tokenizer(inputs,
                             max_length=max_length,
                             stride=stride,
                             padding="max_length",
                             truncation=True,
                             return_overflowing_tokens=True,
                             return_offsets_mapping=True)
    labels = tokenizer(text_target=targets,
                       max_length=max_length,
                       stride=stride,
                       padding="max_length",
                       truncation=True)

    # Replace tokenizer.pad_token_id in the labels to ignore padding in the loss
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]

    # examples with long context give us several features -> map feature to example
    sample_mapping = model_inputs.pop("overflow_to_sample_mapping")

    # convert predictions to substrings of the context for evaluation
    model_inputs["example_id"] = []
    # Augment the overflowing tokens to the labels
    labels_out = []
    for i in range(len(model_inputs["input_ids"])):
        # an example can give many spans -> take index of the example containing the span
        sample_index = sample_mapping[i]
        model_inputs["example_id"].append(examples["id"][sample_index])
        labels_out.append(labels["input_ids"][sample_index])

    model_inputs["labels"] = labels_out
    return model_inputs


def postprocess_qa_predictions(examples, features, predictions):

    if isinstance(predictions, tuple):
        predictions = predictions[0]
    # Replace -100s used for padding as we can't decode them
    #predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    predictions = [np.where(p != -100, p, tokenizer.pad_token_id) for p in predictions]
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    feature_per_example = {example_id_to_index[feature["example_id"]]: i for i, feature in enumerate(features)}
    all_predictions = {}
    for example_index, example in enumerate(examples):
        # This is the index of the feature associated to the current example.
        feature_index = feature_per_example[example_index]
        all_predictions[example["id"]] = decoded_preds[feature_index]

    return all_predictions

In [None]:
from transformers import T5ForConditionalGeneration


input = preprocess_validation({k: [v] for k,v in example.items()})
outs = model.generate(input_ids=torch.tensor(input['input_ids']),
                                attention_mask=torch.tensor(input['attention_mask']),
                                max_new_tokens=16)


# postprocess the prediction
prediction = [np.where(p != -100, p, tokenizer.pad_token_id) for p in outs]
decoded_pred = tokenizer.decode(prediction[0], skip_special_tokens=True)

# compute metrics
metrics = evaluate_predictions(example_answer,
                               predictions={example['id']: decoded_pred})

print(model_id)
print("=== Original Answer ====")
print(' | '.join(example['answers']['text']))
print("====== Prediction ======")
print(decoded_pred)
print("========================")
print(json.dumps(metrics))

# RoBERTa

In [None]:
base_model_id = "VMware/roberta-base"
model_id = "enriquesaou/roberta-vmw-mrqa"

In [None]:
from transformers import pipeline

qa_pipeline = pipeline('question-answering', model=model_id, device_map="auto")
outs = qa_pipeline(question=example['question'], context=example['context'])

# compute metrics
metrics = evaluate_predictions(example_answer,
                               predictions={example['id']: outs['answer']})

print(model_id)
print("=== Original Answer ====")
print(' | '.join(example['answers']['text']))
print("====== Prediction ======")
print(outs['answer'])
print("========================")
print(json.dumps(metrics))