# Testing Canine on SQuAD dataset

Notebook adapted from [Hugging face’s QA guide](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)

#Setup

In [None]:
!pip install datasets transformers

In [None]:
import numpy as np

In [None]:
model_checkpoint = "google/canine-s"

# Dataset

##Loading the dataset

In [None]:
from datasets import load_dataset, load_metric

In [None]:
datasets = load_dataset("squad")

**Display the dataset:**

In [None]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(datasets["train"])

## Preprocessing the training data

In [None]:
from transformers import CanineTokenizer, CanineForQuestionAnswering
import transformers
    
tokenizer = CanineTokenizer.from_pretrained("google/canine-s")

In [None]:
max_length = 2048 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed. #Not really needed here

**Check the output of the tokenizer:**

In [None]:
tokenizer("What is your name?", "My name is Sylvain.")

In [None]:
example = datasets["train"][0]

In [None]:
tokenized_example = tokenizer(
    example["question"],
    example["context"],
    max_length=max_length,
    truncation="only_second",
    return_overflowing_tokens=True,
    stride=doc_stride
)

In [None]:
tokenized_example.keys()

**Check the link between caracters and tokens position:**

In [None]:
answers = example["answers"]
start_char = answers["answer_start"][0]
end_char = start_char + len(answers["text"][0])

offset = len(example["question"])+2  #+2 because of the 2 CLS tokens
start_position, end_position = start_char+offset, end_char+offset

In [None]:
print(example["context"][start_char: end_char])
print(tokenizer.decode(tokenized_example["input_ids"][start_position: end_position]))
print(answers["text"][0])

**Add the start and end token position of the answer in the text:**

In [None]:
pad_on_right = tokenizer.padding_side == "right"

In [None]:
def prepare_train_features(examples):
 
  examples["question"] = [q.lstrip() for q in examples["question"]]

  tokenized_examples = tokenizer(
      examples["question"],
      examples["context"],
      truncation="only_second" if pad_on_right else "only_first",
      max_length=max_length,
      stride=doc_stride,
      padding="max_length",
  )

  tokenized_examples["start_positions"] = []
  tokenized_examples["end_positions"] = []

  for i in range(len(tokenized_examples["input_ids"])):
    answers = examples["answers"][i]
    start_char = answers["answer_start"][0]
    end_char = start_char + len(answers["text"][0])

    offset = len(examples["question"][i])+2
    start_position, end_position = start_char+offset, end_char+offset

    tokenized_examples["start_positions"].append(start_position)
    tokenized_examples["end_positions"].append(end_position)

  return tokenized_examples

In [None]:
examples = datasets['train'][:5]
print(examples["answers"][:])
features = prepare_train_features(examples)
print(features["start_positions"])
for i in range(len(features["input_ids"])):
  print(tokenizer.decode(features["input_ids"][i][features["start_positions"][i]: features["end_positions"][i]]))

In [None]:
tokenized_datasets = datasets.map(prepare_train_features, batched=True, remove_columns=datasets["train"].column_names)

# Fine-tuning the model

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = CanineForQuestionAnswering.from_pretrained("google/canine-s")
batch_size = 4

In [None]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-squad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    push_to_hub=False,
)

In [None]:
from transformers import default_data_collator

data_collator = default_data_collator

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("test-squad-trained")

# Evaluation

In [None]:
import torch

for batch in trainer.get_eval_dataloader():
    break
batch = {k: v.to(trainer.args.device) for k, v in batch.items()}
with torch.no_grad():
    output = trainer.model(**batch)
output.keys()

In [None]:
n_best_size = 20

**Tokenize the validation dataset:**

In [None]:
def prepare_validation_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        padding="max_length",
    )

    return tokenized_examples

In [None]:
validation_features = datasets["validation"].map(
    prepare_validation_features,
    batched=True,
    remove_columns=datasets["validation"].column_names
)

**Apply the model to the validation dataset:**

In [None]:
raw_predictions = trainer.predict(validation_features)

In [None]:
validation_features.set_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))

**Check if the model works on one example:**

In [None]:
max_answer_length = 30

In [None]:
 datasets["validation"][1]["question"]

In [None]:
#Get the best answers of the model for this exemple

start_logits = output.start_logits[1].cpu().numpy()
end_logits = output.end_logits[1].cpu().numpy()
context = datasets["validation"][1]["context"]

# Gather the indices the best start/end logits:
start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
valid_answers = []
for start_index in start_indexes:
    for end_index in end_indexes:
        # Don't consider answers with a length that is either < 0 or > max_answer_length.
        if end_index < start_index or end_index - start_index + 1 > max_answer_length:
            continue
        if start_index <= end_index: # We need to refine that test to check the answer is inside the context
            start_char = start_index - len(datasets["validation"][1]["question"]) - 2
            end_char = end_index - len(datasets["validation"][1]["question"]) - 2
            valid_answers.append(
                {
                    "score": start_logits[start_index] + end_logits[end_index],
                    "text": context[start_char: end_char]
                }
            )

valid_answers = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[:n_best_size]
valid_answers

In [None]:
datasets["validation"][1]["answers"]

**Compute the answers of the questions:**

In [None]:
import collections

examples = datasets["validation"]
features = validation_features



In [None]:
from tqdm.auto import tqdm

#Get the predictions to text format
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = [example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:

                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    # Compute the characters position associated to the tokens
                    offset = len(example["question"]) + 2
                    start_char = start_index - offset
                    end_char = end_index - offset
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        predictions[example["id"]] = best_answer["text"]

    return predictions

In [None]:
final_predictions = postprocess_qa_predictions(datasets["validation"], validation_features, raw_predictions.predictions)

**Compute the metrics:**

In [None]:
metric = load_metric("squad")

In [None]:
formatted_predictions = [{"id": k, "prediction_text": v} for k, v in final_predictions.items()]
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in datasets["validation"]]
metric.compute(predictions=formatted_predictions, references=references)