# Github setup

In [1]:
!rm -rf canine_mva
!git clone https://{YOUR_SECRET_TOKEN}@github.com/chloeskt/canine_mva.git

Cloning into 'canine_mva'...
remote: Enumerating objects: 43, done.[K
remote: Counting objects: 100% (43/43), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 43 (delta 13), reused 40 (delta 10), pack-reused 0[K
Unpacking objects: 100% (43/43), done.


In [2]:
!pip install --quiet transformers datasets

In [3]:
import sys
import os
from pathlib import Path

sys.path.append(Path.cwd().joinpath("canine_mva").as_posix())

%load_ext autoreload
%autoreload 2

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
!nvidia-smi

Tue Mar 29 10:47:01 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Imports

We will heavily use the frameworks proposed by Hugging Face.

In [6]:
from datasets import load_dataset, load_metric, Dataset
from transformers import (
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    CanineModel,
    CanineForQuestionAnswering,
    CanineTokenizer,
    default_data_collator,
    AdamW ,
    get_linear_schedule_with_warmup
)
import torch
import numpy as np
from torch.utils.data import DataLoader

In [7]:
from source.qa import (
    TokenizedDataset,
    Preprocessor,
    set_seed,
    train,
    QADataset,
    to_pandas,
    remove_examples_longer_than_threshold,
    postprocess_qa_predictions,
    compute_metrics,
)

In [8]:
seed = 0
set_seed(seed)

# Load and prepare data

In [9]:
squad_v2 = True

language = "en"
datasets = load_dataset("squad_v2" if squad_v2 else "squad")
datasets

Reusing dataset squad_v2 (/root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

Preprocess the dataset: add `answer_end`

In [10]:
preprocessor = Preprocessor(datasets)
datasets = preprocessor.preprocess()

Loading cached processed dataset at /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d/cache-e5992ea5dfb97a07.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d/cache-9618f4ac47729f81.arrow


Check that the answer end character position has been computed as expected:

In [11]:
max_length = 2048

# canine tokenizer
model_checkpoint = "google/canine-c"
tokenizer = CanineTokenizer.from_pretrained(model_checkpoint)

for i, example in enumerate(datasets["train"]):
    if len(tokenizer(example["question"], example["context"])["input_ids"]) > max_length:
        break
example = datasets["train"][i]
i

Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Token indices sequence length is longer than the specified maximum sequence length for this model (2219 > 2048). Running this sequence through the model will result in indexing errors


1629

In [12]:
idx = 764

print(datasets["train"]["context"][idx])
print(len(datasets["train"]["context"][idx]))
print()
print(datasets["train"]["question"][idx])
print()
print(datasets["train"]["answers"][idx])
print()
rep = datasets["train"]["context"][idx][datasets["train"]["answers"][idx]["answer_start"][0]:datasets["train"]["answers"][idx]["answer_end"][0]]
print(f">{rep}<")

Frédéric François Chopin (/ˈʃoʊpæn/; French pronunciation: ​[fʁe.de.ʁik fʁɑ̃.swa ʃɔ.pɛ̃]; 22 February or 1 March 1810 – 17 October 1849), born Fryderyk Franciszek Chopin,[n 1] was a Polish and French (by citizenship and birth of father) composer and a virtuoso pianist of the Romantic era, who wrote primarily for the solo piano. He gained and has maintained renown worldwide as one of the leading musicians of his era, whose "poetic genius was based on a professional technique that was without equal in his generation." Chopin was born in what was then the Duchy of Warsaw, and grew up in Warsaw, which after 1815 became part of Congress Poland. A child prodigy, he completed his musical education and composed his earlier works in Warsaw before leaving Poland at the age of 20, less than a month before the outbreak of the November 1830 Uprising.
849

What was Chopin's full name?

{'answer_end': [169], 'answer_start': [143], 'text': ['Fryderyk Franciszek Chopin']}

>Fryderyk Franciszek Chopin<


In [13]:
idx = i

print(datasets["train"]["context"][idx])
print(len(datasets["train"]["context"][idx]))
print()
print(datasets["train"]["question"][idx])
print()
print(datasets["train"]["answers"][idx])
print()
rep = datasets["train"]["context"][idx][datasets["train"]["answers"][idx]["answer_start"][0]:datasets["train"]["answers"][idx]["answer_end"][0]]
print(f">{rep}<")

Van Praag states that the Ming court established diplomatic delegations with Tibet merely to secure urgently needed horses. Wang and Nyima argue that these were not diplomatic delegations at all, that Tibetan areas were ruled by the Ming since Tibetan leaders were granted positions as Ming officials, that horses were collected from Tibet as a mandatory "corvée" tax, and therefore Tibetans were "undertaking domestic affairs, not foreign diplomacy". Sperling writes that the Ming simultaneously bought horses in the Kham region while fighting Tibetan tribes in Amdo and receiving Tibetan embassies in Nanjing. He also argues that the embassies of Tibetan lamas visiting the Ming court were for the most part efforts to promote commercial transactions between the lamas' large, wealthy entourage and Ming Chinese merchants and officials. Kolmaš writes that while the Ming maintained a laissez-faire policy towards Tibet and limited the numbers of the Tibetan retinues, the Tibetans sought to maintai

# Resizing the SQUAD dataset

In [14]:
max_length = 2048 # The maximum length of a feature (question and context)
doc_stride = 512 # The authorized overlap between two part of the context when splitting it is needed.

In [15]:
df_train = to_pandas(datasets["train"])
df_validation = to_pandas(datasets["validation"])

In [16]:
print(df_train.shape, df_validation.shape)

(130319, 5) (11873, 5)


### Remove examples with total length larger than a threshold

We decided to keep only examples for which we know that we won't encounter more than 1 overflow. This means that `row["length_context"] + 2 * row["length_question"] + 6 + doc_stride` must be lower than `2 * max_length`.

On Squadv2 we loose 16 examples on the training set and 12 on the validation set.

Note: the following cell takes quite some time to run, approximately 5 minutes.

In [17]:
df_train = remove_examples_longer_than_threshold(df_train, max_length=max_length*2, doc_stride=doc_stride)
df_validation = remove_examples_longer_than_threshold(df_validation, max_length=max_length*2, doc_stride=doc_stride)

In [18]:
print(df_train.shape, df_validation.shape)

(130303, 7) (11861, 7)


### Select random examples to reduce the size of the datasets

In [19]:
subsample = True
nb_train = 200
nb_val = 100

if subsample:
    # select random indices
    random_indices = np.random.choice(range(df_train.shape[0]), nb_train, replace=False)
    df_train = df_train.loc[random_indices]

    random_indices = np.random.choice(range(df_validation.shape[0]), nb_val, replace=False)
    df_validation = df_validation.loc[random_indices]

In [20]:
print(df_train.shape, df_validation.shape)

(200, 7) (100, 7)


In [21]:
# remove unnecessary columns

df_train = df_train.drop(columns=["length_context", "length_context"])
df_validation = df_validation.drop(columns=["length_context", "length_context"])

In [22]:
# convert back to huggingface dataset
datasets["train"] = Dataset.from_pandas(df_train)
datasets["validation"] = Dataset.from_pandas(df_validation)

In [23]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'length_question', '__index_level_0__'],
        num_rows: 200
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'length_question', '__index_level_0__'],
        num_rows: 100
    })
})

In [24]:
del df_train, df_validation

# Tokenize dataset and prepare for training

In [25]:
# canine tokenizer
model_checkpoint = "google/canine-c"
tokenizer = CanineTokenizer.from_pretrained(model_checkpoint)

Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


In [26]:
datasets["validation"], datasets["train"]

(Dataset({
     features: ['id', 'title', 'context', 'question', 'answers', 'length_question', '__index_level_0__'],
     num_rows: 100
 }), Dataset({
     features: ['id', 'title', 'context', 'question', 'answers', 'length_question', '__index_level_0__'],
     num_rows: 200
 }))

In [27]:
tokenizer_dataset = TokenizedDataset(tokenizer, max_length, doc_stride, squad_v2=squad_v2, language=language)
tokenized_datasets = datasets.map(tokenizer_dataset.tokenize, batched=True, remove_columns=datasets["train"].column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [28]:
tokenized_datasets["train"] = tokenized_datasets["train"].remove_columns(["example_id"])

In [29]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'token_type_ids', 'start_positions', 'end_positions'],
        num_rows: 200
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'token_type_ids', 'start_positions', 'end_positions', 'example_id'],
        num_rows: 101
    })
})

# Prepare validation features

In [30]:
validation_examples = tokenized_datasets["validation"]
tokenized_datasets["validation"] = tokenized_datasets["validation"].remove_columns(["example_id"])

# Load metric for SQUAD

In [31]:
metric = load_metric("squad_v2" if squad_v2 else "squad")

# Datasets

In [32]:
tokenized_datasets["train"].set_format("torch")
tokenized_datasets["validation"].set_format("torch")

# Dataloaders

In [33]:
batch_size = 6
# initialize data loader for training data
train_loader = DataLoader(tokenized_datasets["train"], batch_size=batch_size, shuffle=True, drop_last=True, num_workers=2)
# initialize validation set data loader
val_loader = DataLoader(tokenized_datasets["validation"], batch_size=batch_size, shuffle=False, drop_last=True, num_workers=2)

# Fine-tuning the model

We can use the `AutoModelForQuestionAnswering` class to finetune CANINE.

In [34]:
model = CanineForQuestionAnswering.from_pretrained(model_checkpoint)

Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-c and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
learning_rate = 5e-5
num_train_epochs = 3
weight_decay = 0.01
model_name = model_checkpoint.split("/")[-1]
output_dir = f"models/{model_name}"

In [36]:
# Freeze canine layers and only train the last layer (qa_outputs)
freeze = False

if freeze:
    optimizer = torch.optim.AdamW(model.qa_outputs.parameters(), lr=learning_rate, weight_decay=weight_decay, eps=1e-8)
else:
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay, eps=1e-8)

In [37]:
lr_scheduler = True

if lr_scheduler:
    lr_scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=500, #0.1
            num_training_steps=len(train_loader) * num_train_epochs
        )

In [38]:
max_answer_length = 256
n_best_size = 20

print("Start training")
train_losses, val_losses = train(
    model=model,
    num_epochs=num_train_epochs,
    optimizer=optimizer,
    train_loader=train_loader,
    val_loader=val_loader,
    val_dataset=datasets["validation"],
    features_val_dataset=validation_examples,
    tokenizer=tokenizer,
    metric=metric,
    batch_size=batch_size,
    learning_rate=learning_rate,
    max_answer_length=max_answer_length,
    n_best_size=n_best_size,
    best_f1=68.,
    lr_scheduler=lr_scheduler,
    drive=True,
    squad_v2=squad_v2,
    clipping=False
)

Start training


Epoch 0: 100%|██████████| 33/33 [00:25<00:00,  1.31it/s, loss=7.51]
  0%|          | 0/16 [00:00<?, ?it/s, loss=7.39]

  0%|          | 0/1 [00:00<?, ?ba/s]

  6%|▋         | 1/16 [00:02<00:31,  2.12s/it, loss=7.45]

  0%|          | 0/1 [00:00<?, ?ba/s]

 12%|█▎        | 2/16 [00:04<00:27,  1.96s/it, loss=7.41]

  0%|          | 0/1 [00:00<?, ?ba/s]

 19%|█▉        | 3/16 [00:06<00:24,  1.88s/it, loss=7.42]

  0%|          | 0/1 [00:00<?, ?ba/s]

 25%|██▌       | 4/16 [00:07<00:22,  1.84s/it, loss=7.37]

  0%|          | 0/1 [00:00<?, ?ba/s]

 31%|███▏      | 5/16 [00:09<00:21,  1.93s/it, loss=7.46]

  0%|          | 0/1 [00:00<?, ?ba/s]

 38%|███▊      | 6/16 [00:11<00:19,  1.91s/it, loss=7.43]

  0%|          | 0/1 [00:00<?, ?ba/s]

 44%|████▍     | 7/16 [00:13<00:17,  1.93s/it, loss=7.44]

  0%|          | 0/1 [00:00<?, ?ba/s]

 50%|█████     | 8/16 [00:15<00:14,  1.85s/it, loss=7.41]

  0%|          | 0/1 [00:00<?, ?ba/s]

 56%|█████▋    | 9/16 [00:17<00:13,  1.86s/it, loss=7.42]

  0%|          | 0/1 [00:00<?, ?ba/s]

 62%|██████▎   | 10/16 [00:18<00:10,  1.75s/it, loss=7.39]

  0%|          | 0/1 [00:00<?, ?ba/s]

 69%|██████▉   | 11/16 [00:20<00:09,  1.84s/it, loss=7.4]

  0%|          | 0/1 [00:00<?, ?ba/s]

 75%|███████▌  | 12/16 [00:22<00:07,  1.91s/it, loss=7.4]

  0%|          | 0/1 [00:00<?, ?ba/s]

 81%|████████▏ | 13/16 [00:24<00:05,  1.85s/it, loss=7.37]

  0%|          | 0/1 [00:00<?, ?ba/s]

 88%|████████▊ | 14/16 [00:26<00:03,  1.90s/it, loss=7.42]

  0%|          | 0/1 [00:00<?, ?ba/s]

 94%|█████████▍| 15/16 [00:28<00:01,  1.86s/it, loss=7.35]

  0%|          | 0/1 [00:00<?, ?ba/s]

100%|██████████| 16/16 [00:30<00:00,  1.88s/it, f1=3.33]



Epoch 0 complete! Training Loss: 7.602179917422208, Validation Loss : 7.4092020988464355, Validation Accuracy: 0.0052083334885537624
F1-score: 8.27392689851488, Exact match: 3.125


Epoch 1: 100%|██████████| 33/33 [00:24<00:00,  1.33it/s, loss=6.78]
  0%|          | 0/16 [00:00<?, ?it/s, loss=6.56]

  0%|          | 0/1 [00:00<?, ?ba/s]

  6%|▋         | 1/16 [00:01<00:23,  1.58s/it, loss=6.4]

  0%|          | 0/1 [00:00<?, ?ba/s]

 12%|█▎        | 2/16 [00:03<00:19,  1.43s/it, loss=6.59]

  0%|          | 0/1 [00:00<?, ?ba/s]

 19%|█▉        | 3/16 [00:04<00:15,  1.23s/it, loss=6.39]

  0%|          | 0/1 [00:00<?, ?ba/s]

 25%|██▌       | 4/16 [00:05<00:15,  1.26s/it, loss=6.45]

  0%|          | 0/1 [00:00<?, ?ba/s]

 31%|███▏      | 5/16 [00:06<00:13,  1.20s/it, loss=6.71]

  0%|          | 0/1 [00:00<?, ?ba/s]

 38%|███▊      | 6/16 [00:07<00:11,  1.15s/it, loss=6.23]

  0%|          | 0/1 [00:00<?, ?ba/s]

 44%|████▍     | 7/16 [00:08<00:10,  1.19s/it, loss=6.33]

  0%|          | 0/1 [00:00<?, ?ba/s]

 50%|█████     | 8/16 [00:10<00:09,  1.18s/it, loss=6.39]

  0%|          | 0/1 [00:00<?, ?ba/s]

 56%|█████▋    | 9/16 [00:11<00:08,  1.17s/it, loss=6.52]

  0%|          | 0/1 [00:00<?, ?ba/s]

 62%|██████▎   | 10/16 [00:12<00:07,  1.21s/it, loss=6.16]

  0%|          | 0/1 [00:00<?, ?ba/s]

 69%|██████▉   | 11/16 [00:13<00:05,  1.16s/it, loss=6.18]

  0%|          | 0/1 [00:00<?, ?ba/s]

 75%|███████▌  | 12/16 [00:14<00:04,  1.18s/it, loss=6.12]

  0%|          | 0/1 [00:00<?, ?ba/s]

 81%|████████▏ | 13/16 [00:15<00:03,  1.14s/it, loss=6.55]

  0%|          | 0/1 [00:00<?, ?ba/s]

 88%|████████▊ | 14/16 [00:16<00:02,  1.12s/it, loss=6.58]

  0%|          | 0/1 [00:00<?, ?ba/s]

 94%|█████████▍| 15/16 [00:18<00:01,  1.23s/it, loss=6.29]

  0%|          | 0/1 [00:00<?, ?ba/s]

100%|██████████| 16/16 [00:19<00:00,  1.21s/it, f1=50]



Epoch 1 complete! Training Loss: 7.137305274154201, Validation Loss : 6.403667211532593, Validation Accuracy: 0.42708334419876337
F1-score: 45.695684523809526, Exact match: 45.41666666666667


Epoch 2: 100%|██████████| 33/33 [00:24<00:00,  1.33it/s, loss=5.46]
  0%|          | 0/16 [00:00<?, ?it/s, loss=5.38]

  0%|          | 0/1 [00:00<?, ?ba/s]

  6%|▋         | 1/16 [00:01<00:18,  1.24s/it, loss=4.65]

  0%|          | 0/1 [00:00<?, ?ba/s]

 12%|█▎        | 2/16 [00:02<00:15,  1.12s/it, loss=5.56]

  0%|          | 0/1 [00:00<?, ?ba/s]

 19%|█▉        | 3/16 [00:03<00:13,  1.03s/it, loss=4.95]

  0%|          | 0/1 [00:00<?, ?ba/s]

 25%|██▌       | 4/16 [00:04<00:12,  1.01s/it, loss=5.24]

  0%|          | 0/1 [00:00<?, ?ba/s]

 31%|███▏      | 5/16 [00:05<00:10,  1.06it/s, loss=5.84]

  0%|          | 0/1 [00:00<?, ?ba/s]

 38%|███▊      | 6/16 [00:06<00:09,  1.09it/s, loss=4.13]

  0%|          | 0/1 [00:00<?, ?ba/s]

 44%|████▍     | 7/16 [00:07<00:08,  1.07it/s, loss=5.03]

  0%|          | 0/1 [00:00<?, ?ba/s]

 50%|█████     | 8/16 [00:08<00:07,  1.06it/s, loss=4.91]

  0%|          | 0/1 [00:00<?, ?ba/s]

 56%|█████▋    | 9/16 [00:09<00:06,  1.05it/s, loss=4.63]

  0%|          | 0/1 [00:00<?, ?ba/s]

 62%|██████▎   | 10/16 [00:09<00:05,  1.07it/s, loss=4.51]

  0%|          | 0/1 [00:00<?, ?ba/s]

 69%|██████▉   | 11/16 [00:10<00:04,  1.08it/s, loss=4.53]

  0%|          | 0/1 [00:00<?, ?ba/s]

 75%|███████▌  | 12/16 [00:11<00:03,  1.14it/s, loss=4.01]

  0%|          | 0/1 [00:00<?, ?ba/s]

 81%|████████▏ | 13/16 [00:12<00:02,  1.19it/s, loss=5.28]

  0%|          | 0/1 [00:00<?, ?ba/s]

 88%|████████▊ | 14/16 [00:13<00:01,  1.20it/s, loss=5.16]

  0%|          | 0/1 [00:00<?, ?ba/s]

 94%|█████████▍| 15/16 [00:14<00:00,  1.11it/s, loss=4.56]

  0%|          | 0/1 [00:00<?, ?ba/s]

100%|██████████| 16/16 [00:14<00:00,  1.08it/s, f1=50]


Epoch 2 complete! Training Loss: 6.121442433559533, Validation Loss : 4.898667186498642, Validation Accuracy: 0.46354168001562357
F1-score: 48.02083333333333, Exact match: 47.5





In [39]:
train_losses, val_losses

([7.602179917422208, 7.137305274154201, 6.121442433559533],
 [7.4092020988464355, 6.403667211532593, 4.898667186498642])

In [40]:
# Load best model:
model.load_state_dict(torch.load("/content/drive/MyDrive/models/canine-c/CANINE_lr_5e-05_val_loss_1.2391124304863597_f1_70.8831228680206_acc_0.6422987520776525_ep_2.pt"))
model.eval()

# take subset of validation set to compute f1-score
random_indices = np.random.choice(range(len(datasets["validation"])), 20, replace=False)
small_eval_set = datasets["validation"].select(random_indices)

# preprocess this subset to get features
eval_set = small_eval_set.map(
    tokenizer_dataset.tokenize,
    batched=True,
    remove_columns=small_eval_set.column_names,
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [41]:
eval_set_features = eval_set.remove_columns('example_id')
eval_set_features.set_format("torch")

In [42]:
device = torch.device("cpu")
batch = {k: eval_set_features[k].to(device) for k in eval_set_features.column_names}
model = model.to(device)

with torch.no_grad():
    outputs = model(**batch)

In [43]:
final_predictions, data = postprocess_qa_predictions(
    data=small_eval_set,
    features=eval_set,
    raw_predictions=outputs,
    tokenizer=tokenizer,
    n_best_size=20,
    max_answer_length=max_answer_length,
    squad_v2=squad_v2,
)

In [44]:
metrics = compute_metrics(
            metric,
            small_eval_set,
            final_predictions,
            squad_v2,
        )

  0%|          | 0/1 [00:00<?, ?ba/s]

In [45]:
metrics["f1"], metrics["exact"]

(75.0, 75.0)