In [1]:
import os

import torch
from transformers import (
    CanineForQuestionAnswering,
    default_data_collator,
    get_linear_schedule_with_warmup,
    CanineTokenizer,
)
from datasets import load_dataset, DatasetDict, load_metric, Dataset
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm

tqdm.pandas()

%load_ext autoreload
%autoreload 2

In [2]:
from question_answering import (
    Preprocessor,
    CanineDatasetTokenizer,
    CanineCTrainer,
    TrainerArguments,
    DataArguments,
    set_seed,
    to_pandas,
    remove_examples_longer_than_threshold,
)

In [3]:
seed = 0
set_seed(seed)

In [4]:
squad_v2 = True

datasets = load_dataset("squad_v2" if squad_v2 else "squad")
datasets

Reusing dataset squad_v2 (/home/kaliayev/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [5]:
preprocessor = Preprocessor(datasets)
datasets = preprocessor.preprocess()

Loading cached processed dataset at /home/kaliayev/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d/cache-e1f546e6593e44d5.arrow
Loading cached processed dataset at /home/kaliayev/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d/cache-acfc79c6c4e7149f.arrow


In [6]:
# canine tokenizer
pretrained_model_name = "google/canine-c"
tokenizer = CanineTokenizer.from_pretrained(pretrained_model_name)

Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


In [7]:
max_length = 2048  # The maximum length of a feature (question and context)
doc_stride = 512  # The authorized overlap between two part of the context when splitting it is needed.

In [8]:
df_train = to_pandas(datasets["train"])
df_validation = to_pandas(datasets["validation"])

In [9]:
print(df_train.shape, df_validation.shape)

(130319, 5) (11873, 5)


In [10]:
df_train = remove_examples_longer_than_threshold(
    df_train, max_length=max_length * 2, doc_stride=doc_stride
)
df_validation = remove_examples_longer_than_threshold(
    df_validation, max_length=max_length * 2, doc_stride=doc_stride
)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 130319/130319 [01:59<00:00, 1087.66it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11873/11873 [00:10<00:00, 1098.52it/s]


In [11]:
print(df_train.shape, df_validation.shape)

(130303, 7) (11861, 7)


In [12]:
reduce_size = True

if reduce_size:
    random_indices = np.random.choice(range(df_train.shape[0]), 100, replace=False)
    df_train = df_train.loc[random_indices]

    random_indices = np.random.choice(range(df_validation.shape[0]), 20, replace=False)
    df_validation = df_validation.loc[random_indices]

    print(df_train.shape, df_validation.shape)

(100, 7) (20, 7)


In [13]:
datasets["train"] = Dataset.from_pandas(df_train)
datasets["validation"] = Dataset.from_pandas(df_validation)

del df_train, df_validation

In [14]:
tokenizer_dataset = CanineDatasetTokenizer(
    tokenizer, max_length, doc_stride, train=True, squad_v2=squad_v2, language="en"
)
tokenized_datasets = datasets.map(
    tokenizer_dataset.tokenize,
    batched=True,
    remove_columns=datasets["train"].column_names,
)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [15]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'token_type_ids', 'start_positions', 'end_positions'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'token_type_ids', 'start_positions', 'end_positions'],
        num_rows: 20
    })
})

In [16]:
tokenizer_dataset = CanineDatasetTokenizer(
    tokenizer, max_length, doc_stride, train=False, squad_v2=squad_v2, language="en"
)
validation_features = datasets["validation"].map(
    tokenizer_dataset.tokenize,
    batched=True,
    remove_columns=datasets["validation"].column_names,
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
validation_features

Dataset({
    features: ['input_ids', 'attention_mask', 'token_type_ids', 'example_id'],
    num_rows: 20
})

In [18]:
data_collator = default_data_collator
metric = load_metric("squad_v2" if squad_v2 else "squad")

In [19]:
model = CanineForQuestionAnswering.from_pretrained(pretrained_model_name)

Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-c and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
model_name = pretrained_model_name.split("/")[-1]
trainer_args = TrainerArguments(
    model=model,
    learning_rate=5e-5,
    lr_scheduler="constant_with_warmup",
    warmup_ratio=0.1,
    save_strategy="steps",
    save_steps=10,
    epochs=1,
    output_dir="/mnt/hdd/dl_ensae/models",
    metric=metric,
    evaluation_strategy="steps",
    weight_decay=0.001,
    data_collator=data_collator,
    model_save_path=os.path.join(
        "/mnt/hdd/dl_ensae/models", f"{model_name}-finetuned", "best_data2vec.pt"
    ),
    device="cpu",
    early_stopping_patience=3,
)
data_args = DataArguments(
    datasets=datasets,
    validation_features=validation_features,
    batch_size=4,
    tokenizer=tokenizer,
    n_best_size=20,
    max_answer_length=256,
    tokenized_datasets=tokenized_datasets,
    squad_v2=squad_v2,
)

In [21]:
if model_name == "canine-c":
    trainer = CanineCTrainer(trainer_args, data_args)
else:
    trainer = CanineSTrainer(trainer_args, data_args)

In [22]:
trainer.train()

***** Running training *****
  Num examples = 100
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 25


Step,Training Loss,Validation Loss
10,7.27,6.391757
20,6.1033,5.187766


***** Running Evaluation *****
  Num examples = 20
  Batch size = 4
Saving model checkpoint to /mnt/hdd/dl_ensae/models/CANINE-C-finetuned/checkpoint-10
Configuration saved in /mnt/hdd/dl_ensae/models/CANINE-C-finetuned/checkpoint-10/config.json
Model weights saved in /mnt/hdd/dl_ensae/models/CANINE-C-finetuned/checkpoint-10/pytorch_model.bin
tokenizer config file saved in /mnt/hdd/dl_ensae/models/CANINE-C-finetuned/checkpoint-10/tokenizer_config.json
Special tokens file saved in /mnt/hdd/dl_ensae/models/CANINE-C-finetuned/checkpoint-10/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 20
  Batch size = 4
Saving model checkpoint to /mnt/hdd/dl_ensae/models/CANINE-C-finetuned/checkpoint-20
Configuration saved in /mnt/hdd/dl_ensae/models/CANINE-C-finetuned/checkpoint-20/config.json
Model weights saved in /mnt/hdd/dl_ensae/models/CANINE-C-finetuned/checkpoint-20/pytorch_model.bin
tokenizer config file saved in /mnt/hdd/dl_ensae/models/CANINE-C-finetuned/checkpoint-20

In [23]:
f1, exact_match = trainer.evaluate(mode="val")
print("Obtained F1-score: ", f1, "Obtained Exact Match: ", exact_match)

The following columns in the test set  don't have a corresponding argument in `CanineForQuestionAnswering.forward` and have been ignored: example_id. If example_id are not expected by `CanineForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 20
  Batch size = 4


  0%|          | 0/1 [00:00<?, ?ba/s]

Obtained F1-score:  40.0 Obtained Exact Match:  40.0
