**References**

[Tutorial: Working with Hugging Face Models and Datasets](https://github.com/anyuanay/medium/blob/main/src/working_huggingface/Working_with_HuggingFace_ch4_Fine_Tuning_Pretrained_Model_for_Question_Answering.ipynb)

[Fine-Tuning-for-Question-Answering-SQuAD-IndoQA](https://github.com/PrasetyoWidyantoro/Fine-Tuning-for-Question-Answering-SQuAD-IndoQA/blob/master/fine-tune-squad-dataset.ipynb)

# Import libraries

In [7]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from tqdm.auto import tqdm
import evaluate
import numpy as np
import os
import json

# Global variables

In [8]:
random_seed = 42

model_name = "distilbert-base-multilingual-cased"
model_short_name = "distilbert-multilingual"

dataset_name = ("xquad", "xquad.ro")
dataset_short_name = "xquad"

# Import model and tokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Split the dataset into train, validation and test

In [10]:
ds = load_dataset(dataset_name[0], dataset_name[1])

ds = ds.shuffle(seed=random_seed)
ds = ds['validation'].train_test_split(test_size=0.2)
ds_train = ds['train']
ds_test = ds['test']

ds_test = ds_test.train_test_split(test_size=0.5)
ds_val = ds_test['train']
ds_test = ds_test['test']

print(len(ds_train), len(ds_val), len(ds_test))

952 119 119


# Process the dataset

## Train dataset

In [11]:
def standardize(text: str):
    return text
    # return text.replace("ţ", "ț").replace("ş", "ș").replace("Ţ", "Ț").replace("Ş", "Ș")

def process_and_tokenize(dataset: Dataset, tokenizer: AutoTokenizer) -> dict:
    # Extract from the dataset and standardize where possible
    questions = [standardize(q).strip() for q in dataset["question"]]
    contexts = [standardize(c) for c in dataset["context"]]
    answers = dataset["answers"]
    
    
    inputs = tokenizer(
        questions,
        contexts,
        max_length=384,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = inputs["overflow_to_sample_mapping"]
    offset_mapping = inputs["offset_mapping"]
    
    start_pos = []
    end_pos = []

    for i, ofs in enumerate(offset_mapping):
        # Get the sample
        sample_i = sample_mapping[i]
        
        # Get the answer for the sample
        answer = answers[sample_i]
        
        # Get the start and end character of the answer
        start = answer["answer_start"][0]
        end = answer["answer_start"][0] + len(answer["text"][0])
        
        # Get the sequence ids
        seq_ids = inputs.sequence_ids(i)

        # Get the start and end token positions
        start_context = seq_ids.index(1)
        end_context = next(j - 1 for j in range(start_context, len(seq_ids)) if seq_ids[j] != 1)

        if ofs[start_context][0] > start or ofs[end_context][1] < end: # If it's impossible
            start_pos.append(0)
            end_pos.append(0)
        else: # Get and append start and end position
            start_pos.append(next((j - 1 for j in range(start_context, end_context + 1) if ofs[j][0] > start), end_context))
            end_pos.append(next((j + 1 for j in range(end_context, start_context - 1, -1) if ofs[j][1] < end), start_context))
            
            # if i < len(dataset['id']) and dataset['id'][i] == '56d6f3500d65d21400198292':
            #     print(start_context, end_context, end_pos[-1], end_pos[-1])

    inputs["start_positions"] = start_pos
    inputs["end_positions"] = end_pos
        
    inputs.pop("overflow_to_sample_mapping")
    inputs.pop("offset_mapping")
    
    return inputs

In [12]:
ds_tok_train = ds_train.map(lambda x: process_and_tokenize(dataset=x, tokenizer=tokenizer), batched=True, batch_size=64, remove_columns=ds_train.column_names)
 
print(len(ds_tok_train))

Map:   0%|          | 0/952 [00:00<?, ? examples/s]

1082


## Validation and test datasets

In [13]:
def process_and_tokenize_val_test(dataset: Dataset, tokenizer: AutoTokenizer) -> dict:
    # Extract from the dataset and standardize where possible
    questions = [standardize(q).strip() for q in dataset["question"]]
    contexts = [standardize(c) for c in dataset["context"]]
    
    inputs = tokenizer(
        questions,
        contexts,
        max_length=384,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = inputs["overflow_to_sample_mapping"]
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_i = sample_mapping[i]
        example_ids.append(dataset["id"][sample_i])

        seq_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        
        for j, ofs in enumerate(offset):
            inputs['offset_mapping'][i][j] = ofs if seq_ids[j] == 1 else None

    inputs["example_id"] = example_ids
    
    inputs.pop("overflow_to_sample_mapping")
    
    return inputs

In [14]:
ds_tok_val = ds_val.map(lambda x: process_and_tokenize_val_test(dataset=x, tokenizer=tokenizer), batched=True, batch_size=64, remove_columns=ds_val.column_names)
ds_tok_test = ds_test.map(lambda x: process_and_tokenize_val_test(dataset=x, tokenizer=tokenizer), batched=True, batch_size=64, remove_columns=ds_test.column_names)
 
print(len(ds_tok_val), len(ds_tok_test))

Map:   0%|          | 0/119 [00:00<?, ? examples/s]

Map:   0%|          | 0/119 [00:00<?, ? examples/s]

138 143


# Make metrics

In [15]:
squad_metric = evaluate.load("squad")
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

In [16]:
def compute_metrics(start_logits: list, end_logits: list, tok: dict, dataset: Dataset, max_answer_length: int = 80) -> dict:
    extracted_features = {}
    
    # Group extracted features by ids
    for i, feature in enumerate(tok):
        if feature["example_id"] not in extracted_features:
            extracted_features[feature["example_id"]] = [i]
        else:
            extracted_features[feature["example_id"]].append(i)
    
    golds_squad = [{"id": data["id"], "answers": data["answers"]} for data in dataset]
    preds_squad = []
    
    golds_bleu = []
    preds_bleu = []
    
    for data in tqdm(dataset):
        answers = []

        # Iterate over all the extracted features
        for i in extracted_features[data["id"]]:
            start_logit = start_logits[i]
            end_logit = end_logits[i]
            offs = tok[i]["offset_mapping"]

            # Get all combinations of start and end positions
            for start_i in range(len(start_logit)):
                for end_i in range(len(end_logit)):
                    # Continue on wrong answers
                    if offs[start_i] is None \
                        or offs[end_i] is None \
                        or end_i < start_i \
                        or end_i - start_i + 1 > max_answer_length:
                        continue
                    
                    # Add text and score
                    answer = {
                        "answer": data["context"][offs[start_i][0] : offs[end_i][1]],
                        "score": start_logit[start_i] + end_logit[end_i]
                    }
                    answers.append(answer)
                    
        preds_squad.append(
            {"id": data['id'], "prediction_text": max(answers, key=lambda x: x["score"])['answer']} 
                if len(answers) > 0 else 
                {"id": data['id'], "prediction_text": ""}
        )
        
        preds_bleu.append(
            max(answers, key=lambda x: x["score"])['answer']
                if len(answers) > 0 else 
                ""
        )
        
        golds_bleu.append(*[ans for ans in data["answers"]['text']])
        
    return {
        "squad": squad_metric.compute(predictions=preds_squad, references=golds_squad),
        "bleu": bleu_metric.compute(predictions=preds_bleu, references=golds_bleu),
        "rouge": rouge_metric.compute(predictions=preds_bleu, references=golds_bleu)
    }

# Training, evaluation and testing loop for hyperparameters

In [17]:
def output_metrics_to_file(metrics: dict, metric_type: str = None, lr: float = None, epoch: int = None):
    filename = os.path.join("results", f"{model_short_name}-{dataset_short_name}-type_{metric_type}-lr_{lr}-epoch_{epoch:02}.json" if lr is not None and epoch is not None and metric_type is not None else f"{model_short_name}-{dataset_short_name}.json")
    with open(filename, "w") as f:
        json.dump(metrics, f, indent=4)
        
def save_model(model: AutoModelForQuestionAnswering, lr: float, epoch: int):
    model.save_pretrained(os.path.join("results", "model", f"{model_short_name}-{dataset_short_name}-{lr}-{epoch}"))
    
def load_model(lr: float, epoch: int) -> AutoModelForQuestionAnswering:
    return AutoModelForQuestionAnswering.from_pretrained(os.path.join("results", "model", f"{model_short_name}-{dataset_short_name}-{lr}-{epoch}"))

## Get maximum answer length within 2 standard deviations from the mean of the training dataset

In [18]:
mean_answer_length = np.mean([len(a["text"][0]) for a in ds_train["answers"]])
std_dev_answer_length = np.std([len(a["text"][0]) for a in ds_train["answers"]])

two_std_devs_above = mean_answer_length + 2 * std_dev_answer_length

In [None]:
def tet_loop(lr_list: list, epochs_list: list) -> None:
    all_models_eval = {"validation": {lr:{epochs:[] for epochs in epochs_list} for lr in lr_list}, "test": {lr:{epochs:[] for epochs in epochs_list} for lr in lr_list}}
    
    for lr in lr_list:
        for epochs in epochs_list:
            print(f"Training with lr={lr} and epochs={epochs}")
            args = TrainingArguments(
                output_dir="./results",
                eval_strategy="no",
                save_strategy="epoch",
                learning_rate=lr,
                num_train_epochs=epochs,
                weight_decay=0.01,
                per_device_train_batch_size=8, 
            )
            trainer = Trainer(
                model=model,
                args=args,
                train_dataset=ds_tok_train,
                eval_dataset=ds_tok_val,
                processing_class=tokenizer
            )
            trainer.train()
            
            preds, _, _ = trainer.predict(ds_tok_val)
            start_logits, end_logits = preds
            computed_metrics = compute_metrics(start_logits, end_logits, ds_tok_val, ds_val, two_std_devs_above)
            
            output_metrics_to_file(computed_metrics, metric_type='validation', lr=lr, epoch=epochs)
            all_models_eval[f"{lr}-{epochs}"] = computed_metrics
            
            preds, _, _ = trainer.predict(ds_tok_test)
            start_logits2, end_logits2 = preds
            computed_metrics = compute_metrics(start_logits2, end_logits2, ds_tok_test, ds_test, two_std_devs_above)
            
            output_metrics_to_file(computed_metrics, metric_type='test', lr=lr, epoch=epochs)
            
            save_model(model, lr, epochs)
            
            print(f"Finished training, validating and testing with lr={lr} and epochs={epochs}")

In [None]:
tet_loop([1e-5, 1e-4, 1e-3], [2, 4, 8, 16])