**References**

[Tutorial: Working with Hugging Face Models and Datasets](https://github.com/anyuanay/medium/blob/main/src/working_huggingface/Working_with_HuggingFace_ch4_Fine_Tuning_Pretrained_Model_for_Question_Answering.ipynb)

[Fine-Tuning-for-Question-Answering-SQuAD-IndoQA](https://github.com/PrasetyoWidyantoro/Fine-Tuning-for-Question-Answering-SQuAD-IndoQA/blob/master/fine-tune-squad-dataset.ipynb)

# Import libraries

In [1]:
%pip install evaluate sentence-transformers

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from tqdm.auto import tqdm
import evaluate
import numpy as np
import os
import json
from sentence_transformers import SentenceTransformer, util

# Global variables

In [3]:
random_seed = 42

kaggle = True

model_name = "distilbert-base-multilingual-cased"
model_short_name = "distilbert-multilingual"

dataset_name = ("xquad", "xquad.ro")
dataset_short_name = "xquad"

# Import model and tokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForQuestionAnswering.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



# Split the dataset into train, validation and test

In [5]:
ds = load_dataset(dataset_name[0], dataset_name[1])

ds = ds['validation'].train_test_split(test_size=0.2)
ds_train = ds['train']
ds_test = ds['test']

ds_test = ds_test.train_test_split(test_size=0.5)
ds_val = ds_test['train']
ds_test = ds_test['test']

print(len(ds_train), len(ds_val), len(ds_test))

README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/244k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/1190 [00:00<?, ? examples/s]

952 119 119


# Process the dataset

## Train dataset

In [6]:
def standardize(text: str):
    return text
    # return text.replace("ţ", "ț").replace("ş", "ș").replace("Ţ", "Ț").replace("Ş", "Ș")

def process_and_tokenize(dataset: Dataset, tokenizer: AutoTokenizer) -> dict:
    # Extract from the dataset and standardize where possible
    questions = [standardize(q).strip() for q in dataset["question"]]
    contexts = [standardize(c) for c in dataset["context"]]
    answers = dataset["answers"]
    
    
    inputs = tokenizer(
        questions,
        contexts,
        max_length=384,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = inputs["overflow_to_sample_mapping"]
    offset_mapping = inputs["offset_mapping"]
    
    start_pos = []
    end_pos = []

    for i, ofs in enumerate(offset_mapping):
        # Get the sample
        sample_i = sample_mapping[i]
        print(answers[sample_i]["text"])
        
        # Get the answer for the sample
        max_i = 0
        max_len = len(answers[sample_i]["text"][max_i])
        for i in range(1, len(answers[sample_i]['text'])):
            if len(answer["text"][i]) > max_len:
                max_len = len(answers[sample_i]["text"][i])
                max_i = i
        answer = contexts[sample_i].index(answers[sample_i]["text"][max_i])

        
        # Get the start and end character of the answer
        start = answer
        end = answer + len(answers[sample_i]["text"][0])
        
        # Get the sequence ids
        seq_ids = inputs.sequence_ids(i)

        # Get the start and end token positions
        start_context = seq_ids.index(1)
        end_context = next(j - 1 for j in range(start_context, len(seq_ids)) if seq_ids[j] != 1)

        if ofs[start_context][0] > start or ofs[end_context][1] < end: # If it's impossible
            start_pos.append(0)
            end_pos.append(0)
        else: # Get and append start and end position
            start_pos.append(next((j - 1 for j in range(start_context, end_context + 1) if ofs[j][0] > start), end_context))
            end_pos.append(next((j + 1 for j in range(end_context, start_context - 1, -1) if ofs[j][1] < end), start_context))
            
            # if i < len(dataset['id']) and dataset['id'][i] == '56d6f3500d65d21400198292':
            #     print(start_context, end_context, end_pos[-1], end_pos[-1])

    inputs["start_positions"] = start_pos
    inputs["end_positions"] = end_pos
        
    inputs.pop("overflow_to_sample_mapping")
    inputs.pop("offset_mapping")
    
    return inputs

In [7]:
ds_tok_train = ds_train.map(lambda x: process_and_tokenize(dataset=x, tokenizer=tokenizer), batched=True, batch_size=64, remove_columns=ds_train.column_names)
 
print(len(ds_tok_train))

Map:   0%|          | 0/952 [00:00<?, ? examples/s]

['2020']
['1.160.000']
['două']
['două']
['20–18']
['Sud-Vestul Fresno']
['Turnul Magdalen']
['adăpost, asistență educațională, clinici medicale gratuite sau la tarife reduse, asistență legată de locuințe']
['tripartită']
['tripartită']
['unsprezece']
['1,1 × 1011']
['Rinul Mijlociu']
['nouă']
['1986']
['Regele Franței']
['1950']
['America de Nord']
['Lexus']
['Kawann Short']
['Kawann Short']
['traiectoria balistică']
['traiectoria balistică']
['ecartamentul de 1.600 mm (5 ft 3 in)']
['Curții Supreme a Statelor Unite']
['1970']
['1970']
['dovezi']
['pentru viziuni luterane']
['peste jumătate']
['imunoglobulinele și receptorii de celule T']
['iulie']
['An Unearthly Child']
['de jos']
['șapte']
['We Love TV']
['We Love TV']
['Comisie și Consiliu']
['Comisie și Consiliu']
['Comisie și Consiliu']
['Comisie și Consiliu']
['cameră de combustie']
['mai mult de jumătate']
['Gottfried Semper']
['Grupul Operativ al Metodiștilor Uniți privind Avortul și Sexualitatea']
['reducerea la jumătate a să

## Validation and test datasets

In [8]:
def process_and_tokenize_val_test(dataset: Dataset, tokenizer: AutoTokenizer) -> dict:
    # Extract from the dataset and standardize where possible
    questions = [standardize(q).strip() for q in dataset["question"]]
    contexts = [standardize(c) for c in dataset["context"]]
    
    inputs = tokenizer(
        questions,
        contexts,
        max_length=384,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = inputs["overflow_to_sample_mapping"]
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_i = sample_mapping[i]
        example_ids.append(dataset["id"][sample_i])

        seq_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        
        for j, ofs in enumerate(offset):
            inputs['offset_mapping'][i][j] = ofs if seq_ids[j] == 1 else None

    inputs["example_id"] = example_ids
    
    inputs.pop("overflow_to_sample_mapping")
    
    return inputs

In [9]:
ds_tok_val = ds_val.map(lambda x: process_and_tokenize_val_test(dataset=x, tokenizer=tokenizer), batched=True, batch_size=64, remove_columns=ds_val.column_names)
ds_tok_test = ds_test.map(lambda x: process_and_tokenize_val_test(dataset=x, tokenizer=tokenizer), batched=True, batch_size=64, remove_columns=ds_test.column_names)
 
print(len(ds_tok_val), len(ds_tok_test))

Map:   0%|          | 0/119 [00:00<?, ? examples/s]

Map:   0%|          | 0/119 [00:00<?, ? examples/s]

133 133


# Make metrics

In [10]:
%pip install rouge_score

  pid, fd = os.forkpty()


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=16fc2bc46ecf4882c8c6c7c6ea870af6e224bb1c52113d1e80f1837006ab26cc
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [11]:
squad_metric = evaluate.load("squad")
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [12]:
def compute_metrics(start_logits: list, end_logits: list, tok: dict, dataset: Dataset, max_answer_length: int = 30, eval_answer: SentenceTransformer = SentenceTransformer("BlackKakapo/stsb-xlm-r-multilingual-ro")) -> dict:
    extracted_features = {}
    
    # Group extracted features by ids
    for i, feature in enumerate(tok):
        if feature["example_id"] not in extracted_features:
            extracted_features[feature["example_id"]] = [i]
        else:
            extracted_features[feature["example_id"]].append(i)
    
    golds_squad = [{"id": data["id"], "answers": data["answers"]} for data in dataset]
    preds_squad = []
    
    golds_bleu = []
    preds_bleu = []
    
    for data in tqdm(dataset):
        answers = []

        # Iterate over all the extracted features
        for i in extracted_features[data["id"]]:
            start_logit = start_logits[i]
            end_logit = end_logits[i]
            offs = tok[i]["offset_mapping"]

            # Get all combinations of start and end positions
            for start_i in range(len(start_logit)):
                for end_i in range(len(end_logit)):
                    # Continue on wrong answers
                    if offs[start_i] is None \
                        or offs[end_i] is None \
                        or end_i < start_i \
                        or end_i - start_i + 1 > max_answer_length:
                        continue
                    
                    # Add text and score
                    answer = {
                        "answer": data["context"][offs[start_i][0] : offs[end_i][1]],
                        "score": start_logit[start_i] + end_logit[end_i]
                    }
                    answers.append(answer)
                    
        preds_squad.append(
            {"id": data['id'], "prediction_text": max(answers, key=lambda x: x["score"])['answer']} 
                if len(answers) > 0 else 
                {"id": data['id'], "prediction_text": ""}
        )
        
        preds_bleu.append(
            max(answers, key=lambda x: x["score"])['answer']
                if len(answers) > 0 else 
                ""
        )
        
        max_i = 0
        max_len = len(data["answers"]["text"][max_i])
        for i in range(1, len(data["answers"]['text'])):
            if len(data["answers"]["text"][i]) > max_len:
                max_len = len(data["answers"]["text"][i])
                max_i = i
        
        golds_bleu.append(data["answers"]['text'][max_i])
        
    return {
        "squad": squad_metric.compute(predictions=preds_squad, references=golds_squad),
        "bleu": bleu_metric.compute(predictions=preds_bleu, references=golds_bleu),
        "rouge": rouge_metric.compute(predictions=preds_bleu, references=golds_bleu),
        "semantic_similarity": util.pytorch_cos_sim(eval_answer.encode(preds_bleu, convert_to_tensor=True), eval_answer.encode(golds_bleu, convert_to_tensor=True)).mean().item() * 100
    }

# Training, evaluation and testing loop for hyperparameters

In [13]:
def output_metrics_to_file(metrics: dict, metric_type: str = None, lr: float = None, epoch: int = None):
    filename = os.path.join("/kaggle/working/" if kaggle else ".", "results", f"{model_short_name}-{dataset_short_name}-type_{metric_type}-lr_{lr}-epoch_{epoch:02}.json" if lr is not None and epoch is not None and metric_type is not None else f"{model_short_name}-{dataset_short_name}.json")
    with open(filename, "w") as f:
        json.dump(metrics, f, indent=4)
        
def save_model(trainer: Trainer, lr: float, epoch: int):
    trainer.save_model(os.path.join("/kaggle/working/" if kaggle else ".", "results", "models", f"{model_short_name}-{dataset_short_name}-{lr}-{epoch}"))
    
def load_model(lr: float, epoch: int) -> AutoModelForQuestionAnswering:
    return AutoModelForQuestionAnswering.from_pretrained(os.path.join("/kaggle/working/" if kaggle else ".", "results", "models", f"{model_short_name}-{dataset_short_name}-{lr}-{epoch}"))

## Get maximum answer length within 2 standard deviations from the mean of the training dataset

In [14]:
# mean_answer_length = np.mean([len(a["text"][0]) for a in ds_train["answers"]])
# std_dev_answer_length = np.std([len(a["text"][0]) for a in ds_train["answers"]])

# two_std_devs_above = mean_answer_length + 2 * std_dev_answer_length

In [15]:
all_models_eval = None

def tet_loop(lr_list: list, epochs: int, batch_size: int) -> None:
    global all_models_eval
    all_models_eval = {"validation": {lr:{epoch:[] for epoch in range(1, epochs + 1)} for lr in lr_list}, "test": {lr:{epoch:[] for epoch in range(1, epochs + 1)} for lr in lr_list}}
    
    eval_answer = SentenceTransformer("BlackKakapo/stsb-xlm-r-multilingual-ro")
    
    for lr in lr_list:
        model = AutoModelForQuestionAnswering.from_pretrained(model_name)

        warmup_steps = int(len(ds_tok_train) / batch_size * 1 / 10)
        args = TrainingArguments(
                output_dir="./results",
                eval_strategy="no",
                save_strategy="epoch",
                learning_rate=lr,
                num_train_epochs=1,
                weight_decay=0.01,
                per_device_train_batch_size=batch_size, 
                report_to="none",
                save_total_limit=1,
                warmup_steps=warmup_steps
            )
        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=ds_tok_train,
            eval_dataset=ds_tok_val,
            tokenizer=tokenizer
        )
        
        for epoch in range(1, epochs + 1):
            print(f"Training with lr={lr} and epoch={epoch}")
            
            trainer.train()
            
            preds = trainer.predict(ds_tok_val)
            start_logits, end_logits = preds[0][0], preds[0][1]
            computed_metrics = compute_metrics(start_logits, end_logits, ds_tok_val, ds_val, 30, eval_answer)
            
            # output_metrics_to_file(computed_metrics, metric_type='validation', lr=lr, epoch=epoch)
            print(f"Validation for lr {lr} epoch {epoch}: ")
            # print(computed_metrics)
            print(computed_metrics['squad'])
            print(computed_metrics['semantic_similarity'])
            
            all_models_eval['validation'][lr][epoch] = computed_metrics
            
            preds = trainer.predict(ds_tok_test)
            start_logits2, end_logits2 = preds[0][0], preds[0][1]
            computed_metrics = compute_metrics(start_logits2, end_logits2, ds_tok_test, ds_test, 30, eval_answer)
            
            
            # output_metrics_to_file(computed_metrics, metric_type='test', lr=lr, epoch=epoch)
            all_models_eval['test'][lr][epoch] = computed_metrics
            

            print(f"Test for lr {lr} epoch {epoch}: ")
            print(computed_metrics['squad'])
            print(computed_metrics['semantic_similarity'])
            
            # save_model(trainer, lr, epoch)
            
        try:
            del trainer
            del model
            # device = cuda.get_current_device()
            # device.reset()
        except Exception as e:
            print(e)
            
            print(f"Finished training, validating and testing with lr={lr} and epochs={epochs}")

In [16]:
tet_loop(lr_list=[1e-4, 1e-3, 1e-5], epochs=16, batch_size=16)

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with lr=0.0001 and epoch=1


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 1: 
{'exact_match': 10.92436974789916, 'f1': 13.943138158412502}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 1: 
{'exact_match': 15.126050420168067, 'f1': 22.646273795986307}
Training with lr=0.0001 and epoch=2


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 2: 
{'exact_match': 21.008403361344538, 'f1': 31.86970592432777}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 2: 
{'exact_match': 21.84873949579832, 'f1': 34.892763632259424}
Training with lr=0.0001 and epoch=3


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 3: 
{'exact_match': 19.327731092436974, 'f1': 31.74855379521456}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 3: 
{'exact_match': 21.84873949579832, 'f1': 33.28839461192401}
Training with lr=0.0001 and epoch=4


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 4: 
{'exact_match': 21.008403361344538, 'f1': 34.87190570007907}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 4: 
{'exact_match': 26.050420168067227, 'f1': 38.10383593997039}
Training with lr=0.0001 and epoch=5


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 5: 
{'exact_match': 20.168067226890756, 'f1': 34.28700507316163}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 5: 
{'exact_match': 24.369747899159663, 'f1': 35.82221302068064}
Training with lr=0.0001 and epoch=6


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 6: 
{'exact_match': 18.48739495798319, 'f1': 30.54283141302159}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 6: 
{'exact_match': 20.168067226890756, 'f1': 31.79310487392544}
Training with lr=0.0001 and epoch=7


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 7: 
{'exact_match': 20.168067226890756, 'f1': 30.94730089312132}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 7: 
{'exact_match': 19.327731092436974, 'f1': 29.110591309058922}
Training with lr=0.0001 and epoch=8


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 8: 
{'exact_match': 15.126050420168067, 'f1': 27.736749711539627}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 8: 
{'exact_match': 20.168067226890756, 'f1': 32.11933326738072}
Training with lr=0.0001 and epoch=9


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 9: 
{'exact_match': 19.327731092436974, 'f1': 28.764911068095497}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 9: 
{'exact_match': 19.327731092436974, 'f1': 30.332320779676184}
Training with lr=0.0001 and epoch=10


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 10: 
{'exact_match': 17.647058823529413, 'f1': 28.148707096628367}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 10: 
{'exact_match': 20.168067226890756, 'f1': 28.487248741395014}
Training with lr=0.0001 and epoch=11


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 11: 
{'exact_match': 23.529411764705884, 'f1': 34.534230941793965}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 11: 
{'exact_match': 22.689075630252102, 'f1': 32.9592156658166}
Training with lr=0.0001 and epoch=12


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 12: 
{'exact_match': 20.168067226890756, 'f1': 30.40828219399647}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 12: 
{'exact_match': 20.168067226890756, 'f1': 30.287221048691002}
Training with lr=0.0001 and epoch=13


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 13: 
{'exact_match': 19.327731092436974, 'f1': 32.66512380497603}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 13: 
{'exact_match': 23.529411764705884, 'f1': 33.39814550103127}
Training with lr=0.0001 and epoch=14


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 14: 
{'exact_match': 18.48739495798319, 'f1': 28.612407421562654}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 14: 
{'exact_match': 25.210084033613445, 'f1': 33.09062581140579}
Training with lr=0.0001 and epoch=15


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 15: 
{'exact_match': 17.647058823529413, 'f1': 29.92431680069893}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 15: 
{'exact_match': 26.050420168067227, 'f1': 35.93538142927066}
Training with lr=0.0001 and epoch=16


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 16: 
{'exact_match': 19.327731092436974, 'f1': 30.35522695947546}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 16: 
{'exact_match': 25.210084033613445, 'f1': 34.4192411736346}
Training with lr=0.0001 and epoch=17


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 17: 
{'exact_match': 20.168067226890756, 'f1': 30.38634776693157}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 17: 
{'exact_match': 27.73109243697479, 'f1': 36.553829274609264}
Training with lr=0.0001 and epoch=18


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 18: 
{'exact_match': 21.008403361344538, 'f1': 31.171398629381812}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 18: 
{'exact_match': 25.210084033613445, 'f1': 36.44245838592745}
Training with lr=0.0001 and epoch=19


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 19: 
{'exact_match': 18.48739495798319, 'f1': 29.83392424568894}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 19: 
{'exact_match': 22.689075630252102, 'f1': 33.8403739486972}
Training with lr=0.0001 and epoch=20


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 20: 
{'exact_match': 22.689075630252102, 'f1': 35.09863619107316}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 20: 
{'exact_match': 23.529411764705884, 'f1': 34.48886946372757}
Training with lr=0.0001 and epoch=21


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 21: 
{'exact_match': 21.84873949579832, 'f1': 33.54101314185347}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 21: 
{'exact_match': 23.529411764705884, 'f1': 33.71786340601859}
Training with lr=0.0001 and epoch=22


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 22: 
{'exact_match': 20.168067226890756, 'f1': 31.05545481595901}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 22: 
{'exact_match': 23.529411764705884, 'f1': 34.47150713114812}
Training with lr=0.0001 and epoch=23


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 23: 
{'exact_match': 22.689075630252102, 'f1': 34.36497415365433}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 23: 
{'exact_match': 23.529411764705884, 'f1': 34.14139016731883}
Training with lr=0.0001 and epoch=24


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 24: 
{'exact_match': 20.168067226890756, 'f1': 32.78494356888872}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 24: 
{'exact_match': 26.050420168067227, 'f1': 36.94129702737607}
Training with lr=0.0001 and epoch=25


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 25: 
{'exact_match': 22.689075630252102, 'f1': 33.18266390898229}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 25: 
{'exact_match': 23.529411764705884, 'f1': 33.542548820825495}
Training with lr=0.0001 and epoch=26


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 26: 
{'exact_match': 20.168067226890756, 'f1': 31.569194111210923}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 26: 
{'exact_match': 27.73109243697479, 'f1': 36.64124983969458}
Training with lr=0.0001 and epoch=27


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 27: 
{'exact_match': 20.168067226890756, 'f1': 30.890672960748667}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 27: 
{'exact_match': 25.210084033613445, 'f1': 33.773537670788244}
Training with lr=0.0001 and epoch=28


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 28: 
{'exact_match': 18.48739495798319, 'f1': 27.48397469649129}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 28: 
{'exact_match': 27.73109243697479, 'f1': 37.91224347533686}
Training with lr=0.0001 and epoch=29


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 29: 
{'exact_match': 20.168067226890756, 'f1': 29.366995941390403}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 29: 
{'exact_match': 21.84873949579832, 'f1': 31.083118845915642}
Training with lr=0.0001 and epoch=30


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 30: 
{'exact_match': 22.689075630252102, 'f1': 30.864889843084573}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 30: 
{'exact_match': 24.369747899159663, 'f1': 33.03417997212051}
Training with lr=0.0001 and epoch=31


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 31: 
{'exact_match': 18.48739495798319, 'f1': 29.69667476854562}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 31: 
{'exact_match': 30.252100840336134, 'f1': 38.59707179551654}
Training with lr=0.0001 and epoch=32


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 32: 
{'exact_match': 21.84873949579832, 'f1': 30.693843971154887}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 32: 
{'exact_match': 24.369747899159663, 'f1': 33.71635395814235}


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with lr=0.001 and epoch=1


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 1: 
{'exact_match': 0.0, 'f1': 0.4128924296991524}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 1: 
{'exact_match': 0.0, 'f1': 2.856081826670062}
Training with lr=0.001 and epoch=2


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 2: 
{'exact_match': 0.0, 'f1': 3.0762864586394}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 2: 
{'exact_match': 0.0, 'f1': 3.5464356278896756}
Training with lr=0.001 and epoch=3


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 3: 
{'exact_match': 1.680672268907563, 'f1': 2.894491129785247}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 3: 
{'exact_match': 1.680672268907563, 'f1': 2.436974789915966}
Training with lr=0.001 and epoch=4


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 4: 
{'exact_match': 1.680672268907563, 'f1': 2.6687193095861823}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 4: 
{'exact_match': 0.0, 'f1': 1.4565826330532214}
Training with lr=0.001 and epoch=5


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 5: 
{'exact_match': 0.0, 'f1': 0.33613445378151263}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 5: 
{'exact_match': 0.8403361344537815, 'f1': 1.7539323421676358}
Training with lr=0.001 and epoch=6


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 6: 
{'exact_match': 0.0, 'f1': 2.2278125565322795}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 6: 
{'exact_match': 0.0, 'f1': 2.2888352927898157}
Training with lr=0.001 and epoch=7


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 7: 
{'exact_match': 0.0, 'f1': 0.6544925662572721}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 7: 
{'exact_match': 0.8403361344537815, 'f1': 1.5580974864862331}
Training with lr=0.001 and epoch=8


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 8: 
{'exact_match': 0.0, 'f1': 0.8490668994870675}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 8: 
{'exact_match': 0.0, 'f1': 0.642633865140259}
Training with lr=0.001 and epoch=9


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 9: 
{'exact_match': 0.8403361344537815, 'f1': 1.2329142183189066}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 9: 
{'exact_match': 0.0, 'f1': 1.4607073598670237}
Training with lr=0.001 and epoch=10


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 10: 
{'exact_match': 0.0, 'f1': 0.24132730015082954}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 10: 
{'exact_match': 0.0, 'f1': 2.1594092182327476}
Training with lr=0.001 and epoch=11


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 11: 
{'exact_match': 0.0, 'f1': 1.3556991424020588}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 11: 
{'exact_match': 0.0, 'f1': 0.866437484084543}
Training with lr=0.001 and epoch=12


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 12: 
{'exact_match': 0.8403361344537815, 'f1': 1.2905162064825932}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 12: 
{'exact_match': 0.0, 'f1': 0.8793517406962783}
Training with lr=0.001 and epoch=13


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 13: 
{'exact_match': 0.0, 'f1': 1.027077497665733}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 13: 
{'exact_match': 0.0, 'f1': 0.5602240896358542}
Training with lr=0.001 and epoch=14


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 14: 
{'exact_match': 0.0, 'f1': 1.5177345153376312}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 14: 
{'exact_match': 0.0, 'f1': 1.424554977186556}
Training with lr=0.001 and epoch=15


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 15: 
{'exact_match': 0.0, 'f1': 2.0031803667363155}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 15: 
{'exact_match': 0.0, 'f1': 2.7797965163886356}
Training with lr=0.001 and epoch=16


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 16: 
{'exact_match': 0.0, 'f1': 1.5350016326289189}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 16: 
{'exact_match': 0.0, 'f1': 1.5771638062659734}
Training with lr=0.001 and epoch=17


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 17: 
{'exact_match': 0.0, 'f1': 2.077050338240021}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 17: 
{'exact_match': 0.0, 'f1': 2.9024452692954577}
Training with lr=0.001 and epoch=18


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 18: 
{'exact_match': 0.0, 'f1': 2.344802170653564}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 18: 
{'exact_match': 0.0, 'f1': 3.676791681981993}
Training with lr=0.001 and epoch=19


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 19: 
{'exact_match': 0.0, 'f1': 1.6475677619887625}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 19: 
{'exact_match': 0.0, 'f1': 2.389131091032904}
Training with lr=0.001 and epoch=20


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 20: 
{'exact_match': 0.0, 'f1': 2.5710586839019034}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 20: 
{'exact_match': 0.0, 'f1': 3.4091452052915803}
Training with lr=0.001 and epoch=21


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 21: 
{'exact_match': 0.0, 'f1': 1.016888414057808}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 21: 
{'exact_match': 0.0, 'f1': 1.511480190162189}
Training with lr=0.001 and epoch=22


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 22: 
{'exact_match': 0.0, 'f1': 2.728557601647968}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 22: 
{'exact_match': 0.0, 'f1': 2.2048637803865727}
Training with lr=0.001 and epoch=23


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 23: 
{'exact_match': 0.0, 'f1': 0.9318032963536045}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 23: 
{'exact_match': 0.0, 'f1': 1.8748984074061474}
Training with lr=0.001 and epoch=24


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 24: 
{'exact_match': 0.0, 'f1': 2.239408583946399}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 24: 
{'exact_match': 0.0, 'f1': 4.368217165352172}
Training with lr=0.001 and epoch=25


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 25: 
{'exact_match': 0.0, 'f1': 4.2393845650148165}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 25: 
{'exact_match': 0.0, 'f1': 3.6540851324642087}
Training with lr=0.001 and epoch=26


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 26: 
{'exact_match': 0.0, 'f1': 3.0160430738649646}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 26: 
{'exact_match': 0.0, 'f1': 3.856530250268391}
Training with lr=0.001 and epoch=27


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 27: 
{'exact_match': 0.0, 'f1': 4.861762468235399}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 27: 
{'exact_match': 0.0, 'f1': 2.9543674813461287}
Training with lr=0.001 and epoch=28


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 28: 
{'exact_match': 0.0, 'f1': 3.1096608683959417}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 28: 
{'exact_match': 0.0, 'f1': 5.2145027040960645}
Training with lr=0.001 and epoch=29


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 29: 
{'exact_match': 0.0, 'f1': 4.5975299586013225}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 29: 
{'exact_match': 0.0, 'f1': 6.509183994048146}
Training with lr=0.001 and epoch=30


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 30: 
{'exact_match': 0.0, 'f1': 3.0273894348082373}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 30: 
{'exact_match': 0.0, 'f1': 5.08310548818393}
Training with lr=0.001 and epoch=31


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 31: 
{'exact_match': 0.0, 'f1': 3.2050763109242766}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 31: 
{'exact_match': 0.8403361344537815, 'f1': 5.158323954405856}
Training with lr=0.001 and epoch=32


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 32: 
{'exact_match': 0.0, 'f1': 2.9130442195582105}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 0.001 epoch 32: 
{'exact_match': 0.0, 'f1': 4.984082271054872}


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with lr=1e-05 and epoch=1


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 1: 
{'exact_match': 3.361344537815126, 'f1': 9.035478796021458}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 1: 
{'exact_match': 0.8403361344537815, 'f1': 10.312913740541216}
Training with lr=1e-05 and epoch=2


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 2: 
{'exact_match': 8.403361344537815, 'f1': 14.007707978296212}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 2: 
{'exact_match': 3.361344537815126, 'f1': 6.821323935783397}
Training with lr=1e-05 and epoch=3


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 3: 
{'exact_match': 8.403361344537815, 'f1': 13.053839251318243}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 3: 
{'exact_match': 7.563025210084033, 'f1': 12.572711303231372}
Training with lr=1e-05 and epoch=4


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 4: 
{'exact_match': 11.764705882352942, 'f1': 17.662376460330425}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 4: 
{'exact_match': 6.722689075630252, 'f1': 15.649635139657775}
Training with lr=1e-05 and epoch=5


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 5: 
{'exact_match': 13.445378151260504, 'f1': 20.17736536187395}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 5: 
{'exact_match': 8.403361344537815, 'f1': 18.877831850371326}
Training with lr=1e-05 and epoch=6


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 6: 
{'exact_match': 15.966386554621849, 'f1': 24.192317952822155}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 6: 
{'exact_match': 9.243697478991596, 'f1': 20.541405776673873}
Training with lr=1e-05 and epoch=7


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 7: 
{'exact_match': 16.80672268907563, 'f1': 27.073307178349197}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 7: 
{'exact_match': 10.084033613445378, 'f1': 23.9202724748683}
Training with lr=1e-05 and epoch=8


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 8: 
{'exact_match': 15.126050420168067, 'f1': 26.621469100460704}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 8: 
{'exact_match': 14.285714285714286, 'f1': 25.086945057286133}
Training with lr=1e-05 and epoch=9


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 9: 
{'exact_match': 14.285714285714286, 'f1': 26.214518441409197}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 9: 
{'exact_match': 14.285714285714286, 'f1': 26.484551062895516}
Training with lr=1e-05 and epoch=10


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 10: 
{'exact_match': 14.285714285714286, 'f1': 25.882385588267937}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 10: 
{'exact_match': 17.647058823529413, 'f1': 26.987555953916484}
Training with lr=1e-05 and epoch=11


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 11: 
{'exact_match': 14.285714285714286, 'f1': 26.089419450763987}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 11: 
{'exact_match': 16.80672268907563, 'f1': 25.955852929407055}
Training with lr=1e-05 and epoch=12


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 12: 
{'exact_match': 14.285714285714286, 'f1': 25.040333149576846}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 12: 
{'exact_match': 15.966386554621849, 'f1': 25.51634276001422}
Training with lr=1e-05 and epoch=13


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 13: 
{'exact_match': 18.48739495798319, 'f1': 27.536876261542595}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 13: 
{'exact_match': 14.285714285714286, 'f1': 23.947600430546544}
Training with lr=1e-05 and epoch=14


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 14: 
{'exact_match': 17.647058823529413, 'f1': 27.37649838490174}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 14: 
{'exact_match': 17.647058823529413, 'f1': 26.988932828033164}
Training with lr=1e-05 and epoch=15


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 15: 
{'exact_match': 15.966386554621849, 'f1': 25.632528069502857}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 15: 
{'exact_match': 20.168067226890756, 'f1': 29.6963832421289}
Training with lr=1e-05 and epoch=16


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 16: 
{'exact_match': 17.647058823529413, 'f1': 28.017421420782753}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 16: 
{'exact_match': 19.327731092436974, 'f1': 28.34080691099969}
Training with lr=1e-05 and epoch=17


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 17: 
{'exact_match': 17.647058823529413, 'f1': 27.68544247535844}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 17: 
{'exact_match': 18.48739495798319, 'f1': 27.08530471011934}
Training with lr=1e-05 and epoch=18


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 18: 
{'exact_match': 13.445378151260504, 'f1': 23.254285397142546}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 18: 
{'exact_match': 15.126050420168067, 'f1': 23.97072554512}
Training with lr=1e-05 and epoch=19


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 19: 
{'exact_match': 17.647058823529413, 'f1': 28.380769557240143}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 19: 
{'exact_match': 19.327731092436974, 'f1': 28.200584581658752}
Training with lr=1e-05 and epoch=20


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 20: 
{'exact_match': 16.80672268907563, 'f1': 27.01165780997714}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 20: 
{'exact_match': 17.647058823529413, 'f1': 26.855689644278762}
Training with lr=1e-05 and epoch=21


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 21: 
{'exact_match': 17.647058823529413, 'f1': 27.971503286629332}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 21: 
{'exact_match': 16.80672268907563, 'f1': 25.494373187871634}
Training with lr=1e-05 and epoch=22


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 22: 
{'exact_match': 17.647058823529413, 'f1': 27.65383775887977}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 22: 
{'exact_match': 16.80672268907563, 'f1': 27.414246400314514}
Training with lr=1e-05 and epoch=23


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 23: 
{'exact_match': 17.647058823529413, 'f1': 27.736675089616266}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 23: 
{'exact_match': 16.80672268907563, 'f1': 27.48894294559929}
Training with lr=1e-05 and epoch=24


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 24: 
{'exact_match': 15.966386554621849, 'f1': 26.584563895488262}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 24: 
{'exact_match': 19.327731092436974, 'f1': 28.803959148158743}
Training with lr=1e-05 and epoch=25


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 25: 
{'exact_match': 16.80672268907563, 'f1': 28.38186902443639}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 25: 
{'exact_match': 18.48739495798319, 'f1': 29.23624530025914}
Training with lr=1e-05 and epoch=26


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 26: 
{'exact_match': 16.80672268907563, 'f1': 27.5417929409526}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 26: 
{'exact_match': 18.48739495798319, 'f1': 28.089786716825767}
Training with lr=1e-05 and epoch=27


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 27: 
{'exact_match': 17.647058823529413, 'f1': 28.007312482102392}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 27: 
{'exact_match': 17.647058823529413, 'f1': 27.66814679648981}
Training with lr=1e-05 and epoch=28


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 28: 
{'exact_match': 19.327731092436974, 'f1': 28.676974286217973}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 28: 
{'exact_match': 16.80672268907563, 'f1': 28.014800609530184}
Training with lr=1e-05 and epoch=29


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 29: 
{'exact_match': 17.647058823529413, 'f1': 28.659932102412537}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 29: 
{'exact_match': 16.80672268907563, 'f1': 27.489257058776545}
Training with lr=1e-05 and epoch=30


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 30: 
{'exact_match': 17.647058823529413, 'f1': 26.945400537837504}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 30: 
{'exact_match': 14.285714285714286, 'f1': 25.175270858235724}
Training with lr=1e-05 and epoch=31


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 31: 
{'exact_match': 20.168067226890756, 'f1': 30.440814618589176}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 31: 
{'exact_match': 19.327731092436974, 'f1': 29.800514895244472}
Training with lr=1e-05 and epoch=32


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 32: 
{'exact_match': 17.647058823529413, 'f1': 28.317590439439176}


  0%|          | 0/119 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 32: 
{'exact_match': 17.647058823529413, 'f1': 28.303681712210494}


In [17]:
output_metrics_to_file(all_models_eval)

In [18]:
!zip results_e4_2.zip results/*.json

  adding: results/distilbert-multilingual-xquad.json (deflated 86%)
  adding: results/distilbert-multilingual-xquad-type_test-lr_0.0001-epoch_01.json (deflated 52%)
  adding: results/distilbert-multilingual-xquad-type_test-lr_0.0001-epoch_02.json (deflated 53%)
  adding: results/distilbert-multilingual-xquad-type_test-lr_0.0001-epoch_03.json (deflated 52%)
  adding: results/distilbert-multilingual-xquad-type_test-lr_0.0001-epoch_04.json (deflated 52%)
  adding: results/distilbert-multilingual-xquad-type_test-lr_0.0001-epoch_05.json (deflated 52%)
  adding: results/distilbert-multilingual-xquad-type_test-lr_0.0001-epoch_06.json (deflated 53%)
  adding: results/distilbert-multilingual-xquad-type_test-lr_0.0001-epoch_07.json (deflated 52%)
  adding: results/distilbert-multilingual-xquad-type_test-lr_0.0001-epoch_08.json (deflated 52%)
  adding: results/distilbert-multilingual-xquad-type_test-lr_0.0001-epoch_09.json (deflated 52%)
  adding: results/distilbert-multilingual-xquad-type_test-l

  pid, fd = os.forkpty()
