**References**

[Tutorial: Working with Hugging Face Models and Datasets](https://github.com/anyuanay/medium/blob/main/src/working_huggingface/Working_with_HuggingFace_ch4_Fine_Tuning_Pretrained_Model_for_Question_Answering.ipynb)

[Fine-Tuning-for-Question-Answering-SQuAD-IndoQA](https://github.com/PrasetyoWidyantoro/Fine-Tuning-for-Question-Answering-SQuAD-IndoQA/blob/master/fine-tune-squad-dataset.ipynb)

# Import libraries

In [1]:
%pip install evaluate sentence-transformers

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from tqdm.auto import tqdm
import evaluate
import numpy as np
import os
import json
from sentence_transformers import SentenceTransformer, util

# Global variables

In [3]:
random_seed = 42

kaggle = True

model_name = "readerbench/RoGPT2-medium"
model_short_name = "RoGPT2-medium"

dataset_name = ("xquad", "xquad.ro")
dataset_short_name = "xquad"

# Import model and tokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForQuestionAnswering.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/563 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/869 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/985k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/542k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

# Split the dataset into train, validation and test

In [5]:
ds = load_dataset(dataset_name[0], dataset_name[1])

ds = ds['validation'].train_test_split(test_size=0.2)
ds_train = ds['train']
ds_test = ds['test']

ds_test = ds_test.train_test_split(test_size=0.5)
ds_val = ds_test['train']
ds_test = ds_test['test']

print(len(ds_train), len(ds_val), len(ds_test))

README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/244k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/1190 [00:00<?, ? examples/s]

952 119 119


# Process the dataset

## Train dataset

In [6]:
def standardize(text: str):
    return text
    # return text.replace("ţ", "ț").replace("ş", "ș").replace("Ţ", "Ț").replace("Ş", "Ș")

def process_and_tokenize(dataset, tokenizer) -> dict:
    # Standardize and prepare inputs
    questions = [q.strip() for q in dataset["question"]]
    contexts = dataset["context"]
    answers = dataset["answers"]

    # Tokenize with overflow handling and offset mapping
    inputs = tokenizer(
        questions,
        contexts,
        max_length=384,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Create mappings
    sample_mapping = inputs["overflow_to_sample_mapping"]
    offset_mapping = inputs["offset_mapping"]

    # Initialize start and end positions
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        # Retrieve original sample index
        sample_idx = sample_mapping[i]

        # Get context answer data
        answer_texts = answers[sample_idx]["text"]
        answer_starts = answers[sample_idx]["answer_start"]

        # Choose the answer span to use
        # Here, prioritize the first listed answer for simplicity
        if len(answer_starts) > 1:
            start_char = answer_starts[1]
            end_char = start_char + len(answer_texts[1])
        else:
            start_char = answer_starts[0]
            end_char = start_char + len(answer_texts[0])

        # Token-level sequence IDs
        sequence_ids = inputs.sequence_ids(i)

        # Context span
        context_start = sequence_ids.index(1)
        context_end = next(
            (j - 1 for j in range(context_start, len(sequence_ids)) if sequence_ids[j] != 1),
            len(sequence_ids) - 1  # Default to the last valid token index
        )
        # Check if answer falls outside the current span
        if offsets[context_start][0] > start_char or offsets[context_end][1] < end_char:
            # Mark as no answer
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Find the token positions corresponding to the character-level answer
            start_token = next(
                (j for j in range(context_start, context_end + 1) if offsets[j][0] <= start_char < offsets[j][1]),
                None,
            )
            end_token = next(
                (j for j in range(context_end, context_start - 1, -1) if offsets[j][0] < end_char <= offsets[j][1]),
                None,
            )

            if start_token is not None and end_token is not None:
                start_positions.append(start_token)
                end_positions.append(end_token)
            else:
                # Fallback if token positions are not found
                start_positions.append(0)
                end_positions.append(0)

    # Update tokenized inputs with labels
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    # Remove unnecessary fields
    inputs.pop("overflow_to_sample_mapping", None)
    inputs.pop("offset_mapping", None)

    return inputs


In [7]:
tokenizer.pad_token = tokenizer.eos_token

In [8]:
ds_tok_train = ds_train.map(lambda x: process_and_tokenize(dataset=x, tokenizer=tokenizer), batched=True, batch_size=64, remove_columns=ds_train.column_names)
 
print(len(ds_tok_train))

Map:   0%|          | 0/952 [00:00<?, ? examples/s]

1007


## Validation and test datasets

In [9]:
def process_and_tokenize_val_test(dataset: Dataset, tokenizer: AutoTokenizer) -> dict:
    # Extract from the dataset and standardize where possible
    questions = [standardize(q).strip() for q in dataset["question"]]
    contexts = [standardize(c) for c in dataset["context"]]
    
    inputs = tokenizer(
        questions,
        contexts,
        max_length=384,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = inputs["overflow_to_sample_mapping"]
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_i = sample_mapping[i]
        example_ids.append(dataset["id"][sample_i])

        seq_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        
        for j, ofs in enumerate(offset):
            inputs['offset_mapping'][i][j] = ofs if seq_ids[j] == 1 else None

    inputs["example_id"] = example_ids
    
    inputs.pop("overflow_to_sample_mapping")
    
    return inputs

In [10]:
ds_tok_val = ds_val.map(lambda x: process_and_tokenize_val_test(dataset=x, tokenizer=tokenizer), batched=True, batch_size=64, remove_columns=ds_val.column_names)
ds_tok_test = ds_test.map(lambda x: process_and_tokenize_val_test(dataset=x, tokenizer=tokenizer), batched=True, batch_size=64, remove_columns=ds_test.column_names)
 
print(len(ds_tok_val), len(ds_tok_test))

Map:   0%|          | 0/119 [00:00<?, ? examples/s]

Map:   0%|          | 0/119 [00:00<?, ? examples/s]

126 124


# Make metrics

In [11]:
%pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=88b3c1deed049a5782cd1774a1ab7ddfbee714ee46af9dc3a5cf67b0f70b5e7b
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [12]:
squad_metric = evaluate.load("squad")
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [13]:
def compute_metrics(start_logits: list, end_logits: list, tok: dict, dataset: Dataset, max_answer_length: int = 30, eval_answer: SentenceTransformer = SentenceTransformer("BlackKakapo/stsb-xlm-r-multilingual-ro")) -> dict:
    extracted_features = {}
    
    # Group extracted features by ids
    for i, feature in enumerate(tok):
        if feature["example_id"] not in extracted_features:
            extracted_features[feature["example_id"]] = [i]
        else:
            extracted_features[feature["example_id"]].append(i)
    
    golds_squad = [{"id": data["id"], "answers": data["answers"]} for data in dataset]
    preds_squad = []
    
    golds_bleu = []
    preds_bleu = []
    
    for data in tqdm(dataset):
        answers = []

        # Iterate over all the extracted features
        for i in extracted_features[data["id"]]:
            start_logit = start_logits[i]
            end_logit = end_logits[i]
            offs = tok[i]["offset_mapping"]

            # Get all combinations of start and end positions
            for start_i in range(len(start_logit)):
                for end_i in range(len(end_logit)):
                    # Continue on wrong answers
                    if offs[start_i] is None \
                        or offs[end_i] is None \
                        or end_i < start_i \
                        or end_i - start_i + 1 > max_answer_length:
                        continue
                    
                    # Add text and score
                    answer = {
                        "answer": data["context"][offs[start_i][0] : offs[end_i][1]],
                        "score": start_logit[start_i] + end_logit[end_i]
                    }
                    answers.append(answer)
                    
        preds_squad.append(
            {"id": data['id'], "prediction_text": max(answers, key=lambda x: x["score"])['answer']} 
                if len(answers) > 0 else 
                {"id": data['id'], "prediction_text": ""}
        )
        
        preds_bleu.append(
            max(answers, key=lambda x: x["score"])['answer']
                if len(answers) > 0 else 
                ""
        )
        
        max_i = 0
        max_len = len(data["answers"]["text"][max_i])
        for i in range(1, len(data["answers"]['text'])):
            if len(data["answers"]["text"][i]) > max_len:
                max_len = len(data["answers"]["text"][i])
                max_i = i
        
        golds_bleu.append(data["answers"]['text'][max_i])
        
    return {
        "squad": squad_metric.compute(predictions=preds_squad, references=golds_squad),
        "bleu": bleu_metric.compute(predictions=preds_bleu, references=golds_bleu),
        "rouge": rouge_metric.compute(predictions=preds_bleu, references=golds_bleu),
        "semantic_similarity": util.pytorch_cos_sim(eval_answer.encode(preds_bleu, convert_to_tensor=True), eval_answer.encode(golds_bleu, convert_to_tensor=True)).mean().item() * 100
    }

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.68k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/748 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Training, evaluation and testing loop for hyperparameters

In [14]:
def output_metrics_to_file(metrics: dict, metric_type: str = None, lr: float = None, epoch: int = None):
    filename = os.path.join("/kaggle/working/" if kaggle else ".", "results", f"{model_short_name}-{dataset_short_name}-type_{metric_type}-lr_{lr}-epoch_{epoch:02}.json" if lr is not None and epoch is not None and metric_type is not None else f"{model_short_name}-{dataset_short_name}.json")
    with open(filename, "w") as f:
        json.dump(metrics, f, indent=4)
        
def save_model(trainer: Trainer, lr: float, epoch: int):
    trainer.save_model(os.path.join("/kaggle/working/" if kaggle else ".", "results", "models", f"{model_short_name}-{dataset_short_name}-{lr}-{epoch}"))
    
def load_model(lr: float, epoch: int) -> AutoModelForQuestionAnswering:
    return AutoModelForQuestionAnswering.from_pretrained(os.path.join("/kaggle/working/" if kaggle else ".", "results", "models", f"{model_short_name}-{dataset_short_name}-{lr}-{epoch}"))

## Get maximum answer length within 2 standard deviations from the mean of the training dataset

In [15]:
# mean_answer_length = np.mean([len(a["text"][0]) for a in ds_train["answers"]])
# std_dev_answer_length = np.std([len(a["text"][0]) for a in ds_train["answers"]])

# two_std_devs_above = mean_answer_length + 2 * std_dev_answer_length

In [16]:
all_models_eval = None

def tet_loop(lr_list: list, epochs: int, batch_size: int) -> None:
    global all_models_eval
    all_models_eval = {"validation": {lr:{epoch:[] for epoch in range(1, epochs + 1)} for lr in lr_list}, "test": {lr:{epoch:[] for epoch in range(1, epochs + 1)} for lr in lr_list}}
    
    eval_answer = SentenceTransformer("BlackKakapo/stsb-xlm-r-multilingual-ro")
    
    for lr in lr_list:
        model = AutoModelForQuestionAnswering.from_pretrained(model_name)

        warmup_steps = int(len(ds_tok_train) / batch_size * 1 / 10)
        args = TrainingArguments(
                output_dir="./results",
                eval_strategy="no",
                save_strategy="epoch",
                learning_rate=lr,
                num_train_epochs=1,
                weight_decay=0.01,
                per_device_train_batch_size=batch_size, 
                report_to="none",
                save_total_limit=1,
                warmup_steps=warmup_steps
            )
        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=ds_tok_train,
            eval_dataset=ds_tok_val,
            tokenizer=tokenizer
        )
        
        for epoch in range(1, epochs + 1):
            print(f"Training with lr={lr} and epoch={epoch}")
            
            trainer.train()
            
            preds = trainer.predict(ds_tok_val)
            start_logits, end_logits = preds[0][0], preds[0][1]
            computed_metrics = compute_metrics(start_logits, end_logits, ds_tok_val, ds_val, 30, eval_answer)
            
            # output_metrics_to_file(computed_metrics, metric_type='validation', lr=lr, epoch=epoch)
            print(f"Validation for lr {lr} epoch {epoch}: ")
            # print(computed_metrics)
            print(computed_metrics['squad'])
            print(computed_metrics['semantic_similarity'])
            
            all_models_eval['validation'][lr][epoch] = computed_metrics
            
            preds = trainer.predict(ds_tok_test)
            start_logits2, end_logits2 = preds[0][0], preds[0][1]
            computed_metrics = compute_metrics(start_logits2, end_logits2, ds_tok_test, ds_test, 30, eval_answer)
            
            
            # output_metrics_to_file(computed_metrics, metric_type='test', lr=lr, epoch=epoch)
            all_models_eval['test'][lr][epoch] = computed_metrics
            

            print(f"Test for lr {lr} epoch {epoch}: ")
            print(computed_metrics['squad'])
            print(computed_metrics['semantic_similarity'])
            
            # save_model(trainer, lr, epoch)
            
        try:
            del trainer
            del model
            # device = cuda.get_current_device()
            # device.reset()
        except Exception as e:
            print(e)
            
            print(f"Finished training, validating and testing with lr={lr} and epochs={epochs}")

In [17]:
tet_loop(lr_list=[1e-4, 1e-3, 1e-5], epochs=16, batch_size=4)

pytorch_model.bin:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at readerbench/RoGPT2-medium and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training with lr=0.0001 and epoch=1


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 1: 
{'exact_match': 21.84873949579832, 'f1': 31.926866135123355}
17.484644055366516


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 1: 
{'exact_match': 21.008403361344538, 'f1': 28.455637233100294}
18.41612309217453
Training with lr=0.0001 and epoch=2


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 2: 
{'exact_match': 29.41176470588235, 'f1': 40.75169648278893}
17.41429567337036


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 2: 
{'exact_match': 29.41176470588235, 'f1': 38.26648730361016}
18.349364399909973
Training with lr=0.0001 and epoch=3


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 3: 
{'exact_match': 25.210084033613445, 'f1': 36.57336676702868}
16.782116889953613


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 3: 
{'exact_match': 26.89075630252101, 'f1': 35.108352373057684}
18.518143892288208
Training with lr=0.0001 and epoch=4


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 4: 
{'exact_match': 24.369747899159663, 'f1': 36.99676263083805}
16.760797798633575


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 4: 
{'exact_match': 24.369747899159663, 'f1': 33.69993869941837}
18.647083640098572
Training with lr=0.0001 and epoch=5


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 5: 
{'exact_match': 29.41176470588235, 'f1': 41.81891992633681}
17.018337547779083


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 5: 
{'exact_match': 27.73109243697479, 'f1': 37.34045028740131}
18.867245316505432
Training with lr=0.0001 and epoch=6


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 6: 
{'exact_match': 21.84873949579832, 'f1': 32.62546395308533}
16.528436541557312


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 6: 
{'exact_match': 23.529411764705884, 'f1': 34.52186383048402}
18.060019612312317
Training with lr=0.0001 and epoch=7


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 7: 
{'exact_match': 26.89075630252101, 'f1': 37.27686117786689}
16.831354796886444


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 7: 
{'exact_match': 21.84873949579832, 'f1': 31.923606611353698}
18.305429816246033
Training with lr=0.0001 and epoch=8


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 8: 
{'exact_match': 26.050420168067227, 'f1': 36.49745481716714}
17.505362629890442


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 8: 
{'exact_match': 24.369747899159663, 'f1': 35.05015348630705}
17.94710010290146
Training with lr=0.0001 and epoch=9


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 9: 
{'exact_match': 25.210084033613445, 'f1': 37.3558605492883}
16.984258592128754


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 9: 
{'exact_match': 24.369747899159663, 'f1': 33.172493213729524}
18.6759352684021
Training with lr=0.0001 and epoch=10


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 10: 
{'exact_match': 26.89075630252101, 'f1': 39.25108960441536}
17.075923085212708


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 10: 
{'exact_match': 22.689075630252102, 'f1': 34.2709293262146}
18.514437973499298
Training with lr=0.0001 and epoch=11


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 11: 
{'exact_match': 28.571428571428573, 'f1': 37.875583890914605}
17.15569645166397


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 11: 
{'exact_match': 23.529411764705884, 'f1': 34.32117969512871}
18.252067267894745
Training with lr=0.0001 and epoch=12


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 12: 
{'exact_match': 31.932773109243698, 'f1': 43.05825202281176}
17.336763441562653


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 12: 
{'exact_match': 23.529411764705884, 'f1': 35.33558489179262}
18.308474123477936
Training with lr=0.0001 and epoch=13


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 13: 
{'exact_match': 27.73109243697479, 'f1': 37.72511497042435}
17.064455151557922


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 13: 
{'exact_match': 23.529411764705884, 'f1': 35.55819982055006}
18.622002005577087
Training with lr=0.0001 and epoch=14


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 14: 
{'exact_match': 27.73109243697479, 'f1': 36.43623907452917}
16.998232901096344


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 14: 
{'exact_match': 20.168067226890756, 'f1': 31.575436552072503}
19.2385733127594
Training with lr=0.0001 and epoch=15


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 15: 
{'exact_match': 29.41176470588235, 'f1': 38.52205242084672}
17.21562147140503


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 15: 
{'exact_match': 26.89075630252101, 'f1': 35.846001728210446}
19.07508671283722
Training with lr=0.0001 and epoch=16


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.0001 epoch 16: 
{'exact_match': 28.571428571428573, 'f1': 39.43641497607615}
17.0976385474205


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.0001 epoch 16: 
{'exact_match': 22.689075630252102, 'f1': 33.174399287700446}
18.8753679394722


Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at readerbench/RoGPT2-medium and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training with lr=0.001 and epoch=1


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 1: 
{'exact_match': 1.680672268907563, 'f1': 7.3148577039217955}
17.243389785289764


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.001 epoch 1: 
{'exact_match': 3.361344537815126, 'f1': 8.088120015035996}
16.67899489402771
Training with lr=0.001 and epoch=2


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 2: 
{'exact_match': 1.680672268907563, 'f1': 6.699267059587083}
16.229940950870514


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.001 epoch 2: 
{'exact_match': 2.5210084033613445, 'f1': 6.271936446707532}
16.735970973968506
Training with lr=0.001 and epoch=3


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 3: 
{'exact_match': 0.0, 'f1': 3.7257402069562233}
15.462155640125275


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.001 epoch 3: 
{'exact_match': 1.680672268907563, 'f1': 6.188242619864272}
15.75443148612976
Training with lr=0.001 and epoch=4


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 4: 
{'exact_match': 0.8403361344537815, 'f1': 3.130694857188012}
16.415420174598694


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.001 epoch 4: 
{'exact_match': 1.680672268907563, 'f1': 5.840875452700733}
16.618478298187256
Training with lr=0.001 and epoch=5


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 5: 
{'exact_match': 0.8403361344537815, 'f1': 2.8927658359195685}
15.944918990135193


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.001 epoch 5: 
{'exact_match': 0.8403361344537815, 'f1': 3.700150755104316}
16.43475443124771
Training with lr=0.001 and epoch=6


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 6: 
{'exact_match': 0.8403361344537815, 'f1': 2.8816950852218053}
15.85085541009903


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.001 epoch 6: 
{'exact_match': 0.8403361344537815, 'f1': 3.567647256833458}
16.470277309417725
Training with lr=0.001 and epoch=7


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 7: 
{'exact_match': 2.5210084033613445, 'f1': 4.237577914048503}
15.700988471508026


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.001 epoch 7: 
{'exact_match': 1.680672268907563, 'f1': 5.847751688087821}
17.320378124713898
Training with lr=0.001 and epoch=8


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 8: 
{'exact_match': 1.680672268907563, 'f1': 4.472196378845996}
16.142189502716064


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.001 epoch 8: 
{'exact_match': 0.8403361344537815, 'f1': 4.480414882761581}
16.77170693874359
Training with lr=0.001 and epoch=9


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 9: 
{'exact_match': 2.5210084033613445, 'f1': 5.282620858001124}
16.37265384197235


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.001 epoch 9: 
{'exact_match': 0.8403361344537815, 'f1': 4.583812885319246}
17.187583446502686
Training with lr=0.001 and epoch=10


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 10: 
{'exact_match': 3.361344537815126, 'f1': 6.307752716604913}
17.04636961221695


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.001 epoch 10: 
{'exact_match': 1.680672268907563, 'f1': 5.284834899268909}
18.12499463558197
Training with lr=0.001 and epoch=11


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 11: 
{'exact_match': 2.5210084033613445, 'f1': 5.254535717459237}
17.120662331581116


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.001 epoch 11: 
{'exact_match': 0.8403361344537815, 'f1': 4.788390707557136}
17.69831031560898
Training with lr=0.001 and epoch=12


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 12: 
{'exact_match': 4.201680672268908, 'f1': 6.490911191678455}
17.622722685337067


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.001 epoch 12: 
{'exact_match': 0.8403361344537815, 'f1': 5.166118722095953}
18.032877147197723
Training with lr=0.001 and epoch=13


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 13: 
{'exact_match': 3.361344537815126, 'f1': 5.510843005860828}
16.966837644577026


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.001 epoch 13: 
{'exact_match': 0.8403361344537815, 'f1': 2.517948093395664}
18.191981315612793
Training with lr=0.001 and epoch=14


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 14: 
{'exact_match': 2.5210084033613445, 'f1': 4.707103911600032}
16.599537432193756


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.001 epoch 14: 
{'exact_match': 0.8403361344537815, 'f1': 3.73978098026541}
18.569768965244293
Training with lr=0.001 and epoch=15


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 15: 
{'exact_match': 3.361344537815126, 'f1': 4.885821150129431}
17.504389584064484


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.001 epoch 15: 
{'exact_match': 0.8403361344537815, 'f1': 3.7095381138880894}
17.45026260614395
Training with lr=0.001 and epoch=16


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 0.001 epoch 16: 
{'exact_match': 4.201680672268908, 'f1': 6.294985172244514}
17.841559648513794


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 0.001 epoch 16: 
{'exact_match': 0.8403361344537815, 'f1': 3.982126392849596}
17.96381026506424


Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at readerbench/RoGPT2-medium and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training with lr=1e-05 and epoch=1


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 1: 
{'exact_match': 0.0, 'f1': 7.237376046680751}
14.708644151687622


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 1: 
{'exact_match': 2.5210084033613445, 'f1': 9.608287174303728}
16.25882089138031
Training with lr=1e-05 and epoch=2


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 2: 
{'exact_match': 6.722689075630252, 'f1': 13.1541432749334}
16.223987936973572


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 2: 
{'exact_match': 1.680672268907563, 'f1': 8.957986415911535}
15.993377566337585
Training with lr=1e-05 and epoch=3


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 3: 
{'exact_match': 5.882352941176471, 'f1': 10.873758208474188}
17.36127883195877


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 3: 
{'exact_match': 4.201680672268908, 'f1': 10.973268679591616}
17.162232100963593
Training with lr=1e-05 and epoch=4


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 4: 
{'exact_match': 8.403361344537815, 'f1': 13.779327959885727}
17.591163516044617


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 4: 
{'exact_match': 5.882352941176471, 'f1': 13.448835426642697}
17.546464502811432
Training with lr=1e-05 and epoch=5


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 5: 
{'exact_match': 6.722689075630252, 'f1': 13.699136835947694}
17.34277755022049


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 5: 
{'exact_match': 6.722689075630252, 'f1': 15.276797358786972}
17.835281789302826
Training with lr=1e-05 and epoch=6


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 6: 
{'exact_match': 8.403361344537815, 'f1': 16.519945142808076}
17.73948073387146


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 6: 
{'exact_match': 10.084033613445378, 'f1': 17.60852286401717}
18.0412158370018
Training with lr=1e-05 and epoch=7


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 7: 
{'exact_match': 10.084033613445378, 'f1': 19.708031273238284}
16.57644659280777


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 7: 
{'exact_match': 8.403361344537815, 'f1': 17.257292109828896}
18.32994520664215
Training with lr=1e-05 and epoch=8


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 8: 
{'exact_match': 10.92436974789916, 'f1': 19.683477862130257}
17.02592223882675


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 8: 
{'exact_match': 7.563025210084033, 'f1': 16.73836892283848}
18.278153240680695
Training with lr=1e-05 and epoch=9


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 9: 
{'exact_match': 14.285714285714286, 'f1': 24.090692069211208}
17.313429713249207


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 9: 
{'exact_match': 9.243697478991596, 'f1': 18.477337237437045}
17.834225296974182
Training with lr=1e-05 and epoch=10


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 10: 
{'exact_match': 15.126050420168067, 'f1': 23.354412902813195}
16.88266396522522


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 10: 
{'exact_match': 12.605042016806722, 'f1': 21.851987773105442}
18.25888156890869
Training with lr=1e-05 and epoch=11


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 11: 
{'exact_match': 15.966386554621849, 'f1': 24.878217827081752}
17.2608882188797


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 11: 
{'exact_match': 11.764705882352942, 'f1': 22.3372696051365}
18.0645152926445
Training with lr=1e-05 and epoch=12


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 12: 
{'exact_match': 15.126050420168067, 'f1': 25.882365260226745}
16.834813356399536


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 12: 
{'exact_match': 9.243697478991596, 'f1': 20.207956341369453}
17.805828154087067
Training with lr=1e-05 and epoch=13


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 13: 
{'exact_match': 22.689075630252102, 'f1': 30.334570384700783}
17.382408678531647


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 13: 
{'exact_match': 12.605042016806722, 'f1': 23.538568891223882}
17.890073359012604
Training with lr=1e-05 and epoch=14


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 14: 
{'exact_match': 21.008403361344538, 'f1': 30.485539863401346}
17.49783307313919


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 14: 
{'exact_match': 13.445378151260504, 'f1': 22.22803123861999}
17.970608174800873
Training with lr=1e-05 and epoch=15


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 15: 
{'exact_match': 23.529411764705884, 'f1': 31.163861257835283}
17.160619795322418


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 15: 
{'exact_match': 13.445378151260504, 'f1': 22.22247368857406}
17.537429928779602
Training with lr=1e-05 and epoch=16


Step,Training Loss


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Validation for lr 1e-05 epoch 16: 
{'exact_match': 21.008403361344538, 'f1': 30.96476201029313}
17.062020301818848


  0%|          | 0/119 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Test for lr 1e-05 epoch 16: 
{'exact_match': 15.966386554621849, 'f1': 24.226832989288436}
17.827226221561432


In [18]:
output_metrics_to_file(all_models_eval)

In [19]:
!zip results_ml.zip results/*.json

  adding: results/RoGPT2-medium-xquad.json (deflated 85%)
