# References
- Modified from: https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_seq2seq_qa.py
- Modified from: https://github.com/LazarusNLP/IndoT5/blob/main/scripts/run_qa.py

# Source code

## Check Dataset
Dataset (https://huggingface.co/datasets/dehanalkautsar/xcopa_gen_id) was self-modified from xcopa (https://huggingface.co/datasets/cambridgeltl/xcopa)

In [None]:
!pip install datasets -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.31.0, but you have requests 2.32.3 which is incompatible.[0m[31m
[0m

In [None]:
from datasets import load_dataset

dataset = load_dataset("dehanalkautsar/xcopa_gen_id")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/42.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/400 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
dataset['train'][0]

{'input': 'Berikan sebab dari kalimat berikut: Barang itu dikemas dalam bungkus gelembung.',
 'label': 'Barang itu rapuh.'}

## Import libraries

In [None]:
!pip install datargs -q
!pip install datasets -q
!pip install tokenizers -q
!pip install sentencepiece -q
!pip install transformers -q
!pip install rouge_score -q
!pip install evaluate -q
!pip install accelerate -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/170.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m163.8/170.2 kB[0m [31m6.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m170.2/170.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from dataclasses import dataclass
from datargs import parse

import math
import time
import evaluate
import numpy as np
from datasets import load_dataset
from transformers.trainer_utils import EvalPrediction
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    EarlyStoppingCallback,
)

In [None]:
# QuestionAnsweringSeq2SeqTrainer Class (https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/trainer_seq2seq_qa.py)
from torch.utils.data import Dataset

from transformers import Seq2SeqTrainer, is_torch_xla_available
from transformers.trainer_utils import PredictionOutput, speed_metrics
from typing import Optional, List, Dict

if is_torch_xla_available():
    import torch_xla.core.xla_model as xm
    import torch_xla.debug.metrics as met


class QuestionAnsweringSeq2SeqTrainer(Seq2SeqTrainer):
    def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.eval_examples = eval_examples
        self.post_process_function = post_process_function

    # def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"):
    def evaluate(
        self,
        eval_dataset: Optional[Dataset] = None,
        eval_examples=None,
        ignore_keys: Optional[List[str]] = None,
        metric_key_prefix: str = "eval",
        **gen_kwargs,
    ) -> Dict[str, float]:
        gen_kwargs = gen_kwargs.copy()

        # Use legacy argument setting if a) the option is not explicitly passed; and b) the argument is set in the
        # training args
        if gen_kwargs.get("max_length") is None and self.args.generation_max_length is not None:
            gen_kwargs["max_length"] = self.args.generation_max_length
        if gen_kwargs.get("num_beams") is None and self.args.generation_num_beams is not None:
            gen_kwargs["num_beams"] = self.args.generation_num_beams
        self._gen_kwargs = gen_kwargs

        eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
        eval_dataloader = self.get_eval_dataloader(eval_dataset)
        eval_examples = self.eval_examples if eval_examples is None else eval_examples

        # Temporarily disable metric computation, we will do it in the loop here.
        compute_metrics = self.compute_metrics
        self.compute_metrics = None
        start_time = time.time()
        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
        try:
            output = eval_loop(
                eval_dataloader,
                description="Evaluation",
                # No point gathering the predictions if there are no metrics, otherwise we defer to
                # self.args.prediction_loss_only
                prediction_loss_only=True if compute_metrics is None else None,
                ignore_keys=ignore_keys,
                metric_key_prefix=metric_key_prefix,
            )
        finally:
            self.compute_metrics = compute_metrics
        total_batch_size = self.args.eval_batch_size * self.args.world_size
        if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
            start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
        output.metrics.update(
            speed_metrics(
                metric_key_prefix,
                start_time,
                num_samples=output.num_samples,
                num_steps=math.ceil(output.num_samples / total_batch_size),
            )
        )

        if self.post_process_function is not None and self.compute_metrics is not None and self.args.should_save:
            # Only the main node write the results by default
            eval_preds = self.post_process_function(eval_examples, eval_dataset, output)
            metrics = self.compute_metrics(eval_preds)

            # Prefix all keys with metric_key_prefix + '_'
            for key in list(metrics.keys()):
                if not key.startswith(f"{metric_key_prefix}_"):
                    metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)

            metrics.update(output.metrics)
        else:
            metrics = output.metrics

        if self.args.should_log:
            # Only the main node log the results by default
            self.log(metrics)

        if self.args.tpu_metrics_debug or self.args.debug:
            # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
            xm.master_print(met.metrics_report())

        self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics)
        return metrics

    def predict(
        self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test", **gen_kwargs
    ):
        self._gen_kwargs = gen_kwargs.copy()

        predict_dataloader = self.get_test_dataloader(predict_dataset)

        # Temporarily disable metric computation, we will do it in the loop here.
        compute_metrics = self.compute_metrics
        self.compute_metrics = None
        start_time = time.time()
        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
        try:
            output = eval_loop(
                predict_dataloader,
                description="Prediction",
                # No point gathering the predictions if there are no metrics, otherwise we defer to
                # self.args.prediction_loss_only
                prediction_loss_only=True if compute_metrics is None else None,
                ignore_keys=ignore_keys,
                metric_key_prefix=metric_key_prefix,
            )
        finally:
            self.compute_metrics = compute_metrics

        total_batch_size = self.args.eval_batch_size * self.args.world_size
        if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
            start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
        output.metrics.update(
            speed_metrics(
                metric_key_prefix,
                start_time,
                num_samples=output.num_samples,
                num_steps=math.ceil(output.num_samples / total_batch_size),
            )
        )
        if self.post_process_function is None or self.compute_metrics is None:
            return output

        predictions = self.post_process_function(predict_examples, predict_dataset, output, "predict")
        metrics = self.compute_metrics(predictions)

        # Prefix all keys with metric_key_prefix + '_'
        for key in list(metrics.keys()):
            if not key.startswith(f"{metric_key_prefix}_"):
                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
        metrics.update(output.metrics)
        return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics)

## Main: LazarusNLP/IndoT5

In [None]:
@dataclass
class Args:
    model_checkpoint: str = "LazarusNLP/IndoNanoT5-base"
    dataset_name: str = "dehanalkautsar/xcopa_gen_id"
    # dataset_config: str = "question_answering"
    # context_column_name: str = "context"
    # question_column_name: str = "input"
    answer_column_name: str = "label" #"references"
    # id_column_name: str = "gem_id"
    input_max_length: int = 128
    target_max_length: int = 128
    num_beams: int = 5
    output_dir: str = "outputs/indo-nanot5-xcopagenid"
    num_train_epochs: int = 10 #50
    early_stopping_patience: int = 3 #5
    early_stopping_threshold: float = 0.01
    optim: str = "adamw_torch_fused"
    learning_rate: float = 1e-5
    weight_decay: float = 0.01
    per_device_train_batch_size: int = 8
    per_device_eval_batch_size: int = 16
    # hub_model_id: str = "LazarusNLP/IndoNanoT5-base-TyDiQA"

In [None]:
def main(args: Args):
    # load dataset, tokenizer, model
    dataset = load_dataset(args.dataset_name, trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(args.model_checkpoint)
    model = AutoModelForSeq2SeqLM.from_pretrained(args.model_checkpoint)

    def preprocess_function(examples):
        inputs, targets = examples['input'], examples['label']

        model_inputs = tokenizer(inputs, max_length=args.input_max_length, truncation=True)
        labels = tokenizer(text_target=targets, max_length=args.target_max_length, truncation=True)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    def preprocess_validation_function(examples):
        inputs, targets = examples['input'], examples['label']

        model_inputs = tokenizer(
            inputs,
            max_length=args.input_max_length,
            truncation=True,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
        )
        # Tokenize targets with the `text_target` keyword argument
        labels = tokenizer(text_target=targets, max_length=args.target_max_length, truncation=True)

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        # sample_mapping = model_inputs.pop("overflow_to_sample_mapping")

        # # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
        # # corresponding example_id and we will store the offset mappings.
        # model_inputs["example_id"] = []
        # # Augment the overflowing tokens to the labels
        # labels_out = []

        # for i in range(len(model_inputs["input_ids"])):
        #     # One example can give several spans, this is the index of the example containing this span of text.
        #     sample_index = sample_mapping[i]
        #     model_inputs["example_id"].append(examples[args.id_column_name][sample_index])
        #     labels_out.append(labels["input_ids"][sample_index])

        # model_inputs["labels"] = labels_out
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    train_dataset = dataset["train"]
    validation_dataset = dataset["validation"]
    test_dataset = dataset["test"]

    tokenized_train_dataset = train_dataset.map(
        preprocess_function, batched=True, remove_columns=train_dataset.column_names
    )
    tokenized_validation_dataset = validation_dataset.map(
        preprocess_validation_function, batched=True, remove_columns=validation_dataset.column_names
    )
    tokenized_test_dataset = test_dataset.map(
        preprocess_validation_function, batched=True, remove_columns=test_dataset.column_names
    )

    # prepare s2s collator
    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer, model=args.model_checkpoint, label_pad_token_id=tokenizer.pad_token_id
    )

    # SQuAD v2 and BLEU metric for evaluation
    squad_v2 = evaluate.load("squad_v2")
    bleu = evaluate.load("bleu")

    def compute_metrics(p: EvalPrediction):
        squad_res = squad_v2.compute(predictions=p.predictions, references=p.label_ids)
        preds = []
        refs = []
        for prediction, reference in zip(p.predictions, p.label_ids):
          preds.append(prediction["prediction_text"])
          refs.append(reference["answers"]["text"])
        bleu_res = bleu.compute(predictions=preds, references=refs)
        return {"exact":squad_res['exact'], "f1":squad_res['f1'], 'bleu':bleu_res['bleu'], 'bleu1':bleu_res['precisions'][0], 'bleu2':bleu_res['precisions'][1], 'bleu3':bleu_res['precisions'][2], 'bleu4':bleu_res['precisions'][3]}

    def post_processing_function(examples, features, outputs, stage="eval"):
        # Decode the predicted tokens.
        preds = outputs.predictions
        if isinstance(preds, tuple):
            preds = preds[0]
        # Replace -100s used for padding as we can't decode them
        preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

        # # Build a map example to its corresponding features.
        # example_id_to_index = {k: i for i, k in enumerate(examples[args.id_column_name])}
        # feature_per_example = {example_id_to_index[feature["example_id"]]: i for i, feature in enumerate(features)}
        # predictions = {}
        # # Let's loop over all the examples!
        # for example_index, example in enumerate(examples):
        #     # This is the index of the feature associated to the current example.
        #     feature_index = feature_per_example[example_index]
        #     predictions[example[args.id_column_name]] = decoded_preds[feature_index]

        # Format the result to the format the metric expects.
        # formatted_predictions = [
        #     {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
        # ]

        # references = [
        #     {"id": ex[args.id_column_name], "answers": {"answer_start": [0], "text": ex[args.answer_column_name]}}
        #     for ex in examples
        # ]
        formatted_predictions = []
        references = []
        for i, ex in enumerate(examples):
          formatted_predictions.append({"id": str(i), "prediction_text": decoded_preds[i], "no_answer_probability":0.0})
          references.append({"id": str(i), "answers": {"answer_start": [0], "text": [ex[args.answer_column_name].lower()]}})
        print(formatted_predictions[:2])
        print(references[:2])
        return EvalPrediction(predictions=formatted_predictions, label_ids=references)

    callbacks = [EarlyStoppingCallback(args.early_stopping_patience, args.early_stopping_threshold)]

    training_args = Seq2SeqTrainingArguments(
        output_dir=args.output_dir,
        evaluation_strategy="epoch",
        save_strategy="no",
        per_device_train_batch_size=args.per_device_train_batch_size,
        per_device_eval_batch_size=args.per_device_eval_batch_size,
        optim=args.optim,
        learning_rate=args.learning_rate,
        weight_decay=args.weight_decay,
        num_train_epochs=args.num_train_epochs,
        save_total_limit=3,
        predict_with_generate=True,
        load_best_model_at_end=False,
        metric_for_best_model="f1",
        # bf16=True,
        # report_to="tensorboard",
        # push_to_hub=True,
        # hub_model_id=args.hub_model_id,
        # hub_private_repo=True,
    )

    trainer = QuestionAnsweringSeq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_validation_dataset,
        eval_examples=validation_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=callbacks,
        post_process_function=post_processing_function,
    )

    trainer.train()

    result = trainer.evaluate(tokenized_test_dataset, test_dataset, max_length=args.target_max_length, num_beams=args.num_beams)
    print(result)

    # trainer.push_to_hub()

In [None]:
# args = parse(Args)
args = Args()
main(args)

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.03M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,Exact,F1,Bleu,Bleu Prec
1,No log,8.237881,0.0,12.4504,0.0,"[0.3341584158415842, 0.02147239263803681, 0.0, 0.0]"
2,No log,2.331386,0.0,6.7614,0.0,"[0.38144329896907214, 0.06802721088435375, 0.0, 0.0]"
3,No log,1.86586,0.0,21.90119,0.0,"[0.3943089430894309, 0.05357142857142857, 0.00684931506849315, 0.0]"
4,No log,1.839979,0.0,20.827165,0.0,"[0.3925619834710744, 0.057291666666666664, 0.007042253521126761, 0.0]"
5,No log,1.838346,0.0,18.915981,0.0,"[0.3816631130063966, 0.04336043360433604, 0.0037174721189591076, 0.0]"
6,No log,1.849852,0.0,19.29881,0.029232,"[0.3834745762711864, 0.051075268817204304, 0.011029411764705883, 0.005813953488372093]"


Trainer is attempting to log a value of "[0.3341584158415842, 0.02147239263803681, 0.0, 0.0]" of type <class 'list'> for key "eval/bleu_prec" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


[{'id': '0', 'prediction_text': 'dia tidak ingin pergi ke luar negeri.', 'no_answer_probability': 0.0}, {'id': '1', 'prediction_text': 'yang kaya itu meninggal karena kecelakaan.', 'no_answer_probability': 0.0}]
[{'id': '0', 'answers': {'answer_start': [0], 'text': ['dia bekerja keras.']}}, {'id': '1', 'answers': {'answer_start': [0], 'text': ['putranya mewarisi kekayaannya.']}}]


Trainer is attempting to log a value of "[0.38144329896907214, 0.06802721088435375, 0.0, 0.0]" of type <class 'list'> for key "eval/bleu_prec" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


[{'id': '0', 'prediction_text': '', 'no_answer_probability': 0.0}, {'id': '1', 'prediction_text': '', 'no_answer_probability': 0.0}]
[{'id': '0', 'answers': {'answer_start': [0], 'text': ['dia bekerja keras.']}}, {'id': '1', 'answers': {'answer_start': [0], 'text': ['putranya mewarisi kekayaannya.']}}]


Trainer is attempting to log a value of "[0.3943089430894309, 0.05357142857142857, 0.00684931506849315, 0.0]" of type <class 'list'> for key "eval/bleu_prec" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


[{'id': '0', 'prediction_text': 'dia ingin pergi ke kota itu.', 'no_answer_probability': 0.0}, {'id': '1', 'prediction_text': 'dia meninggal.', 'no_answer_probability': 0.0}]
[{'id': '0', 'answers': {'answer_start': [0], 'text': ['dia bekerja keras.']}}, {'id': '1', 'answers': {'answer_start': [0], 'text': ['putranya mewarisi kekayaannya.']}}]


Trainer is attempting to log a value of "[0.3925619834710744, 0.057291666666666664, 0.007042253521126761, 0.0]" of type <class 'list'> for key "eval/bleu_prec" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


[{'id': '0', 'prediction_text': 'dia ingin pergi ke kota itu.', 'no_answer_probability': 0.0}, {'id': '1', 'prediction_text': 'dia meninggal.', 'no_answer_probability': 0.0}]
[{'id': '0', 'answers': {'answer_start': [0], 'text': ['dia bekerja keras.']}}, {'id': '1', 'answers': {'answer_start': [0], 'text': ['putranya mewarisi kekayaannya.']}}]


Trainer is attempting to log a value of "[0.3816631130063966, 0.04336043360433604, 0.0037174721189591076, 0.0]" of type <class 'list'> for key "eval/bleu_prec" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


[{'id': '0', 'prediction_text': 'dia ingin mencapai tujuan itu.', 'no_answer_probability': 0.0}, {'id': '1', 'prediction_text': 'dia meninggal.', 'no_answer_probability': 0.0}]
[{'id': '0', 'answers': {'answer_start': [0], 'text': ['dia bekerja keras.']}}, {'id': '1', 'answers': {'answer_start': [0], 'text': ['putranya mewarisi kekayaannya.']}}]


Trainer is attempting to log a value of "[0.3834745762711864, 0.051075268817204304, 0.011029411764705883, 0.005813953488372093]" of type <class 'list'> for key "eval/bleu_prec" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


[{'id': '0', 'prediction_text': 'dia ingin mencapai tujuan itu.', 'no_answer_probability': 0.0}, {'id': '1', 'prediction_text': 'dia meninggal karena kecelakaan.', 'no_answer_probability': 0.0}]
[{'id': '0', 'answers': {'answer_start': [0], 'text': ['dia bekerja keras.']}}, {'id': '1', 'answers': {'answer_start': [0], 'text': ['putranya mewarisi kekayaannya.']}}]


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


Trainer is attempting to log a value of "[0.3346613545816733, 0.02736318407960199, 0.0033112582781456954, 0.0]" of type <class 'list'> for key "eval/bleu_prec" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


[{'id': '0', 'prediction_text': 'keran itu telah habis.', 'no_answer_probability': 0.0}, {'id': '1', 'prediction_text': 'dia menuangkan sebotol air ke dalam mangkuknya.', 'no_answer_probability': 0.0}]
[{'id': '0', 'answers': {'answer_start': [0], 'text': ['air telah mengalir dari cerat keran tersebut.']}}, {'id': '1', 'answers': {'answer_start': [0], 'text': ['dia telah kehilangan nafsu makan.']}}]
{'eval_exact': 0.0, 'eval_f1': 15.15240315240315, 'eval_bleu': 0.0, 'eval_bleu_prec': [0.3346613545816733, 0.02736318407960199, 0.0033112582781456954, 0.0], 'eval_loss': 2.1066598892211914, 'eval_runtime': 5.9003, 'eval_samples_per_second': 16.948, 'eval_steps_per_second': 1.186, 'epoch': 6.0}
