In [None]:
%pip install accelerate -U -qqq
%pip install transformers[torch] -qqq

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import torch
import yaml
from distutils.dir_util import copy_tree

In [3]:
import torch
import yaml
from src.data.datamodule import DataManager

In [4]:
import json
import os
import numpy as np
import re

from tqdm import tqdm

from src.data.mt_dataset import MTDataset_HF
from src.data.tokenizers.unif_tokenizers import UNIFTokenizer

In [5]:
data_config = yaml.load(open("configs/data_config.yaml", 'r', encoding='utf-8'), Loader=yaml.Loader)
data_path = lambda x: data_config["path_repository"] + "data/" + data_config["data_language"] + str(x) + data_config["data_name_file"] + ".json"

In [8]:
def prepare_sql_input(kb_id_list: list[str], db2attr_dict: dict[str:list[str]]) -> list[int]:
    input_list = []
    for kb_id in kb_id_list:
        final_input_str = ' [schema] '
        question_relevant_db_attributes = db2attr_dict[kb_id]
        final_input_str = final_input_str + " ".join(question_relevant_db_attributes)
        input_list.append(final_input_str)
    return input_list

In [13]:
config = data_config
device = "cpu"
db2attr_dict = json.load(open("data/table_id2new_attrs_for_parsing.json", 'r', encoding="utf8"))

tokenizer = UNIFTokenizer(path_tok=config["path_repository"] + "data/query_vocab.json",
                               pre_train_name=config["pre_train_tokenizer"],
                               pad_flag=True,
                               max_length=config["max_sent_len"])

def prepare_data(path_data, drop_last=False):

    dev_data = json.load(open(os.path.join(path_data), 'r', encoding="utf-8"))
    target_sentences = []
    source_sentences = []
    kb_id_sentences = []
    for sample in tqdm(dev_data[:config["separate_batch"]], desc="Pars data"):
        target_sentences.append(sample['masked_query'])
        source_sentences.append(sample['question'])
        kb_id_sentences.append(sample['kb_id'])
        
    kb_id_sentences = prepare_sql_input(kb_id_sentences, db2attr_dict)
    source_sentences = [i+j for i, j in zip(source_sentences, kb_id_sentences)]

    # DataLoader

    tokenized_source_sentences = [tokenizer.tkr(i) for i in source_sentences]
    tokenized_target_sentences = [tokenizer.tkr(i) for i in target_sentences]

    dataset = MTDataset_HF(tokenized_source_list=tokenized_source_sentences,
                        tokenized_target_list=tokenized_target_sentences, device=device)
    return dataset

In [14]:
dev_dataloader = prepare_data(path_data=data_path("train"), drop_last=False)
test_dataloader = prepare_data(path_data=data_path("dev"), drop_last=True) # dev

Pars data: 100%|██████████| 56354/56354 [00:00<00:00, 1189806.69it/s]
Pars data: 100%|██████████| 8420/8420 [00:00<00:00, 1213317.75it/s]


In [15]:
from transformers import XLMRobertaForCausalLM, AutoConfig
config = AutoConfig.from_pretrained("FacebookAI/roberta-base")
config.is_decoder = True
model = XLMRobertaForCausalLM.from_pretrained("FacebookAI/roberta-base", config=config)#.to('cuda')

In [16]:
sum(p.numel() for p in model.parameters())

124697433

In [17]:
from transformers import Trainer, TrainingArguments

In [18]:
training_args = TrainingArguments(
    f"roberta-base-exp",
    evaluation_strategy = "epoch",
    learning_rate=2e-5, # 2e-5,
    weight_decay=0.01,
    num_train_epochs = 20,
    logging_dir = 'logs',
    save_strategy="no" # "epoch", "no"
)

In [19]:
from transformers import ProgressCallback, PrinterCallback

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dev_dataloader,
    eval_dataset=test_dataloader,
    callbacks = [PrinterCallback]
)

In [21]:
end_train = trainer.train()

  0%|          | 500/140900 [01:08<5:12:25,  7.49it/s]

{'loss': 0.2617, 'learning_rate': 1.992902767920511e-05, 'epoch': 0.07}
{'loss': 0.2617, 'learning_rate': 1.992902767920511e-05, 'epoch': 0.07}


  1%|          | 1000/140900 [02:15<5:12:06,  7.47it/s]

{'loss': 0.1908, 'learning_rate': 1.9858055358410224e-05, 'epoch': 0.14}
{'loss': 0.1908, 'learning_rate': 1.9858055358410224e-05, 'epoch': 0.14}


  1%|          | 1501/140900 [03:22<5:15:43,  7.36it/s]

{'loss': 0.1841, 'learning_rate': 1.978708303761533e-05, 'epoch': 0.21}
{'loss': 0.1841, 'learning_rate': 1.978708303761533e-05, 'epoch': 0.21}


  1%|▏         | 2001/140900 [04:29<5:13:51,  7.38it/s]

{'loss': 0.1749, 'learning_rate': 1.9716110716820442e-05, 'epoch': 0.28}
{'loss': 0.1749, 'learning_rate': 1.9716110716820442e-05, 'epoch': 0.28}


  2%|▏         | 2500/140900 [05:35<5:07:31,  7.50it/s]

{'loss': 0.172, 'learning_rate': 1.9645138396025552e-05, 'epoch': 0.35}
{'loss': 0.172, 'learning_rate': 1.9645138396025552e-05, 'epoch': 0.35}


  2%|▏         | 3000/140900 [06:42<5:06:42,  7.49it/s]

{'loss': 0.1685, 'learning_rate': 1.957416607523066e-05, 'epoch': 0.43}
{'loss': 0.1685, 'learning_rate': 1.957416607523066e-05, 'epoch': 0.43}


  2%|▏         | 3501/140900 [07:49<5:09:31,  7.40it/s]

{'loss': 0.1649, 'learning_rate': 1.950319375443577e-05, 'epoch': 0.5}
{'loss': 0.1649, 'learning_rate': 1.950319375443577e-05, 'epoch': 0.5}


  3%|▎         | 4000/140900 [08:56<5:06:26,  7.45it/s]

{'loss': 0.1682, 'learning_rate': 1.943222143364088e-05, 'epoch': 0.57}
{'loss': 0.1682, 'learning_rate': 1.943222143364088e-05, 'epoch': 0.57}


  3%|▎         | 4500/140900 [10:03<5:04:02,  7.48it/s]

{'loss': 0.1648, 'learning_rate': 1.9361249112845992e-05, 'epoch': 0.64}
{'loss': 0.1648, 'learning_rate': 1.9361249112845992e-05, 'epoch': 0.64}


  4%|▎         | 5000/140900 [11:10<5:02:36,  7.48it/s]

{'loss': 0.1648, 'learning_rate': 1.9290276792051102e-05, 'epoch': 0.71}
{'loss': 0.1648, 'learning_rate': 1.9290276792051102e-05, 'epoch': 0.71}


  4%|▍         | 5501/140900 [12:17<5:05:06,  7.40it/s]

{'loss': 0.1609, 'learning_rate': 1.921930447125621e-05, 'epoch': 0.78}
{'loss': 0.1609, 'learning_rate': 1.921930447125621e-05, 'epoch': 0.78}


  4%|▍         | 6000/140900 [13:24<4:59:21,  7.51it/s]

{'loss': 0.1582, 'learning_rate': 1.914833215046132e-05, 'epoch': 0.85}
{'loss': 0.1582, 'learning_rate': 1.914833215046132e-05, 'epoch': 0.85}


  5%|▍         | 6501/140900 [14:31<5:02:45,  7.40it/s]

{'loss': 0.1552, 'learning_rate': 1.907735982966643e-05, 'epoch': 0.92}
{'loss': 0.1552, 'learning_rate': 1.907735982966643e-05, 'epoch': 0.92}


  5%|▍         | 7000/140900 [15:37<4:57:16,  7.51it/s]

{'loss': 0.1596, 'learning_rate': 1.9006387508871543e-05, 'epoch': 0.99}
{'loss': 0.1596, 'learning_rate': 1.9006387508871543e-05, 'epoch': 0.99}


  5%|▍         | 7044/140900 [15:43<4:59:44,  7.44it/s]
  0%|          | 0/1053 [00:00<?, ?it/s][A
  0%|          | 4/1053 [00:00<00:33, 31.65it/s][A
  1%|          | 8/1053 [00:00<00:36, 28.39it/s][A
  1%|          | 11/1053 [00:00<00:38, 27.32it/s][A
  1%|▏         | 14/1053 [00:00<00:38, 26.70it/s][A
  2%|▏         | 17/1053 [00:00<00:39, 26.29it/s][A
  2%|▏         | 20/1053 [00:00<00:39, 26.19it/s][A
  2%|▏         | 23/1053 [00:00<00:39, 26.01it/s][A
  2%|▏         | 26/1053 [00:00<00:39, 25.97it/s][A
  3%|▎         | 29/1053 [00:01<00:39, 25.92it/s][A
  3%|▎         | 32/1053 [00:01<00:39, 25.86it/s][A
  3%|▎         | 35/1053 [00:01<00:39, 25.79it/s][A
  4%|▎         | 38/1053 [00:01<00:39, 25.78it/s][A
  4%|▍         | 41/1053 [00:01<00:39, 25.73it/s][A
  4%|▍         | 44/1053 [00:01<00:39, 25.64it/s][A
  4%|▍         | 47/1053 [00:01<00:39, 25.67it/s][A
  5%|▍         | 50/1053 [00:01<00:39, 25.64it/s][A
  5%|▌         | 53/1053 [00:02<00:38, 25.72it/s][A
 

{'eval_loss': 0.15903528034687042, 'eval_runtime': 41.1462, 'eval_samples_per_second': 204.636, 'eval_steps_per_second': 25.592, 'epoch': 1.0}
{'eval_loss': 0.15903528034687042, 'eval_runtime': 41.1462, 'eval_samples_per_second': 204.636, 'eval_steps_per_second': 25.592, 'epoch': 1.0}


  5%|▌         | 7500/140900 [17:25<4:58:54,  7.44it/s]  

{'loss': 0.1542, 'learning_rate': 1.8935415188076652e-05, 'epoch': 1.06}
{'loss': 0.1542, 'learning_rate': 1.8935415188076652e-05, 'epoch': 1.06}


  6%|▌         | 8000/140900 [18:32<4:57:00,  7.46it/s]

{'loss': 0.1517, 'learning_rate': 1.886444286728176e-05, 'epoch': 1.14}
{'loss': 0.1517, 'learning_rate': 1.886444286728176e-05, 'epoch': 1.14}


  6%|▌         | 8501/140900 [19:39<4:57:57,  7.41it/s]

{'loss': 0.1538, 'learning_rate': 1.879347054648687e-05, 'epoch': 1.21}
{'loss': 0.1538, 'learning_rate': 1.879347054648687e-05, 'epoch': 1.21}


  6%|▋         | 9001/140900 [20:46<4:58:09,  7.37it/s]

{'loss': 0.15, 'learning_rate': 1.872249822569198e-05, 'epoch': 1.28}
{'loss': 0.15, 'learning_rate': 1.872249822569198e-05, 'epoch': 1.28}


  7%|▋         | 9500/140900 [21:53<4:53:02,  7.47it/s]

{'loss': 0.1517, 'learning_rate': 1.8651525904897093e-05, 'epoch': 1.35}
{'loss': 0.1517, 'learning_rate': 1.8651525904897093e-05, 'epoch': 1.35}


  7%|▋         | 10000/140900 [23:00<4:52:00,  7.47it/s]

{'loss': 0.1496, 'learning_rate': 1.8580553584102202e-05, 'epoch': 1.42}
{'loss': 0.1496, 'learning_rate': 1.8580553584102202e-05, 'epoch': 1.42}


  7%|▋         | 10501/140900 [24:07<4:54:18,  7.38it/s]

{'loss': 0.1488, 'learning_rate': 1.850958126330731e-05, 'epoch': 1.49}
{'loss': 0.1488, 'learning_rate': 1.850958126330731e-05, 'epoch': 1.49}


  8%|▊         | 11000/140900 [25:13<4:49:13,  7.49it/s]

{'loss': 0.1493, 'learning_rate': 1.843860894251242e-05, 'epoch': 1.56}
{'loss': 0.1493, 'learning_rate': 1.843860894251242e-05, 'epoch': 1.56}


  8%|▊         | 11500/140900 [26:20<4:48:23,  7.48it/s]

{'loss': 0.1474, 'learning_rate': 1.836763662171753e-05, 'epoch': 1.63}
{'loss': 0.1474, 'learning_rate': 1.836763662171753e-05, 'epoch': 1.63}


  9%|▊         | 12001/140900 [27:27<4:51:03,  7.38it/s]

{'loss': 0.146, 'learning_rate': 1.8296664300922643e-05, 'epoch': 1.7}
{'loss': 0.146, 'learning_rate': 1.8296664300922643e-05, 'epoch': 1.7}


                                                        

{'loss': 0.1455, 'learning_rate': 1.8225691980127752e-05, 'epoch': 1.77}


  9%|▉         | 12501/140900 [28:34<4:50:10,  7.37it/s]

{'loss': 0.1455, 'learning_rate': 1.8225691980127752e-05, 'epoch': 1.77}


  9%|▉         | 13000/140900 [29:41<4:45:17,  7.47it/s]

{'loss': 0.1472, 'learning_rate': 1.815471965933286e-05, 'epoch': 1.85}
{'loss': 0.1472, 'learning_rate': 1.815471965933286e-05, 'epoch': 1.85}


 10%|▉         | 13501/140900 [30:48<4:47:51,  7.38it/s]

{'loss': 0.1449, 'learning_rate': 1.808374733853797e-05, 'epoch': 1.92}
{'loss': 0.1449, 'learning_rate': 1.808374733853797e-05, 'epoch': 1.92}


 10%|▉         | 14000/140900 [31:55<4:43:21,  7.46it/s]

{'loss': 0.1445, 'learning_rate': 1.8012775017743083e-05, 'epoch': 1.99}
{'loss': 0.1445, 'learning_rate': 1.8012775017743083e-05, 'epoch': 1.99}


 10%|▉         | 14089/140900 [32:07<4:44:49,  7.42it/s]
  0%|          | 0/1053 [00:00<?, ?it/s][A
  0%|          | 4/1053 [00:00<00:30, 34.68it/s][A
  1%|          | 8/1053 [00:00<00:35, 29.03it/s][A
  1%|          | 11/1053 [00:00<00:37, 27.58it/s][A
  1%|▏         | 14/1053 [00:00<00:38, 26.82it/s][A
  2%|▏         | 17/1053 [00:00<00:39, 26.39it/s][A
  2%|▏         | 20/1053 [00:00<00:39, 26.06it/s][A
  2%|▏         | 23/1053 [00:00<00:39, 25.98it/s][A
  2%|▏         | 26/1053 [00:00<00:39, 25.80it/s][A
  3%|▎         | 29/1053 [00:01<00:41, 24.75it/s][A
  3%|▎         | 32/1053 [00:01<00:40, 25.13it/s][A
  3%|▎         | 35/1053 [00:01<00:40, 25.37it/s][A
  4%|▎         | 38/1053 [00:01<00:39, 25.53it/s][A
  4%|▍         | 41/1053 [00:01<00:39, 25.56it/s][A
  4%|▍         | 44/1053 [00:01<00:39, 25.68it/s][A
  4%|▍         | 47/1053 [00:01<00:39, 25.68it/s][A
  5%|▍         | 50/1053 [00:01<00:38, 25.76it/s][A
  5%|▌         | 53/1053 [00:02<00:38, 25.82it/s][A


{'eval_loss': 0.15391471982002258, 'eval_runtime': 41.1559, 'eval_samples_per_second': 204.588, 'eval_steps_per_second': 25.586, 'epoch': 2.0}
{'eval_loss': 0.15391471982002258, 'eval_runtime': 41.1559, 'eval_samples_per_second': 204.588, 'eval_steps_per_second': 25.586, 'epoch': 2.0}


 10%|█         | 14500/140900 [33:43<4:40:41,  7.51it/s]  

{'loss': 0.145, 'learning_rate': 1.7941802696948193e-05, 'epoch': 2.06}
{'loss': 0.145, 'learning_rate': 1.7941802696948193e-05, 'epoch': 2.06}


 11%|█         | 15001/140900 [34:50<4:43:43,  7.40it/s]

{'loss': 0.1426, 'learning_rate': 1.7870830376153302e-05, 'epoch': 2.13}
{'loss': 0.1426, 'learning_rate': 1.7870830376153302e-05, 'epoch': 2.13}


 11%|█         | 15500/140900 [35:56<4:40:02,  7.46it/s]

{'loss': 0.1401, 'learning_rate': 1.779985805535841e-05, 'epoch': 2.2}
{'loss': 0.1401, 'learning_rate': 1.779985805535841e-05, 'epoch': 2.2}


 11%|█▏        | 16000/140900 [37:03<4:37:20,  7.51it/s]

{'loss': 0.1385, 'learning_rate': 1.772888573456352e-05, 'epoch': 2.27}
{'loss': 0.1385, 'learning_rate': 1.772888573456352e-05, 'epoch': 2.27}


 12%|█▏        | 16501/140900 [38:10<4:41:09,  7.37it/s]

{'loss': 0.1431, 'learning_rate': 1.7657913413768633e-05, 'epoch': 2.34}
{'loss': 0.1431, 'learning_rate': 1.7657913413768633e-05, 'epoch': 2.34}


 12%|█▏        | 17000/140900 [39:17<4:35:53,  7.49it/s]

{'loss': 0.1414, 'learning_rate': 1.7586941092973743e-05, 'epoch': 2.41}
{'loss': 0.1414, 'learning_rate': 1.7586941092973743e-05, 'epoch': 2.41}


 12%|█▏        | 17500/140900 [40:24<4:34:25,  7.49it/s]

{'loss': 0.14, 'learning_rate': 1.7515968772178852e-05, 'epoch': 2.48}
{'loss': 0.14, 'learning_rate': 1.7515968772178852e-05, 'epoch': 2.48}


 13%|█▎        | 18001/140900 [41:31<4:37:05,  7.39it/s]

{'loss': 0.1379, 'learning_rate': 1.744499645138396e-05, 'epoch': 2.56}
{'loss': 0.1379, 'learning_rate': 1.744499645138396e-05, 'epoch': 2.56}


 13%|█▎        | 18500/140900 [42:38<4:32:39,  7.48it/s]

{'loss': 0.1377, 'learning_rate': 1.737402413058907e-05, 'epoch': 2.63}
{'loss': 0.1377, 'learning_rate': 1.737402413058907e-05, 'epoch': 2.63}


 13%|█▎        | 19000/140900 [43:44<4:31:07,  7.49it/s]

{'loss': 0.1401, 'learning_rate': 1.7303051809794184e-05, 'epoch': 2.7}
{'loss': 0.1401, 'learning_rate': 1.7303051809794184e-05, 'epoch': 2.7}


 14%|█▍        | 19501/140900 [44:51<4:33:13,  7.41it/s]

{'loss': 0.1408, 'learning_rate': 1.7232079488999293e-05, 'epoch': 2.77}
{'loss': 0.1408, 'learning_rate': 1.7232079488999293e-05, 'epoch': 2.77}


 14%|█▍        | 20000/140900 [45:58<4:28:35,  7.50it/s]

{'loss': 0.1389, 'learning_rate': 1.7161107168204402e-05, 'epoch': 2.84}
{'loss': 0.1389, 'learning_rate': 1.7161107168204402e-05, 'epoch': 2.84}


 15%|█▍        | 20500/140900 [47:05<4:30:41,  7.41it/s]

{'loss': 0.1404, 'learning_rate': 1.709013484740951e-05, 'epoch': 2.91}
{'loss': 0.1404, 'learning_rate': 1.709013484740951e-05, 'epoch': 2.91}


 15%|█▍        | 21001/140900 [48:12<4:29:42,  7.41it/s]

{'loss': 0.1394, 'learning_rate': 1.701916252661462e-05, 'epoch': 2.98}
{'loss': 0.1394, 'learning_rate': 1.701916252661462e-05, 'epoch': 2.98}


 15%|█▍        | 21134/140900 [48:30<4:27:58,  7.45it/s]
  0%|          | 0/1053 [00:00<?, ?it/s][A
  0%|          | 4/1053 [00:00<00:29, 35.05it/s][A
  1%|          | 8/1053 [00:00<00:36, 28.95it/s][A
  1%|          | 11/1053 [00:00<00:37, 27.58it/s][A
  1%|▏         | 14/1053 [00:00<00:38, 26.90it/s][A
  2%|▏         | 17/1053 [00:00<00:39, 26.37it/s][A
  2%|▏         | 20/1053 [00:00<00:39, 26.14it/s][A
  2%|▏         | 23/1053 [00:00<00:39, 25.91it/s][A
  2%|▏         | 26/1053 [00:00<00:39, 25.71it/s][A
  3%|▎         | 29/1053 [00:01<00:39, 25.70it/s][A
  3%|▎         | 32/1053 [00:01<00:39, 25.59it/s][A
  3%|▎         | 35/1053 [00:01<00:39, 25.54it/s][A
  4%|▎         | 38/1053 [00:01<00:39, 25.48it/s][A
  4%|▍         | 41/1053 [00:01<00:39, 25.42it/s][A
  4%|▍         | 44/1053 [00:01<00:39, 25.42it/s][A
  4%|▍         | 47/1053 [00:01<00:39, 25.49it/s][A
  5%|▍         | 50/1053 [00:01<00:39, 25.54it/s][A
  5%|▌         | 53/1053 [00:02<00:39, 25.57it/s][A


{'eval_loss': 0.15027588605880737, 'eval_runtime': 41.0803, 'eval_samples_per_second': 204.964, 'eval_steps_per_second': 25.633, 'epoch': 3.0}
{'eval_loss': 0.15027588605880737, 'eval_runtime': 41.0803, 'eval_samples_per_second': 204.964, 'eval_steps_per_second': 25.633, 'epoch': 3.0}


 15%|█▌        | 21500/140900 [50:00<4:26:44,  7.46it/s]  

{'loss': 0.1346, 'learning_rate': 1.6948190205819734e-05, 'epoch': 3.05}
{'loss': 0.1346, 'learning_rate': 1.6948190205819734e-05, 'epoch': 3.05}


 16%|█▌        | 22000/140900 [51:07<4:25:26,  7.47it/s]

{'loss': 0.1347, 'learning_rate': 1.6877217885024843e-05, 'epoch': 3.12}
{'loss': 0.1347, 'learning_rate': 1.6877217885024843e-05, 'epoch': 3.12}


 16%|█▌        | 22501/140900 [52:14<4:26:14,  7.41it/s]

{'loss': 0.134, 'learning_rate': 1.6806245564229952e-05, 'epoch': 3.19}
{'loss': 0.134, 'learning_rate': 1.6806245564229952e-05, 'epoch': 3.19}


 16%|█▋        | 23001/140900 [53:21<4:26:57,  7.36it/s]

{'loss': 0.1338, 'learning_rate': 1.673527324343506e-05, 'epoch': 3.26}
{'loss': 0.1338, 'learning_rate': 1.673527324343506e-05, 'epoch': 3.26}


 17%|█▋        | 23500/140900 [54:27<4:21:26,  7.48it/s]

{'loss': 0.1339, 'learning_rate': 1.666430092264017e-05, 'epoch': 3.34}
{'loss': 0.1339, 'learning_rate': 1.666430092264017e-05, 'epoch': 3.34}


 17%|█▋        | 24001/140900 [55:34<4:22:43,  7.42it/s]

{'loss': 0.134, 'learning_rate': 1.6593328601845284e-05, 'epoch': 3.41}
{'loss': 0.134, 'learning_rate': 1.6593328601845284e-05, 'epoch': 3.41}


 17%|█▋        | 24501/140900 [56:41<4:22:07,  7.40it/s]

{'loss': 0.1333, 'learning_rate': 1.652235628105039e-05, 'epoch': 3.48}
{'loss': 0.1333, 'learning_rate': 1.652235628105039e-05, 'epoch': 3.48}


 18%|█▊        | 25000/140900 [57:48<4:17:58,  7.49it/s]

{'loss': 0.1331, 'learning_rate': 1.6451383960255502e-05, 'epoch': 3.55}
{'loss': 0.1331, 'learning_rate': 1.6451383960255502e-05, 'epoch': 3.55}


 18%|█▊        | 25500/140900 [58:55<4:16:58,  7.48it/s]

{'loss': 0.1343, 'learning_rate': 1.6380411639460612e-05, 'epoch': 3.62}
{'loss': 0.1343, 'learning_rate': 1.6380411639460612e-05, 'epoch': 3.62}


 18%|█▊        | 26001/140900 [1:00:02<4:18:41,  7.40it/s]

{'loss': 0.1347, 'learning_rate': 1.630943931866572e-05, 'epoch': 3.69}
{'loss': 0.1347, 'learning_rate': 1.630943931866572e-05, 'epoch': 3.69}


 19%|█▉        | 26500/140900 [1:01:08<4:14:03,  7.50it/s]

{'loss': 0.1326, 'learning_rate': 1.623846699787083e-05, 'epoch': 3.76}
{'loss': 0.1326, 'learning_rate': 1.623846699787083e-05, 'epoch': 3.76}


 19%|█▉        | 27000/140900 [1:02:15<4:13:26,  7.49it/s]

{'loss': 0.1336, 'learning_rate': 1.616749467707594e-05, 'epoch': 3.83}
{'loss': 0.1336, 'learning_rate': 1.616749467707594e-05, 'epoch': 3.83}


 20%|█▉        | 27500/140900 [1:03:22<4:12:37,  7.48it/s]

{'loss': 0.1347, 'learning_rate': 1.6096522356281052e-05, 'epoch': 3.9}
{'loss': 0.1347, 'learning_rate': 1.6096522356281052e-05, 'epoch': 3.9}


 20%|█▉        | 28000/140900 [1:04:29<4:10:57,  7.50it/s]

{'loss': 0.1355, 'learning_rate': 1.6025550035486162e-05, 'epoch': 3.97}
{'loss': 0.1355, 'learning_rate': 1.6025550035486162e-05, 'epoch': 3.97}


 20%|█▉        | 28179/140900 [1:04:53<4:12:24,  7.44it/s]
  0%|          | 0/1053 [00:00<?, ?it/s][A
  0%|          | 4/1053 [00:00<00:29, 35.77it/s][A
  1%|          | 8/1053 [00:00<00:35, 29.21it/s][A
  1%|          | 11/1053 [00:00<00:37, 27.71it/s][A
  1%|▏         | 14/1053 [00:00<00:38, 26.96it/s][A
  2%|▏         | 17/1053 [00:00<00:39, 26.51it/s][A
  2%|▏         | 20/1053 [00:00<00:39, 26.21it/s][A
  2%|▏         | 23/1053 [00:00<00:39, 26.00it/s][A
  2%|▏         | 26/1053 [00:00<00:39, 25.84it/s][A
  3%|▎         | 29/1053 [00:01<00:39, 25.66it/s][A
  3%|▎         | 32/1053 [00:01<00:39, 25.67it/s][A
  3%|▎         | 35/1053 [00:01<00:39, 25.71it/s][A
  4%|▎         | 38/1053 [00:01<00:39, 25.54it/s][A
  4%|▍         | 41/1053 [00:01<00:39, 25.69it/s][A
  4%|▍         | 44/1053 [00:01<00:39, 25.70it/s][A
  4%|▍         | 47/1053 [00:01<00:39, 25.65it/s][A
  5%|▍         | 50/1053 [00:01<00:39, 25.66it/s][A
  5%|▌         | 53/1053 [00:02<00:39, 25.60it/s][

{'eval_loss': 0.14804573357105255, 'eval_runtime': 41.1356, 'eval_samples_per_second': 204.689, 'eval_steps_per_second': 25.598, 'epoch': 4.0}


 20%|██        | 28180/140900 [1:05:34<4:12:24,  7.44it/s]
100%|██████████| 1053/1053 [00:41<00:00, 25.67it/s][A
 20%|██        | 28181/140900 [1:05:34<301:02:09,  9.61s/it]

{'eval_loss': 0.14804573357105255, 'eval_runtime': 41.1356, 'eval_samples_per_second': 204.689, 'eval_steps_per_second': 25.598, 'epoch': 4.0}


 20%|██        | 28500/140900 [1:06:17<4:10:47,  7.47it/s]  

{'loss': 0.1323, 'learning_rate': 1.595457771469127e-05, 'epoch': 4.05}
{'loss': 0.1323, 'learning_rate': 1.595457771469127e-05, 'epoch': 4.05}


 21%|██        | 29001/140900 [1:07:24<4:11:57,  7.40it/s]

{'loss': 0.1281, 'learning_rate': 1.588360539389638e-05, 'epoch': 4.12}
{'loss': 0.1281, 'learning_rate': 1.588360539389638e-05, 'epoch': 4.12}


 21%|██        | 29500/140900 [1:08:31<4:07:47,  7.49it/s]

{'loss': 0.1284, 'learning_rate': 1.581263307310149e-05, 'epoch': 4.19}
{'loss': 0.1284, 'learning_rate': 1.581263307310149e-05, 'epoch': 4.19}


 21%|██▏       | 30001/140900 [1:09:38<4:09:28,  7.41it/s]

{'loss': 0.1277, 'learning_rate': 1.5741660752306603e-05, 'epoch': 4.26}
{'loss': 0.1277, 'learning_rate': 1.5741660752306603e-05, 'epoch': 4.26}


 22%|██▏       | 30500/140900 [1:10:44<4:05:25,  7.50it/s]

{'loss': 0.1282, 'learning_rate': 1.5670688431511712e-05, 'epoch': 4.33}
{'loss': 0.1282, 'learning_rate': 1.5670688431511712e-05, 'epoch': 4.33}


 22%|██▏       | 31001/140900 [1:11:51<4:07:07,  7.41it/s]

{'loss': 0.129, 'learning_rate': 1.559971611071682e-05, 'epoch': 4.4}
{'loss': 0.129, 'learning_rate': 1.559971611071682e-05, 'epoch': 4.4}


 22%|██▏       | 31500/140900 [1:12:58<4:03:02,  7.50it/s]

{'loss': 0.1286, 'learning_rate': 1.552874378992193e-05, 'epoch': 4.47}
{'loss': 0.1286, 'learning_rate': 1.552874378992193e-05, 'epoch': 4.47}


 23%|██▎       | 32001/140900 [1:14:05<4:10:20,  7.25it/s]

{'loss': 0.128, 'learning_rate': 1.545777146912704e-05, 'epoch': 4.54}
{'loss': 0.128, 'learning_rate': 1.545777146912704e-05, 'epoch': 4.54}


 23%|██▎       | 32501/140900 [1:15:12<4:07:06,  7.31it/s]

{'loss': 0.1304, 'learning_rate': 1.5386799148332153e-05, 'epoch': 4.61}
{'loss': 0.1304, 'learning_rate': 1.5386799148332153e-05, 'epoch': 4.61}


 23%|██▎       | 33000/140900 [1:16:19<4:00:20,  7.48it/s]

{'loss': 0.1279, 'learning_rate': 1.5315826827537262e-05, 'epoch': 4.68}
{'loss': 0.1279, 'learning_rate': 1.5315826827537262e-05, 'epoch': 4.68}


 24%|██▍       | 33501/140900 [1:17:26<4:02:16,  7.39it/s]

{'loss': 0.1288, 'learning_rate': 1.5244854506742373e-05, 'epoch': 4.76}
{'loss': 0.1288, 'learning_rate': 1.5244854506742373e-05, 'epoch': 4.76}


 24%|██▍       | 34000/140900 [1:18:32<3:57:54,  7.49it/s]

{'loss': 0.1275, 'learning_rate': 1.5173882185947482e-05, 'epoch': 4.83}
{'loss': 0.1275, 'learning_rate': 1.5173882185947482e-05, 'epoch': 4.83}


 24%|██▍       | 34500/140900 [1:19:39<3:56:26,  7.50it/s]

{'loss': 0.1264, 'learning_rate': 1.510290986515259e-05, 'epoch': 4.9}
{'loss': 0.1264, 'learning_rate': 1.510290986515259e-05, 'epoch': 4.9}


 25%|██▍       | 35001/140900 [1:20:46<3:58:32,  7.40it/s]

{'loss': 0.1306, 'learning_rate': 1.5031937544357703e-05, 'epoch': 4.97}
{'loss': 0.1306, 'learning_rate': 1.5031937544357703e-05, 'epoch': 4.97}


 25%|██▍       | 35224/140900 [1:21:16<3:56:53,  7.43it/s]
  0%|          | 0/1053 [00:00<?, ?it/s][A
  0%|          | 4/1053 [00:00<00:29, 35.58it/s][A
  1%|          | 8/1053 [00:00<00:35, 29.12it/s][A
  1%|          | 11/1053 [00:00<00:37, 27.63it/s][A
  1%|▏         | 14/1053 [00:00<00:38, 27.01it/s][A
  2%|▏         | 17/1053 [00:00<00:39, 26.46it/s][A
  2%|▏         | 20/1053 [00:00<00:39, 26.18it/s][A
  2%|▏         | 23/1053 [00:00<00:39, 25.88it/s][A
  2%|▏         | 26/1053 [00:00<00:39, 25.80it/s][A
  3%|▎         | 29/1053 [00:01<00:39, 25.77it/s][A
  3%|▎         | 32/1053 [00:01<00:39, 25.64it/s][A
  3%|▎         | 35/1053 [00:01<00:39, 25.61it/s][A
  4%|▎         | 38/1053 [00:01<00:39, 25.61it/s][A
  4%|▍         | 41/1053 [00:01<00:39, 25.64it/s][A
  4%|▍         | 44/1053 [00:01<00:39, 25.58it/s][A
  4%|▍         | 47/1053 [00:01<00:39, 25.54it/s][A
  5%|▍         | 50/1053 [00:01<00:39, 25.52it/s][A
  5%|▌         | 53/1053 [00:02<00:39, 25.49it/s][

{'eval_loss': 0.14785607159137726, 'eval_runtime': 41.1525, 'eval_samples_per_second': 204.605, 'eval_steps_per_second': 25.588, 'epoch': 5.0}
{'eval_loss': 0.14785607159137726, 'eval_runtime': 41.1525, 'eval_samples_per_second': 204.605, 'eval_steps_per_second': 25.588, 'epoch': 5.0}


 25%|██▌       | 35500/140900 [1:22:34<3:54:20,  7.50it/s]  

{'loss': 0.1249, 'learning_rate': 1.496096522356281e-05, 'epoch': 5.04}
{'loss': 0.1249, 'learning_rate': 1.496096522356281e-05, 'epoch': 5.04}


 26%|██▌       | 36000/140900 [1:23:41<3:54:03,  7.47it/s]

{'loss': 0.1224, 'learning_rate': 1.4889992902767923e-05, 'epoch': 5.11}
{'loss': 0.1224, 'learning_rate': 1.4889992902767923e-05, 'epoch': 5.11}


 26%|██▌       | 36500/140900 [1:24:48<3:52:38,  7.48it/s]

{'loss': 0.1217, 'learning_rate': 1.481902058197303e-05, 'epoch': 5.18}
{'loss': 0.1217, 'learning_rate': 1.481902058197303e-05, 'epoch': 5.18}


 26%|██▋       | 37000/140900 [1:25:55<3:51:15,  7.49it/s]

{'loss': 0.1249, 'learning_rate': 1.4748048261178143e-05, 'epoch': 5.25}
{'loss': 0.1249, 'learning_rate': 1.4748048261178143e-05, 'epoch': 5.25}


 27%|██▋       | 37501/140900 [1:27:01<3:52:58,  7.40it/s]

{'loss': 0.1221, 'learning_rate': 1.4677075940383251e-05, 'epoch': 5.32}
{'loss': 0.1221, 'learning_rate': 1.4677075940383251e-05, 'epoch': 5.32}


 27%|██▋       | 38000/140900 [1:28:08<3:48:28,  7.51it/s]

{'loss': 0.123, 'learning_rate': 1.460610361958836e-05, 'epoch': 5.39}
{'loss': 0.123, 'learning_rate': 1.460610361958836e-05, 'epoch': 5.39}


 27%|██▋       | 38501/140900 [1:29:15<3:50:53,  7.39it/s]

{'loss': 0.1222, 'learning_rate': 1.4535131298793471e-05, 'epoch': 5.46}
{'loss': 0.1222, 'learning_rate': 1.4535131298793471e-05, 'epoch': 5.46}


 28%|██▊       | 39000/140900 [1:30:22<3:47:01,  7.48it/s]

{'loss': 0.1229, 'learning_rate': 1.446415897799858e-05, 'epoch': 5.54}
{'loss': 0.1229, 'learning_rate': 1.446415897799858e-05, 'epoch': 5.54}


 28%|██▊       | 39501/140900 [1:31:29<3:48:19,  7.40it/s]

{'loss': 0.1228, 'learning_rate': 1.4393186657203694e-05, 'epoch': 5.61}
{'loss': 0.1228, 'learning_rate': 1.4393186657203694e-05, 'epoch': 5.61}


 28%|██▊       | 40000/140900 [1:32:36<3:43:36,  7.52it/s]

{'loss': 0.1238, 'learning_rate': 1.4322214336408801e-05, 'epoch': 5.68}
{'loss': 0.1238, 'learning_rate': 1.4322214336408801e-05, 'epoch': 5.68}


 29%|██▊       | 40500/140900 [1:33:42<3:44:15,  7.46it/s]

{'loss': 0.1236, 'learning_rate': 1.425124201561391e-05, 'epoch': 5.75}
{'loss': 0.1236, 'learning_rate': 1.425124201561391e-05, 'epoch': 5.75}


 29%|██▉       | 41000/140900 [1:34:49<3:41:52,  7.50it/s]

{'loss': 0.1254, 'learning_rate': 1.4180269694819022e-05, 'epoch': 5.82}
{'loss': 0.1254, 'learning_rate': 1.4180269694819022e-05, 'epoch': 5.82}


 29%|██▉       | 41500/140900 [1:35:56<3:41:47,  7.47it/s]

{'loss': 0.1248, 'learning_rate': 1.4109297374024131e-05, 'epoch': 5.89}
{'loss': 0.1248, 'learning_rate': 1.4109297374024131e-05, 'epoch': 5.89}


 30%|██▉       | 42001/140900 [1:37:03<3:43:59,  7.36it/s]

{'loss': 0.1239, 'learning_rate': 1.4038325053229242e-05, 'epoch': 5.96}
{'loss': 0.1239, 'learning_rate': 1.4038325053229242e-05, 'epoch': 5.96}


 30%|██▉       | 42269/140900 [1:37:39<3:40:18,  7.46it/s]
  0%|          | 0/1053 [00:00<?, ?it/s][A
  0%|          | 4/1053 [00:00<00:30, 34.54it/s][A
  1%|          | 8/1053 [00:00<00:36, 28.50it/s][A
  1%|          | 11/1053 [00:00<00:37, 27.42it/s][A
  1%|▏         | 14/1053 [00:00<00:38, 26.79it/s][A
  2%|▏         | 17/1053 [00:00<00:40, 25.81it/s][A
  2%|▏         | 20/1053 [00:00<00:40, 25.62it/s][A
  2%|▏         | 23/1053 [00:00<00:40, 25.61it/s][A
  2%|▏         | 26/1053 [00:00<00:40, 25.62it/s][A
  3%|▎         | 29/1053 [00:01<00:40, 25.54it/s][A
  3%|▎         | 32/1053 [00:01<00:40, 25.50it/s][A
  3%|▎         | 35/1053 [00:01<00:39, 25.63it/s][A
  4%|▎         | 38/1053 [00:01<00:39, 25.62it/s][A
  4%|▍         | 41/1053 [00:01<00:39, 25.61it/s][A
  4%|▍         | 44/1053 [00:01<00:39, 25.76it/s][A
  4%|▍         | 47/1053 [00:01<00:39, 25.77it/s][A
  5%|▍         | 50/1053 [00:01<00:38, 25.76it/s][A
  5%|▌         | 53/1053 [00:02<00:38, 25.68it/s][

{'eval_loss': 0.14957435429096222, 'eval_runtime': 41.1056, 'eval_samples_per_second': 204.838, 'eval_steps_per_second': 25.617, 'epoch': 6.0}



 30%|███       | 42270/140900 [1:38:20<3:40:18,  7.46it/s]
100%|██████████| 1053/1053 [00:41<00:00, 25.87it/s][A
 30%|███       | 42271/140900 [1:38:20<263:13:00,  9.61s/it]

{'eval_loss': 0.14957435429096222, 'eval_runtime': 41.1056, 'eval_samples_per_second': 204.838, 'eval_steps_per_second': 25.617, 'epoch': 6.0}


 30%|███       | 42500/140900 [1:38:51<3:38:48,  7.50it/s]  

{'loss': 0.1213, 'learning_rate': 1.3967352732434351e-05, 'epoch': 6.03}
{'loss': 0.1213, 'learning_rate': 1.3967352732434351e-05, 'epoch': 6.03}


 31%|███       | 43001/140900 [1:39:58<3:40:28,  7.40it/s]

{'loss': 0.1167, 'learning_rate': 1.389638041163946e-05, 'epoch': 6.1}
{'loss': 0.1167, 'learning_rate': 1.389638041163946e-05, 'epoch': 6.1}


 31%|███       | 43500/140900 [1:41:05<3:36:34,  7.50it/s]

{'loss': 0.1197, 'learning_rate': 1.3825408090844572e-05, 'epoch': 6.17}
{'loss': 0.1197, 'learning_rate': 1.3825408090844572e-05, 'epoch': 6.17}


 31%|███       | 44000/140900 [1:42:12<3:39:12,  7.37it/s]

{'loss': 0.119, 'learning_rate': 1.3754435770049681e-05, 'epoch': 6.25}
{'loss': 0.119, 'learning_rate': 1.3754435770049681e-05, 'epoch': 6.25}


 32%|███▏      | 44501/140900 [1:43:19<3:37:52,  7.37it/s]

{'loss': 0.1197, 'learning_rate': 1.3683463449254792e-05, 'epoch': 6.32}
{'loss': 0.1197, 'learning_rate': 1.3683463449254792e-05, 'epoch': 6.32}


 32%|███▏      | 45001/140900 [1:44:25<3:37:47,  7.34it/s]

{'loss': 0.1193, 'learning_rate': 1.3612491128459901e-05, 'epoch': 6.39}
{'loss': 0.1193, 'learning_rate': 1.3612491128459901e-05, 'epoch': 6.39}


 32%|███▏      | 45500/140900 [1:45:32<3:33:39,  7.44it/s]

{'loss': 0.1183, 'learning_rate': 1.3541518807665012e-05, 'epoch': 6.46}
{'loss': 0.1183, 'learning_rate': 1.3541518807665012e-05, 'epoch': 6.46}


 33%|███▎      | 46000/140900 [1:46:39<3:31:12,  7.49it/s]

{'loss': 0.118, 'learning_rate': 1.3470546486870122e-05, 'epoch': 6.53}
{'loss': 0.118, 'learning_rate': 1.3470546486870122e-05, 'epoch': 6.53}


 33%|███▎      | 46500/140900 [1:47:46<3:29:57,  7.49it/s]

{'loss': 0.1193, 'learning_rate': 1.3399574166075231e-05, 'epoch': 6.6}
{'loss': 0.1193, 'learning_rate': 1.3399574166075231e-05, 'epoch': 6.6}


 33%|███▎      | 47000/140900 [1:48:53<3:30:00,  7.45it/s]

{'loss': 0.1177, 'learning_rate': 1.3328601845280342e-05, 'epoch': 6.67}
{'loss': 0.1177, 'learning_rate': 1.3328601845280342e-05, 'epoch': 6.67}


 34%|███▎      | 47500/140900 [1:49:59<3:27:43,  7.49it/s]

{'loss': 0.1186, 'learning_rate': 1.3257629524485451e-05, 'epoch': 6.74}
{'loss': 0.1186, 'learning_rate': 1.3257629524485451e-05, 'epoch': 6.74}


 34%|███▍      | 48000/140900 [1:51:06<3:26:52,  7.48it/s]

{'loss': 0.1169, 'learning_rate': 1.3186657203690562e-05, 'epoch': 6.81}
{'loss': 0.1169, 'learning_rate': 1.3186657203690562e-05, 'epoch': 6.81}


 34%|███▍      | 48500/140900 [1:52:13<3:25:22,  7.50it/s]

{'loss': 0.1189, 'learning_rate': 1.3115684882895672e-05, 'epoch': 6.88}
{'loss': 0.1189, 'learning_rate': 1.3115684882895672e-05, 'epoch': 6.88}


 35%|███▍      | 49001/140900 [1:53:20<3:26:45,  7.41it/s]

{'loss': 0.1206, 'learning_rate': 1.3044712562100781e-05, 'epoch': 6.96}
{'loss': 0.1206, 'learning_rate': 1.3044712562100781e-05, 'epoch': 6.96}


 35%|███▍      | 49314/140900 [1:54:02<3:25:17,  7.44it/s]
  0%|          | 0/1053 [00:00<?, ?it/s][A
  0%|          | 4/1053 [00:00<00:29, 35.53it/s][A
  1%|          | 8/1053 [00:00<00:35, 29.12it/s][A
  1%|          | 11/1053 [00:00<00:37, 27.74it/s][A
  1%|▏         | 14/1053 [00:00<00:38, 26.89it/s][A
  2%|▏         | 17/1053 [00:00<00:38, 26.61it/s][A
  2%|▏         | 20/1053 [00:00<00:39, 26.26it/s][A
  2%|▏         | 23/1053 [00:00<00:39, 26.17it/s][A
  2%|▏         | 26/1053 [00:00<00:39, 25.96it/s][A
  3%|▎         | 29/1053 [00:01<00:39, 25.81it/s][A
  3%|▎         | 32/1053 [00:01<00:39, 25.80it/s][A
  3%|▎         | 35/1053 [00:01<00:39, 25.79it/s][A
  4%|▎         | 38/1053 [00:01<00:39, 25.80it/s][A
  4%|▍         | 41/1053 [00:01<00:39, 25.65it/s][A
  4%|▍         | 44/1053 [00:01<00:39, 25.81it/s][A
  4%|▍         | 47/1053 [00:01<00:38, 25.82it/s][A
  5%|▍         | 50/1053 [00:01<00:38, 25.79it/s][A
  5%|▌         | 53/1053 [00:02<00:40, 24.77it/s][

{'eval_loss': 0.15095998346805573, 'eval_runtime': 41.1377, 'eval_samples_per_second': 204.678, 'eval_steps_per_second': 25.597, 'epoch': 7.0}
{'eval_loss': 0.15095998346805573, 'eval_runtime': 41.1377, 'eval_samples_per_second': 204.678, 'eval_steps_per_second': 25.597, 'epoch': 7.0}


 35%|███▌      | 49501/140900 [1:55:08<3:26:24,  7.38it/s]  

{'loss': 0.1158, 'learning_rate': 1.2973740241305892e-05, 'epoch': 7.03}
{'loss': 0.1158, 'learning_rate': 1.2973740241305892e-05, 'epoch': 7.03}


 35%|███▌      | 50000/140900 [1:56:15<3:22:11,  7.49it/s]

{'loss': 0.1134, 'learning_rate': 1.2902767920511001e-05, 'epoch': 7.1}
{'loss': 0.1134, 'learning_rate': 1.2902767920511001e-05, 'epoch': 7.1}


 36%|███▌      | 50500/140900 [1:57:22<3:21:07,  7.49it/s]

{'loss': 0.1123, 'learning_rate': 1.2831795599716112e-05, 'epoch': 7.17}
{'loss': 0.1123, 'learning_rate': 1.2831795599716112e-05, 'epoch': 7.17}


 36%|███▌      | 51001/140900 [1:58:29<3:22:16,  7.41it/s]

{'loss': 0.1134, 'learning_rate': 1.2760823278921222e-05, 'epoch': 7.24}
{'loss': 0.1134, 'learning_rate': 1.2760823278921222e-05, 'epoch': 7.24}


 37%|███▋      | 51500/140900 [1:59:35<3:19:41,  7.46it/s]

{'loss': 0.113, 'learning_rate': 1.2689850958126331e-05, 'epoch': 7.31}
{'loss': 0.113, 'learning_rate': 1.2689850958126331e-05, 'epoch': 7.31}


 37%|███▋      | 52000/140900 [2:00:42<3:18:16,  7.47it/s]

{'loss': 0.1141, 'learning_rate': 1.2618878637331442e-05, 'epoch': 7.38}
{'loss': 0.1141, 'learning_rate': 1.2618878637331442e-05, 'epoch': 7.38}


 37%|███▋      | 52501/140900 [2:01:49<3:19:41,  7.38it/s]

{'loss': 0.1137, 'learning_rate': 1.2547906316536552e-05, 'epoch': 7.45}
{'loss': 0.1137, 'learning_rate': 1.2547906316536552e-05, 'epoch': 7.45}


 38%|███▊      | 53000/140900 [2:02:56<3:16:22,  7.46it/s]

{'loss': 0.1133, 'learning_rate': 1.2476933995741663e-05, 'epoch': 7.52}
{'loss': 0.1133, 'learning_rate': 1.2476933995741663e-05, 'epoch': 7.52}


 38%|███▊      | 53500/140900 [2:04:03<3:13:42,  7.52it/s]

{'loss': 0.1128, 'learning_rate': 1.2405961674946772e-05, 'epoch': 7.59}
{'loss': 0.1128, 'learning_rate': 1.2405961674946772e-05, 'epoch': 7.59}


 38%|███▊      | 54000/140900 [2:05:10<3:13:30,  7.48it/s]

{'loss': 0.1141, 'learning_rate': 1.2334989354151883e-05, 'epoch': 7.67}
{'loss': 0.1141, 'learning_rate': 1.2334989354151883e-05, 'epoch': 7.67}


 39%|███▊      | 54500/140900 [2:06:17<3:12:43,  7.47it/s]

{'loss': 0.1153, 'learning_rate': 1.2264017033356992e-05, 'epoch': 7.74}
{'loss': 0.1153, 'learning_rate': 1.2264017033356992e-05, 'epoch': 7.74}


 39%|███▉      | 55001/140900 [2:07:24<3:12:56,  7.42it/s]

{'loss': 0.1132, 'learning_rate': 1.2193044712562102e-05, 'epoch': 7.81}
{'loss': 0.1132, 'learning_rate': 1.2193044712562102e-05, 'epoch': 7.81}


 39%|███▉      | 55500/140900 [2:08:30<3:09:30,  7.51it/s]

{'loss': 0.1139, 'learning_rate': 1.2122072391767213e-05, 'epoch': 7.88}
{'loss': 0.1139, 'learning_rate': 1.2122072391767213e-05, 'epoch': 7.88}


 40%|███▉      | 56001/140900 [2:09:37<3:12:03,  7.37it/s]

{'loss': 0.1146, 'learning_rate': 1.2051100070972322e-05, 'epoch': 7.95}
{'loss': 0.1146, 'learning_rate': 1.2051100070972322e-05, 'epoch': 7.95}


 40%|███▉      | 56359/140900 [2:10:25<3:10:14,  7.41it/s]
  0%|          | 0/1053 [00:00<?, ?it/s][A
  0%|          | 4/1053 [00:00<00:30, 34.87it/s][A
  1%|          | 8/1053 [00:00<00:36, 28.63it/s][A
  1%|          | 11/1053 [00:00<00:38, 27.26it/s][A
  1%|▏         | 14/1053 [00:00<00:39, 26.55it/s][A
  2%|▏         | 17/1053 [00:00<00:39, 26.21it/s][A
  2%|▏         | 20/1053 [00:00<00:39, 25.84it/s][A
  2%|▏         | 23/1053 [00:00<00:40, 25.69it/s][A
  2%|▏         | 26/1053 [00:00<00:40, 25.55it/s][A
  3%|▎         | 29/1053 [00:01<00:40, 25.56it/s][A
  3%|▎         | 32/1053 [00:01<00:40, 25.47it/s][A
  3%|▎         | 35/1053 [00:01<00:40, 25.44it/s][A
  4%|▎         | 38/1053 [00:01<00:40, 25.37it/s][A
  4%|▍         | 41/1053 [00:01<00:39, 25.38it/s][A
  4%|▍         | 44/1053 [00:01<00:39, 25.52it/s][A
  4%|▍         | 47/1053 [00:01<00:39, 25.51it/s][A
  5%|▍         | 50/1053 [00:01<00:39, 25.62it/s][A
  5%|▌         | 53/1053 [00:02<00:38, 25.69it/s][

{'eval_loss': 0.15173861384391785, 'eval_runtime': 41.1329, 'eval_samples_per_second': 204.702, 'eval_steps_per_second': 25.6, 'epoch': 8.0}
{'eval_loss': 0.15173861384391785, 'eval_runtime': 41.1329, 'eval_samples_per_second': 204.702, 'eval_steps_per_second': 25.6, 'epoch': 8.0}


 40%|████      | 56500/140900 [2:11:25<3:08:49,  7.45it/s]  

{'loss': 0.1135, 'learning_rate': 1.1980127750177433e-05, 'epoch': 8.02}
{'loss': 0.1135, 'learning_rate': 1.1980127750177433e-05, 'epoch': 8.02}


 40%|████      | 57000/140900 [2:12:32<3:07:09,  7.47it/s]

{'loss': 0.108, 'learning_rate': 1.1909155429382542e-05, 'epoch': 8.09}
{'loss': 0.108, 'learning_rate': 1.1909155429382542e-05, 'epoch': 8.09}


 41%|████      | 57501/140900 [2:13:39<3:08:19,  7.38it/s]

{'loss': 0.1102, 'learning_rate': 1.1838183108587652e-05, 'epoch': 8.16}
{'loss': 0.1102, 'learning_rate': 1.1838183108587652e-05, 'epoch': 8.16}


 41%|████      | 58000/140900 [2:14:46<3:04:53,  7.47it/s]

{'loss': 0.1093, 'learning_rate': 1.1767210787792763e-05, 'epoch': 8.23}
{'loss': 0.1093, 'learning_rate': 1.1767210787792763e-05, 'epoch': 8.23}


 42%|████▏     | 58500/140900 [2:15:53<3:03:40,  7.48it/s]

{'loss': 0.1087, 'learning_rate': 1.1696238466997872e-05, 'epoch': 8.3}
{'loss': 0.1087, 'learning_rate': 1.1696238466997872e-05, 'epoch': 8.3}


 42%|████▏     | 59001/140900 [2:16:59<3:04:18,  7.41it/s]

{'loss': 0.1095, 'learning_rate': 1.1625266146202983e-05, 'epoch': 8.37}
{'loss': 0.1095, 'learning_rate': 1.1625266146202983e-05, 'epoch': 8.37}


 42%|████▏     | 59500/140900 [2:18:06<3:01:14,  7.49it/s]

{'loss': 0.1106, 'learning_rate': 1.1554293825408092e-05, 'epoch': 8.45}
{'loss': 0.1106, 'learning_rate': 1.1554293825408092e-05, 'epoch': 8.45}


 43%|████▎     | 60000/140900 [2:19:13<3:00:02,  7.49it/s]

{'loss': 0.1075, 'learning_rate': 1.14833215046132e-05, 'epoch': 8.52}
{'loss': 0.1075, 'learning_rate': 1.14833215046132e-05, 'epoch': 8.52}


 43%|████▎     | 60501/140900 [2:20:20<3:00:23,  7.43it/s]

{'loss': 0.1099, 'learning_rate': 1.1412349183818313e-05, 'epoch': 8.59}
{'loss': 0.1099, 'learning_rate': 1.1412349183818313e-05, 'epoch': 8.59}


 43%|████▎     | 61000/140900 [2:21:27<2:57:59,  7.48it/s]

{'loss': 0.1098, 'learning_rate': 1.134137686302342e-05, 'epoch': 8.66}
{'loss': 0.1098, 'learning_rate': 1.134137686302342e-05, 'epoch': 8.66}


 44%|████▎     | 61500/140900 [2:22:34<2:56:30,  7.50it/s]

{'loss': 0.1103, 'learning_rate': 1.1270404542228533e-05, 'epoch': 8.73}
{'loss': 0.1103, 'learning_rate': 1.1270404542228533e-05, 'epoch': 8.73}


 44%|████▍     | 62000/140900 [2:23:41<2:55:44,  7.48it/s]

{'loss': 0.1081, 'learning_rate': 1.119943222143364e-05, 'epoch': 8.8}
{'loss': 0.1081, 'learning_rate': 1.119943222143364e-05, 'epoch': 8.8}


 44%|████▍     | 62500/140900 [2:24:47<2:54:45,  7.48it/s]

{'loss': 0.1087, 'learning_rate': 1.1128459900638754e-05, 'epoch': 8.87}
{'loss': 0.1087, 'learning_rate': 1.1128459900638754e-05, 'epoch': 8.87}


 45%|████▍     | 63001/140900 [2:25:54<2:55:24,  7.40it/s]

{'loss': 0.1101, 'learning_rate': 1.1057487579843861e-05, 'epoch': 8.94}
{'loss': 0.1101, 'learning_rate': 1.1057487579843861e-05, 'epoch': 8.94}


 45%|████▍     | 63404/140900 [2:26:48<2:53:24,  7.45it/s]
  0%|          | 0/1053 [00:00<?, ?it/s][A
  0%|          | 4/1053 [00:00<00:29, 35.75it/s][A
  1%|          | 8/1053 [00:00<00:35, 29.11it/s][A
  1%|          | 11/1053 [00:00<00:37, 27.66it/s][A
  1%|▏         | 14/1053 [00:00<00:38, 27.07it/s][A
  2%|▏         | 17/1053 [00:00<00:38, 26.62it/s][A
  2%|▏         | 20/1053 [00:00<00:39, 26.35it/s][A
  2%|▏         | 23/1053 [00:00<00:39, 26.19it/s][A
  2%|▏         | 26/1053 [00:00<00:39, 25.97it/s][A
  3%|▎         | 29/1053 [00:01<00:39, 25.87it/s][A
  3%|▎         | 32/1053 [00:01<00:39, 25.65it/s][A
  3%|▎         | 35/1053 [00:01<00:39, 25.58it/s][A
  4%|▎         | 38/1053 [00:01<00:39, 25.68it/s][A
  4%|▍         | 41/1053 [00:01<00:39, 25.71it/s][A
  4%|▍         | 44/1053 [00:01<00:39, 25.57it/s][A
  4%|▍         | 47/1053 [00:01<00:39, 25.46it/s][A
  5%|▍         | 50/1053 [00:01<00:39, 25.67it/s][A
  5%|▌         | 53/1053 [00:02<00:38, 25.75it/s][

{'eval_loss': 0.15465517342090607, 'eval_runtime': 41.1069, 'eval_samples_per_second': 204.832, 'eval_steps_per_second': 25.616, 'epoch': 9.0}
{'eval_loss': 0.15465517342090607, 'eval_runtime': 41.1069, 'eval_samples_per_second': 204.832, 'eval_steps_per_second': 25.616, 'epoch': 9.0}


 45%|████▌     | 63500/140900 [2:27:42<2:51:56,  7.50it/s]  

{'loss': 0.1092, 'learning_rate': 1.098651525904897e-05, 'epoch': 9.01}
{'loss': 0.1092, 'learning_rate': 1.098651525904897e-05, 'epoch': 9.01}


 45%|████▌     | 64001/140900 [2:28:49<2:53:53,  7.37it/s]

{'loss': 0.1032, 'learning_rate': 1.0915542938254082e-05, 'epoch': 9.08}
{'loss': 0.1032, 'learning_rate': 1.0915542938254082e-05, 'epoch': 9.08}


 46%|████▌     | 64500/140900 [2:29:56<2:49:54,  7.49it/s]

{'loss': 0.1029, 'learning_rate': 1.0844570617459191e-05, 'epoch': 9.16}
{'loss': 0.1029, 'learning_rate': 1.0844570617459191e-05, 'epoch': 9.16}


 46%|████▌     | 65000/140900 [2:31:03<2:48:56,  7.49it/s]

{'loss': 0.1053, 'learning_rate': 1.0773598296664302e-05, 'epoch': 9.23}
{'loss': 0.1053, 'learning_rate': 1.0773598296664302e-05, 'epoch': 9.23}


 46%|████▋     | 65500/140900 [2:32:09<2:47:25,  7.51it/s]

{'loss': 0.1047, 'learning_rate': 1.0702625975869411e-05, 'epoch': 9.3}
{'loss': 0.1047, 'learning_rate': 1.0702625975869411e-05, 'epoch': 9.3}


 47%|████▋     | 66001/140900 [2:33:16<2:48:47,  7.40it/s]

{'loss': 0.1051, 'learning_rate': 1.063165365507452e-05, 'epoch': 9.37}
{'loss': 0.1051, 'learning_rate': 1.063165365507452e-05, 'epoch': 9.37}


 47%|████▋     | 66500/140900 [2:34:23<2:45:38,  7.49it/s]

{'loss': 0.1069, 'learning_rate': 1.0560681334279632e-05, 'epoch': 9.44}
{'loss': 0.1069, 'learning_rate': 1.0560681334279632e-05, 'epoch': 9.44}


 48%|████▊     | 67001/140900 [2:35:30<2:46:47,  7.38it/s]

{'loss': 0.1065, 'learning_rate': 1.0489709013484741e-05, 'epoch': 9.51}
{'loss': 0.1065, 'learning_rate': 1.0489709013484741e-05, 'epoch': 9.51}


 48%|████▊     | 67500/140900 [2:36:37<2:43:09,  7.50it/s]

{'loss': 0.1043, 'learning_rate': 1.0418736692689852e-05, 'epoch': 9.58}
{'loss': 0.1043, 'learning_rate': 1.0418736692689852e-05, 'epoch': 9.58}


 48%|████▊     | 68000/140900 [2:37:44<2:42:09,  7.49it/s]

{'loss': 0.1074, 'learning_rate': 1.0347764371894961e-05, 'epoch': 9.65}
{'loss': 0.1074, 'learning_rate': 1.0347764371894961e-05, 'epoch': 9.65}


 49%|████▊     | 68501/140900 [2:38:51<2:42:56,  7.41it/s]

{'loss': 0.1054, 'learning_rate': 1.027679205110007e-05, 'epoch': 9.72}
{'loss': 0.1054, 'learning_rate': 1.027679205110007e-05, 'epoch': 9.72}


 49%|████▉     | 69000/140900 [2:39:57<2:40:16,  7.48it/s]

{'loss': 0.1056, 'learning_rate': 1.0205819730305182e-05, 'epoch': 9.79}
{'loss': 0.1056, 'learning_rate': 1.0205819730305182e-05, 'epoch': 9.79}


 49%|████▉     | 69501/140900 [2:41:04<2:41:17,  7.38it/s]

{'loss': 0.1054, 'learning_rate': 1.0134847409510291e-05, 'epoch': 9.87}
{'loss': 0.1054, 'learning_rate': 1.0134847409510291e-05, 'epoch': 9.87}


 50%|████▉     | 70000/140900 [2:42:11<2:40:06,  7.38it/s]

{'loss': 0.1061, 'learning_rate': 1.0063875088715402e-05, 'epoch': 9.94}
{'loss': 0.1061, 'learning_rate': 1.0063875088715402e-05, 'epoch': 9.94}


 50%|████▉     | 70449/140900 [2:43:11<2:37:55,  7.44it/s]
  0%|          | 0/1053 [00:00<?, ?it/s][A
  0%|          | 4/1053 [00:00<00:30, 34.96it/s][A
  1%|          | 8/1053 [00:00<00:36, 28.92it/s][A
  1%|          | 11/1053 [00:00<00:37, 27.50it/s][A
  1%|▏         | 14/1053 [00:00<00:39, 26.61it/s][A
  2%|▏         | 17/1053 [00:00<00:39, 26.23it/s][A
  2%|▏         | 20/1053 [00:00<00:39, 26.09it/s][A
  2%|▏         | 23/1053 [00:00<00:39, 25.99it/s][A
  2%|▏         | 26/1053 [00:00<00:39, 25.93it/s][A
  3%|▎         | 29/1053 [00:01<00:40, 25.49it/s][A
  3%|▎         | 32/1053 [00:01<00:39, 25.56it/s][A
  3%|▎         | 35/1053 [00:01<00:39, 25.63it/s][A
  4%|▎         | 38/1053 [00:01<00:39, 25.73it/s][A
  4%|▍         | 41/1053 [00:01<00:39, 25.65it/s][A
  4%|▍         | 44/1053 [00:01<00:39, 25.55it/s][A
  4%|▍         | 47/1053 [00:01<00:39, 25.53it/s][A
  5%|▍         | 50/1053 [00:01<00:39, 25.55it/s][A
  5%|▌         | 53/1053 [00:02<00:39, 25.55it/s][

{'eval_loss': 0.1602775752544403, 'eval_runtime': 41.0903, 'eval_samples_per_second': 204.915, 'eval_steps_per_second': 25.626, 'epoch': 10.0}
{'eval_loss': 0.1602775752544403, 'eval_runtime': 41.0903, 'eval_samples_per_second': 204.915, 'eval_steps_per_second': 25.626, 'epoch': 10.0}


 50%|█████     | 70500/140900 [2:43:59<2:36:40,  7.49it/s]  

{'loss': 0.1061, 'learning_rate': 9.992902767920511e-06, 'epoch': 10.01}
{'loss': 0.1061, 'learning_rate': 9.992902767920511e-06, 'epoch': 10.01}


 50%|█████     | 71000/140900 [2:45:06<2:36:01,  7.47it/s]

{'loss': 0.1013, 'learning_rate': 9.921930447125622e-06, 'epoch': 10.08}
{'loss': 0.1013, 'learning_rate': 9.921930447125622e-06, 'epoch': 10.08}


 51%|█████     | 71501/140900 [2:46:13<2:36:22,  7.40it/s]

{'loss': 0.0989, 'learning_rate': 9.850958126330732e-06, 'epoch': 10.15}
{'loss': 0.0989, 'learning_rate': 9.850958126330732e-06, 'epoch': 10.15}


 51%|█████     | 72000/140900 [2:47:19<2:33:18,  7.49it/s]

{'loss': 0.1007, 'learning_rate': 9.779985805535843e-06, 'epoch': 10.22}
{'loss': 0.1007, 'learning_rate': 9.779985805535843e-06, 'epoch': 10.22}


 51%|█████▏    | 72501/140900 [2:48:26<2:34:10,  7.39it/s]

{'loss': 0.1002, 'learning_rate': 9.709013484740952e-06, 'epoch': 10.29}
{'loss': 0.1002, 'learning_rate': 9.709013484740952e-06, 'epoch': 10.29}


 52%|█████▏    | 73000/140900 [2:49:33<2:31:37,  7.46it/s]

{'loss': 0.102, 'learning_rate': 9.638041163946061e-06, 'epoch': 10.36}
{'loss': 0.102, 'learning_rate': 9.638041163946061e-06, 'epoch': 10.36}


 52%|█████▏    | 73500/140900 [2:50:40<2:29:58,  7.49it/s]

{'loss': 0.1018, 'learning_rate': 9.56706884315117e-06, 'epoch': 10.43}
{'loss': 0.1018, 'learning_rate': 9.56706884315117e-06, 'epoch': 10.43}


 53%|█████▎    | 74001/140900 [2:51:47<2:30:52,  7.39it/s]

{'loss': 0.1016, 'learning_rate': 9.496096522356282e-06, 'epoch': 10.5}
{'loss': 0.1016, 'learning_rate': 9.496096522356282e-06, 'epoch': 10.5}


 53%|█████▎    | 74500/140900 [2:52:54<2:27:52,  7.48it/s]

{'loss': 0.0991, 'learning_rate': 9.425124201561391e-06, 'epoch': 10.57}
{'loss': 0.0991, 'learning_rate': 9.425124201561391e-06, 'epoch': 10.57}


 53%|█████▎    | 75001/140900 [2:54:01<2:28:41,  7.39it/s]

{'loss': 0.1017, 'learning_rate': 9.354151880766502e-06, 'epoch': 10.65}
{'loss': 0.1017, 'learning_rate': 9.354151880766502e-06, 'epoch': 10.65}


 54%|█████▎    | 75500/140900 [2:55:07<2:25:23,  7.50it/s]

{'loss': 0.1027, 'learning_rate': 9.283179559971612e-06, 'epoch': 10.72}
{'loss': 0.1027, 'learning_rate': 9.283179559971612e-06, 'epoch': 10.72}


 54%|█████▍    | 76000/140900 [2:56:14<2:24:47,  7.47it/s]

{'loss': 0.102, 'learning_rate': 9.212207239176721e-06, 'epoch': 10.79}
{'loss': 0.102, 'learning_rate': 9.212207239176721e-06, 'epoch': 10.79}


 54%|█████▍    | 76501/140900 [2:57:21<2:25:45,  7.36it/s]

{'loss': 0.1035, 'learning_rate': 9.141234918381832e-06, 'epoch': 10.86}
{'loss': 0.1035, 'learning_rate': 9.141234918381832e-06, 'epoch': 10.86}


 55%|█████▍    | 77000/140900 [2:58:28<2:22:51,  7.45it/s]

{'loss': 0.1025, 'learning_rate': 9.070262597586941e-06, 'epoch': 10.93}
{'loss': 0.1025, 'learning_rate': 9.070262597586941e-06, 'epoch': 10.93}


 55%|█████▍    | 77494/140900 [2:59:34<2:22:19,  7.43it/s]
  0%|          | 0/1053 [00:00<?, ?it/s][A
  0%|          | 4/1053 [00:00<00:29, 35.09it/s][A
  1%|          | 8/1053 [00:00<00:36, 28.86it/s][A
  1%|          | 11/1053 [00:00<00:37, 27.67it/s][A
  1%|▏         | 14/1053 [00:00<00:38, 26.79it/s][A
  2%|▏         | 17/1053 [00:00<00:39, 26.49it/s][A
  2%|▏         | 20/1053 [00:00<00:39, 26.29it/s][A
  2%|▏         | 23/1053 [00:00<00:39, 26.13it/s][A
  2%|▏         | 26/1053 [00:00<00:39, 26.02it/s][A
  3%|▎         | 29/1053 [00:01<00:39, 25.89it/s][A
  3%|▎         | 32/1053 [00:01<00:39, 25.89it/s][A
  3%|▎         | 35/1053 [00:01<00:39, 25.87it/s][A
  4%|▎         | 38/1053 [00:01<00:39, 25.81it/s][A
  4%|▍         | 41/1053 [00:01<00:39, 25.73it/s][A
  4%|▍         | 44/1053 [00:01<00:39, 25.79it/s][A
  4%|▍         | 47/1053 [00:01<00:38, 25.87it/s][A
  5%|▍         | 50/1053 [00:01<00:38, 25.80it/s][A
  5%|▌         | 53/1053 [00:02<00:38, 25.70it/s][

{'eval_loss': 0.1622205525636673, 'eval_runtime': 41.0992, 'eval_samples_per_second': 204.87, 'eval_steps_per_second': 25.621, 'epoch': 11.0}
{'eval_loss': 0.1622205525636673, 'eval_runtime': 41.0992, 'eval_samples_per_second': 204.87, 'eval_steps_per_second': 25.621, 'epoch': 11.0}


 55%|█████▌    | 77501/140900 [3:00:16<37:04:37,  2.11s/it] 

{'loss': 0.1016, 'learning_rate': 8.999290276792052e-06, 'epoch': 11.0}
{'loss': 0.1016, 'learning_rate': 8.999290276792052e-06, 'epoch': 11.0}


 55%|█████▌    | 78000/140900 [3:01:23<2:20:15,  7.47it/s] 

{'loss': 0.0979, 'learning_rate': 8.928317955997162e-06, 'epoch': 11.07}
{'loss': 0.0979, 'learning_rate': 8.928317955997162e-06, 'epoch': 11.07}


 56%|█████▌    | 78501/140900 [3:02:30<2:20:38,  7.39it/s]

{'loss': 0.0967, 'learning_rate': 8.857345635202271e-06, 'epoch': 11.14}
{'loss': 0.0967, 'learning_rate': 8.857345635202271e-06, 'epoch': 11.14}


 56%|█████▌    | 79000/140900 [3:03:36<2:17:30,  7.50it/s]

{'loss': 0.0969, 'learning_rate': 8.786373314407382e-06, 'epoch': 11.21}
{'loss': 0.0969, 'learning_rate': 8.786373314407382e-06, 'epoch': 11.21}


 56%|█████▋    | 79500/140900 [3:04:43<2:16:30,  7.50it/s]

{'loss': 0.0973, 'learning_rate': 8.715400993612491e-06, 'epoch': 11.28}
{'loss': 0.0973, 'learning_rate': 8.715400993612491e-06, 'epoch': 11.28}


 57%|█████▋    | 80001/140900 [3:05:50<2:17:30,  7.38it/s]

{'loss': 0.0973, 'learning_rate': 8.644428672817602e-06, 'epoch': 11.36}
{'loss': 0.0973, 'learning_rate': 8.644428672817602e-06, 'epoch': 11.36}


 57%|█████▋    | 80500/140900 [3:06:57<2:14:33,  7.48it/s]

{'loss': 0.0964, 'learning_rate': 8.573456352022712e-06, 'epoch': 11.43}
{'loss': 0.0964, 'learning_rate': 8.573456352022712e-06, 'epoch': 11.43}


 57%|█████▋    | 81001/140900 [3:08:04<2:14:43,  7.41it/s]

{'loss': 0.0976, 'learning_rate': 8.502484031227823e-06, 'epoch': 11.5}
{'loss': 0.0976, 'learning_rate': 8.502484031227823e-06, 'epoch': 11.5}


 58%|█████▊    | 81500/140900 [3:09:11<2:12:12,  7.49it/s]

{'loss': 0.0963, 'learning_rate': 8.431511710432932e-06, 'epoch': 11.57}
{'loss': 0.0963, 'learning_rate': 8.431511710432932e-06, 'epoch': 11.57}


 58%|█████▊    | 82000/140900 [3:10:17<2:11:40,  7.46it/s]

{'loss': 0.1001, 'learning_rate': 8.360539389638041e-06, 'epoch': 11.64}
{'loss': 0.1001, 'learning_rate': 8.360539389638041e-06, 'epoch': 11.64}


 59%|█████▊    | 82501/140900 [3:11:24<2:12:44,  7.33it/s]

{'loss': 0.0984, 'learning_rate': 8.289567068843152e-06, 'epoch': 11.71}
{'loss': 0.0984, 'learning_rate': 8.289567068843152e-06, 'epoch': 11.71}


 59%|█████▉    | 83000/140900 [3:12:31<2:09:42,  7.44it/s]

{'loss': 0.099, 'learning_rate': 8.218594748048262e-06, 'epoch': 11.78}
{'loss': 0.099, 'learning_rate': 8.218594748048262e-06, 'epoch': 11.78}


 59%|█████▉    | 83501/140900 [3:13:38<2:09:46,  7.37it/s]

{'loss': 0.0998, 'learning_rate': 8.147622427253373e-06, 'epoch': 11.85}
{'loss': 0.0998, 'learning_rate': 8.147622427253373e-06, 'epoch': 11.85}


 60%|█████▉    | 84000/140900 [3:14:45<2:06:39,  7.49it/s]

{'loss': 0.0984, 'learning_rate': 8.076650106458482e-06, 'epoch': 11.92}
{'loss': 0.0984, 'learning_rate': 8.076650106458482e-06, 'epoch': 11.92}


 60%|█████▉    | 84500/140900 [3:15:52<2:05:47,  7.47it/s]

{'loss': 0.0979, 'learning_rate': 8.005677785663592e-06, 'epoch': 11.99}
{'loss': 0.0979, 'learning_rate': 8.005677785663592e-06, 'epoch': 11.99}


 60%|█████▉    | 84539/140900 [3:15:57<2:06:40,  7.42it/s]
  0%|          | 0/1053 [00:00<?, ?it/s][A
  0%|          | 4/1053 [00:00<00:30, 34.88it/s][A
  1%|          | 8/1053 [00:00<00:36, 28.81it/s][A
  1%|          | 11/1053 [00:00<00:37, 27.69it/s][A
  1%|▏         | 14/1053 [00:00<00:38, 26.97it/s][A
  2%|▏         | 17/1053 [00:00<00:39, 26.46it/s][A
  2%|▏         | 20/1053 [00:00<00:39, 26.30it/s][A
  2%|▏         | 23/1053 [00:00<00:39, 26.12it/s][A
  2%|▏         | 26/1053 [00:00<00:39, 25.93it/s][A
  3%|▎         | 29/1053 [00:01<00:39, 25.79it/s][A
  3%|▎         | 32/1053 [00:01<00:39, 25.82it/s][A
  3%|▎         | 35/1053 [00:01<00:39, 25.75it/s][A
  4%|▎         | 38/1053 [00:01<00:39, 25.78it/s][A
  4%|▍         | 41/1053 [00:01<00:39, 25.70it/s][A
  4%|▍         | 44/1053 [00:01<00:39, 25.75it/s][A
  4%|▍         | 47/1053 [00:01<00:39, 25.78it/s][A
  5%|▍         | 50/1053 [00:01<00:38, 25.72it/s][A
  5%|▌         | 53/1053 [00:02<00:38, 25.70it/s][

{'eval_loss': 0.1675490140914917, 'eval_runtime': 41.1123, 'eval_samples_per_second': 204.805, 'eval_steps_per_second': 25.613, 'epoch': 12.0}
{'eval_loss': 0.1675490140914917, 'eval_runtime': 41.1123, 'eval_samples_per_second': 204.805, 'eval_steps_per_second': 25.613, 'epoch': 12.0}


 60%|██████    | 85001/140900 [3:17:40<2:05:39,  7.41it/s]  

{'loss': 0.0942, 'learning_rate': 7.9347054648687e-06, 'epoch': 12.07}
{'loss': 0.0942, 'learning_rate': 7.9347054648687e-06, 'epoch': 12.07}


 61%|██████    | 85500/140900 [3:18:47<2:03:10,  7.50it/s]

{'loss': 0.0925, 'learning_rate': 7.863733144073812e-06, 'epoch': 12.14}
{'loss': 0.0925, 'learning_rate': 7.863733144073812e-06, 'epoch': 12.14}


 61%|██████    | 86000/140900 [3:19:53<2:02:08,  7.49it/s]

{'loss': 0.0947, 'learning_rate': 7.792760823278921e-06, 'epoch': 12.21}
{'loss': 0.0947, 'learning_rate': 7.792760823278921e-06, 'epoch': 12.21}


 61%|██████▏   | 86501/140900 [3:21:00<2:02:41,  7.39it/s]

{'loss': 0.094, 'learning_rate': 7.721788502484032e-06, 'epoch': 12.28}
{'loss': 0.094, 'learning_rate': 7.721788502484032e-06, 'epoch': 12.28}


 62%|██████▏   | 87000/140900 [3:22:07<2:00:04,  7.48it/s]

{'loss': 0.0933, 'learning_rate': 7.650816181689142e-06, 'epoch': 12.35}
{'loss': 0.0933, 'learning_rate': 7.650816181689142e-06, 'epoch': 12.35}


 62%|██████▏   | 87501/140900 [3:23:14<2:00:49,  7.37it/s]

{'loss': 0.0962, 'learning_rate': 7.579843860894252e-06, 'epoch': 12.42}
{'loss': 0.0962, 'learning_rate': 7.579843860894252e-06, 'epoch': 12.42}


 62%|██████▏   | 88000/140900 [3:24:21<1:57:45,  7.49it/s]

{'loss': 0.0944, 'learning_rate': 7.508871540099362e-06, 'epoch': 12.49}
{'loss': 0.0944, 'learning_rate': 7.508871540099362e-06, 'epoch': 12.49}


 63%|██████▎   | 88501/140900 [3:25:28<1:57:56,  7.41it/s]

{'loss': 0.0945, 'learning_rate': 7.437899219304472e-06, 'epoch': 12.56}
{'loss': 0.0945, 'learning_rate': 7.437899219304472e-06, 'epoch': 12.56}


 63%|██████▎   | 89000/140900 [3:26:34<1:55:56,  7.46it/s]

{'loss': 0.0953, 'learning_rate': 7.366926898509582e-06, 'epoch': 12.63}
{'loss': 0.0953, 'learning_rate': 7.366926898509582e-06, 'epoch': 12.63}


 64%|██████▎   | 89500/140900 [3:27:41<1:54:19,  7.49it/s]

{'loss': 0.0949, 'learning_rate': 7.2959545777146925e-06, 'epoch': 12.7}
{'loss': 0.0949, 'learning_rate': 7.2959545777146925e-06, 'epoch': 12.7}


 64%|██████▍   | 90001/140900 [3:28:48<1:54:48,  7.39it/s]

{'loss': 0.0957, 'learning_rate': 7.224982256919801e-06, 'epoch': 12.78}
{'loss': 0.0957, 'learning_rate': 7.224982256919801e-06, 'epoch': 12.78}


 64%|██████▍   | 90500/140900 [3:29:55<1:52:01,  7.50it/s]

{'loss': 0.0941, 'learning_rate': 7.154009936124911e-06, 'epoch': 12.85}
{'loss': 0.0941, 'learning_rate': 7.154009936124911e-06, 'epoch': 12.85}


 65%|██████▍   | 91001/140900 [3:31:02<1:52:27,  7.40it/s]

{'loss': 0.0961, 'learning_rate': 7.083037615330021e-06, 'epoch': 12.92}
{'loss': 0.0961, 'learning_rate': 7.083037615330021e-06, 'epoch': 12.92}


 65%|██████▍   | 91501/140900 [3:32:09<1:50:42,  7.44it/s]

{'loss': 0.0953, 'learning_rate': 7.0120652945351315e-06, 'epoch': 12.99}
{'loss': 0.0953, 'learning_rate': 7.0120652945351315e-06, 'epoch': 12.99}


 65%|██████▍   | 91584/140900 [3:32:20<1:50:30,  7.44it/s]
  0%|          | 0/1053 [00:00<?, ?it/s][A
  0%|          | 4/1053 [00:00<00:31, 33.50it/s][A
  1%|          | 8/1053 [00:00<00:37, 28.20it/s][A
  1%|          | 11/1053 [00:00<00:38, 27.08it/s][A
  1%|▏         | 14/1053 [00:00<00:39, 26.60it/s][A
  2%|▏         | 17/1053 [00:00<00:39, 26.22it/s][A
  2%|▏         | 20/1053 [00:00<00:39, 25.84it/s][A
  2%|▏         | 23/1053 [00:00<00:39, 25.76it/s][A
  2%|▏         | 26/1053 [00:00<00:39, 25.80it/s][A
  3%|▎         | 29/1053 [00:01<00:39, 25.76it/s][A
  3%|▎         | 32/1053 [00:01<00:39, 25.72it/s][A
  3%|▎         | 35/1053 [00:01<00:39, 25.76it/s][A
  4%|▎         | 38/1053 [00:01<00:39, 25.77it/s][A
  4%|▍         | 41/1053 [00:01<00:39, 25.76it/s][A
  4%|▍         | 44/1053 [00:01<00:39, 25.75it/s][A
  4%|▍         | 47/1053 [00:01<00:39, 25.72it/s][A
  5%|▍         | 50/1053 [00:01<00:38, 25.75it/s][A
  5%|▌         | 53/1053 [00:02<00:38, 25.75it/s][

{'eval_loss': 0.1706051379442215, 'eval_runtime': 41.1057, 'eval_samples_per_second': 204.838, 'eval_steps_per_second': 25.617, 'epoch': 13.0}
{'eval_loss': 0.1706051379442215, 'eval_runtime': 41.1057, 'eval_samples_per_second': 204.838, 'eval_steps_per_second': 25.617, 'epoch': 13.0}


 65%|██████▌   | 92000/140900 [3:33:57<1:48:58,  7.48it/s]  

{'loss': 0.0906, 'learning_rate': 6.9410929737402426e-06, 'epoch': 13.06}
{'loss': 0.0906, 'learning_rate': 6.9410929737402426e-06, 'epoch': 13.06}


 66%|██████▌   | 92500/140900 [3:35:03<1:47:25,  7.51it/s]

{'loss': 0.0912, 'learning_rate': 6.870120652945353e-06, 'epoch': 13.13}
{'loss': 0.0912, 'learning_rate': 6.870120652945353e-06, 'epoch': 13.13}


 66%|██████▌   | 93001/140900 [3:36:11<1:48:10,  7.38it/s]

{'loss': 0.0901, 'learning_rate': 6.799148332150461e-06, 'epoch': 13.2}
{'loss': 0.0901, 'learning_rate': 6.799148332150461e-06, 'epoch': 13.2}


 66%|██████▋   | 93500/140900 [3:37:17<1:45:35,  7.48it/s]

{'loss': 0.0906, 'learning_rate': 6.7281760113555714e-06, 'epoch': 13.27}
{'loss': 0.0906, 'learning_rate': 6.7281760113555714e-06, 'epoch': 13.27}


 67%|██████▋   | 94001/140900 [3:38:24<1:47:25,  7.28it/s]

{'loss': 0.0909, 'learning_rate': 6.657203690560682e-06, 'epoch': 13.34}
{'loss': 0.0909, 'learning_rate': 6.657203690560682e-06, 'epoch': 13.34}


 67%|██████▋   | 94501/140900 [3:39:31<1:44:41,  7.39it/s]

{'loss': 0.0907, 'learning_rate': 6.586231369765792e-06, 'epoch': 13.41}
{'loss': 0.0907, 'learning_rate': 6.586231369765792e-06, 'epoch': 13.41}


 67%|██████▋   | 95000/140900 [3:40:38<1:42:48,  7.44it/s]

{'loss': 0.0922, 'learning_rate': 6.515259048970902e-06, 'epoch': 13.48}
{'loss': 0.0922, 'learning_rate': 6.515259048970902e-06, 'epoch': 13.48}


 68%|██████▊   | 95501/140900 [3:41:45<1:42:22,  7.39it/s]

{'loss': 0.0932, 'learning_rate': 6.444286728176012e-06, 'epoch': 13.56}
{'loss': 0.0932, 'learning_rate': 6.444286728176012e-06, 'epoch': 13.56}


 68%|██████▊   | 96000/140900 [3:42:51<1:39:44,  7.50it/s]

{'loss': 0.0919, 'learning_rate': 6.3733144073811215e-06, 'epoch': 13.63}
{'loss': 0.0919, 'learning_rate': 6.3733144073811215e-06, 'epoch': 13.63}


 68%|██████▊   | 96500/140900 [3:43:58<1:38:33,  7.51it/s]

{'loss': 0.0941, 'learning_rate': 6.302342086586232e-06, 'epoch': 13.7}
{'loss': 0.0941, 'learning_rate': 6.302342086586232e-06, 'epoch': 13.7}


 69%|██████▉   | 97001/140900 [3:45:05<1:38:51,  7.40it/s]

{'loss': 0.0915, 'learning_rate': 6.231369765791342e-06, 'epoch': 13.77}
{'loss': 0.0915, 'learning_rate': 6.231369765791342e-06, 'epoch': 13.77}


 69%|██████▉   | 97500/140900 [3:46:12<1:36:44,  7.48it/s]

{'loss': 0.0921, 'learning_rate': 6.160397444996452e-06, 'epoch': 13.84}
{'loss': 0.0921, 'learning_rate': 6.160397444996452e-06, 'epoch': 13.84}


 70%|██████▉   | 98000/140900 [3:47:19<1:35:36,  7.48it/s]

{'loss': 0.0924, 'learning_rate': 6.089425124201562e-06, 'epoch': 13.91}
{'loss': 0.0924, 'learning_rate': 6.089425124201562e-06, 'epoch': 13.91}


 70%|██████▉   | 98501/140900 [3:48:26<1:35:37,  7.39it/s]

{'loss': 0.0921, 'learning_rate': 6.018452803406672e-06, 'epoch': 13.98}
{'loss': 0.0921, 'learning_rate': 6.018452803406672e-06, 'epoch': 13.98}


 70%|██████▉   | 98629/140900 [3:48:43<1:34:52,  7.43it/s]
  0%|          | 0/1053 [00:00<?, ?it/s][A
  0%|          | 4/1053 [00:00<00:30, 34.83it/s][A
  1%|          | 8/1053 [00:00<00:36, 28.76it/s][A
  1%|          | 11/1053 [00:00<00:37, 27.50it/s][A
  1%|▏         | 14/1053 [00:00<00:38, 26.86it/s][A
  2%|▏         | 17/1053 [00:00<00:39, 26.47it/s][A
  2%|▏         | 20/1053 [00:00<00:39, 26.23it/s][A
  2%|▏         | 23/1053 [00:00<00:39, 25.95it/s][A
  2%|▏         | 26/1053 [00:00<00:39, 25.98it/s][A
  3%|▎         | 29/1053 [00:01<00:39, 25.95it/s][A
  3%|▎         | 32/1053 [00:01<00:39, 25.90it/s][A
  3%|▎         | 35/1053 [00:01<00:39, 25.68it/s][A
  4%|▎         | 38/1053 [00:01<00:39, 25.69it/s][A
  4%|▍         | 41/1053 [00:01<00:40, 24.97it/s][A
  4%|▍         | 44/1053 [00:01<00:40, 25.16it/s][A
  4%|▍         | 47/1053 [00:01<00:39, 25.24it/s][A
  5%|▍         | 50/1053 [00:01<00:39, 25.42it/s][A
  5%|▌         | 53/1053 [00:02<00:39, 25.50it/s][

{'eval_loss': 0.17268717288970947, 'eval_runtime': 41.0879, 'eval_samples_per_second': 204.926, 'eval_steps_per_second': 25.628, 'epoch': 14.0}
{'eval_loss': 0.17268717288970947, 'eval_runtime': 41.0879, 'eval_samples_per_second': 204.926, 'eval_steps_per_second': 25.628, 'epoch': 14.0}


 70%|███████   | 99000/140900 [3:50:14<1:33:03,  7.50it/s]  

{'loss': 0.09, 'learning_rate': 5.947480482611782e-06, 'epoch': 14.05}
{'loss': 0.09, 'learning_rate': 5.947480482611782e-06, 'epoch': 14.05}


 71%|███████   | 99501/140900 [3:51:21<1:32:57,  7.42it/s]

{'loss': 0.0885, 'learning_rate': 5.876508161816892e-06, 'epoch': 14.12}
{'loss': 0.0885, 'learning_rate': 5.876508161816892e-06, 'epoch': 14.12}


 71%|███████   | 100000/140900 [3:52:27<1:31:17,  7.47it/s]

{'loss': 0.0884, 'learning_rate': 5.805535841022002e-06, 'epoch': 14.19}
{'loss': 0.0884, 'learning_rate': 5.805535841022002e-06, 'epoch': 14.19}


 71%|███████▏  | 100500/140900 [3:53:34<1:29:49,  7.50it/s]

{'loss': 0.0884, 'learning_rate': 5.734563520227112e-06, 'epoch': 14.27}
{'loss': 0.0884, 'learning_rate': 5.734563520227112e-06, 'epoch': 14.27}


 72%|███████▏  | 101000/140900 [3:54:41<1:28:39,  7.50it/s]

{'loss': 0.0906, 'learning_rate': 5.6635911994322225e-06, 'epoch': 14.34}
{'loss': 0.0906, 'learning_rate': 5.6635911994322225e-06, 'epoch': 14.34}


 72%|███████▏  | 101500/140900 [3:55:48<1:27:42,  7.49it/s]

{'loss': 0.0883, 'learning_rate': 5.592618878637332e-06, 'epoch': 14.41}
{'loss': 0.0883, 'learning_rate': 5.592618878637332e-06, 'epoch': 14.41}


 72%|███████▏  | 102000/140900 [3:56:55<1:26:25,  7.50it/s]

{'loss': 0.0892, 'learning_rate': 5.521646557842442e-06, 'epoch': 14.48}
{'loss': 0.0892, 'learning_rate': 5.521646557842442e-06, 'epoch': 14.48}


 73%|███████▎  | 102501/140900 [3:58:02<1:26:26,  7.40it/s]

{'loss': 0.0882, 'learning_rate': 5.450674237047552e-06, 'epoch': 14.55}
{'loss': 0.0882, 'learning_rate': 5.450674237047552e-06, 'epoch': 14.55}


 73%|███████▎  | 103000/140900 [3:59:08<1:24:14,  7.50it/s]

{'loss': 0.0894, 'learning_rate': 5.379701916252662e-06, 'epoch': 14.62}
{'loss': 0.0894, 'learning_rate': 5.379701916252662e-06, 'epoch': 14.62}


 73%|███████▎  | 103501/140900 [4:00:15<1:24:05,  7.41it/s]

{'loss': 0.0896, 'learning_rate': 5.308729595457773e-06, 'epoch': 14.69}
{'loss': 0.0896, 'learning_rate': 5.308729595457773e-06, 'epoch': 14.69}


 74%|███████▍  | 104000/140900 [4:01:22<1:22:05,  7.49it/s]

{'loss': 0.0888, 'learning_rate': 5.237757274662883e-06, 'epoch': 14.76}
{'loss': 0.0888, 'learning_rate': 5.237757274662883e-06, 'epoch': 14.76}


 74%|███████▍  | 104501/140900 [4:02:29<1:21:50,  7.41it/s]

{'loss': 0.0885, 'learning_rate': 5.166784953867991e-06, 'epoch': 14.83}
{'loss': 0.0885, 'learning_rate': 5.166784953867991e-06, 'epoch': 14.83}


 75%|███████▍  | 105000/140900 [4:03:36<1:19:55,  7.49it/s]

{'loss': 0.0888, 'learning_rate': 5.0958126330731014e-06, 'epoch': 14.9}
{'loss': 0.0888, 'learning_rate': 5.0958126330731014e-06, 'epoch': 14.9}


 75%|███████▍  | 105501/140900 [4:04:43<1:19:49,  7.39it/s]

{'loss': 0.0906, 'learning_rate': 5.024840312278212e-06, 'epoch': 14.98}
{'loss': 0.0906, 'learning_rate': 5.024840312278212e-06, 'epoch': 14.98}


 75%|███████▍  | 105674/140900 [4:05:06<1:18:55,  7.44it/s]
  0%|          | 0/1053 [00:00<?, ?it/s][A
  0%|          | 4/1053 [00:00<00:29, 35.63it/s][A
  1%|          | 8/1053 [00:00<00:36, 28.95it/s][A
  1%|          | 11/1053 [00:00<00:37, 27.74it/s][A
  1%|▏         | 14/1053 [00:00<00:38, 26.93it/s][A
  2%|▏         | 17/1053 [00:00<00:39, 26.53it/s][A
  2%|▏         | 20/1053 [00:00<00:39, 26.24it/s][A
  2%|▏         | 23/1053 [00:00<00:39, 26.15it/s][A
  2%|▏         | 26/1053 [00:00<00:39, 26.08it/s][A
  3%|▎         | 29/1053 [00:01<00:39, 25.95it/s][A
  3%|▎         | 32/1053 [00:01<00:39, 25.74it/s][A
  3%|▎         | 35/1053 [00:01<00:39, 25.76it/s][A
  4%|▎         | 38/1053 [00:01<00:39, 25.79it/s][A
  4%|▍         | 41/1053 [00:01<00:39, 25.73it/s][A
  4%|▍         | 44/1053 [00:01<00:39, 25.64it/s][A
  4%|▍         | 47/1053 [00:01<00:39, 25.72it/s][A
  5%|▍         | 50/1053 [00:01<00:39, 25.69it/s][A
  5%|▌         | 53/1053 [00:02<00:38, 25.79it/s]

{'eval_loss': 0.17925691604614258, 'eval_runtime': 41.1237, 'eval_samples_per_second': 204.748, 'eval_steps_per_second': 25.606, 'epoch': 15.0}
{'eval_loss': 0.17925691604614258, 'eval_runtime': 41.1237, 'eval_samples_per_second': 204.748, 'eval_steps_per_second': 25.606, 'epoch': 15.0}


 75%|███████▌  | 106000/140900 [4:06:30<1:17:40,  7.49it/s] 

{'loss': 0.0884, 'learning_rate': 4.953867991483322e-06, 'epoch': 15.05}
{'loss': 0.0884, 'learning_rate': 4.953867991483322e-06, 'epoch': 15.05}


 76%|███████▌  | 106501/140900 [4:07:37<1:17:40,  7.38it/s]

{'loss': 0.0857, 'learning_rate': 4.882895670688432e-06, 'epoch': 15.12}
{'loss': 0.0857, 'learning_rate': 4.882895670688432e-06, 'epoch': 15.12}


 76%|███████▌  | 107001/140900 [4:08:44<1:16:01,  7.43it/s]

{'loss': 0.0868, 'learning_rate': 4.811923349893542e-06, 'epoch': 15.19}
{'loss': 0.0868, 'learning_rate': 4.811923349893542e-06, 'epoch': 15.19}


 76%|███████▋  | 107500/140900 [4:09:51<1:15:35,  7.36it/s]

{'loss': 0.0871, 'learning_rate': 4.740951029098652e-06, 'epoch': 15.26}
{'loss': 0.0871, 'learning_rate': 4.740951029098652e-06, 'epoch': 15.26}


 77%|███████▋  | 108001/140900 [4:10:58<1:15:08,  7.30it/s]

{'loss': 0.0846, 'learning_rate': 4.669978708303762e-06, 'epoch': 15.33}
{'loss': 0.0846, 'learning_rate': 4.669978708303762e-06, 'epoch': 15.33}


 77%|███████▋  | 108500/140900 [4:12:04<1:13:28,  7.35it/s]

{'loss': 0.086, 'learning_rate': 4.599006387508872e-06, 'epoch': 15.4}
{'loss': 0.086, 'learning_rate': 4.599006387508872e-06, 'epoch': 15.4}


 77%|███████▋  | 109000/140900 [4:13:11<1:11:21,  7.45it/s]

{'loss': 0.0878, 'learning_rate': 4.528034066713982e-06, 'epoch': 15.47}
{'loss': 0.0878, 'learning_rate': 4.528034066713982e-06, 'epoch': 15.47}


 78%|███████▊  | 109500/140900 [4:14:18<1:09:57,  7.48it/s]

{'loss': 0.0864, 'learning_rate': 4.457061745919091e-06, 'epoch': 15.54}
{'loss': 0.0864, 'learning_rate': 4.457061745919091e-06, 'epoch': 15.54}


 78%|███████▊  | 110001/140900 [4:15:25<1:09:46,  7.38it/s]

{'loss': 0.0886, 'learning_rate': 4.386089425124202e-06, 'epoch': 15.61}
{'loss': 0.0886, 'learning_rate': 4.386089425124202e-06, 'epoch': 15.61}


 78%|███████▊  | 110500/140900 [4:16:32<1:07:43,  7.48it/s]

{'loss': 0.085, 'learning_rate': 4.315117104329312e-06, 'epoch': 15.68}
{'loss': 0.085, 'learning_rate': 4.315117104329312e-06, 'epoch': 15.68}


 79%|███████▉  | 111000/140900 [4:17:39<1:06:36,  7.48it/s]

{'loss': 0.0872, 'learning_rate': 4.244144783534422e-06, 'epoch': 15.76}
{'loss': 0.0872, 'learning_rate': 4.244144783534422e-06, 'epoch': 15.76}


 79%|███████▉  | 111501/140900 [4:18:46<1:06:10,  7.40it/s]

{'loss': 0.0861, 'learning_rate': 4.173172462739532e-06, 'epoch': 15.83}
{'loss': 0.0861, 'learning_rate': 4.173172462739532e-06, 'epoch': 15.83}


 79%|███████▉  | 112000/140900 [4:19:52<1:04:24,  7.48it/s]

{'loss': 0.0877, 'learning_rate': 4.1022001419446415e-06, 'epoch': 15.9}
{'loss': 0.0877, 'learning_rate': 4.1022001419446415e-06, 'epoch': 15.9}


 80%|███████▉  | 112500/140900 [4:20:59<1:03:10,  7.49it/s]

{'loss': 0.088, 'learning_rate': 4.031227821149752e-06, 'epoch': 15.97}
{'loss': 0.088, 'learning_rate': 4.031227821149752e-06, 'epoch': 15.97}


 80%|███████▉  | 112719/140900 [4:21:28<1:03:10,  7.43it/s]
  0%|          | 0/1053 [00:00<?, ?it/s][A
  0%|          | 4/1053 [00:00<00:29, 35.13it/s][A
  1%|          | 8/1053 [00:00<00:35, 29.05it/s][A
  1%|          | 11/1053 [00:00<00:37, 27.59it/s][A
  1%|▏         | 14/1053 [00:00<00:38, 26.89it/s][A
  2%|▏         | 17/1053 [00:00<00:39, 26.43it/s][A
  2%|▏         | 20/1053 [00:00<00:39, 26.17it/s][A
  2%|▏         | 23/1053 [00:00<00:39, 25.92it/s][A
  2%|▏         | 26/1053 [00:00<00:39, 25.69it/s][A
  3%|▎         | 29/1053 [00:01<00:39, 25.75it/s][A
  3%|▎         | 32/1053 [00:01<00:39, 25.79it/s][A
  3%|▎         | 35/1053 [00:01<00:39, 25.79it/s][A
  4%|▎         | 38/1053 [00:01<00:39, 25.70it/s][A
  4%|▍         | 41/1053 [00:01<00:39, 25.79it/s][A
  4%|▍         | 44/1053 [00:01<00:39, 25.78it/s][A
  4%|▍         | 47/1053 [00:01<00:39, 25.73it/s][A
  5%|▍         | 50/1053 [00:01<00:39, 25.62it/s][A
  5%|▌         | 53/1053 [00:02<00:38, 25.68it/s]

{'eval_loss': 0.18119865655899048, 'eval_runtime': 41.0566, 'eval_samples_per_second': 205.083, 'eval_steps_per_second': 25.648, 'epoch': 16.0}


 80%|████████  | 112720/140900 [4:22:10<1:03:10,  7.43it/s]
100%|██████████| 1053/1053 [00:41<00:00, 25.55it/s][A
 80%|████████  | 112721/140900 [4:22:10<75:07:16,  9.60s/it]

{'eval_loss': 0.18119865655899048, 'eval_runtime': 41.0566, 'eval_samples_per_second': 205.083, 'eval_steps_per_second': 25.648, 'epoch': 16.0}


 80%|████████  | 113000/140900 [4:22:47<1:01:49,  7.52it/s] 

{'loss': 0.0854, 'learning_rate': 3.960255500354862e-06, 'epoch': 16.04}
{'loss': 0.0854, 'learning_rate': 3.960255500354862e-06, 'epoch': 16.04}


 81%|████████  | 113501/140900 [4:23:54<1:01:42,  7.40it/s]

{'loss': 0.0834, 'learning_rate': 3.889283179559972e-06, 'epoch': 16.11}
{'loss': 0.0834, 'learning_rate': 3.889283179559972e-06, 'epoch': 16.11}


 81%|████████  | 114000/140900 [4:25:01<59:50,  7.49it/s]  

{'loss': 0.0846, 'learning_rate': 3.818310858765082e-06, 'epoch': 16.18}
{'loss': 0.0846, 'learning_rate': 3.818310858765082e-06, 'epoch': 16.18}


 81%|████████▏ | 114501/140900 [4:26:08<59:30,  7.39it/s]  

{'loss': 0.0859, 'learning_rate': 3.747338537970192e-06, 'epoch': 16.25}
{'loss': 0.0859, 'learning_rate': 3.747338537970192e-06, 'epoch': 16.25}


 82%|████████▏ | 115000/140900 [4:27:14<57:40,  7.49it/s]

{'loss': 0.0842, 'learning_rate': 3.6763662171753018e-06, 'epoch': 16.32}
{'loss': 0.0842, 'learning_rate': 3.6763662171753018e-06, 'epoch': 16.32}


 82%|████████▏ | 115500/140900 [4:28:21<56:38,  7.47it/s]

{'loss': 0.0845, 'learning_rate': 3.605393896380412e-06, 'epoch': 16.39}
{'loss': 0.0845, 'learning_rate': 3.605393896380412e-06, 'epoch': 16.39}


 82%|████████▏ | 116000/140900 [4:29:28<55:27,  7.48it/s]

{'loss': 0.0851, 'learning_rate': 3.534421575585522e-06, 'epoch': 16.47}
{'loss': 0.0851, 'learning_rate': 3.534421575585522e-06, 'epoch': 16.47}


 83%|████████▎ | 116501/140900 [4:30:35<54:56,  7.40it/s]

{'loss': 0.0853, 'learning_rate': 3.463449254790632e-06, 'epoch': 16.54}
{'loss': 0.0853, 'learning_rate': 3.463449254790632e-06, 'epoch': 16.54}


 83%|████████▎ | 117000/140900 [4:31:42<53:11,  7.49it/s]

{'loss': 0.0846, 'learning_rate': 3.392476933995742e-06, 'epoch': 16.61}
{'loss': 0.0846, 'learning_rate': 3.392476933995742e-06, 'epoch': 16.61}


 83%|████████▎ | 117501/140900 [4:32:49<52:56,  7.37it/s]

{'loss': 0.0845, 'learning_rate': 3.3215046132008523e-06, 'epoch': 16.68}
{'loss': 0.0845, 'learning_rate': 3.3215046132008523e-06, 'epoch': 16.68}


 84%|████████▎ | 118000/140900 [4:33:55<51:04,  7.47it/s]

{'loss': 0.0853, 'learning_rate': 3.2505322924059616e-06, 'epoch': 16.75}
{'loss': 0.0853, 'learning_rate': 3.2505322924059616e-06, 'epoch': 16.75}


 84%|████████▍ | 118501/140900 [4:35:02<50:27,  7.40it/s]

{'loss': 0.0853, 'learning_rate': 3.179559971611072e-06, 'epoch': 16.82}
{'loss': 0.0853, 'learning_rate': 3.179559971611072e-06, 'epoch': 16.82}


 84%|████████▍ | 119000/140900 [4:36:09<48:39,  7.50it/s]

{'loss': 0.0851, 'learning_rate': 3.1085876508161824e-06, 'epoch': 16.89}
{'loss': 0.0851, 'learning_rate': 3.1085876508161824e-06, 'epoch': 16.89}


 85%|████████▍ | 119500/140900 [4:37:16<47:35,  7.50it/s]

{'loss': 0.0845, 'learning_rate': 3.0376153300212917e-06, 'epoch': 16.96}
{'loss': 0.0845, 'learning_rate': 3.0376153300212917e-06, 'epoch': 16.96}


 85%|████████▍ | 119764/140900 [4:37:51<47:20,  7.44it/s]
  0%|          | 0/1053 [00:00<?, ?it/s][A
  0%|          | 4/1053 [00:00<00:30, 34.30it/s][A
  1%|          | 8/1053 [00:00<00:36, 28.68it/s][A
  1%|          | 11/1053 [00:00<00:37, 27.57it/s][A
  1%|▏         | 14/1053 [00:00<00:38, 26.88it/s][A
  2%|▏         | 17/1053 [00:00<00:39, 26.54it/s][A
  2%|▏         | 20/1053 [00:00<00:39, 26.29it/s][A
  2%|▏         | 23/1053 [00:00<00:39, 26.07it/s][A
  2%|▏         | 26/1053 [00:00<00:39, 26.00it/s][A
  3%|▎         | 29/1053 [00:01<00:39, 25.92it/s][A
  3%|▎         | 32/1053 [00:01<00:39, 25.73it/s][A
  3%|▎         | 35/1053 [00:01<00:39, 25.67it/s][A
  4%|▎         | 38/1053 [00:01<00:39, 25.69it/s][A
  4%|▍         | 41/1053 [00:01<00:39, 25.69it/s][A
  4%|▍         | 44/1053 [00:01<00:39, 25.72it/s][A
  4%|▍         | 47/1053 [00:01<00:39, 25.58it/s][A
  5%|▍         | 50/1053 [00:01<00:39, 25.46it/s][A
  5%|▌         | 53/1053 [00:02<00:39, 25.50it/s][A

{'eval_loss': 0.1838558316230774, 'eval_runtime': 41.1266, 'eval_samples_per_second': 204.733, 'eval_steps_per_second': 25.604, 'epoch': 17.0}
{'eval_loss': 0.1838558316230774, 'eval_runtime': 41.1266, 'eval_samples_per_second': 204.733, 'eval_steps_per_second': 25.604, 'epoch': 17.0}


 85%|████████▌ | 120000/140900 [4:39:04<46:26,  7.50it/s]   

{'loss': 0.0839, 'learning_rate': 2.966643009226402e-06, 'epoch': 17.03}
{'loss': 0.0839, 'learning_rate': 2.966643009226402e-06, 'epoch': 17.03}


 86%|████████▌ | 120501/140900 [4:40:11<46:11,  7.36it/s]

{'loss': 0.0826, 'learning_rate': 2.895670688431512e-06, 'epoch': 17.1}
{'loss': 0.0826, 'learning_rate': 2.895670688431512e-06, 'epoch': 17.1}


 86%|████████▌ | 121000/140900 [4:41:17<44:24,  7.47it/s]

{'loss': 0.0815, 'learning_rate': 2.824698367636622e-06, 'epoch': 17.18}
{'loss': 0.0815, 'learning_rate': 2.824698367636622e-06, 'epoch': 17.18}


 86%|████████▌ | 121501/140900 [4:42:24<44:19,  7.29it/s]

{'loss': 0.0836, 'learning_rate': 2.753726046841732e-06, 'epoch': 17.25}
{'loss': 0.0836, 'learning_rate': 2.753726046841732e-06, 'epoch': 17.25}


 87%|████████▋ | 122000/140900 [4:43:31<42:02,  7.49it/s]

{'loss': 0.0831, 'learning_rate': 2.682753726046842e-06, 'epoch': 17.32}
{'loss': 0.0831, 'learning_rate': 2.682753726046842e-06, 'epoch': 17.32}


 87%|████████▋ | 122501/140900 [4:44:38<41:21,  7.41it/s]

{'loss': 0.0841, 'learning_rate': 2.611781405251952e-06, 'epoch': 17.39}
{'loss': 0.0841, 'learning_rate': 2.611781405251952e-06, 'epoch': 17.39}


 87%|████████▋ | 123000/140900 [4:45:45<39:48,  7.49it/s]

{'loss': 0.0822, 'learning_rate': 2.540809084457062e-06, 'epoch': 17.46}
{'loss': 0.0822, 'learning_rate': 2.540809084457062e-06, 'epoch': 17.46}


 88%|████████▊ | 123500/140900 [4:46:52<38:44,  7.48it/s]

{'loss': 0.083, 'learning_rate': 2.469836763662172e-06, 'epoch': 17.53}
{'loss': 0.083, 'learning_rate': 2.469836763662172e-06, 'epoch': 17.53}


 88%|████████▊ | 124001/140900 [4:47:59<37:59,  7.41it/s]

{'loss': 0.0828, 'learning_rate': 2.398864442867282e-06, 'epoch': 17.6}
{'loss': 0.0828, 'learning_rate': 2.398864442867282e-06, 'epoch': 17.6}


 88%|████████▊ | 124500/140900 [4:49:05<36:30,  7.49it/s]

{'loss': 0.0833, 'learning_rate': 2.327892122072392e-06, 'epoch': 17.67}
{'loss': 0.0833, 'learning_rate': 2.327892122072392e-06, 'epoch': 17.67}


 89%|████████▊ | 125000/140900 [4:50:12<35:19,  7.50it/s]

{'loss': 0.0839, 'learning_rate': 2.256919801277502e-06, 'epoch': 17.74}
{'loss': 0.0839, 'learning_rate': 2.256919801277502e-06, 'epoch': 17.74}


 89%|████████▉ | 125501/140900 [4:51:19<34:37,  7.41it/s]

{'loss': 0.0832, 'learning_rate': 2.185947480482612e-06, 'epoch': 17.81}
{'loss': 0.0832, 'learning_rate': 2.185947480482612e-06, 'epoch': 17.81}


 89%|████████▉ | 126000/140900 [4:52:26<33:15,  7.47it/s]

{'loss': 0.0834, 'learning_rate': 2.114975159687722e-06, 'epoch': 17.89}
{'loss': 0.0834, 'learning_rate': 2.114975159687722e-06, 'epoch': 17.89}


 90%|████████▉ | 126501/140900 [4:53:33<32:37,  7.36it/s]

{'loss': 0.0835, 'learning_rate': 2.044002838892832e-06, 'epoch': 17.96}
{'loss': 0.0835, 'learning_rate': 2.044002838892832e-06, 'epoch': 17.96}


 90%|████████▉ | 126809/140900 [4:54:14<31:35,  7.43it/s]
  0%|          | 0/1053 [00:00<?, ?it/s][A
  0%|          | 4/1053 [00:00<00:30, 34.84it/s][A
  1%|          | 8/1053 [00:00<00:36, 28.71it/s][A
  1%|          | 11/1053 [00:00<00:38, 27.26it/s][A
  1%|▏         | 14/1053 [00:00<00:39, 26.56it/s][A
  2%|▏         | 17/1053 [00:00<00:39, 26.26it/s][A
  2%|▏         | 20/1053 [00:00<00:39, 25.94it/s][A
  2%|▏         | 23/1053 [00:00<00:40, 25.74it/s][A
  2%|▏         | 26/1053 [00:00<00:39, 25.82it/s][A
  3%|▎         | 29/1053 [00:01<00:39, 25.78it/s][A
  3%|▎         | 32/1053 [00:01<00:39, 25.75it/s][A
  3%|▎         | 35/1053 [00:01<00:39, 25.63it/s][A
  4%|▎         | 38/1053 [00:01<00:39, 25.69it/s][A
  4%|▍         | 41/1053 [00:01<00:39, 25.69it/s][A
  4%|▍         | 44/1053 [00:01<00:39, 25.73it/s][A
  4%|▍         | 47/1053 [00:01<00:39, 25.72it/s][A
  5%|▍         | 50/1053 [00:01<00:39, 25.65it/s][A
  5%|▌         | 53/1053 [00:02<00:38, 25.77it/s][A

{'eval_loss': 0.18864911794662476, 'eval_runtime': 41.0901, 'eval_samples_per_second': 204.915, 'eval_steps_per_second': 25.627, 'epoch': 18.0}
{'eval_loss': 0.18864911794662476, 'eval_runtime': 41.0901, 'eval_samples_per_second': 204.915, 'eval_steps_per_second': 25.627, 'epoch': 18.0}


 90%|█████████ | 127000/140900 [4:55:20<30:51,  7.51it/s]   

{'loss': 0.0813, 'learning_rate': 1.973030518097942e-06, 'epoch': 18.03}
{'loss': 0.0813, 'learning_rate': 1.973030518097942e-06, 'epoch': 18.03}


 90%|█████████ | 127500/140900 [4:56:27<29:49,  7.49it/s]

{'loss': 0.0821, 'learning_rate': 1.902058197303052e-06, 'epoch': 18.1}
{'loss': 0.0821, 'learning_rate': 1.902058197303052e-06, 'epoch': 18.1}


 91%|█████████ | 128000/140900 [4:57:34<28:39,  7.50it/s]

{'loss': 0.0814, 'learning_rate': 1.831085876508162e-06, 'epoch': 18.17}
{'loss': 0.0814, 'learning_rate': 1.831085876508162e-06, 'epoch': 18.17}


 91%|█████████ | 128500/140900 [4:58:41<27:36,  7.49it/s]

{'loss': 0.0805, 'learning_rate': 1.760113555713272e-06, 'epoch': 18.24}
{'loss': 0.0805, 'learning_rate': 1.760113555713272e-06, 'epoch': 18.24}


 92%|█████████▏| 129001/140900 [4:59:48<26:42,  7.42it/s]

{'loss': 0.0821, 'learning_rate': 1.6891412349183818e-06, 'epoch': 18.31}
{'loss': 0.0821, 'learning_rate': 1.6891412349183818e-06, 'epoch': 18.31}


 92%|█████████▏| 129500/140900 [5:00:54<25:23,  7.49it/s]

{'loss': 0.0807, 'learning_rate': 1.6181689141234918e-06, 'epoch': 18.38}
{'loss': 0.0807, 'learning_rate': 1.6181689141234918e-06, 'epoch': 18.38}


 92%|█████████▏| 130001/140900 [5:02:01<24:31,  7.41it/s]

{'loss': 0.0825, 'learning_rate': 1.547196593328602e-06, 'epoch': 18.45}
{'loss': 0.0825, 'learning_rate': 1.547196593328602e-06, 'epoch': 18.45}


 93%|█████████▎| 130500/140900 [5:03:08<23:09,  7.49it/s]

{'loss': 0.0823, 'learning_rate': 1.476224272533712e-06, 'epoch': 18.52}
{'loss': 0.0823, 'learning_rate': 1.476224272533712e-06, 'epoch': 18.52}


 93%|█████████▎| 131001/140900 [5:04:15<22:17,  7.40it/s]

{'loss': 0.0824, 'learning_rate': 1.405251951738822e-06, 'epoch': 18.59}
{'loss': 0.0824, 'learning_rate': 1.405251951738822e-06, 'epoch': 18.59}


 93%|█████████▎| 131500/140900 [5:05:22<20:55,  7.49it/s]

{'loss': 0.0812, 'learning_rate': 1.3342796309439321e-06, 'epoch': 18.67}
{'loss': 0.0812, 'learning_rate': 1.3342796309439321e-06, 'epoch': 18.67}


 94%|█████████▎| 132001/140900 [5:06:29<20:05,  7.38it/s]

{'loss': 0.0816, 'learning_rate': 1.263307310149042e-06, 'epoch': 18.74}
{'loss': 0.0816, 'learning_rate': 1.263307310149042e-06, 'epoch': 18.74}


 94%|█████████▍| 132500/140900 [5:07:35<18:40,  7.49it/s]

{'loss': 0.0817, 'learning_rate': 1.1923349893541519e-06, 'epoch': 18.81}
{'loss': 0.0817, 'learning_rate': 1.1923349893541519e-06, 'epoch': 18.81}


 94%|█████████▍| 133001/140900 [5:08:42<17:51,  7.37it/s]

{'loss': 0.0822, 'learning_rate': 1.121362668559262e-06, 'epoch': 18.88}
{'loss': 0.0822, 'learning_rate': 1.121362668559262e-06, 'epoch': 18.88}


 95%|█████████▍| 133500/140900 [5:09:49<16:24,  7.51it/s]

{'loss': 0.0809, 'learning_rate': 1.050390347764372e-06, 'epoch': 18.95}
{'loss': 0.0809, 'learning_rate': 1.050390347764372e-06, 'epoch': 18.95}


 95%|█████████▍| 133854/140900 [5:10:36<15:45,  7.45it/s]
  0%|          | 0/1053 [00:00<?, ?it/s][A
  0%|          | 4/1053 [00:00<00:29, 35.50it/s][A
  1%|          | 8/1053 [00:00<00:35, 29.19it/s][A
  1%|          | 11/1053 [00:00<00:37, 27.76it/s][A
  1%|▏         | 14/1053 [00:00<00:38, 26.99it/s][A
  2%|▏         | 17/1053 [00:00<00:39, 26.50it/s][A
  2%|▏         | 20/1053 [00:00<00:39, 26.32it/s][A
  2%|▏         | 23/1053 [00:00<00:39, 26.16it/s][A
  2%|▏         | 26/1053 [00:00<00:39, 25.98it/s][A
  3%|▎         | 29/1053 [00:01<00:39, 26.00it/s][A
  3%|▎         | 32/1053 [00:01<00:39, 25.94it/s][A
  3%|▎         | 35/1053 [00:01<00:39, 25.91it/s][A
  4%|▎         | 38/1053 [00:01<00:39, 25.85it/s][A
  4%|▍         | 41/1053 [00:01<00:39, 25.75it/s][A
  4%|▍         | 44/1053 [00:01<00:39, 25.74it/s][A
  4%|▍         | 47/1053 [00:01<00:39, 25.72it/s][A
  5%|▍         | 50/1053 [00:01<00:39, 25.60it/s][A
  5%|▌         | 53/1053 [00:02<00:39, 25.53it/s][A

{'eval_loss': 0.1932285726070404, 'eval_runtime': 41.0879, 'eval_samples_per_second': 204.926, 'eval_steps_per_second': 25.628, 'epoch': 19.0}
{'eval_loss': 0.1932285726070404, 'eval_runtime': 41.0879, 'eval_samples_per_second': 204.926, 'eval_steps_per_second': 25.628, 'epoch': 19.0}


 95%|█████████▌| 134000/140900 [5:11:37<15:23,  7.47it/s]   

{'loss': 0.0816, 'learning_rate': 9.79418026969482e-07, 'epoch': 19.02}
{'loss': 0.0816, 'learning_rate': 9.79418026969482e-07, 'epoch': 19.02}


 95%|█████████▌| 134501/140900 [5:12:44<14:25,  7.39it/s]

{'loss': 0.0811, 'learning_rate': 9.08445706174592e-07, 'epoch': 19.09}
{'loss': 0.0811, 'learning_rate': 9.08445706174592e-07, 'epoch': 19.09}


 96%|█████████▌| 135000/140900 [5:13:51<13:06,  7.51it/s]

{'loss': 0.0811, 'learning_rate': 8.374733853797019e-07, 'epoch': 19.16}
{'loss': 0.0811, 'learning_rate': 8.374733853797019e-07, 'epoch': 19.16}


 96%|█████████▌| 135500/140900 [5:14:58<12:00,  7.49it/s]

{'loss': 0.0799, 'learning_rate': 7.66501064584812e-07, 'epoch': 19.23}
{'loss': 0.0799, 'learning_rate': 7.66501064584812e-07, 'epoch': 19.23}


 97%|█████████▋| 136000/140900 [5:16:04<10:54,  7.49it/s]

{'loss': 0.0804, 'learning_rate': 6.955287437899221e-07, 'epoch': 19.3}
{'loss': 0.0804, 'learning_rate': 6.955287437899221e-07, 'epoch': 19.3}


 97%|█████████▋| 136500/140900 [5:17:11<09:48,  7.48it/s]

{'loss': 0.0796, 'learning_rate': 6.24556422995032e-07, 'epoch': 19.38}
{'loss': 0.0796, 'learning_rate': 6.24556422995032e-07, 'epoch': 19.38}


 97%|█████████▋| 137000/140900 [5:18:18<08:42,  7.46it/s]

{'loss': 0.0803, 'learning_rate': 5.535841022001419e-07, 'epoch': 19.45}
{'loss': 0.0803, 'learning_rate': 5.535841022001419e-07, 'epoch': 19.45}


 98%|█████████▊| 137501/140900 [5:19:25<07:40,  7.38it/s]

{'loss': 0.0804, 'learning_rate': 4.82611781405252e-07, 'epoch': 19.52}
{'loss': 0.0804, 'learning_rate': 4.82611781405252e-07, 'epoch': 19.52}


 98%|█████████▊| 138000/140900 [5:20:32<06:28,  7.46it/s]

{'loss': 0.0802, 'learning_rate': 4.11639460610362e-07, 'epoch': 19.59}
{'loss': 0.0802, 'learning_rate': 4.11639460610362e-07, 'epoch': 19.59}


 98%|█████████▊| 138501/140900 [5:21:39<05:23,  7.41it/s]

{'loss': 0.0805, 'learning_rate': 3.40667139815472e-07, 'epoch': 19.66}
{'loss': 0.0805, 'learning_rate': 3.40667139815472e-07, 'epoch': 19.66}


 99%|█████████▊| 139000/140900 [5:22:45<04:14,  7.47it/s]

{'loss': 0.0811, 'learning_rate': 2.69694819020582e-07, 'epoch': 19.73}
{'loss': 0.0811, 'learning_rate': 2.69694819020582e-07, 'epoch': 19.73}


 99%|█████████▉| 139501/140900 [5:23:52<03:09,  7.38it/s]

{'loss': 0.0804, 'learning_rate': 1.98722498225692e-07, 'epoch': 19.8}
{'loss': 0.0804, 'learning_rate': 1.98722498225692e-07, 'epoch': 19.8}


 99%|█████████▉| 140000/140900 [5:24:59<01:59,  7.51it/s]

{'loss': 0.0813, 'learning_rate': 1.27750177430802e-07, 'epoch': 19.87}
{'loss': 0.0813, 'learning_rate': 1.27750177430802e-07, 'epoch': 19.87}


100%|█████████▉| 140501/140900 [5:26:06<00:53,  7.42it/s]

{'loss': 0.0797, 'learning_rate': 5.6777856635912e-08, 'epoch': 19.94}
{'loss': 0.0797, 'learning_rate': 5.6777856635912e-08, 'epoch': 19.94}


100%|█████████▉| 140899/140900 [5:26:59<00:00,  7.44it/s]
  0%|          | 0/1053 [00:00<?, ?it/s][A
  0%|          | 4/1053 [00:00<00:30, 34.86it/s][A
  1%|          | 8/1053 [00:00<00:36, 28.65it/s][A
  1%|          | 11/1053 [00:00<00:37, 27.43it/s][A
  1%|▏         | 14/1053 [00:00<00:38, 26.77it/s][A
  2%|▏         | 17/1053 [00:00<00:39, 26.30it/s][A
  2%|▏         | 20/1053 [00:00<00:39, 26.11it/s][A
  2%|▏         | 23/1053 [00:00<00:39, 25.85it/s][A
  2%|▏         | 26/1053 [00:00<00:39, 25.82it/s][A
  3%|▎         | 29/1053 [00:01<00:39, 25.67it/s][A
  3%|▎         | 32/1053 [00:01<00:39, 25.68it/s][A
  3%|▎         | 35/1053 [00:01<00:39, 25.64it/s][A
  4%|▎         | 38/1053 [00:01<00:39, 25.77it/s][A
  4%|▍         | 41/1053 [00:01<00:39, 25.45it/s][A
  4%|▍         | 44/1053 [00:01<00:39, 25.62it/s][A
  4%|▍         | 47/1053 [00:01<00:39, 25.51it/s][A
  5%|▍         | 50/1053 [00:01<00:39, 25.38it/s][A
  5%|▌         | 53/1053 [00:02<00:39, 25.45it/s][A

{'eval_loss': 0.19474487006664276, 'eval_runtime': 41.0893, 'eval_samples_per_second': 204.92, 'eval_steps_per_second': 25.627, 'epoch': 20.0}
{'eval_loss': 0.19474487006664276, 'eval_runtime': 41.0893, 'eval_samples_per_second': 204.92, 'eval_steps_per_second': 25.627, 'epoch': 20.0}
{'train_runtime': 19660.9621, 'train_samples_per_second': 57.326, 'train_steps_per_second': 7.166, 'train_loss': 0.10940194611180497, 'epoch': 20.0}
{'train_runtime': 19660.9621, 'train_samples_per_second': 57.326, 'train_steps_per_second': 7.166, 'train_loss': 0.10940194611180497, 'epoch': 20.0}





In [22]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

100%|██████████| 1053/1053 [00:41<00:00, 25.66it/s]

{'eval_loss': 0.19474487006664276, 'eval_runtime': 41.0559, 'eval_samples_per_second': 205.086, 'eval_steps_per_second': 25.648, 'epoch': 20.0}
Perplexity: 1.22





In [23]:
import random

In [24]:
len(dev_dataloader)

56354

In [25]:
trainer.model.to('cpu');
for _ in range(5):
    index = random.randint(0,len(test_dataloader)-1)
    print(f"---------true-----------")
    print(tokenizer.decode(test_dataloader[index]['input_ids']), tokenizer.decode(test_dataloader[index]['labels']))
    print(f"---------predict-----------")
    predict = trainer.model.generate(test_dataloader[index]['input_ids'].reshape((1,-1)), max_new_tokens=40, do_sample=True, top_k=50, top_p=0.95)[0]
    print(tokenizer.decode(predict))

---------true-----------


IndexError: list index out of range

In [None]:
trainer.model.to('cpu');
for _ in range(5):
    index = random.randint(0,len(test_dataloader)-1)
    print(f"---------true-----------")
    print(tokenizer.decode(test_dataloader[index]['input_ids']), tokenizer.decode(test_dataloader[index]['labels']))
    print(f"---------predict-----------")
    predict = trainer.model.generate(test_dataloader[index]['input_ids'].reshape((1,-1)), max_new_tokens=40, do_sample=True, top_k=50, top_p=0.95)[0]
    print(tokenizer.decode(predict))

In [None]:
trainer.model.to('cpu');
for _ in range(4):
    index = random.randint(0,len(dev_dataloader)-1)
    print(f"---------true-----------")
    print(tokenizer.decode(dev_dataloader[index]['input_ids']), tokenizer.decode(dev_dataloader[index]['labels']))
    print(f"---------predict-----------")
    predict = trainer.model.generate(dev_dataloader[index]['input_ids'].reshape((1,-1)), max_new_tokens=40, do_sample=True, top_k=50, top_p=0.95)[0]
    print(tokenizer.decode(predict))

In [None]:
metrics = end_train.metrics

# save train results
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

In [24]:
test_model = XLMRobertaForCausalLM.from_pretrained("/home/jupyter/datasphere/project/roberta-base-exp/checkpoint-42270")#.to('cuda')

You are using a model of type roberta to instantiate a model of type xlm-roberta. This is not supported for all configurations of models and can yield errors.


In [25]:
config = data_config
device = "cpu"

tokenizer = UNIFTokenizer(path_tok=config["path_repository"] + "data/query_vocab.json",
                               pre_train_name=config["pre_train_tokenizer"],
                               pad_flag=False,
                               max_length=config["max_sent_len"])

def prepare_data(path_data, drop_last=False):

    dev_data = json.load(open(os.path.join(path_data), 'r', encoding="utf-8"))
    target_sentences = []
    source_sentences = []
    for sample in tqdm(dev_data[:config["separate_batch"]], desc="Pars data"):
        target_sentences.append(sample['masked_query'])
        source_sentences.append(sample['question'])

    # DataLoader

    tokenized_source_sentences = [tokenizer.tkr(i) for i in source_sentences][0:40]
    tokenized_target_sentences = [tokenizer.tkr(i) for i in target_sentences][0:40]

    dataset = MTDataset_HF(tokenized_source_list=tokenized_source_sentences,
                        tokenized_target_list=tokenized_target_sentences, device=device)
    return dataset

In [26]:
test_model.to('cpu');
for _ in range(4):
    index = random.randint(0,len(dev_dataloader)-1)
    print(f"---------true-----------")
    print(tokenizer.decode(dev_dataloader[index]['input_ids']), tokenizer.decode(dev_dataloader[index]['labels']))
    print(f"---------predict-----------")
    predict = test_model.generate(dev_dataloader[index]['input_ids'].reshape((1,-1)), max_new_tokens=40, do_sample=True, top_k=50, top_p=0.95)[0]
    print(tokenizer.decode(predict))

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


---------true-----------
<s>Which Attendance has an Opponent of carolina panthers ?</s> <s>SELECT Attendance FROM table WHERE Opponent = STR_VALUE_1</s>
---------predict-----------


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


<s>Which Attendance has an Opponent of carolina panthers ?</s> insertiononentonentonentinonentonent_inininonentonentonent_in
---------true-----------
<s>What is the number of played when points against 645 ?</s> <s>SELECT COUNT Played FROM table WHERE Pts_Agst = NUM_VALUE_1</s>
---------predict-----------


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


<s>What is the number of played when points against 645 ?</s></s>
---------true-----------
<s>What is tuesday day three when thursday day five is kamis ?</s> <s>SELECT Tuesday_Day_Three FROM table WHERE Thursday_Day_Five = STR_VALUE_1</s>
---------predict-----------


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


<s>What is tuesday day three when thursday day five is kamis ?</s>1VALUE1</s>
---------true-----------
<s>In what parish is the sub-parish fortun ? </s> <s>SELECT Parish_(Prestegjeld) FROM table WHERE Sub-Parish_(Sogn) = STR_VALUE_1</s>
---------predict-----------
<s>In what parish is the sub-parish fortun ? </s>1 WHERE = WHERE)Par_ParParParParParParParParParParParParParParParParParParParParPar_PPParParPar
