In [1]:
import os
import torch
import gc
import json
from huggingface_hub import login
import datasets
import transformers
from datasets import load_dataset, Dataset
from transformers import (AutoTokenizer,
                          AutoModelForCausalLM,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline
                          )
from peft import (LoraConfig,
                  PeftConfig,
                  PeftModel,
                  get_peft_model,
                  prepare_model_for_kbit_training,
                  AutoPeftModelForCausalLM
                  )
import trl
from trl import (SFTTrainer,
                 setup_chat_format,
                 )
from multiprocessing import cpu_count
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm
Could not find the bitsandbytes CUDA binary at PosixPath('/home/talia/anaconda3/envs/finetune/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda113.so')
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


In [2]:
print(f"The PyTorch version is {torch.__version__}.")
print(f"Datasets version is {datasets.__version__}.")
print(f"Transformers version is {transformers.__version__}.")
print(f"TRL version is {trl.__version__}.")

The PyTorch version is 1.12.1.
Datasets version is 3.0.0.
Transformers version is 4.44.1.
TRL version is 0.11.0.


In [3]:

major_version, minor_version = torch.cuda.get_device_capability()
print(f"Cuda major version: {major_version}.\nCuda minor version: {minor_version}")
assert major_version >= 8, "Hardware not supported by Flash Attention."
print(f'CUDA capable device available - {torch.cuda.is_available()}')

Cuda major version: 8.
Cuda minor version: 6
CUDA capable device available - True


In [36]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3080 Ti. Max memory = 11.749 GB.
10.406 GB of memory reserved.


# Data Explorer

In [36]:
main_df = pd.read_csv('data/text2cypher_claudeopus.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'text2cypher_claudeopus.csv'

In [20]:
main_df.head()

Unnamed: 0,question,cypher,type,database,syntax_error,timeout,returns_results,false_schema
0,What are the top 5 movies with a runtime great...,MATCH (m:Movie)\nWHERE m.runtime > 120\nRETURN...,Simple Retrieval Queries,recommendations,False,False,True,
1,List the first 3 directors born before 1950.,MATCH (d:Director)\nWHERE d.born < date('1950-...,Simple Retrieval Queries,recommendations,False,False,True,
2,Which 5 users have rated more than 20 movies?,"MATCH (u:User)-[r:RATED]->(m:Movie)\nWITH u, c...",Simple Retrieval Queries,recommendations,False,False,True,
3,Identify the top 5 actors who have acted in mo...,MATCH (a:Actor)-[:ACTED_IN]->(m:Movie)\nWITH a...,Simple Retrieval Queries,recommendations,False,False,True,
4,What are the top 3 genres associated with movi...,MATCH (m:Movie)-[:IN_GENRE]->(g:Genre)\nWHERE ...,Simple Retrieval Queries,recommendations,False,False,True,


In [21]:
schema_df = pd.read_csv('data/text2cypher_schemas.csv')

In [22]:
schema_df.head()

Unnamed: 0,database,schema,structured_schema
0,recommendations,Node properties:\n- **Movie**\n - `url`: STRI...,"{'node_props': {'Movie': [{'property': 'url', ..."
1,buzzoverflow,Node properties:\n- **Question**\n - `favorit...,{'node_props': {'Question': [{'property': 'fav...
2,bluesky,Node properties:\n- **User**\n - `label`: STR...,"{'node_props': {'User': [{'property': 'label',..."
3,companies,Node properties:\n- **Person**\n - `name`: ST...,{'node_props': {'Person': [{'property': 'name'...
4,fincen,Node properties:\n- **Country**\n - `location...,{'node_props': {'Country': [{'property': 'loca...


In [23]:
questions_df = pd.read_csv('data/text2cypher_questions.csv')

In [24]:
questions_df.head()

Unnamed: 0,question,type,database
0,What are the top 5 movies with a runtime great...,Simple Retrieval Queries,recommendations
1,List the first 3 directors born before 1950.,Simple Retrieval Queries,recommendations
2,Which 5 users have rated more than 20 movies?,Simple Retrieval Queries,recommendations
3,Identify the top 5 actors who have acted in mo...,Simple Retrieval Queries,recommendations
4,What are the top 3 genres associated with movi...,Simple Retrieval Queries,recommendations


# Model Definitions

In [2]:
data_path = 'data/'

In [3]:
trainer_with_repeats_file = 'parametric_trainer_with_repeats.json'

In [4]:
# Model from Hugging Face hub
model_id = "stabilityai/stable-code-instruct-3b"

# Fine-tuned model
model_path = "i_speak_cypher_3b"

In [5]:
with open(data_path+trainer_with_repeats_file, 'rb') as f:
	sampler = json.load(f)


In [6]:
sampler[123]

{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Fetch the Author nodes and extract their affiliation property!',
 'Schema': 'Graph schema: Relevant node labels and their properties (with datatypes) are:\nAuthor {affiliation: STRING}',
 'Cypher': 'MATCH (n:Author) RETURN n.affiliation'}

In [7]:
system_message = """
You are a text to Cypher query translator. {prompt}\n{schema}
"""

# Function to transform the data to conversational format {role:, content: }
def create_conversation(sample):
    return {
        "messages": [
            {"role": "system","content": system_message.format(prompt=sample["Prompt"], schema=sample["Schema"])},
            {"role": "user", "content": sample["Question"]},
            {"role": "assistant", "content": sample["Cypher"]}
        ]
    }

    


In [11]:
dataset = Dataset.from_list(sampler)
dataset = dataset.shuffle()

# Transform to required format
dataset = dataset.map(create_conversation,
                      remove_columns=dataset.features,
                      batched=False)


Map: 100%|██████████| 30116/30116 [00:02<00:00, 12065.56 examples/s]


In [12]:
#@title Split Data into Train and Test Sets
dataset = dataset.train_test_split(test_size=0.1, seed=23)

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 27104
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 3012
    })
})


In [13]:
dataset["train"][32]["messages"]

[{'content': "\nYou are a text to Cypher query translator. Convert the following question into a Cypher query using the provided graph schema!\nGraph schema: Relevant node labels and their properties (with datatypes) are:\nArticle {comments: STRING}\nAuthor {}\n\nRelevant relationships are:\n{'start': Article, 'type': WRITTEN_BY, 'end': Author }\n",
  'role': 'system'},
 {'content': 'Fetch comments of the Article that are connected to Author via WRITTEN_BY!',
  'role': 'user'},
 {'content': 'MATCH (n:Article) WHERE EXISTS { MATCH (n)-[:WRITTEN_BY]->(:Author) } RETURN n.comments AS comments',
  'role': 'assistant'}]

In [14]:
#@title Save the Dataset Splits
dataset["train"].to_json(data_path+"train_dataset.json", orient="records")
dataset["test"].to_json(data_path+"test_dataset.json", orient="records")

Creating json from Arrow format: 100%|██████████| 28/28 [00:00<00:00, 90.84ba/s]
Creating json from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 116.00ba/s]


2199514

In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'right' # added to prevent warnings

# Set a maximum length
tokenizer.model_max_length = 2048

In [16]:

CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|im_start|>user\n' + message['content'] + '<|im_end|>'+eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|im_start|>system\n' + message['content'] + '<|im_end|>'+eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|im_start|>assistant\n'  + message['content'] + '<|im_end|>'+eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|im_start|>assistant' }}\n{% endif %}\n{% endfor %}"

tokenizer.chat_template = CHAT_TEMPLATE
     

In [17]:

train_dataset = load_dataset("json",
                       data_files=data_path+"train_dataset.json",
                       split="train")

Generating train split: 27104 examples [00:00, 832600.32 examples/s]


In [18]:
# Process the dataset function
def apply_chat_template(sample, tokenizer):
    messages = sample["messages"]

    # We add an empty system message if there is none
    if messages[0]["role"] != "system":
        messages.insert(0, {"role": "system", "content": ""})

    sample["text"] = tokenizer.apply_chat_template(messages,
                                                   tokenize=False)

    return sample

# Apply the chat template to the entire dataset
train_dataset = train_dataset.map(apply_chat_template,
                      fn_kwargs={"tokenizer": tokenizer},
                      remove_columns=train_dataset.features,
                      )
print(train_dataset)

Map: 100%|██████████| 27104/27104 [00:02<00:00, 11730.83 examples/s]

Dataset({
    features: ['text'],
    num_rows: 27104
})





In [19]:
print(train_dataset[123]["text"])

<|im_start|>system

You are a text to Cypher query translator. Convert the following question into a Cypher query using the provided graph schema!
Graph schema: Relevant node labels and their properties (with datatypes) are:
Author {affiliation: STRING}
Author {last_name: STRING}
<|im_end|><|endoftext|>
<|im_start|>user
Fetch the distinct values of the last_name from Author where either affiliation is unspecified or last_name is not null!<|im_end|><|endoftext|>
<|im_start|>assistant
MATCH (n:Author) WHERE n.affiliation = 'unspecified' OR n.last_name IS NOT NULL RETURN DISTINCT n.last_name AS last_name<|im_end|><|endoftext|>



In [20]:
device_map = {"": torch.cuda.current_device()} if torch.cuda.is_available() else None

In [72]:
device_map

{'': 0}

In [21]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device_map, #device_map,
    torch_dtype=torch.float16, #bfloat16, # change to float16 if using non-Ampere GPU
)


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.14s/it]


In [22]:
print(model)

StableLmForCausalLM(
  (model): StableLmModel(
    (embed_tokens): Embedding(50304, 2560)
    (layers): ModuleList(
      (0): StableLmDecoderLayer(
        (self_attn): StableLmAttention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=False)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=False)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=False)
          (o_proj): Linear(in_features=2560, out_features=2560, bias=False)
          (attention_dropout): Dropout(p=0.0, inplace=False)
          (rotary_emb): StableLmRotaryEmbedding()
        )
        (mlp): StableLmMLP(
          (gate_proj): Linear(in_features=2560, out_features=6912, bias=False)
          (up_proj): Linear(in_features=2560, out_features=6912, bias=False)
          (down_proj): Linear(in_features=6912, out_features=2560, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
    

In [23]:
peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=256,
        bias="none",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "c_fc", "c_proj"], # use all linear layers
        task_type="CAUSAL_LM",
)
     

In [24]:

# Adapted from  Phil Schmid blogpost
args = TrainingArguments(
    output_dir=model_path,                  # directory to save the model and repository id
    num_train_epochs=1,
    per_device_train_batch_size=1,               # number of training epochs, use 3 at most
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,            # use gradient checkpointing to save memory, use in distributed training
    gradient_checkpointing_kwargs={"use_reentrant": False}, # needed if gradient checkpoint is used
    optim="adamw_torch",              # use fused adamw optimizer
    logging_steps=10,                      # number of steps between two logs
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    tf32=True,                              # use tf32 precision for better performance
    max_grad_norm=1.0,                      # max gradient norm based on QLoRA paper
    warmup_steps=20,
    save_steps=100,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="cosine",           # use constant learning rate scheduler
)

In [25]:
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    dataset_text_field="text",
    peft_config=peft_config,
    max_seq_length=tokenizer.model_max_length,
    tokenizer=tokenizer,
    dataset_kwargs={
        "add_special_tokens": False,  # the template adds the special tokens
        "append_concat_token": False, # no need to add additional separator token
    }
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Map: 100%|██████████| 27104/27104 [00:01<00:00, 15126.23 examples/s]


In [26]:
trainer.train()

  0%|          | 10/13552 [00:03<1:14:49,  3.02it/s]

{'loss': 3.4745, 'grad_norm': 2.2865517139434814, 'learning_rate': 0.0001, 'epoch': 0.0}


  0%|          | 20/13552 [00:07<1:15:28,  2.99it/s]

{'loss': 1.4653, 'grad_norm': 1.4454466104507446, 'learning_rate': 0.0002, 'epoch': 0.0}


  0%|          | 30/13552 [00:10<1:12:26,  3.11it/s]

{'loss': 0.9225, 'grad_norm': 1.3016773462295532, 'learning_rate': 0.0001999997305081868, 'epoch': 0.0}


  0%|          | 40/13552 [00:14<1:25:21,  2.64it/s]

{'loss': 0.7775, 'grad_norm': 1.7720146179199219, 'learning_rate': 0.00019999892203419976, 'epoch': 0.0}


  0%|          | 50/13552 [00:18<1:19:41,  2.82it/s]

{'loss': 0.7245, 'grad_norm': 0.8291468620300293, 'learning_rate': 0.00019999757458239635, 'epoch': 0.0}


  0%|          | 60/13552 [00:21<1:12:12,  3.11it/s]

{'loss': 0.6702, 'grad_norm': 0.9567443132400513, 'learning_rate': 0.0001999956881600392, 'epoch': 0.0}


  1%|          | 70/13552 [00:24<1:11:55,  3.12it/s]

{'loss': 0.6717, 'grad_norm': 0.8080818057060242, 'learning_rate': 0.00019999326277729575, 'epoch': 0.01}


  1%|          | 80/13552 [00:28<1:22:05,  2.73it/s]

{'loss': 0.5875, 'grad_norm': 0.5922210216522217, 'learning_rate': 0.00019999029844723845, 'epoch': 0.01}


  1%|          | 90/13552 [00:31<1:14:04,  3.03it/s]

{'loss': 0.5462, 'grad_norm': 1.0693235397338867, 'learning_rate': 0.00019998679518584453, 'epoch': 0.01}


  1%|          | 100/13552 [00:34<1:13:50,  3.04it/s]

{'loss': 0.5897, 'grad_norm': 0.5976505875587463, 'learning_rate': 0.00019998275301199602, 'epoch': 0.01}


  1%|          | 110/13552 [00:38<1:16:38,  2.92it/s]

{'loss': 0.5149, 'grad_norm': 0.821171760559082, 'learning_rate': 0.0001999781719474796, 'epoch': 0.01}


  1%|          | 120/13552 [00:41<1:12:49,  3.07it/s]

{'loss': 0.4111, 'grad_norm': 0.726222813129425, 'learning_rate': 0.00019997305201698639, 'epoch': 0.01}


  1%|          | 130/13552 [00:45<1:14:00,  3.02it/s]

{'loss': 0.4282, 'grad_norm': 0.6866040825843811, 'learning_rate': 0.000199967393248112, 'epoch': 0.01}


  1%|          | 140/13552 [00:48<1:13:18,  3.05it/s]

{'loss': 0.4266, 'grad_norm': 0.7465971112251282, 'learning_rate': 0.00019996119567135628, 'epoch': 0.01}


  1%|          | 150/13552 [00:52<1:17:00,  2.90it/s]

{'loss': 0.4899, 'grad_norm': 0.8401684761047363, 'learning_rate': 0.00019995445932012316, 'epoch': 0.01}


  1%|          | 160/13552 [00:55<1:32:04,  2.42it/s]

{'loss': 0.5448, 'grad_norm': 0.6885645389556885, 'learning_rate': 0.00019994718423072046, 'epoch': 0.01}


  1%|▏         | 170/13552 [00:59<1:14:57,  2.98it/s]

{'loss': 0.3558, 'grad_norm': 0.43466025590896606, 'learning_rate': 0.00019993937044235973, 'epoch': 0.01}


  1%|▏         | 180/13552 [01:02<1:25:57,  2.59it/s]

{'loss': 0.462, 'grad_norm': 0.36673271656036377, 'learning_rate': 0.000199931017997156, 'epoch': 0.01}


  1%|▏         | 190/13552 [01:06<1:22:00,  2.72it/s]

{'loss': 0.3863, 'grad_norm': 0.4980913996696472, 'learning_rate': 0.00019992212694012757, 'epoch': 0.01}


  1%|▏         | 200/13552 [01:10<1:26:51,  2.56it/s]

{'loss': 0.4068, 'grad_norm': 1.0073680877685547, 'learning_rate': 0.00019991269731919583, 'epoch': 0.01}


  2%|▏         | 210/13552 [01:13<1:14:41,  2.98it/s]

{'loss': 0.536, 'grad_norm': 0.28018099069595337, 'learning_rate': 0.0001999027291851848, 'epoch': 0.02}


  2%|▏         | 220/13552 [01:16<1:13:42,  3.01it/s]

{'loss': 0.4061, 'grad_norm': 0.5992564558982849, 'learning_rate': 0.0001998922225918212, 'epoch': 0.02}


  2%|▏         | 230/13552 [01:20<1:14:25,  2.98it/s]

{'loss': 0.431, 'grad_norm': 0.561007559299469, 'learning_rate': 0.00019988117759573378, 'epoch': 0.02}


  2%|▏         | 240/13552 [01:23<1:14:05,  2.99it/s]

{'loss': 0.3968, 'grad_norm': 1.961240291595459, 'learning_rate': 0.0001998695942564533, 'epoch': 0.02}


  2%|▏         | 250/13552 [01:26<1:13:33,  3.01it/s]

{'loss': 0.3714, 'grad_norm': 0.4959861636161804, 'learning_rate': 0.000199857472636412, 'epoch': 0.02}


  2%|▏         | 260/13552 [01:30<1:19:43,  2.78it/s]

{'loss': 0.3734, 'grad_norm': 0.799066424369812, 'learning_rate': 0.0001998448128009435, 'epoch': 0.02}


  2%|▏         | 270/13552 [01:34<1:28:37,  2.50it/s]

{'loss': 0.3808, 'grad_norm': 0.5764375329017639, 'learning_rate': 0.00019983161481828217, 'epoch': 0.02}


  2%|▏         | 280/13552 [01:38<1:32:28,  2.39it/s]

{'loss': 0.3799, 'grad_norm': 0.5523318648338318, 'learning_rate': 0.00019981787875956306, 'epoch': 0.02}


  2%|▏         | 290/13552 [01:41<1:17:19,  2.86it/s]

{'loss': 0.3665, 'grad_norm': 0.5030439496040344, 'learning_rate': 0.0001998036046988212, 'epoch': 0.02}


  2%|▏         | 300/13552 [01:45<1:25:07,  2.59it/s]

{'loss': 0.3687, 'grad_norm': 0.5524946451187134, 'learning_rate': 0.0001997887927129915, 'epoch': 0.02}


  2%|▏         | 310/13552 [01:48<1:15:18,  2.93it/s]

{'loss': 0.4635, 'grad_norm': 0.5270333886146545, 'learning_rate': 0.00019977344288190808, 'epoch': 0.02}


  2%|▏         | 320/13552 [01:52<1:29:00,  2.48it/s]

{'loss': 0.3203, 'grad_norm': 0.45424655079841614, 'learning_rate': 0.00019975755528830403, 'epoch': 0.02}


  2%|▏         | 330/13552 [01:56<1:18:49,  2.80it/s]

{'loss': 0.3589, 'grad_norm': 0.43465298414230347, 'learning_rate': 0.00019974113001781091, 'epoch': 0.02}


  3%|▎         | 340/13552 [01:59<1:21:28,  2.70it/s]

{'loss': 0.2617, 'grad_norm': 0.7622542977333069, 'learning_rate': 0.00019972416715895825, 'epoch': 0.03}


  3%|▎         | 350/13552 [02:03<1:20:56,  2.72it/s]

{'loss': 0.3734, 'grad_norm': 0.4177861213684082, 'learning_rate': 0.00019970666680317302, 'epoch': 0.03}


  3%|▎         | 360/13552 [02:06<1:12:37,  3.03it/s]

{'loss': 0.3287, 'grad_norm': 0.6141859292984009, 'learning_rate': 0.00019968862904477935, 'epoch': 0.03}


  3%|▎         | 370/13552 [02:10<1:14:30,  2.95it/s]

{'loss': 0.3683, 'grad_norm': 0.38956910371780396, 'learning_rate': 0.0001996700539809977, 'epoch': 0.03}


  3%|▎         | 380/13552 [02:13<1:12:26,  3.03it/s]

{'loss': 0.4697, 'grad_norm': 0.9539759755134583, 'learning_rate': 0.00019965094171194473, 'epoch': 0.03}


  3%|▎         | 390/13552 [02:17<1:14:33,  2.94it/s]

{'loss': 0.352, 'grad_norm': 0.25091490149497986, 'learning_rate': 0.00019963129234063236, 'epoch': 0.03}


  3%|▎         | 400/13552 [02:20<1:21:12,  2.70it/s]

{'loss': 0.3273, 'grad_norm': 0.6103862524032593, 'learning_rate': 0.00019961110597296753, 'epoch': 0.03}


  3%|▎         | 410/13552 [02:24<1:15:00,  2.92it/s]

{'loss': 0.2878, 'grad_norm': 0.2744615077972412, 'learning_rate': 0.00019959038271775143, 'epoch': 0.03}


  3%|▎         | 420/13552 [02:27<1:11:58,  3.04it/s]

{'loss': 0.3262, 'grad_norm': 0.7997731566429138, 'learning_rate': 0.00019956912268667907, 'epoch': 0.03}


  3%|▎         | 430/13552 [02:30<1:12:48,  3.00it/s]

{'loss': 0.2684, 'grad_norm': 0.17807601392269135, 'learning_rate': 0.00019954732599433844, 'epoch': 0.03}


  3%|▎         | 440/13552 [02:34<1:25:49,  2.55it/s]

{'loss': 0.4012, 'grad_norm': 0.6042690873146057, 'learning_rate': 0.00019952499275821022, 'epoch': 0.03}


  3%|▎         | 450/13552 [02:37<1:12:40,  3.00it/s]

{'loss': 0.2545, 'grad_norm': 0.3452781140804291, 'learning_rate': 0.00019950212309866687, 'epoch': 0.03}


  3%|▎         | 460/13552 [02:42<1:29:59,  2.42it/s]

{'loss': 0.4236, 'grad_norm': 0.3151572346687317, 'learning_rate': 0.00019947871713897212, 'epoch': 0.03}


  3%|▎         | 470/13552 [02:45<1:12:53,  2.99it/s]

{'loss': 0.3099, 'grad_norm': 0.3482968211174011, 'learning_rate': 0.00019945477500528025, 'epoch': 0.03}


  4%|▎         | 480/13552 [02:48<1:11:26,  3.05it/s]

{'loss': 0.3062, 'grad_norm': 1.2388391494750977, 'learning_rate': 0.0001994302968266354, 'epoch': 0.04}


  4%|▎         | 490/13552 [02:52<1:11:51,  3.03it/s]

{'loss': 0.3239, 'grad_norm': 0.45113039016723633, 'learning_rate': 0.00019940528273497106, 'epoch': 0.04}


  4%|▎         | 500/13552 [02:55<1:14:52,  2.91it/s]

{'loss': 0.2932, 'grad_norm': 0.2888636589050293, 'learning_rate': 0.00019937973286510897, 'epoch': 0.04}


  4%|▍         | 510/13552 [02:58<1:11:36,  3.04it/s]

{'loss': 0.3006, 'grad_norm': 0.5500810146331787, 'learning_rate': 0.00019935364735475881, 'epoch': 0.04}


  4%|▍         | 520/13552 [03:02<1:11:31,  3.04it/s]

{'loss': 0.3187, 'grad_norm': 0.31920573115348816, 'learning_rate': 0.00019932702634451718, 'epoch': 0.04}


  4%|▍         | 530/13552 [03:05<1:12:42,  2.99it/s]

{'loss': 0.2786, 'grad_norm': 0.24507692456245422, 'learning_rate': 0.00019929986997786699, 'epoch': 0.04}


  4%|▍         | 540/13552 [03:09<1:15:41,  2.87it/s]

{'loss': 0.3168, 'grad_norm': 0.6610637903213501, 'learning_rate': 0.00019927217840117658, 'epoch': 0.04}


  4%|▍         | 550/13552 [03:12<1:11:05,  3.05it/s]

{'loss': 0.2053, 'grad_norm': 1.0562325716018677, 'learning_rate': 0.00019924395176369904, 'epoch': 0.04}


  4%|▍         | 560/13552 [03:16<1:16:24,  2.83it/s]

{'loss': 0.3114, 'grad_norm': 0.44975337386131287, 'learning_rate': 0.0001992151902175713, 'epoch': 0.04}


  4%|▍         | 570/13552 [03:19<1:11:45,  3.02it/s]

{'loss': 0.32, 'grad_norm': 0.4929930567741394, 'learning_rate': 0.0001991858939178134, 'epoch': 0.04}


  4%|▍         | 580/13552 [03:23<1:11:42,  3.01it/s]

{'loss': 0.3251, 'grad_norm': 0.4238712191581726, 'learning_rate': 0.00019915606302232762, 'epoch': 0.04}


  4%|▍         | 590/13552 [03:26<1:11:48,  3.01it/s]

{'loss': 0.249, 'grad_norm': 0.47259825468063354, 'learning_rate': 0.00019912569769189758, 'epoch': 0.04}


  4%|▍         | 600/13552 [03:30<1:13:37,  2.93it/s]

{'loss': 0.2837, 'grad_norm': 0.4893158972263336, 'learning_rate': 0.00019909479809018742, 'epoch': 0.04}


  5%|▍         | 610/13552 [03:33<1:15:38,  2.85it/s]

{'loss': 0.2051, 'grad_norm': 0.1640656739473343, 'learning_rate': 0.00019906336438374096, 'epoch': 0.05}


  5%|▍         | 620/13552 [03:37<1:11:12,  3.03it/s]

{'loss': 0.2957, 'grad_norm': 0.24046577513217926, 'learning_rate': 0.00019903139674198075, 'epoch': 0.05}


  5%|▍         | 630/13552 [03:40<1:10:42,  3.05it/s]

{'loss': 0.3295, 'grad_norm': 0.569049596786499, 'learning_rate': 0.0001989988953372071, 'epoch': 0.05}


  5%|▍         | 640/13552 [03:43<1:15:29,  2.85it/s]

{'loss': 0.3358, 'grad_norm': 0.5969500541687012, 'learning_rate': 0.00019896586034459727, 'epoch': 0.05}


  5%|▍         | 650/13552 [03:47<1:10:55,  3.03it/s]

{'loss': 0.3694, 'grad_norm': 0.5855609774589539, 'learning_rate': 0.0001989322919422045, 'epoch': 0.05}


  5%|▍         | 660/13552 [03:50<1:10:53,  3.03it/s]

{'loss': 0.2327, 'grad_norm': 0.17945528030395508, 'learning_rate': 0.0001988981903109569, 'epoch': 0.05}


  5%|▍         | 670/13552 [03:54<1:12:25,  2.96it/s]

{'loss': 0.2819, 'grad_norm': 0.4372011125087738, 'learning_rate': 0.00019886355563465677, 'epoch': 0.05}


  5%|▌         | 680/13552 [03:57<1:20:17,  2.67it/s]

{'loss': 0.3204, 'grad_norm': 0.24954578280448914, 'learning_rate': 0.00019882838809997933, 'epoch': 0.05}


  5%|▌         | 690/13552 [04:01<1:20:50,  2.65it/s]

{'loss': 0.337, 'grad_norm': 0.5795125961303711, 'learning_rate': 0.00019879268789647176, 'epoch': 0.05}


  5%|▌         | 700/13552 [04:04<1:12:09,  2.97it/s]

{'loss': 0.2514, 'grad_norm': 0.5774496793746948, 'learning_rate': 0.0001987564552165524, 'epoch': 0.05}


  5%|▌         | 710/13552 [04:08<1:16:14,  2.81it/s]

{'loss': 0.216, 'grad_norm': 0.1716972291469574, 'learning_rate': 0.00019871969025550939, 'epoch': 0.05}


  5%|▌         | 720/13552 [04:11<1:11:52,  2.98it/s]

{'loss': 0.2344, 'grad_norm': 0.12967191636562347, 'learning_rate': 0.0001986823932114999, 'epoch': 0.05}


  5%|▌         | 730/13552 [04:15<1:31:57,  2.32it/s]

{'loss': 0.2271, 'grad_norm': 0.3564128577709198, 'learning_rate': 0.00019864456428554886, 'epoch': 0.05}


  5%|▌         | 740/13552 [04:19<1:14:09,  2.88it/s]

{'loss': 0.2364, 'grad_norm': 0.3232311010360718, 'learning_rate': 0.00019860620368154804, 'epoch': 0.05}


  6%|▌         | 750/13552 [04:22<1:11:31,  2.98it/s]

{'loss': 0.2763, 'grad_norm': 0.19073207676410675, 'learning_rate': 0.00019856731160625474, 'epoch': 0.06}


  6%|▌         | 760/13552 [04:26<1:24:15,  2.53it/s]

{'loss': 0.2746, 'grad_norm': 0.751444935798645, 'learning_rate': 0.00019852788826929093, 'epoch': 0.06}


  6%|▌         | 770/13552 [04:29<1:12:54,  2.92it/s]

{'loss': 0.2381, 'grad_norm': 0.23593144118785858, 'learning_rate': 0.00019848793388314192, 'epoch': 0.06}


  6%|▌         | 780/13552 [04:33<1:12:19,  2.94it/s]

{'loss': 0.2387, 'grad_norm': 0.13197632133960724, 'learning_rate': 0.0001984474486631553, 'epoch': 0.06}


  6%|▌         | 790/13552 [04:36<1:10:28,  3.02it/s]

{'loss': 0.218, 'grad_norm': 0.4523744583129883, 'learning_rate': 0.00019840643282753982, 'epoch': 0.06}


  6%|▌         | 800/13552 [04:40<1:16:16,  2.79it/s]

{'loss': 0.2447, 'grad_norm': 0.3373796045780182, 'learning_rate': 0.00019836488659736407, 'epoch': 0.06}


  6%|▌         | 810/13552 [04:43<1:10:41,  3.00it/s]

{'loss': 0.1597, 'grad_norm': 0.13070981204509735, 'learning_rate': 0.00019832281019655545, 'epoch': 0.06}


  6%|▌         | 820/13552 [04:46<1:10:23,  3.01it/s]

{'loss': 0.2688, 'grad_norm': 0.6074798703193665, 'learning_rate': 0.00019828020385189888, 'epoch': 0.06}


  6%|▌         | 830/13552 [04:50<1:12:39,  2.92it/s]

{'loss': 0.2181, 'grad_norm': 1.4307876825332642, 'learning_rate': 0.00019823706779303554, 'epoch': 0.06}


  6%|▌         | 840/13552 [04:54<1:18:43,  2.69it/s]

{'loss': 0.2833, 'grad_norm': 0.152470663189888, 'learning_rate': 0.00019819340225246177, 'epoch': 0.06}


  6%|▋         | 850/13552 [04:57<1:09:31,  3.05it/s]

{'loss': 0.2925, 'grad_norm': 0.7799125909805298, 'learning_rate': 0.0001981492074655277, 'epoch': 0.06}


  6%|▋         | 860/13552 [05:00<1:12:42,  2.91it/s]

{'loss': 0.2914, 'grad_norm': 0.722346842288971, 'learning_rate': 0.0001981044836704359, 'epoch': 0.06}


  6%|▋         | 870/13552 [05:04<1:14:11,  2.85it/s]

{'loss': 0.3019, 'grad_norm': 0.725884735584259, 'learning_rate': 0.0001980592311082404, 'epoch': 0.06}


  6%|▋         | 880/13552 [05:07<1:11:43,  2.94it/s]

{'loss': 0.2521, 'grad_norm': 0.467917263507843, 'learning_rate': 0.00019801345002284508, 'epoch': 0.06}


  7%|▋         | 890/13552 [05:11<1:15:10,  2.81it/s]

{'loss': 0.2699, 'grad_norm': 0.3540831506252289, 'learning_rate': 0.00019796714066100247, 'epoch': 0.07}


  7%|▋         | 900/13552 [05:14<1:17:42,  2.71it/s]

{'loss': 0.2384, 'grad_norm': 0.5936291217803955, 'learning_rate': 0.00019792030327231246, 'epoch': 0.07}


  7%|▋         | 910/13552 [05:18<1:12:32,  2.90it/s]

{'loss': 0.2387, 'grad_norm': 0.1781316101551056, 'learning_rate': 0.0001978729381092209, 'epoch': 0.07}


  7%|▋         | 920/13552 [05:21<1:10:50,  2.97it/s]

{'loss': 0.2389, 'grad_norm': 0.1885126531124115, 'learning_rate': 0.00019782504542701826, 'epoch': 0.07}


  7%|▋         | 930/13552 [05:25<1:17:16,  2.72it/s]

{'loss': 0.2618, 'grad_norm': 0.35358530282974243, 'learning_rate': 0.0001977766254838383, 'epoch': 0.07}


  7%|▋         | 940/13552 [05:28<1:12:03,  2.92it/s]

{'loss': 0.2156, 'grad_norm': 0.22988909482955933, 'learning_rate': 0.0001977276785406565, 'epoch': 0.07}


  7%|▋         | 950/13552 [05:32<1:12:23,  2.90it/s]

{'loss': 0.2595, 'grad_norm': 0.5442370772361755, 'learning_rate': 0.00019767820486128894, 'epoch': 0.07}


  7%|▋         | 960/13552 [05:36<1:18:22,  2.68it/s]

{'loss': 0.2569, 'grad_norm': 0.18314583599567413, 'learning_rate': 0.00019762820471239066, 'epoch': 0.07}


  7%|▋         | 970/13552 [05:39<1:25:05,  2.46it/s]

{'loss': 0.2572, 'grad_norm': 0.8357760310173035, 'learning_rate': 0.0001975776783634542, 'epoch': 0.07}


  7%|▋         | 980/13552 [05:43<1:17:54,  2.69it/s]

{'loss': 0.2232, 'grad_norm': 0.21006084978580475, 'learning_rate': 0.00019752662608680835, 'epoch': 0.07}


  7%|▋         | 990/13552 [05:47<1:11:18,  2.94it/s]

{'loss': 0.2345, 'grad_norm': 0.5350012183189392, 'learning_rate': 0.00019747504815761658, 'epoch': 0.07}


  7%|▋         | 1000/13552 [05:50<1:07:22,  3.11it/s]

{'loss': 0.2166, 'grad_norm': 0.20142392814159393, 'learning_rate': 0.0001974229448538754, 'epoch': 0.07}


  7%|▋         | 1010/13552 [05:53<1:07:11,  3.11it/s]

{'loss': 0.1795, 'grad_norm': 0.2325126826763153, 'learning_rate': 0.0001973703164564131, 'epoch': 0.07}


  8%|▊         | 1020/13552 [05:57<1:09:32,  3.00it/s]

{'loss': 0.2935, 'grad_norm': 0.8023693561553955, 'learning_rate': 0.00019731716324888817, 'epoch': 0.08}


  8%|▊         | 1030/13552 [06:01<1:12:25,  2.88it/s]

{'loss': 0.2181, 'grad_norm': 0.09383732825517654, 'learning_rate': 0.00019726348551778763, 'epoch': 0.08}


  8%|▊         | 1040/13552 [06:04<1:06:47,  3.12it/s]

{'loss': 0.1958, 'grad_norm': 0.23851823806762695, 'learning_rate': 0.00019720928355242568, 'epoch': 0.08}


  8%|▊         | 1050/13552 [06:07<1:07:16,  3.10it/s]

{'loss': 0.2348, 'grad_norm': 0.5699799656867981, 'learning_rate': 0.0001971545576449421, 'epoch': 0.08}


  8%|▊         | 1060/13552 [06:11<1:29:17,  2.33it/s]

{'loss': 0.2327, 'grad_norm': 0.4616710841655731, 'learning_rate': 0.00019709930809030054, 'epoch': 0.08}


  8%|▊         | 1070/13552 [06:14<1:07:35,  3.08it/s]

{'loss': 0.1917, 'grad_norm': 0.49995189905166626, 'learning_rate': 0.000197043535186287, 'epoch': 0.08}


  8%|▊         | 1080/13552 [06:17<1:06:57,  3.10it/s]

{'loss': 0.2071, 'grad_norm': 0.19128470122814178, 'learning_rate': 0.00019698723923350836, 'epoch': 0.08}


  8%|▊         | 1090/13552 [06:21<1:12:43,  2.86it/s]

{'loss': 0.2079, 'grad_norm': 0.43932992219924927, 'learning_rate': 0.00019693042053539058, 'epoch': 0.08}


  8%|▊         | 1100/13552 [06:24<1:26:49,  2.39it/s]

{'loss': 0.235, 'grad_norm': 2.2044296264648438, 'learning_rate': 0.0001968730793981771, 'epoch': 0.08}


  8%|▊         | 1110/13552 [06:28<1:20:26,  2.58it/s]

{'loss': 0.1765, 'grad_norm': 0.6667661666870117, 'learning_rate': 0.0001968152161309273, 'epoch': 0.08}


  8%|▊         | 1120/13552 [06:32<1:16:15,  2.72it/s]

{'loss': 0.2189, 'grad_norm': 0.33294355869293213, 'learning_rate': 0.0001967568310455147, 'epoch': 0.08}


  8%|▊         | 1130/13552 [06:35<1:07:20,  3.07it/s]

{'loss': 0.2691, 'grad_norm': 0.6613542437553406, 'learning_rate': 0.00019669792445662534, 'epoch': 0.08}


  8%|▊         | 1140/13552 [06:39<1:16:43,  2.70it/s]

{'loss': 0.2158, 'grad_norm': 0.7958184480667114, 'learning_rate': 0.00019663849668175612, 'epoch': 0.08}


  8%|▊         | 1150/13552 [06:43<1:16:59,  2.68it/s]

{'loss': 0.2413, 'grad_norm': 0.28786274790763855, 'learning_rate': 0.000196578548041213, 'epoch': 0.08}


  9%|▊         | 1160/13552 [06:46<1:14:49,  2.76it/s]

{'loss': 0.298, 'grad_norm': 0.5102984309196472, 'learning_rate': 0.00019651807885810932, 'epoch': 0.09}


  9%|▊         | 1170/13552 [06:50<1:15:40,  2.73it/s]

{'loss': 0.2359, 'grad_norm': 0.12360230088233948, 'learning_rate': 0.00019645708945836412, 'epoch': 0.09}


  9%|▊         | 1180/13552 [06:54<1:12:25,  2.85it/s]

{'loss': 0.1753, 'grad_norm': 0.18804627656936646, 'learning_rate': 0.00019639558017070025, 'epoch': 0.09}


  9%|▉         | 1190/13552 [06:57<1:16:48,  2.68it/s]

{'loss': 0.1932, 'grad_norm': 0.8156444430351257, 'learning_rate': 0.00019633355132664268, 'epoch': 0.09}


  9%|▉         | 1200/13552 [07:01<1:14:43,  2.76it/s]

{'loss': 0.2003, 'grad_norm': 0.14798560738563538, 'learning_rate': 0.00019627100326051675, 'epoch': 0.09}


  9%|▉         | 1210/13552 [07:04<1:07:41,  3.04it/s]

{'loss': 0.1841, 'grad_norm': 0.5467005372047424, 'learning_rate': 0.0001962079363094463, 'epoch': 0.09}


  9%|▉         | 1220/13552 [07:07<1:10:31,  2.91it/s]

{'loss': 0.2162, 'grad_norm': 0.11808415502309799, 'learning_rate': 0.00019614435081335185, 'epoch': 0.09}


  9%|▉         | 1230/13552 [07:11<1:07:54,  3.02it/s]

{'loss': 0.1811, 'grad_norm': 0.12182354182004929, 'learning_rate': 0.00019608024711494882, 'epoch': 0.09}


  9%|▉         | 1240/13552 [07:14<1:07:50,  3.02it/s]

{'loss': 0.1977, 'grad_norm': 0.35284531116485596, 'learning_rate': 0.00019601562555974565, 'epoch': 0.09}


  9%|▉         | 1250/13552 [07:17<1:07:15,  3.05it/s]

{'loss': 0.2118, 'grad_norm': 1.8707176446914673, 'learning_rate': 0.00019595048649604193, 'epoch': 0.09}


  9%|▉         | 1260/13552 [07:21<1:13:41,  2.78it/s]

{'loss': 0.1654, 'grad_norm': 0.48044711351394653, 'learning_rate': 0.0001958848302749266, 'epoch': 0.09}


  9%|▉         | 1270/13552 [07:24<1:07:35,  3.03it/s]

{'loss': 0.246, 'grad_norm': 0.34216558933258057, 'learning_rate': 0.00019581865725027585, 'epoch': 0.09}


  9%|▉         | 1280/13552 [07:28<1:07:33,  3.03it/s]

{'loss': 0.1758, 'grad_norm': 0.22731474041938782, 'learning_rate': 0.0001957519677787515, 'epoch': 0.09}


 10%|▉         | 1290/13552 [07:31<1:08:57,  2.96it/s]

{'loss': 0.2031, 'grad_norm': 1.246828317642212, 'learning_rate': 0.0001956847622197989, 'epoch': 0.1}


 10%|▉         | 1300/13552 [07:34<1:08:39,  2.97it/s]

{'loss': 0.2042, 'grad_norm': 0.9397076964378357, 'learning_rate': 0.000195617040935645, 'epoch': 0.1}


 10%|▉         | 1310/13552 [07:38<1:07:20,  3.03it/s]

{'loss': 0.2155, 'grad_norm': 0.3294781446456909, 'learning_rate': 0.00019554880429129644, 'epoch': 0.1}


 10%|▉         | 1320/13552 [07:41<1:14:00,  2.75it/s]

{'loss': 0.2002, 'grad_norm': 0.3130757212638855, 'learning_rate': 0.0001954800526545375, 'epoch': 0.1}


 10%|▉         | 1330/13552 [07:45<1:07:06,  3.04it/s]

{'loss': 0.1706, 'grad_norm': 0.12251204252243042, 'learning_rate': 0.0001954107863959283, 'epoch': 0.1}


 10%|▉         | 1340/13552 [07:48<1:07:10,  3.03it/s]

{'loss': 0.1893, 'grad_norm': 0.15945473313331604, 'learning_rate': 0.0001953410058888026, 'epoch': 0.1}


 10%|▉         | 1350/13552 [07:51<1:07:14,  3.02it/s]

{'loss': 0.2104, 'grad_norm': 0.48522278666496277, 'learning_rate': 0.00019527071150926597, 'epoch': 0.1}


 10%|█         | 1360/13552 [07:55<1:06:41,  3.05it/s]

{'loss': 0.2109, 'grad_norm': 0.6499018669128418, 'learning_rate': 0.00019519990363619353, 'epoch': 0.1}


 10%|█         | 1370/13552 [07:58<1:10:25,  2.88it/s]

{'loss': 0.2329, 'grad_norm': 0.5570711493492126, 'learning_rate': 0.0001951285826512282, 'epoch': 0.1}


 10%|█         | 1380/13552 [08:02<1:18:01,  2.60it/s]

{'loss': 0.2461, 'grad_norm': 1.0008074045181274, 'learning_rate': 0.0001950567489387783, 'epoch': 0.1}


 10%|█         | 1390/13552 [08:05<1:07:06,  3.02it/s]

{'loss': 0.1974, 'grad_norm': 0.536543607711792, 'learning_rate': 0.00019498440288601588, 'epoch': 0.1}


 10%|█         | 1400/13552 [08:09<1:27:43,  2.31it/s]

{'loss': 0.2085, 'grad_norm': 0.32895490527153015, 'learning_rate': 0.00019491154488287426, 'epoch': 0.1}


 10%|█         | 1410/13552 [08:13<1:10:43,  2.86it/s]

{'loss': 0.2127, 'grad_norm': 1.044548511505127, 'learning_rate': 0.0001948381753220462, 'epoch': 0.1}


 10%|█         | 1420/13552 [08:16<1:06:54,  3.02it/s]

{'loss': 0.2119, 'grad_norm': 0.9899427890777588, 'learning_rate': 0.0001947642945989816, 'epoch': 0.1}


 11%|█         | 1430/13552 [08:20<1:10:43,  2.86it/s]

{'loss': 0.1977, 'grad_norm': 0.3404589891433716, 'learning_rate': 0.00019468990311188544, 'epoch': 0.11}


 11%|█         | 1440/13552 [08:23<1:07:32,  2.99it/s]

{'loss': 0.2004, 'grad_norm': 0.5025002956390381, 'learning_rate': 0.00019461500126171565, 'epoch': 0.11}


 11%|█         | 1450/13552 [08:27<1:14:55,  2.69it/s]

{'loss': 0.2018, 'grad_norm': 0.22585459053516388, 'learning_rate': 0.00019453958945218097, 'epoch': 0.11}


 11%|█         | 1460/13552 [08:30<1:10:23,  2.86it/s]

{'loss': 0.2408, 'grad_norm': 0.37697744369506836, 'learning_rate': 0.0001944636680897387, 'epoch': 0.11}


 11%|█         | 1470/13552 [08:34<1:33:14,  2.16it/s]

{'loss': 0.1991, 'grad_norm': 0.5037255883216858, 'learning_rate': 0.00019438723758359255, 'epoch': 0.11}


 11%|█         | 1480/13552 [08:38<1:10:48,  2.84it/s]

{'loss': 0.191, 'grad_norm': 2.1074106693267822, 'learning_rate': 0.00019431029834569038, 'epoch': 0.11}


 11%|█         | 1490/13552 [08:41<1:06:58,  3.00it/s]

{'loss': 0.1985, 'grad_norm': 0.29356899857521057, 'learning_rate': 0.00019423285079072215, 'epoch': 0.11}


 11%|█         | 1500/13552 [08:45<1:11:23,  2.81it/s]

{'loss': 0.1835, 'grad_norm': 0.32698559761047363, 'learning_rate': 0.0001941548953361175, 'epoch': 0.11}


 11%|█         | 1510/13552 [08:49<1:09:24,  2.89it/s]

{'loss': 0.2109, 'grad_norm': 0.6396865844726562, 'learning_rate': 0.00019407643240204354, 'epoch': 0.11}


 11%|█         | 1520/13552 [08:52<1:05:26,  3.06it/s]

{'loss': 0.1936, 'grad_norm': 0.40572279691696167, 'learning_rate': 0.00019399746241140263, 'epoch': 0.11}


 11%|█▏        | 1530/13552 [08:55<1:05:00,  3.08it/s]

{'loss': 0.1858, 'grad_norm': 0.9945968389511108, 'learning_rate': 0.0001939179857898301, 'epoch': 0.11}


 11%|█▏        | 1540/13552 [08:59<1:14:12,  2.70it/s]

{'loss': 0.1869, 'grad_norm': 1.9705533981323242, 'learning_rate': 0.00019383800296569198, 'epoch': 0.11}


 11%|█▏        | 1550/13552 [09:02<1:05:29,  3.05it/s]

{'loss': 0.1951, 'grad_norm': 0.6378498077392578, 'learning_rate': 0.00019375751437008252, 'epoch': 0.11}


 12%|█▏        | 1560/13552 [09:05<1:07:19,  2.97it/s]

{'loss': 0.2183, 'grad_norm': 0.8612276911735535, 'learning_rate': 0.0001936765204368221, 'epoch': 0.12}


 12%|█▏        | 1570/13552 [09:09<1:08:10,  2.93it/s]

{'loss': 0.1987, 'grad_norm': 0.49797236919403076, 'learning_rate': 0.00019359502160245473, 'epoch': 0.12}


 12%|█▏        | 1580/13552 [09:12<1:05:14,  3.06it/s]

{'loss': 0.2091, 'grad_norm': 0.36922332644462585, 'learning_rate': 0.00019351301830624584, 'epoch': 0.12}


 12%|█▏        | 1590/13552 [09:15<1:06:05,  3.02it/s]

{'loss': 0.2174, 'grad_norm': 0.8818494081497192, 'learning_rate': 0.00019343051099017972, 'epoch': 0.12}


 12%|█▏        | 1600/13552 [09:19<1:06:30,  2.99it/s]

{'loss': 0.188, 'grad_norm': 1.2471297979354858, 'learning_rate': 0.00019334750009895735, 'epoch': 0.12}


 12%|█▏        | 1610/13552 [09:22<1:25:08,  2.34it/s]

{'loss': 0.2496, 'grad_norm': 0.30456778407096863, 'learning_rate': 0.00019326398607999375, 'epoch': 0.12}


 12%|█▏        | 1620/13552 [09:26<1:05:43,  3.03it/s]

{'loss': 0.1806, 'grad_norm': 0.13795936107635498, 'learning_rate': 0.00019317996938341593, 'epoch': 0.12}


 12%|█▏        | 1630/13552 [09:29<1:04:50,  3.06it/s]

{'loss': 0.199, 'grad_norm': 0.09855493903160095, 'learning_rate': 0.00019309545046206002, 'epoch': 0.12}


 12%|█▏        | 1640/13552 [09:32<1:04:45,  3.07it/s]

{'loss': 0.2149, 'grad_norm': 0.5013089179992676, 'learning_rate': 0.00019301042977146922, 'epoch': 0.12}


 12%|█▏        | 1650/13552 [09:36<1:06:47,  2.97it/s]

{'loss': 0.173, 'grad_norm': 0.11816827952861786, 'learning_rate': 0.00019292490776989113, 'epoch': 0.12}


 12%|█▏        | 1660/13552 [09:39<1:05:21,  3.03it/s]

{'loss': 0.2037, 'grad_norm': 0.17650608718395233, 'learning_rate': 0.00019283888491827532, 'epoch': 0.12}


 12%|█▏        | 1670/13552 [09:43<1:09:54,  2.83it/s]

{'loss': 0.2124, 'grad_norm': 0.19858039915561676, 'learning_rate': 0.00019275236168027086, 'epoch': 0.12}


 12%|█▏        | 1680/13552 [09:46<1:07:06,  2.95it/s]

{'loss': 0.1979, 'grad_norm': 0.13351289927959442, 'learning_rate': 0.00019266533852222388, 'epoch': 0.12}


 12%|█▏        | 1690/13552 [09:50<1:11:39,  2.76it/s]

{'loss': 0.1558, 'grad_norm': 0.2806607484817505, 'learning_rate': 0.00019257781591317492, 'epoch': 0.12}


 13%|█▎        | 1700/13552 [09:54<1:06:10,  2.98it/s]

{'loss': 0.1632, 'grad_norm': 0.43387261033058167, 'learning_rate': 0.00019248979432485652, 'epoch': 0.13}


 13%|█▎        | 1710/13552 [09:57<1:06:33,  2.97it/s]

{'loss': 0.2034, 'grad_norm': 0.21444973349571228, 'learning_rate': 0.00019240127423169066, 'epoch': 0.13}


 13%|█▎        | 1720/13552 [10:01<1:21:43,  2.41it/s]

{'loss': 0.1573, 'grad_norm': 0.23495125770568848, 'learning_rate': 0.0001923122561107861, 'epoch': 0.13}


 13%|█▎        | 1730/13552 [10:04<1:06:33,  2.96it/s]

{'loss': 0.1908, 'grad_norm': 0.30507516860961914, 'learning_rate': 0.00019222274044193593, 'epoch': 0.13}


 13%|█▎        | 1740/13552 [10:08<1:05:04,  3.03it/s]

{'loss': 0.2269, 'grad_norm': 0.6956890225410461, 'learning_rate': 0.000192132727707615, 'epoch': 0.13}


 13%|█▎        | 1750/13552 [10:11<1:04:40,  3.04it/s]

{'loss': 0.1637, 'grad_norm': 0.28329548239707947, 'learning_rate': 0.00019204221839297717, 'epoch': 0.13}


 13%|█▎        | 1760/13552 [10:14<1:03:18,  3.10it/s]

{'loss': 0.1895, 'grad_norm': 0.43234047293663025, 'learning_rate': 0.00019195121298585285, 'epoch': 0.13}


 13%|█▎        | 1770/13552 [10:18<1:10:30,  2.79it/s]

{'loss': 0.2058, 'grad_norm': 0.10028868168592453, 'learning_rate': 0.00019185971197674628, 'epoch': 0.13}


 13%|█▎        | 1780/13552 [10:22<1:04:01,  3.06it/s]

{'loss': 0.1562, 'grad_norm': 0.30816569924354553, 'learning_rate': 0.0001917677158588329, 'epoch': 0.13}


 13%|█▎        | 1790/13552 [10:25<1:18:59,  2.48it/s]

{'loss': 0.1638, 'grad_norm': 0.5849215388298035, 'learning_rate': 0.00019167522512795673, 'epoch': 0.13}


 13%|█▎        | 1800/13552 [10:29<1:19:27,  2.47it/s]

{'loss': 0.1794, 'grad_norm': 0.17534761130809784, 'learning_rate': 0.00019158224028262768, 'epoch': 0.13}


 13%|█▎        | 1810/13552 [10:33<1:14:42,  2.62it/s]

{'loss': 0.1845, 'grad_norm': 0.193706214427948, 'learning_rate': 0.00019148876182401882, 'epoch': 0.13}


 13%|█▎        | 1820/13552 [10:36<1:02:47,  3.11it/s]

{'loss': 0.1914, 'grad_norm': 0.39369603991508484, 'learning_rate': 0.00019139479025596375, 'epoch': 0.13}


 14%|█▎        | 1830/13552 [10:39<1:02:55,  3.11it/s]

{'loss': 0.1974, 'grad_norm': 0.5142452716827393, 'learning_rate': 0.00019130032608495384, 'epoch': 0.14}


 14%|█▎        | 1840/13552 [10:42<1:02:33,  3.12it/s]

{'loss': 0.1711, 'grad_norm': 0.17904935777187347, 'learning_rate': 0.0001912053698201355, 'epoch': 0.14}


 14%|█▎        | 1850/13552 [10:46<1:03:30,  3.07it/s]

{'loss': 0.1581, 'grad_norm': 0.13467244803905487, 'learning_rate': 0.0001911099219733074, 'epoch': 0.14}


 14%|█▎        | 1860/13552 [10:49<1:06:56,  2.91it/s]

{'loss': 0.1626, 'grad_norm': 0.23020121455192566, 'learning_rate': 0.0001910139830589179, 'epoch': 0.14}


 14%|█▍        | 1870/13552 [10:53<1:06:15,  2.94it/s]

{'loss': 0.19, 'grad_norm': 0.2227352112531662, 'learning_rate': 0.00019091755359406195, 'epoch': 0.14}


 14%|█▍        | 1880/13552 [10:56<1:05:15,  2.98it/s]

{'loss': 0.2329, 'grad_norm': 0.24029245972633362, 'learning_rate': 0.00019082063409847863, 'epoch': 0.14}


 14%|█▍        | 1890/13552 [11:00<1:08:25,  2.84it/s]

{'loss': 0.2116, 'grad_norm': 0.1659344881772995, 'learning_rate': 0.00019072322509454815, 'epoch': 0.14}


 14%|█▍        | 1900/13552 [11:03<1:06:08,  2.94it/s]

{'loss': 0.151, 'grad_norm': 0.2569924592971802, 'learning_rate': 0.00019062532710728908, 'epoch': 0.14}


 14%|█▍        | 1910/13552 [11:06<1:03:03,  3.08it/s]

{'loss': 0.1974, 'grad_norm': 0.3943850100040436, 'learning_rate': 0.00019052694066435554, 'epoch': 0.14}


 14%|█▍        | 1920/13552 [11:10<1:03:43,  3.04it/s]

{'loss': 0.169, 'grad_norm': 0.10967572033405304, 'learning_rate': 0.00019042806629603436, 'epoch': 0.14}


 14%|█▍        | 1930/13552 [11:13<1:03:48,  3.04it/s]

{'loss': 0.162, 'grad_norm': 0.31179195642471313, 'learning_rate': 0.00019032870453524216, 'epoch': 0.14}


 14%|█▍        | 1940/13552 [11:17<1:09:42,  2.78it/s]

{'loss': 0.1815, 'grad_norm': 1.265762448310852, 'learning_rate': 0.00019022885591752262, 'epoch': 0.14}


 14%|█▍        | 1950/13552 [11:21<1:05:42,  2.94it/s]

{'loss': 0.1526, 'grad_norm': 0.20694604516029358, 'learning_rate': 0.0001901285209810434, 'epoch': 0.14}


 14%|█▍        | 1960/13552 [11:24<1:06:38,  2.90it/s]

{'loss': 0.1883, 'grad_norm': 0.5698956251144409, 'learning_rate': 0.00019002770026659338, 'epoch': 0.14}


 15%|█▍        | 1970/13552 [11:27<1:04:54,  2.97it/s]

{'loss': 0.1768, 'grad_norm': 0.16272947192192078, 'learning_rate': 0.0001899263943175797, 'epoch': 0.15}


 15%|█▍        | 1980/13552 [11:31<1:03:51,  3.02it/s]

{'loss': 0.1759, 'grad_norm': 0.6778003573417664, 'learning_rate': 0.00018982460368002486, 'epoch': 0.15}


 15%|█▍        | 1990/13552 [11:34<1:03:20,  3.04it/s]

{'loss': 0.2097, 'grad_norm': 0.6078962087631226, 'learning_rate': 0.00018972232890256374, 'epoch': 0.15}


 15%|█▍        | 2000/13552 [11:37<1:02:40,  3.07it/s]

{'loss': 0.1817, 'grad_norm': 0.14366066455841064, 'learning_rate': 0.0001896195705364406, 'epoch': 0.15}


 15%|█▍        | 2010/13552 [11:41<1:03:07,  3.05it/s]

{'loss': 0.1621, 'grad_norm': 0.29280099272727966, 'learning_rate': 0.00018951632913550626, 'epoch': 0.15}


 15%|█▍        | 2020/13552 [11:44<1:03:44,  3.02it/s]

{'loss': 0.1908, 'grad_norm': 0.3063008189201355, 'learning_rate': 0.00018941260525621488, 'epoch': 0.15}


 15%|█▍        | 2030/13552 [11:48<1:16:48,  2.50it/s]

{'loss': 0.1588, 'grad_norm': 0.4476550817489624, 'learning_rate': 0.0001893083994576213, 'epoch': 0.15}


 15%|█▌        | 2040/13552 [11:51<1:03:07,  3.04it/s]

{'loss': 0.1848, 'grad_norm': 0.2979224920272827, 'learning_rate': 0.00018920371230137764, 'epoch': 0.15}


 15%|█▌        | 2050/13552 [11:54<1:10:37,  2.71it/s]

{'loss': 0.2062, 'grad_norm': 0.15530557930469513, 'learning_rate': 0.00018909854435173053, 'epoch': 0.15}


 15%|█▌        | 2060/13552 [11:58<1:02:43,  3.05it/s]

{'loss': 0.1698, 'grad_norm': 0.16397951543331146, 'learning_rate': 0.00018899289617551804, 'epoch': 0.15}


 15%|█▌        | 2070/13552 [12:01<1:03:00,  3.04it/s]

{'loss': 0.1777, 'grad_norm': 0.33861681818962097, 'learning_rate': 0.0001888867683421665, 'epoch': 0.15}


 15%|█▌        | 2080/13552 [12:04<1:02:38,  3.05it/s]

{'loss': 0.1724, 'grad_norm': 0.5229748487472534, 'learning_rate': 0.0001887801614236876, 'epoch': 0.15}


 15%|█▌        | 2090/13552 [12:08<1:04:37,  2.96it/s]

{'loss': 0.1743, 'grad_norm': 0.20957985520362854, 'learning_rate': 0.00018867307599467509, 'epoch': 0.15}


 15%|█▌        | 2100/13552 [12:11<1:03:06,  3.02it/s]

{'loss': 0.1666, 'grad_norm': 0.4571530520915985, 'learning_rate': 0.000188565512632302, 'epoch': 0.15}


 16%|█▌        | 2110/13552 [12:15<1:03:17,  3.01it/s]

{'loss': 0.2109, 'grad_norm': 0.3117474615573883, 'learning_rate': 0.0001884574719163172, 'epoch': 0.16}


 16%|█▌        | 2120/13552 [12:18<1:03:29,  3.00it/s]

{'loss': 0.1776, 'grad_norm': 0.47060346603393555, 'learning_rate': 0.00018834895442904244, 'epoch': 0.16}


 16%|█▌        | 2130/13552 [12:22<1:08:20,  2.79it/s]

{'loss': 0.1645, 'grad_norm': 0.5914669036865234, 'learning_rate': 0.00018823996075536923, 'epoch': 0.16}


 16%|█▌        | 2140/13552 [12:26<1:13:21,  2.59it/s]

{'loss': 0.2204, 'grad_norm': 0.17550791800022125, 'learning_rate': 0.00018813049148275562, 'epoch': 0.16}


 16%|█▌        | 2150/13552 [12:30<1:10:18,  2.70it/s]

{'loss': 0.1821, 'grad_norm': 0.16260863840579987, 'learning_rate': 0.0001880205472012231, 'epoch': 0.16}


 16%|█▌        | 2160/13552 [12:34<1:08:59,  2.75it/s]

{'loss': 0.2298, 'grad_norm': 0.7899950742721558, 'learning_rate': 0.00018791012850335323, 'epoch': 0.16}


 16%|█▌        | 2170/13552 [12:37<1:02:36,  3.03it/s]

{'loss': 0.1841, 'grad_norm': 0.5073907375335693, 'learning_rate': 0.00018779923598428484, 'epoch': 0.16}


 16%|█▌        | 2180/13552 [12:40<1:02:16,  3.04it/s]

{'loss': 0.1735, 'grad_norm': 0.7755054831504822, 'learning_rate': 0.00018768787024171042, 'epoch': 0.16}


 16%|█▌        | 2190/13552 [12:44<1:05:39,  2.88it/s]

{'loss': 0.1861, 'grad_norm': 0.17620410025119781, 'learning_rate': 0.00018757603187587302, 'epoch': 0.16}


 16%|█▌        | 2200/13552 [12:47<1:02:37,  3.02it/s]

{'loss': 0.1782, 'grad_norm': 0.43693411350250244, 'learning_rate': 0.0001874637214895632, 'epoch': 0.16}


 16%|█▋        | 2210/13552 [12:50<1:03:12,  2.99it/s]

{'loss': 0.1615, 'grad_norm': 0.14806675910949707, 'learning_rate': 0.00018735093968811554, 'epoch': 0.16}


 16%|█▋        | 2220/13552 [12:54<1:05:49,  2.87it/s]

{'loss': 0.2009, 'grad_norm': 0.154261976480484, 'learning_rate': 0.00018723768707940545, 'epoch': 0.16}


 16%|█▋        | 2230/13552 [12:58<1:04:47,  2.91it/s]

{'loss': 0.1598, 'grad_norm': 0.5199783444404602, 'learning_rate': 0.00018712396427384594, 'epoch': 0.16}


 17%|█▋        | 2240/13552 [13:01<1:01:28,  3.07it/s]

{'loss': 0.1782, 'grad_norm': 0.26469483971595764, 'learning_rate': 0.0001870097718843844, 'epoch': 0.17}


 17%|█▋        | 2250/13552 [13:05<1:02:50,  3.00it/s]

{'loss': 0.1975, 'grad_norm': 0.26378265023231506, 'learning_rate': 0.000186895110526499, 'epoch': 0.17}


 17%|█▋        | 2260/13552 [13:08<1:02:03,  3.03it/s]

{'loss': 0.1658, 'grad_norm': 0.2740681767463684, 'learning_rate': 0.0001867799808181957, 'epoch': 0.17}


 17%|█▋        | 2270/13552 [13:12<1:02:13,  3.02it/s]

{'loss': 0.1787, 'grad_norm': 0.2397814393043518, 'learning_rate': 0.0001866643833800049, 'epoch': 0.17}


 17%|█▋        | 2280/13552 [13:15<1:05:56,  2.85it/s]

{'loss': 0.1738, 'grad_norm': 0.2404189109802246, 'learning_rate': 0.0001865483188349777, 'epoch': 0.17}


 17%|█▋        | 2290/13552 [13:18<1:01:44,  3.04it/s]

{'loss': 0.1704, 'grad_norm': 0.4023342728614807, 'learning_rate': 0.00018643178780868312, 'epoch': 0.17}


 17%|█▋        | 2300/13552 [13:22<1:10:58,  2.64it/s]

{'loss': 0.1689, 'grad_norm': 0.14395304024219513, 'learning_rate': 0.00018631479092920422, 'epoch': 0.17}


 17%|█▋        | 2310/13552 [13:25<1:03:04,  2.97it/s]

{'loss': 0.1673, 'grad_norm': 0.1679874211549759, 'learning_rate': 0.00018619732882713507, 'epoch': 0.17}


 17%|█▋        | 2320/13552 [13:29<1:06:44,  2.80it/s]

{'loss': 0.1585, 'grad_norm': 0.13163995742797852, 'learning_rate': 0.00018607940213557716, 'epoch': 0.17}


 17%|█▋        | 2330/13552 [13:33<1:01:27,  3.04it/s]

{'loss': 0.1853, 'grad_norm': 0.29666754603385925, 'learning_rate': 0.00018596101149013605, 'epoch': 0.17}


 17%|█▋        | 2340/13552 [13:36<1:01:45,  3.03it/s]

{'loss': 0.1678, 'grad_norm': 0.2743864357471466, 'learning_rate': 0.00018584215752891794, 'epoch': 0.17}


 17%|█▋        | 2350/13552 [13:39<1:05:10,  2.86it/s]

{'loss': 0.1658, 'grad_norm': 0.456988662481308, 'learning_rate': 0.0001857228408925262, 'epoch': 0.17}


 17%|█▋        | 2360/13552 [13:43<1:02:56,  2.96it/s]

{'loss': 0.1895, 'grad_norm': 0.13015413284301758, 'learning_rate': 0.00018560306222405797, 'epoch': 0.17}


 17%|█▋        | 2370/13552 [13:46<1:01:01,  3.05it/s]

{'loss': 0.1723, 'grad_norm': 0.29414448142051697, 'learning_rate': 0.0001854828221691007, 'epoch': 0.17}


 18%|█▊        | 2380/13552 [13:50<1:01:45,  3.01it/s]

{'loss': 0.1599, 'grad_norm': 0.11982859671115875, 'learning_rate': 0.00018536212137572854, 'epoch': 0.18}


 18%|█▊        | 2390/13552 [13:53<1:04:40,  2.88it/s]

{'loss': 0.1623, 'grad_norm': 0.1445804238319397, 'learning_rate': 0.00018524096049449902, 'epoch': 0.18}


 18%|█▊        | 2400/13552 [13:56<1:01:18,  3.03it/s]

{'loss': 0.1466, 'grad_norm': 0.1607741415500641, 'learning_rate': 0.00018511934017844948, 'epoch': 0.18}


 18%|█▊        | 2410/13552 [14:00<1:04:48,  2.87it/s]

{'loss': 0.1582, 'grad_norm': 0.3828270435333252, 'learning_rate': 0.00018499726108309346, 'epoch': 0.18}


 18%|█▊        | 2420/13552 [14:03<1:01:08,  3.03it/s]

{'loss': 0.1797, 'grad_norm': 0.20741917192935944, 'learning_rate': 0.00018487472386641737, 'epoch': 0.18}


 18%|█▊        | 2430/13552 [14:07<1:00:46,  3.05it/s]

{'loss': 0.1727, 'grad_norm': 0.34801968932151794, 'learning_rate': 0.00018475172918887669, 'epoch': 0.18}


 18%|█▊        | 2440/13552 [14:10<1:09:01,  2.68it/s]

{'loss': 0.152, 'grad_norm': 0.45476263761520386, 'learning_rate': 0.00018462827771339256, 'epoch': 0.18}


 18%|█▊        | 2450/13552 [14:13<1:06:26,  2.78it/s]

{'loss': 0.1587, 'grad_norm': 0.2697069048881531, 'learning_rate': 0.00018450437010534833, 'epoch': 0.18}


 18%|█▊        | 2460/13552 [14:17<1:08:47,  2.69it/s]

{'loss': 0.1601, 'grad_norm': 0.7821062803268433, 'learning_rate': 0.0001843800070325856, 'epoch': 0.18}


 18%|█▊        | 2470/13552 [14:20<1:04:24,  2.87it/s]

{'loss': 0.1878, 'grad_norm': 0.32329505681991577, 'learning_rate': 0.00018425518916540107, 'epoch': 0.18}


 18%|█▊        | 2480/13552 [14:24<1:00:38,  3.04it/s]

{'loss': 0.1747, 'grad_norm': 0.28849342465400696, 'learning_rate': 0.00018412991717654248, 'epoch': 0.18}


 18%|█▊        | 2490/13552 [14:27<59:48,  3.08it/s]  

{'loss': 0.1724, 'grad_norm': 0.19578048586845398, 'learning_rate': 0.00018400419174120547, 'epoch': 0.18}


 18%|█▊        | 2500/13552 [14:31<1:16:32,  2.41it/s]

{'loss': 0.1637, 'grad_norm': 0.2824179530143738, 'learning_rate': 0.0001838780135370295, 'epoch': 0.18}


 19%|█▊        | 2510/13552 [14:34<1:02:25,  2.95it/s]

{'loss': 0.1543, 'grad_norm': 0.08986099064350128, 'learning_rate': 0.00018375138324409443, 'epoch': 0.19}


 19%|█▊        | 2520/13552 [14:37<1:00:20,  3.05it/s]

{'loss': 0.1744, 'grad_norm': 0.10084376484155655, 'learning_rate': 0.0001836243015449168, 'epoch': 0.19}


 19%|█▊        | 2530/13552 [14:40<59:40,  3.08it/s]  

{'loss': 0.1878, 'grad_norm': 0.3620433807373047, 'learning_rate': 0.00018349676912444613, 'epoch': 0.19}


 19%|█▊        | 2540/13552 [14:44<1:04:59,  2.82it/s]

{'loss': 0.1764, 'grad_norm': 0.9595803022384644, 'learning_rate': 0.00018336878667006134, 'epoch': 0.19}


 19%|█▉        | 2550/13552 [14:48<1:02:54,  2.91it/s]

{'loss': 0.1654, 'grad_norm': 0.36091163754463196, 'learning_rate': 0.00018324035487156688, 'epoch': 0.19}


 19%|█▉        | 2560/13552 [14:51<1:01:13,  2.99it/s]

{'loss': 0.1844, 'grad_norm': 0.11560769379138947, 'learning_rate': 0.00018311147442118912, 'epoch': 0.19}


 19%|█▉        | 2570/13552 [14:55<1:03:04,  2.90it/s]

{'loss': 0.1776, 'grad_norm': 0.4581170976161957, 'learning_rate': 0.0001829821460135726, 'epoch': 0.19}


 19%|█▉        | 2580/13552 [14:58<1:01:06,  2.99it/s]

{'loss': 0.1748, 'grad_norm': 0.1656210869550705, 'learning_rate': 0.0001828523703457762, 'epoch': 0.19}


 19%|█▉        | 2590/13552 [15:02<1:05:08,  2.80it/s]

{'loss': 0.1967, 'grad_norm': 0.24562370777130127, 'learning_rate': 0.00018272214811726963, 'epoch': 0.19}


 19%|█▉        | 2600/13552 [15:06<59:38,  3.06it/s]  

{'loss': 0.175, 'grad_norm': 0.06915533542633057, 'learning_rate': 0.00018259148002992926, 'epoch': 0.19}


 19%|█▉        | 2610/13552 [15:09<1:00:08,  3.03it/s]

{'loss': 0.1674, 'grad_norm': 0.29438918828964233, 'learning_rate': 0.00018246036678803474, 'epoch': 0.19}


 19%|█▉        | 2620/13552 [15:12<59:51,  3.04it/s]  

{'loss': 0.1444, 'grad_norm': 0.19569921493530273, 'learning_rate': 0.00018232880909826497, 'epoch': 0.19}


 19%|█▉        | 2630/13552 [15:16<1:00:18,  3.02it/s]

{'loss': 0.1626, 'grad_norm': 0.0924191102385521, 'learning_rate': 0.0001821968076696944, 'epoch': 0.19}


 19%|█▉        | 2640/13552 [15:19<58:57,  3.08it/s]  

{'loss': 0.1656, 'grad_norm': 0.25045329332351685, 'learning_rate': 0.00018206436321378905, 'epoch': 0.19}


 20%|█▉        | 2650/13552 [15:23<1:02:29,  2.91it/s]

{'loss': 0.2197, 'grad_norm': 0.7187458872795105, 'learning_rate': 0.0001819314764444029, 'epoch': 0.2}


 20%|█▉        | 2660/13552 [15:26<59:36,  3.05it/s]  

{'loss': 0.1604, 'grad_norm': 0.17916756868362427, 'learning_rate': 0.00018179814807777383, 'epoch': 0.2}


 20%|█▉        | 2670/13552 [15:29<59:47,  3.03it/s]  

{'loss': 0.1762, 'grad_norm': 0.0766671672463417, 'learning_rate': 0.00018166437883251993, 'epoch': 0.2}


 20%|█▉        | 2680/13552 [15:33<1:05:25,  2.77it/s]

{'loss': 0.1646, 'grad_norm': 0.08102694898843765, 'learning_rate': 0.00018153016942963558, 'epoch': 0.2}


 20%|█▉        | 2690/13552 [15:37<1:05:11,  2.78it/s]

{'loss': 0.1689, 'grad_norm': 0.10918690264225006, 'learning_rate': 0.0001813955205924874, 'epoch': 0.2}


 20%|█▉        | 2700/13552 [15:40<1:03:14,  2.86it/s]

{'loss': 0.1677, 'grad_norm': 0.3645767867565155, 'learning_rate': 0.0001812604330468106, 'epoch': 0.2}


 20%|█▉        | 2710/13552 [15:44<1:04:53,  2.78it/s]

{'loss': 0.1644, 'grad_norm': 0.22571870684623718, 'learning_rate': 0.00018112490752070502, 'epoch': 0.2}


 20%|██        | 2720/13552 [15:48<1:04:29,  2.80it/s]

{'loss': 0.1848, 'grad_norm': 0.3569672703742981, 'learning_rate': 0.00018098894474463092, 'epoch': 0.2}


 20%|██        | 2730/13552 [15:51<1:02:28,  2.89it/s]

{'loss': 0.1852, 'grad_norm': 0.38682618737220764, 'learning_rate': 0.0001808525454514055, 'epoch': 0.2}


 20%|██        | 2740/13552 [15:55<1:03:25,  2.84it/s]

{'loss': 0.1412, 'grad_norm': 0.07369700074195862, 'learning_rate': 0.00018071571037619853, 'epoch': 0.2}


 20%|██        | 2750/13552 [15:59<1:01:11,  2.94it/s]

{'loss': 0.1583, 'grad_norm': 0.23203520476818085, 'learning_rate': 0.00018057844025652875, 'epoch': 0.2}


 20%|██        | 2760/13552 [16:02<1:06:41,  2.70it/s]

{'loss': 0.1831, 'grad_norm': 0.10561654716730118, 'learning_rate': 0.0001804407358322596, 'epoch': 0.2}


 20%|██        | 2770/13552 [16:06<1:00:45,  2.96it/s]

{'loss': 0.1776, 'grad_norm': 0.5836870074272156, 'learning_rate': 0.00018030259784559535, 'epoch': 0.2}


 21%|██        | 2780/13552 [16:09<1:00:33,  2.96it/s]

{'loss': 0.1468, 'grad_norm': 0.15534065663814545, 'learning_rate': 0.00018016402704107716, 'epoch': 0.21}


 21%|██        | 2790/13552 [16:13<1:01:05,  2.94it/s]

{'loss': 0.1428, 'grad_norm': 0.083094023168087, 'learning_rate': 0.00018002502416557893, 'epoch': 0.21}


 21%|██        | 2800/13552 [16:16<1:00:35,  2.96it/s]

{'loss': 0.1625, 'grad_norm': 0.08425427228212357, 'learning_rate': 0.0001798855899683035, 'epoch': 0.21}


 21%|██        | 2810/13552 [16:20<1:03:28,  2.82it/s]

{'loss': 0.168, 'grad_norm': 0.16753219068050385, 'learning_rate': 0.00017974572520077823, 'epoch': 0.21}


 21%|██        | 2820/13552 [16:23<1:00:24,  2.96it/s]

{'loss': 0.1579, 'grad_norm': 0.09330761432647705, 'learning_rate': 0.00017960543061685145, 'epoch': 0.21}


 21%|██        | 2830/13552 [16:27<1:05:17,  2.74it/s]

{'loss': 0.1732, 'grad_norm': 0.5324885845184326, 'learning_rate': 0.00017946470697268789, 'epoch': 0.21}


 21%|██        | 2840/13552 [16:30<1:01:40,  2.90it/s]

{'loss': 0.1489, 'grad_norm': 0.07385297119617462, 'learning_rate': 0.00017932355502676498, 'epoch': 0.21}


 21%|██        | 2850/13552 [16:34<1:05:41,  2.71it/s]

{'loss': 0.1538, 'grad_norm': 0.3117879331111908, 'learning_rate': 0.00017918197553986866, 'epoch': 0.21}


 21%|██        | 2860/13552 [16:38<1:02:59,  2.83it/s]

{'loss': 0.144, 'grad_norm': 0.18361559510231018, 'learning_rate': 0.00017903996927508907, 'epoch': 0.21}


 21%|██        | 2870/13552 [16:41<1:00:05,  2.96it/s]

{'loss': 0.19, 'grad_norm': 0.15105587244033813, 'learning_rate': 0.00017889753699781684, 'epoch': 0.21}


 21%|██▏       | 2880/13552 [16:45<1:00:30,  2.94it/s]

{'loss': 0.1663, 'grad_norm': 0.4632263481616974, 'learning_rate': 0.00017875467947573855, 'epoch': 0.21}


 21%|██▏       | 2890/13552 [16:49<1:04:21,  2.76it/s]

{'loss': 0.1567, 'grad_norm': 0.2857431471347809, 'learning_rate': 0.00017861139747883288, 'epoch': 0.21}


 21%|██▏       | 2900/13552 [16:52<1:11:49,  2.47it/s]

{'loss': 0.1407, 'grad_norm': 0.46724098920822144, 'learning_rate': 0.00017846769177936632, 'epoch': 0.21}


 21%|██▏       | 2910/13552 [16:56<1:09:59,  2.53it/s]

{'loss': 0.222, 'grad_norm': 0.8553862571716309, 'learning_rate': 0.00017832356315188906, 'epoch': 0.21}


 22%|██▏       | 2920/13552 [16:59<1:01:14,  2.89it/s]

{'loss': 0.1698, 'grad_norm': 0.49570536613464355, 'learning_rate': 0.0001781790123732308, 'epoch': 0.22}


 22%|██▏       | 2930/13552 [17:03<59:48,  2.96it/s]  

{'loss': 0.1681, 'grad_norm': 0.4940013885498047, 'learning_rate': 0.0001780340402224966, 'epoch': 0.22}


 22%|██▏       | 2940/13552 [17:06<59:36,  2.97it/s]  

{'loss': 0.161, 'grad_norm': 0.09852021187543869, 'learning_rate': 0.0001778886474810626, 'epoch': 0.22}


 22%|██▏       | 2950/13552 [17:09<59:02,  2.99it/s]

{'loss': 0.1708, 'grad_norm': 0.11090899258852005, 'learning_rate': 0.00017774283493257183, 'epoch': 0.22}


 22%|██▏       | 2960/13552 [17:13<1:01:14,  2.88it/s]

{'loss': 0.1634, 'grad_norm': 1.7087390422821045, 'learning_rate': 0.00017759660336293012, 'epoch': 0.22}


 22%|██▏       | 2970/13552 [17:16<59:54,  2.94it/s]  

{'loss': 0.1573, 'grad_norm': 0.19857414066791534, 'learning_rate': 0.00017744995356030162, 'epoch': 0.22}


 22%|██▏       | 2980/13552 [17:20<1:01:17,  2.87it/s]

{'loss': 0.166, 'grad_norm': 0.5702523589134216, 'learning_rate': 0.0001773028863151048, 'epoch': 0.22}


 22%|██▏       | 2990/13552 [17:24<1:02:07,  2.83it/s]

{'loss': 0.1512, 'grad_norm': 0.18844681978225708, 'learning_rate': 0.00017715540242000804, 'epoch': 0.22}


 22%|██▏       | 3000/13552 [17:28<1:00:26,  2.91it/s]

{'loss': 0.1712, 'grad_norm': 0.10130276530981064, 'learning_rate': 0.00017700750266992535, 'epoch': 0.22}


 22%|██▏       | 3010/13552 [17:32<1:17:31,  2.27it/s]

{'loss': 0.1494, 'grad_norm': 0.20054009556770325, 'learning_rate': 0.00017685918786201216, 'epoch': 0.22}


 22%|██▏       | 3020/13552 [17:35<59:29,  2.95it/s]  

{'loss': 0.1641, 'grad_norm': 0.08046387881040573, 'learning_rate': 0.00017671045879566103, 'epoch': 0.22}


 22%|██▏       | 3030/13552 [17:39<59:29,  2.95it/s]

{'loss': 0.1584, 'grad_norm': 0.05862180516123772, 'learning_rate': 0.00017656131627249728, 'epoch': 0.22}


 22%|██▏       | 3040/13552 [17:42<1:04:49,  2.70it/s]

{'loss': 0.152, 'grad_norm': 0.087996706366539, 'learning_rate': 0.00017641176109637466, 'epoch': 0.22}


 23%|██▎       | 3050/13552 [17:46<1:04:27,  2.72it/s]

{'loss': 0.1883, 'grad_norm': 0.5876498818397522, 'learning_rate': 0.00017626179407337112, 'epoch': 0.23}


 23%|██▎       | 3060/13552 [17:50<1:21:43,  2.14it/s]

{'loss': 0.146, 'grad_norm': 0.38919347524642944, 'learning_rate': 0.0001761114160117843, 'epoch': 0.23}


 23%|██▎       | 3070/13552 [17:53<58:54,  2.97it/s]  

{'loss': 0.1833, 'grad_norm': 0.8159825205802917, 'learning_rate': 0.00017596062772212739, 'epoch': 0.23}


 23%|██▎       | 3080/13552 [17:57<58:58,  2.96it/s]

{'loss': 0.1703, 'grad_norm': 0.6302368640899658, 'learning_rate': 0.00017580943001712455, 'epoch': 0.23}


 23%|██▎       | 3090/13552 [18:00<1:00:46,  2.87it/s]

{'loss': 0.1543, 'grad_norm': 0.48249372839927673, 'learning_rate': 0.00017565782371170666, 'epoch': 0.23}


 23%|██▎       | 3100/13552 [18:04<57:52,  3.01it/s]  

{'loss': 0.1551, 'grad_norm': 0.8585938215255737, 'learning_rate': 0.00017550580962300688, 'epoch': 0.23}


 23%|██▎       | 3110/13552 [18:07<1:03:05,  2.76it/s]

{'loss': 0.164, 'grad_norm': 0.12240735441446304, 'learning_rate': 0.0001753533885703563, 'epoch': 0.23}


 23%|██▎       | 3120/13552 [18:10<58:01,  3.00it/s]  

{'loss': 0.1412, 'grad_norm': 0.485252320766449, 'learning_rate': 0.00017520056137527935, 'epoch': 0.23}


 23%|██▎       | 3130/13552 [18:14<1:00:42,  2.86it/s]

{'loss': 0.1664, 'grad_norm': 0.15751712024211884, 'learning_rate': 0.0001750473288614897, 'epoch': 0.23}


 23%|██▎       | 3140/13552 [18:18<57:57,  2.99it/s]  

{'loss': 0.1539, 'grad_norm': 0.07425800710916519, 'learning_rate': 0.0001748936918548854, 'epoch': 0.23}


 23%|██▎       | 3150/13552 [18:21<1:03:21,  2.74it/s]

{'loss': 0.1681, 'grad_norm': 0.26854407787323, 'learning_rate': 0.0001747396511835448, 'epoch': 0.23}


 23%|██▎       | 3160/13552 [18:24<58:32,  2.96it/s]  

{'loss': 0.1592, 'grad_norm': 0.24718062579631805, 'learning_rate': 0.0001745852076777219, 'epoch': 0.23}


 23%|██▎       | 3170/13552 [18:28<58:12,  2.97it/s]  

{'loss': 0.1343, 'grad_norm': 0.15481732785701752, 'learning_rate': 0.00017443036216984194, 'epoch': 0.23}


 23%|██▎       | 3180/13552 [18:31<58:04,  2.98it/s]  

{'loss': 0.1631, 'grad_norm': 0.5089274644851685, 'learning_rate': 0.00017427511549449683, 'epoch': 0.23}


 24%|██▎       | 3190/13552 [18:35<1:00:07,  2.87it/s]

{'loss': 0.1562, 'grad_norm': 0.09628845006227493, 'learning_rate': 0.00017411946848844067, 'epoch': 0.24}


 24%|██▎       | 3200/13552 [18:38<56:50,  3.04it/s]  

{'loss': 0.169, 'grad_norm': 0.20254558324813843, 'learning_rate': 0.00017396342199058545, 'epoch': 0.24}


 24%|██▎       | 3210/13552 [18:42<57:43,  2.99it/s]  

{'loss': 0.1639, 'grad_norm': 0.2281256914138794, 'learning_rate': 0.00017380697684199614, 'epoch': 0.24}


 24%|██▍       | 3220/13552 [18:45<57:47,  2.98it/s]  

{'loss': 0.1529, 'grad_norm': 0.1487552672624588, 'learning_rate': 0.00017365013388588656, 'epoch': 0.24}


 24%|██▍       | 3230/13552 [18:48<58:38,  2.93it/s]

{'loss': 0.1552, 'grad_norm': 0.11373630911111832, 'learning_rate': 0.0001734928939676145, 'epoch': 0.24}


 24%|██▍       | 3240/13552 [18:52<56:48,  3.02it/s]

{'loss': 0.1731, 'grad_norm': 0.2284633368253708, 'learning_rate': 0.0001733352579346774, 'epoch': 0.24}


 24%|██▍       | 3250/13552 [18:55<56:28,  3.04it/s]

{'loss': 0.1623, 'grad_norm': 0.08140649646520615, 'learning_rate': 0.0001731772266367077, 'epoch': 0.24}


 24%|██▍       | 3260/13552 [18:58<56:20,  3.04it/s]

{'loss': 0.1711, 'grad_norm': 0.18652337789535522, 'learning_rate': 0.00017301880092546816, 'epoch': 0.24}


 24%|██▍       | 3270/13552 [19:02<58:01,  2.95it/s]  

{'loss': 0.1494, 'grad_norm': 0.11980395764112473, 'learning_rate': 0.00017285998165484742, 'epoch': 0.24}


 24%|██▍       | 3280/13552 [19:05<56:40,  3.02it/s]  

{'loss': 0.1634, 'grad_norm': 0.19953574240207672, 'learning_rate': 0.0001727007696808554, 'epoch': 0.24}


 24%|██▍       | 3290/13552 [19:08<55:35,  3.08it/s]  

{'loss': 0.1644, 'grad_norm': 0.656154990196228, 'learning_rate': 0.00017254116586161853, 'epoch': 0.24}


 24%|██▍       | 3300/13552 [19:12<55:29,  3.08it/s]

{'loss': 0.169, 'grad_norm': 0.24582381546497345, 'learning_rate': 0.0001723811710573753, 'epoch': 0.24}


 24%|██▍       | 3310/13552 [19:15<58:10,  2.93it/s]  

{'loss': 0.1647, 'grad_norm': 0.07430532574653625, 'learning_rate': 0.00017222078613047148, 'epoch': 0.24}


 24%|██▍       | 3320/13552 [19:19<58:30,  2.91it/s]

{'loss': 0.1626, 'grad_norm': 0.15798117220401764, 'learning_rate': 0.00017206001194535557, 'epoch': 0.24}


 25%|██▍       | 3330/13552 [19:22<58:50,  2.90it/s]  

{'loss': 0.1784, 'grad_norm': 0.22719508409500122, 'learning_rate': 0.0001718988493685741, 'epoch': 0.25}


 25%|██▍       | 3340/13552 [19:26<1:00:27,  2.81it/s]

{'loss': 0.1521, 'grad_norm': 0.09073441475629807, 'learning_rate': 0.00017173729926876695, 'epoch': 0.25}


 25%|██▍       | 3350/13552 [19:29<55:21,  3.07it/s]  

{'loss': 0.164, 'grad_norm': 0.3145393133163452, 'learning_rate': 0.00017157536251666275, 'epoch': 0.25}


 25%|██▍       | 3360/13552 [19:32<59:02,  2.88it/s]  

{'loss': 0.1489, 'grad_norm': 0.1593959778547287, 'learning_rate': 0.00017141303998507404, 'epoch': 0.25}


 25%|██▍       | 3370/13552 [19:36<55:43,  3.05it/s]  

{'loss': 0.1782, 'grad_norm': 0.7133653163909912, 'learning_rate': 0.00017125033254889274, 'epoch': 0.25}


 25%|██▍       | 3380/13552 [19:39<1:01:11,  2.77it/s]

{'loss': 0.1468, 'grad_norm': 0.1826471984386444, 'learning_rate': 0.00017108724108508521, 'epoch': 0.25}


 25%|██▌       | 3390/13552 [19:43<57:39,  2.94it/s]  

{'loss': 0.1634, 'grad_norm': 0.48999547958374023, 'learning_rate': 0.00017092376647268785, 'epoch': 0.25}


 25%|██▌       | 3400/13552 [19:46<56:41,  2.98it/s]

{'loss': 0.1626, 'grad_norm': 0.331869900226593, 'learning_rate': 0.0001707599095928019, 'epoch': 0.25}


 25%|██▌       | 3410/13552 [19:49<56:48,  2.98it/s]  

{'loss': 0.1841, 'grad_norm': 0.6701318025588989, 'learning_rate': 0.00017059567132858927, 'epoch': 0.25}


 25%|██▌       | 3420/13552 [19:53<59:50,  2.82it/s]  

{'loss': 0.1651, 'grad_norm': 0.18539108335971832, 'learning_rate': 0.00017043105256526724, 'epoch': 0.25}


 25%|██▌       | 3430/13552 [19:56<55:27,  3.04it/s]

{'loss': 0.1618, 'grad_norm': 0.09272097796201706, 'learning_rate': 0.00017026605419010395, 'epoch': 0.25}


 25%|██▌       | 3440/13552 [20:00<57:13,  2.95it/s]  

{'loss': 0.1771, 'grad_norm': 0.09041201323270798, 'learning_rate': 0.0001701006770924137, 'epoch': 0.25}


 25%|██▌       | 3450/13552 [20:03<54:44,  3.08it/s]

{'loss': 0.1731, 'grad_norm': 0.13571390509605408, 'learning_rate': 0.0001699349221635519, 'epoch': 0.25}


 26%|██▌       | 3460/13552 [20:06<57:55,  2.90it/s]

{'loss': 0.1466, 'grad_norm': 0.07577165216207504, 'learning_rate': 0.00016976879029691056, 'epoch': 0.26}


 26%|██▌       | 3470/13552 [20:10<1:08:04,  2.47it/s]

{'loss': 0.1737, 'grad_norm': 0.6626776456832886, 'learning_rate': 0.00016960228238791316, 'epoch': 0.26}


 26%|██▌       | 3480/13552 [20:13<57:01,  2.94it/s]  

{'loss': 0.1513, 'grad_norm': 1.1955668926239014, 'learning_rate': 0.0001694353993340101, 'epoch': 0.26}


 26%|██▌       | 3490/13552 [20:17<57:54,  2.90it/s]  

{'loss': 0.195, 'grad_norm': 0.1530960202217102, 'learning_rate': 0.00016926814203467374, 'epoch': 0.26}


 26%|██▌       | 3500/13552 [20:20<56:51,  2.95it/s]  

{'loss': 0.1783, 'grad_norm': 0.3468184173107147, 'learning_rate': 0.0001691005113913935, 'epoch': 0.26}


 26%|██▌       | 3510/13552 [20:24<58:26,  2.86it/s]  

{'loss': 0.1777, 'grad_norm': 0.35161998867988586, 'learning_rate': 0.0001689325083076711, 'epoch': 0.26}


 26%|██▌       | 3520/13552 [20:27<56:42,  2.95it/s]

{'loss': 0.1779, 'grad_norm': 0.3916364908218384, 'learning_rate': 0.00016876413368901565, 'epoch': 0.26}


 26%|██▌       | 3530/13552 [20:31<56:35,  2.95it/s]

{'loss': 0.1795, 'grad_norm': 0.7403610348701477, 'learning_rate': 0.00016859538844293882, 'epoch': 0.26}


 26%|██▌       | 3540/13552 [20:34<1:01:15,  2.72it/s]

{'loss': 0.1649, 'grad_norm': 0.46748608350753784, 'learning_rate': 0.0001684262734789498, 'epoch': 0.26}


 26%|██▌       | 3550/13552 [20:38<59:11,  2.82it/s]  

{'loss': 0.1799, 'grad_norm': 0.3512478470802307, 'learning_rate': 0.00016825678970855062, 'epoch': 0.26}


 26%|██▋       | 3560/13552 [20:42<1:07:08,  2.48it/s]

{'loss': 0.1718, 'grad_norm': 0.16517436504364014, 'learning_rate': 0.00016808693804523097, 'epoch': 0.26}


 26%|██▋       | 3570/13552 [20:45<58:33,  2.84it/s]  

{'loss': 0.1652, 'grad_norm': 0.10587987303733826, 'learning_rate': 0.00016791671940446357, 'epoch': 0.26}


 26%|██▋       | 3580/13552 [20:49<57:05,  2.91it/s]

{'loss': 0.186, 'grad_norm': 0.3006913363933563, 'learning_rate': 0.000167746134703699, 'epoch': 0.26}


 26%|██▋       | 3590/13552 [20:52<55:56,  2.97it/s]

{'loss': 0.1886, 'grad_norm': 0.4127424657344818, 'learning_rate': 0.00016757518486236087, 'epoch': 0.26}


 27%|██▋       | 3600/13552 [20:56<56:26,  2.94it/s]

{'loss': 0.1707, 'grad_norm': 0.19074276089668274, 'learning_rate': 0.00016740387080184084, 'epoch': 0.27}


 27%|██▋       | 3610/13552 [20:59<57:07,  2.90it/s]

{'loss': 0.182, 'grad_norm': 0.8389883041381836, 'learning_rate': 0.00016723219344549363, 'epoch': 0.27}


 27%|██▋       | 3620/13552 [21:02<55:38,  2.98it/s]

{'loss': 0.1777, 'grad_norm': 0.4576769471168518, 'learning_rate': 0.00016706015371863208, 'epoch': 0.27}


 27%|██▋       | 3630/13552 [21:06<1:14:08,  2.23it/s]

{'loss': 0.1388, 'grad_norm': 0.8785942196846008, 'learning_rate': 0.00016688775254852217, 'epoch': 0.27}


 27%|██▋       | 3640/13552 [21:10<56:11,  2.94it/s]  

{'loss': 0.1796, 'grad_norm': 0.5384902358055115, 'learning_rate': 0.00016671499086437795, 'epoch': 0.27}


 27%|██▋       | 3650/13552 [21:13<54:43,  3.02it/s]

{'loss': 0.1861, 'grad_norm': 0.13091303408145905, 'learning_rate': 0.00016654186959735666, 'epoch': 0.27}


 27%|██▋       | 3660/13552 [21:16<57:32,  2.87it/s]  

{'loss': 0.1933, 'grad_norm': 1.2583802938461304, 'learning_rate': 0.00016636838968055352, 'epoch': 0.27}


 27%|██▋       | 3670/13552 [21:20<55:27,  2.97it/s]  

{'loss': 0.173, 'grad_norm': 0.7730205655097961, 'learning_rate': 0.00016619455204899692, 'epoch': 0.27}


 27%|██▋       | 3680/13552 [21:23<55:19,  2.97it/s]

{'loss': 0.1682, 'grad_norm': 0.17664682865142822, 'learning_rate': 0.0001660203576396432, 'epoch': 0.27}


 27%|██▋       | 3690/13552 [21:27<55:02,  2.99it/s]

{'loss': 0.1579, 'grad_norm': 0.2977665662765503, 'learning_rate': 0.00016584580739137175, 'epoch': 0.27}


 27%|██▋       | 3700/13552 [21:30<1:03:59,  2.57it/s]

{'loss': 0.1543, 'grad_norm': 0.5852251648902893, 'learning_rate': 0.0001656709022449798, 'epoch': 0.27}


 27%|██▋       | 3710/13552 [21:34<55:46,  2.94it/s]  

{'loss': 0.1761, 'grad_norm': 0.20812121033668518, 'learning_rate': 0.00016549564314317737, 'epoch': 0.27}


 27%|██▋       | 3720/13552 [21:37<56:27,  2.90it/s]  

{'loss': 0.1339, 'grad_norm': 0.5860506296157837, 'learning_rate': 0.0001653200310305825, 'epoch': 0.27}


 28%|██▊       | 3730/13552 [21:41<54:55,  2.98it/s]

{'loss': 0.1765, 'grad_norm': 0.2460876852273941, 'learning_rate': 0.00016514406685371556, 'epoch': 0.28}


 28%|██▊       | 3740/13552 [21:44<1:07:34,  2.42it/s]

{'loss': 0.1558, 'grad_norm': 0.19521227478981018, 'learning_rate': 0.00016496775156099475, 'epoch': 0.28}


 28%|██▊       | 3750/13552 [21:48<56:31,  2.89it/s]  

{'loss': 0.1847, 'grad_norm': 0.6922126412391663, 'learning_rate': 0.0001647910861027306, 'epoch': 0.28}


 28%|██▊       | 3760/13552 [21:51<54:39,  2.99it/s]

{'loss': 0.1769, 'grad_norm': 0.24649161100387573, 'learning_rate': 0.00016461407143112097, 'epoch': 0.28}


 28%|██▊       | 3770/13552 [21:55<56:31,  2.88it/s]  

{'loss': 0.1723, 'grad_norm': 0.3909758925437927, 'learning_rate': 0.00016443670850024603, 'epoch': 0.28}


 28%|██▊       | 3780/13552 [21:59<56:20,  2.89it/s]  

{'loss': 0.1551, 'grad_norm': 0.4389440715312958, 'learning_rate': 0.00016425899826606286, 'epoch': 0.28}


 28%|██▊       | 3790/13552 [22:02<54:36,  2.98it/s]

{'loss': 0.1683, 'grad_norm': 0.4585651159286499, 'learning_rate': 0.00016408094168640056, 'epoch': 0.28}


 28%|██▊       | 3800/13552 [22:06<57:50,  2.81it/s]  

{'loss': 0.1667, 'grad_norm': 0.28801438212394714, 'learning_rate': 0.00016390253972095492, 'epoch': 0.28}


 28%|██▊       | 3810/13552 [22:09<54:22,  2.99it/s]

{'loss': 0.1941, 'grad_norm': 0.25760385394096375, 'learning_rate': 0.00016372379333128335, 'epoch': 0.28}


 28%|██▊       | 3820/13552 [22:13<59:01,  2.75it/s]  

{'loss': 0.1662, 'grad_norm': 0.37196284532546997, 'learning_rate': 0.00016354470348079963, 'epoch': 0.28}


 28%|██▊       | 3830/13552 [22:17<1:10:10,  2.31it/s]

{'loss': 0.1509, 'grad_norm': 0.23527348041534424, 'learning_rate': 0.0001633652711347687, 'epoch': 0.28}


 28%|██▊       | 3840/13552 [22:21<55:56,  2.89it/s]  

{'loss': 0.1629, 'grad_norm': 0.09422435611486435, 'learning_rate': 0.00016318549726030153, 'epoch': 0.28}


 28%|██▊       | 3850/13552 [22:24<55:07,  2.93it/s]  

{'loss': 0.1502, 'grad_norm': 0.3852861225605011, 'learning_rate': 0.0001630053828263499, 'epoch': 0.28}


 28%|██▊       | 3860/13552 [22:28<52:26,  3.08it/s]  

{'loss': 0.1906, 'grad_norm': 0.18261563777923584, 'learning_rate': 0.00016282492880370108, 'epoch': 0.28}


 29%|██▊       | 3870/13552 [22:31<54:24,  2.97it/s]

{'loss': 0.1652, 'grad_norm': 0.7652456164360046, 'learning_rate': 0.00016264413616497273, 'epoch': 0.29}


 29%|██▊       | 3880/13552 [22:35<58:35,  2.75it/s]  

{'loss': 0.1977, 'grad_norm': 0.1157119870185852, 'learning_rate': 0.00016246300588460753, 'epoch': 0.29}


 29%|██▊       | 3890/13552 [22:38<52:22,  3.07it/s]

{'loss': 0.1411, 'grad_norm': 0.4766114354133606, 'learning_rate': 0.0001622815389388681, 'epoch': 0.29}


 29%|██▉       | 3900/13552 [22:42<55:03,  2.92it/s]  

{'loss': 0.1715, 'grad_norm': 0.07968863844871521, 'learning_rate': 0.0001620997363058315, 'epoch': 0.29}


 29%|██▉       | 3910/13552 [22:45<52:01,  3.09it/s]

{'loss': 0.1486, 'grad_norm': 0.05042731389403343, 'learning_rate': 0.00016191759896538418, 'epoch': 0.29}


 29%|██▉       | 3920/13552 [22:48<57:23,  2.80it/s]

{'loss': 0.166, 'grad_norm': 0.12562479078769684, 'learning_rate': 0.00016173512789921662, 'epoch': 0.29}


 29%|██▉       | 3930/13552 [22:52<54:43,  2.93it/s]

{'loss': 0.1623, 'grad_norm': 0.14232110977172852, 'learning_rate': 0.00016155232409081793, 'epoch': 0.29}


 29%|██▉       | 3940/13552 [22:55<54:13,  2.95it/s]

{'loss': 0.1397, 'grad_norm': 0.09436975419521332, 'learning_rate': 0.00016136918852547073, 'epoch': 0.29}


 29%|██▉       | 3950/13552 [22:59<54:18,  2.95it/s]  

{'loss': 0.1645, 'grad_norm': 0.22291935980319977, 'learning_rate': 0.00016118572219024575, 'epoch': 0.29}


 29%|██▉       | 3960/13552 [23:02<54:08,  2.95it/s]  

{'loss': 0.1582, 'grad_norm': 0.092702217400074, 'learning_rate': 0.00016100192607399646, 'epoch': 0.29}


 29%|██▉       | 3970/13552 [23:06<53:37,  2.98it/s]  

{'loss': 0.1563, 'grad_norm': 0.30241626501083374, 'learning_rate': 0.00016081780116735384, 'epoch': 0.29}


 29%|██▉       | 3980/13552 [23:09<56:39,  2.82it/s]

{'loss': 0.1701, 'grad_norm': 0.08011844009160995, 'learning_rate': 0.00016063334846272103, 'epoch': 0.29}


 29%|██▉       | 3990/13552 [23:13<53:35,  2.97it/s]

{'loss': 0.1442, 'grad_norm': 0.10581906139850616, 'learning_rate': 0.00016044856895426786, 'epoch': 0.29}


 30%|██▉       | 4000/13552 [23:16<53:27,  2.98it/s]

{'loss': 0.1707, 'grad_norm': 0.548794686794281, 'learning_rate': 0.00016026346363792567, 'epoch': 0.3}


 30%|██▉       | 4010/13552 [23:20<53:36,  2.97it/s]

{'loss': 0.1734, 'grad_norm': 0.4421837329864502, 'learning_rate': 0.00016007803351138173, 'epoch': 0.3}


 30%|██▉       | 4020/13552 [23:23<52:55,  3.00it/s]

{'loss': 0.1821, 'grad_norm': 0.6985331177711487, 'learning_rate': 0.00015989227957407414, 'epoch': 0.3}


 30%|██▉       | 4030/13552 [23:26<53:14,  2.98it/s]

{'loss': 0.1545, 'grad_norm': 0.17568562924861908, 'learning_rate': 0.00015970620282718616, 'epoch': 0.3}


 30%|██▉       | 4040/13552 [23:30<52:59,  2.99it/s]

{'loss': 0.175, 'grad_norm': 0.539038360118866, 'learning_rate': 0.00015951980427364102, 'epoch': 0.3}


 30%|██▉       | 4050/13552 [23:33<56:14,  2.82it/s]

{'loss': 0.1616, 'grad_norm': 0.15377235412597656, 'learning_rate': 0.00015933308491809637, 'epoch': 0.3}


 30%|██▉       | 4060/13552 [23:36<55:03,  2.87it/s]

{'loss': 0.172, 'grad_norm': 0.7809950113296509, 'learning_rate': 0.00015914604576693902, 'epoch': 0.3}


 30%|███       | 4070/13552 [23:40<53:09,  2.97it/s]

{'loss': 0.149, 'grad_norm': 0.22668993473052979, 'learning_rate': 0.00015895868782827928, 'epoch': 0.3}


 30%|███       | 4080/13552 [23:43<51:35,  3.06it/s]

{'loss': 0.208, 'grad_norm': 0.15548856556415558, 'learning_rate': 0.00015877101211194585, 'epoch': 0.3}


 30%|███       | 4090/13552 [23:47<56:08,  2.81it/s]

{'loss': 0.1649, 'grad_norm': 0.45294496417045593, 'learning_rate': 0.00015858301962948004, 'epoch': 0.3}


 30%|███       | 4100/13552 [23:50<51:48,  3.04it/s]

{'loss': 0.1737, 'grad_norm': 0.33585575222969055, 'learning_rate': 0.00015839471139413066, 'epoch': 0.3}


 30%|███       | 4110/13552 [23:53<58:49,  2.67it/s]  

{'loss': 0.1513, 'grad_norm': 0.3052186071872711, 'learning_rate': 0.00015820608842084813, 'epoch': 0.3}


 30%|███       | 4120/13552 [23:57<1:08:10,  2.31it/s]

{'loss': 0.1454, 'grad_norm': 0.356549471616745, 'learning_rate': 0.00015801715172627945, 'epoch': 0.3}


 30%|███       | 4130/13552 [24:00<52:19,  3.00it/s]  

{'loss': 0.168, 'grad_norm': 0.34686988592147827, 'learning_rate': 0.00015782790232876247, 'epoch': 0.3}


 31%|███       | 4140/13552 [24:04<55:29,  2.83it/s]  

{'loss': 0.1746, 'grad_norm': 0.2599671185016632, 'learning_rate': 0.00015763834124832047, 'epoch': 0.31}


 31%|███       | 4150/13552 [24:07<51:27,  3.05it/s]

{'loss': 0.1783, 'grad_norm': 0.39702197909355164, 'learning_rate': 0.00015744846950665662, 'epoch': 0.31}


 31%|███       | 4160/13552 [24:11<53:03,  2.95it/s]  

{'loss': 0.1727, 'grad_norm': 0.1549137979745865, 'learning_rate': 0.00015725828812714854, 'epoch': 0.31}


 31%|███       | 4170/13552 [24:14<51:26,  3.04it/s]

{'loss': 0.1735, 'grad_norm': 0.08284517377614975, 'learning_rate': 0.00015706779813484266, 'epoch': 0.31}


 31%|███       | 4180/13552 [24:18<1:04:17,  2.43it/s]

{'loss': 0.1505, 'grad_norm': 0.578606128692627, 'learning_rate': 0.0001568770005564489, 'epoch': 0.31}


 31%|███       | 4190/13552 [24:22<53:56,  2.89it/s]  

{'loss': 0.1866, 'grad_norm': 0.6804196834564209, 'learning_rate': 0.000156685896420335, 'epoch': 0.31}


 31%|███       | 4200/13552 [24:25<50:41,  3.08it/s]

{'loss': 0.1801, 'grad_norm': 0.12317653000354767, 'learning_rate': 0.0001564944867565209, 'epoch': 0.31}


 31%|███       | 4210/13552 [24:28<51:08,  3.04it/s]

{'loss': 0.1666, 'grad_norm': 0.23525044322013855, 'learning_rate': 0.0001563027725966734, 'epoch': 0.31}


 31%|███       | 4220/13552 [24:32<1:01:47,  2.52it/s]

{'loss': 0.164, 'grad_norm': 0.8189141750335693, 'learning_rate': 0.00015611075497410038, 'epoch': 0.31}


 31%|███       | 4230/13552 [24:35<51:46,  3.00it/s]  

{'loss': 0.1878, 'grad_norm': 0.1576591432094574, 'learning_rate': 0.00015591843492374538, 'epoch': 0.31}


 31%|███▏      | 4240/13552 [24:39<53:29,  2.90it/s]

{'loss': 0.1673, 'grad_norm': 0.3279925286769867, 'learning_rate': 0.00015572581348218204, 'epoch': 0.31}


 31%|███▏      | 4250/13552 [24:42<50:54,  3.05it/s]

{'loss': 0.1571, 'grad_norm': 0.1441231518983841, 'learning_rate': 0.00015553289168760833, 'epoch': 0.31}


 31%|███▏      | 4260/13552 [24:45<50:52,  3.04it/s]

{'loss': 0.1638, 'grad_norm': 0.08586020767688751, 'learning_rate': 0.0001553396705798412, 'epoch': 0.31}


 32%|███▏      | 4270/13552 [24:49<50:34,  3.06it/s]

{'loss': 0.1409, 'grad_norm': 0.30980756878852844, 'learning_rate': 0.00015514615120031076, 'epoch': 0.32}


 32%|███▏      | 4280/13552 [24:52<50:37,  3.05it/s]

{'loss': 0.1637, 'grad_norm': 0.5600868463516235, 'learning_rate': 0.00015495233459205475, 'epoch': 0.32}


 32%|███▏      | 4290/13552 [24:55<58:21,  2.65it/s]  

{'loss': 0.1707, 'grad_norm': 0.6506803631782532, 'learning_rate': 0.00015475822179971297, 'epoch': 0.32}


 32%|███▏      | 4300/13552 [24:59<52:23,  2.94it/s]  

{'loss': 0.1767, 'grad_norm': 0.2704200744628906, 'learning_rate': 0.00015456381386952159, 'epoch': 0.32}


 32%|███▏      | 4310/13552 [25:02<51:03,  3.02it/s]

{'loss': 0.1431, 'grad_norm': 0.3005940020084381, 'learning_rate': 0.00015436911184930752, 'epoch': 0.32}


 32%|███▏      | 4320/13552 [25:06<50:40,  3.04it/s]

{'loss': 0.1519, 'grad_norm': 0.19491896033287048, 'learning_rate': 0.00015417411678848278, 'epoch': 0.32}


 32%|███▏      | 4330/13552 [25:09<50:58,  3.02it/s]  

{'loss': 0.1528, 'grad_norm': 0.17821791768074036, 'learning_rate': 0.00015397882973803878, 'epoch': 0.32}


 32%|███▏      | 4340/13552 [25:13<52:39,  2.92it/s]

{'loss': 0.1601, 'grad_norm': 0.13120044767856598, 'learning_rate': 0.00015378325175054083, 'epoch': 0.32}


 32%|███▏      | 4350/13552 [25:16<55:26,  2.77it/s]

{'loss': 0.1564, 'grad_norm': 0.1802576780319214, 'learning_rate': 0.00015358738388012214, 'epoch': 0.32}


 32%|███▏      | 4360/13552 [25:20<53:05,  2.89it/s]

{'loss': 0.1443, 'grad_norm': 0.4101721942424774, 'learning_rate': 0.0001533912271824786, 'epoch': 0.32}


 32%|███▏      | 4370/13552 [25:24<55:53,  2.74it/s]  

{'loss': 0.158, 'grad_norm': 0.3699342906475067, 'learning_rate': 0.0001531947827148626, 'epoch': 0.32}


 32%|███▏      | 4380/13552 [25:27<51:34,  2.96it/s]

{'loss': 0.147, 'grad_norm': 0.046531375497579575, 'learning_rate': 0.00015299805153607763, 'epoch': 0.32}


 32%|███▏      | 4390/13552 [25:30<51:36,  2.96it/s]

{'loss': 0.1425, 'grad_norm': 0.35257846117019653, 'learning_rate': 0.0001528010347064726, 'epoch': 0.32}


 32%|███▏      | 4400/13552 [25:34<51:06,  2.98it/s]

{'loss': 0.1534, 'grad_norm': 0.1113688200712204, 'learning_rate': 0.00015260373328793598, 'epoch': 0.32}


 33%|███▎      | 4410/13552 [25:37<52:59,  2.88it/s]

{'loss': 0.1672, 'grad_norm': 0.6693687438964844, 'learning_rate': 0.00015240614834389002, 'epoch': 0.33}


 33%|███▎      | 4420/13552 [25:41<58:19,  2.61it/s]  

{'loss': 0.1814, 'grad_norm': 0.6761395335197449, 'learning_rate': 0.0001522082809392853, 'epoch': 0.33}


 33%|███▎      | 4430/13552 [25:44<50:44,  3.00it/s]

{'loss': 0.1646, 'grad_norm': 0.07147442549467087, 'learning_rate': 0.00015201013214059467, 'epoch': 0.33}


 33%|███▎      | 4440/13552 [25:47<51:18,  2.96it/s]

{'loss': 0.1578, 'grad_norm': 0.4066547453403473, 'learning_rate': 0.00015181170301580777, 'epoch': 0.33}


 33%|███▎      | 4450/13552 [25:51<50:57,  2.98it/s]

{'loss': 0.1685, 'grad_norm': 0.7234295606613159, 'learning_rate': 0.00015161299463442503, 'epoch': 0.33}


 33%|███▎      | 4460/13552 [25:55<1:04:39,  2.34it/s]

{'loss': 0.1641, 'grad_norm': 0.7077316641807556, 'learning_rate': 0.00015141400806745214, 'epoch': 0.33}


 33%|███▎      | 4470/13552 [25:59<52:55,  2.86it/s]  

{'loss': 0.1468, 'grad_norm': 0.1241789236664772, 'learning_rate': 0.00015121474438739408, 'epoch': 0.33}


 33%|███▎      | 4480/13552 [26:02<51:33,  2.93it/s]

{'loss': 0.1756, 'grad_norm': 0.45076268911361694, 'learning_rate': 0.0001510152046682495, 'epoch': 0.33}


 33%|███▎      | 4490/13552 [26:06<51:10,  2.95it/s]

{'loss': 0.1662, 'grad_norm': 0.20331045985221863, 'learning_rate': 0.0001508153899855048, 'epoch': 0.33}


 33%|███▎      | 4500/13552 [26:09<51:16,  2.94it/s]

{'loss': 0.164, 'grad_norm': 0.40664029121398926, 'learning_rate': 0.00015061530141612835, 'epoch': 0.33}


 33%|███▎      | 4510/13552 [26:12<49:41,  3.03it/s]

{'loss': 0.1674, 'grad_norm': 0.16816234588623047, 'learning_rate': 0.00015041494003856487, 'epoch': 0.33}


 33%|███▎      | 4520/13552 [26:16<59:03,  2.55it/s]  

{'loss': 0.1649, 'grad_norm': 0.17805935442447662, 'learning_rate': 0.0001502143069327293, 'epoch': 0.33}


 33%|███▎      | 4530/13552 [26:20<51:57,  2.89it/s]

{'loss': 0.156, 'grad_norm': 0.12676283717155457, 'learning_rate': 0.0001500134031800013, 'epoch': 0.33}


 34%|███▎      | 4540/13552 [26:23<53:13,  2.82it/s]

{'loss': 0.1641, 'grad_norm': 0.3528221547603607, 'learning_rate': 0.00014981222986321915, 'epoch': 0.34}


 34%|███▎      | 4550/13552 [26:27<51:11,  2.93it/s]

{'loss': 0.1797, 'grad_norm': 0.45678502321243286, 'learning_rate': 0.0001496107880666741, 'epoch': 0.34}


 34%|███▎      | 4560/13552 [26:30<51:21,  2.92it/s]  

{'loss': 0.1718, 'grad_norm': 0.09482142329216003, 'learning_rate': 0.00014940907887610438, 'epoch': 0.34}


 34%|███▎      | 4570/13552 [26:34<54:17,  2.76it/s]  

{'loss': 0.1532, 'grad_norm': 0.07811805605888367, 'learning_rate': 0.00014920710337868964, 'epoch': 0.34}


 34%|███▍      | 4580/13552 [26:37<51:45,  2.89it/s]

{'loss': 0.1615, 'grad_norm': 0.10944973677396774, 'learning_rate': 0.00014900486266304465, 'epoch': 0.34}


 34%|███▍      | 4590/13552 [26:41<53:04,  2.81it/s]  

{'loss': 0.1409, 'grad_norm': 0.20429939031600952, 'learning_rate': 0.00014880235781921376, 'epoch': 0.34}


 34%|███▍      | 4600/13552 [26:45<52:19,  2.85it/s]

{'loss': 0.1488, 'grad_norm': 0.14955483376979828, 'learning_rate': 0.00014859958993866497, 'epoch': 0.34}


 34%|███▍      | 4610/13552 [26:48<51:28,  2.90it/s]

{'loss': 0.1639, 'grad_norm': 0.20806913077831268, 'learning_rate': 0.00014839656011428389, 'epoch': 0.34}


 34%|███▍      | 4620/13552 [26:52<51:42,  2.88it/s]  

{'loss': 0.1348, 'grad_norm': 0.06218046322464943, 'learning_rate': 0.00014819326944036807, 'epoch': 0.34}


 34%|███▍      | 4630/13552 [26:56<54:49,  2.71it/s]

{'loss': 0.1438, 'grad_norm': 0.22595436871051788, 'learning_rate': 0.00014798971901262094, 'epoch': 0.34}


 34%|███▍      | 4640/13552 [27:00<54:13,  2.74it/s]  

{'loss': 0.1458, 'grad_norm': 0.07679808139801025, 'learning_rate': 0.00014778590992814598, 'epoch': 0.34}


 34%|███▍      | 4650/13552 [27:04<1:02:26,  2.38it/s]

{'loss': 0.1446, 'grad_norm': 0.12594403326511383, 'learning_rate': 0.0001475818432854408, 'epoch': 0.34}


 34%|███▍      | 4660/13552 [27:07<53:41,  2.76it/s]  

{'loss': 0.15, 'grad_norm': 0.11874233931303024, 'learning_rate': 0.0001473775201843912, 'epoch': 0.34}


 34%|███▍      | 4670/13552 [27:11<49:56,  2.96it/s]

{'loss': 0.1561, 'grad_norm': 0.09358468651771545, 'learning_rate': 0.00014717294172626516, 'epoch': 0.34}


 35%|███▍      | 4680/13552 [27:14<51:00,  2.90it/s]  

{'loss': 0.1434, 'grad_norm': 0.9094814658164978, 'learning_rate': 0.00014696810901370717, 'epoch': 0.35}


 35%|███▍      | 4690/13552 [27:18<51:05,  2.89it/s]  

{'loss': 0.1443, 'grad_norm': 0.11283108592033386, 'learning_rate': 0.00014676302315073196, 'epoch': 0.35}


 35%|███▍      | 4700/13552 [27:22<54:17,  2.72it/s]  

{'loss': 0.1544, 'grad_norm': 0.08747916668653488, 'learning_rate': 0.0001465576852427188, 'epoch': 0.35}


 35%|███▍      | 4710/13552 [27:25<48:30,  3.04it/s]

{'loss': 0.1482, 'grad_norm': 0.20610706508159637, 'learning_rate': 0.00014635209639640533, 'epoch': 0.35}


 35%|███▍      | 4720/13552 [27:29<50:17,  2.93it/s]

{'loss': 0.1388, 'grad_norm': 0.044330429285764694, 'learning_rate': 0.0001461462577198818, 'epoch': 0.35}


 35%|███▍      | 4730/13552 [27:32<47:34,  3.09it/s]

{'loss': 0.166, 'grad_norm': 0.059494271874427795, 'learning_rate': 0.00014594017032258492, 'epoch': 0.35}


 35%|███▍      | 4740/13552 [27:35<51:12,  2.87it/s]

{'loss': 0.1495, 'grad_norm': 0.6920559406280518, 'learning_rate': 0.00014573383531529214, 'epoch': 0.35}


 35%|███▌      | 4750/13552 [27:39<1:00:11,  2.44it/s]

{'loss': 0.1488, 'grad_norm': 0.07011174410581589, 'learning_rate': 0.00014552725381011526, 'epoch': 0.35}


 35%|███▌      | 4760/13552 [27:43<1:09:50,  2.10it/s]

{'loss': 0.1323, 'grad_norm': 0.057635121047496796, 'learning_rate': 0.0001453204269204948, 'epoch': 0.35}


 35%|███▌      | 4770/13552 [27:47<51:34,  2.84it/s]  

{'loss': 0.154, 'grad_norm': 0.24352312088012695, 'learning_rate': 0.00014511335576119385, 'epoch': 0.35}


 35%|███▌      | 4780/13552 [27:50<52:49,  2.77it/s]

{'loss': 0.1813, 'grad_norm': 0.6744728088378906, 'learning_rate': 0.00014490604144829202, 'epoch': 0.35}


 35%|███▌      | 4790/13552 [27:54<51:05,  2.86it/s]

{'loss': 0.1296, 'grad_norm': 0.07735253125429153, 'learning_rate': 0.00014469848509917954, 'epoch': 0.35}


 35%|███▌      | 4800/13552 [27:57<49:05,  2.97it/s]

{'loss': 0.1566, 'grad_norm': 0.14215700328350067, 'learning_rate': 0.0001444906878325511, 'epoch': 0.35}


 35%|███▌      | 4810/13552 [28:01<48:40,  2.99it/s]

{'loss': 0.1532, 'grad_norm': 0.09599599987268448, 'learning_rate': 0.0001442826507684, 'epoch': 0.35}


 36%|███▌      | 4820/13552 [28:04<48:04,  3.03it/s]

{'loss': 0.1454, 'grad_norm': 0.09048296511173248, 'learning_rate': 0.00014407437502801193, 'epoch': 0.36}


 36%|███▌      | 4830/13552 [28:08<55:36,  2.61it/s]  

{'loss': 0.1507, 'grad_norm': 0.23505578935146332, 'learning_rate': 0.00014386586173395903, 'epoch': 0.36}


 36%|███▌      | 4840/13552 [28:11<51:37,  2.81it/s]

{'loss': 0.1551, 'grad_norm': 0.5466187000274658, 'learning_rate': 0.0001436571120100938, 'epoch': 0.36}


 36%|███▌      | 4850/13552 [28:15<53:11,  2.73it/s]

{'loss': 0.1542, 'grad_norm': 0.3985898196697235, 'learning_rate': 0.00014344812698154305, 'epoch': 0.36}


 36%|███▌      | 4860/13552 [28:18<51:57,  2.79it/s]

{'loss': 0.1711, 'grad_norm': 0.8476597666740417, 'learning_rate': 0.00014323890777470194, 'epoch': 0.36}


 36%|███▌      | 4870/13552 [28:22<49:12,  2.94it/s]

{'loss': 0.161, 'grad_norm': 1.0155912637710571, 'learning_rate': 0.0001430294555172277, 'epoch': 0.36}


 36%|███▌      | 4880/13552 [28:25<48:21,  2.99it/s]

{'loss': 0.1411, 'grad_norm': 0.07585343718528748, 'learning_rate': 0.0001428197713380337, 'epoch': 0.36}


 36%|███▌      | 4890/13552 [28:29<48:31,  2.98it/s]

{'loss': 0.151, 'grad_norm': 0.09926056116819382, 'learning_rate': 0.00014260985636728332, 'epoch': 0.36}


 36%|███▌      | 4900/13552 [28:32<52:35,  2.74it/s]  

{'loss': 0.1492, 'grad_norm': 0.06625999510288239, 'learning_rate': 0.00014239971173638392, 'epoch': 0.36}


 36%|███▌      | 4910/13552 [28:36<52:16,  2.76it/s]

{'loss': 0.1484, 'grad_norm': 0.08975640684366226, 'learning_rate': 0.00014218933857798057, 'epoch': 0.36}


 36%|███▋      | 4920/13552 [28:39<48:01,  3.00it/s]

{'loss': 0.169, 'grad_norm': 0.09822705388069153, 'learning_rate': 0.00014197873802595026, 'epoch': 0.36}


 36%|███▋      | 4930/13552 [28:43<52:14,  2.75it/s]

{'loss': 0.1284, 'grad_norm': 0.11215148866176605, 'learning_rate': 0.0001417679112153954, 'epoch': 0.36}


 36%|███▋      | 4940/13552 [28:46<49:08,  2.92it/s]

{'loss': 0.1521, 'grad_norm': 0.2445678561925888, 'learning_rate': 0.000141556859282638, 'epoch': 0.36}


 37%|███▋      | 4950/13552 [28:50<50:06,  2.86it/s]

{'loss': 0.1561, 'grad_norm': 0.4847537577152252, 'learning_rate': 0.00014134558336521342, 'epoch': 0.37}


 37%|███▋      | 4960/13552 [28:53<48:30,  2.95it/s]

{'loss': 0.1584, 'grad_norm': 0.07223285734653473, 'learning_rate': 0.00014113408460186429, 'epoch': 0.37}


 37%|███▋      | 4970/13552 [28:57<48:27,  2.95it/s]

{'loss': 0.174, 'grad_norm': 0.12805208563804626, 'learning_rate': 0.00014092236413253427, 'epoch': 0.37}


 37%|███▋      | 4980/13552 [29:01<51:16,  2.79it/s]

{'loss': 0.1668, 'grad_norm': 0.13308364152908325, 'learning_rate': 0.00014071042309836206, 'epoch': 0.37}


 37%|███▋      | 4990/13552 [29:04<48:24,  2.95it/s]  

{'loss': 0.1593, 'grad_norm': 0.09977582097053528, 'learning_rate': 0.00014049826264167511, 'epoch': 0.37}


 37%|███▋      | 5000/13552 [29:08<57:46,  2.47it/s]  

{'loss': 0.149, 'grad_norm': 0.6413902044296265, 'learning_rate': 0.0001402858839059836, 'epoch': 0.37}


 37%|███▋      | 5010/13552 [29:11<49:01,  2.90it/s]

{'loss': 0.1709, 'grad_norm': 0.13800421357154846, 'learning_rate': 0.00014007328803597403, 'epoch': 0.37}


 37%|███▋      | 5020/13552 [29:15<50:59,  2.79it/s]

{'loss': 0.1861, 'grad_norm': 0.3513883054256439, 'learning_rate': 0.00013986047617750339, 'epoch': 0.37}


 37%|███▋      | 5030/13552 [29:18<48:32,  2.93it/s]

{'loss': 0.1481, 'grad_norm': 0.05866071209311485, 'learning_rate': 0.00013964744947759277, 'epoch': 0.37}


 37%|███▋      | 5040/13552 [29:22<47:49,  2.97it/s]

{'loss': 0.1646, 'grad_norm': 0.05874480679631233, 'learning_rate': 0.0001394342090844212, 'epoch': 0.37}


 37%|███▋      | 5050/13552 [29:26<58:27,  2.42it/s]  

{'loss': 0.1446, 'grad_norm': 0.22795982658863068, 'learning_rate': 0.00013922075614731947, 'epoch': 0.37}


 37%|███▋      | 5060/13552 [29:29<48:40,  2.91it/s]

{'loss': 0.1452, 'grad_norm': 0.05724562704563141, 'learning_rate': 0.00013900709181676396, 'epoch': 0.37}


 37%|███▋      | 5070/13552 [29:33<48:07,  2.94it/s]

{'loss': 0.1386, 'grad_norm': 0.49263229966163635, 'learning_rate': 0.0001387932172443704, 'epoch': 0.37}


 37%|███▋      | 5080/13552 [29:36<48:42,  2.90it/s]

{'loss': 0.1525, 'grad_norm': 0.07743944972753525, 'learning_rate': 0.00013857913358288776, 'epoch': 0.37}


 38%|███▊      | 5090/13552 [29:40<46:01,  3.06it/s]

{'loss': 0.1698, 'grad_norm': 0.21246936917304993, 'learning_rate': 0.00013836484198619193, 'epoch': 0.38}


 38%|███▊      | 5100/13552 [29:43<47:44,  2.95it/s]

{'loss': 0.2131, 'grad_norm': 0.09462888538837433, 'learning_rate': 0.00013815034360927948, 'epoch': 0.38}


 38%|███▊      | 5110/13552 [29:46<44:41,  3.15it/s]

{'loss': 0.1614, 'grad_norm': 0.655572235584259, 'learning_rate': 0.00013793563960826156, 'epoch': 0.38}


 38%|███▊      | 5120/13552 [29:50<45:24,  3.09it/s]

{'loss': 0.1591, 'grad_norm': 0.08936659246683121, 'learning_rate': 0.00013772073114035762, 'epoch': 0.38}


 38%|███▊      | 5130/13552 [29:53<1:02:38,  2.24it/s]

{'loss': 0.1422, 'grad_norm': 0.0740797147154808, 'learning_rate': 0.00013750561936388907, 'epoch': 0.38}


 38%|███▊      | 5140/13552 [29:57<45:56,  3.05it/s]  

{'loss': 0.164, 'grad_norm': 0.13534638285636902, 'learning_rate': 0.00013729030543827315, 'epoch': 0.38}


 38%|███▊      | 5150/13552 [30:00<44:51,  3.12it/s]

{'loss': 0.1551, 'grad_norm': 0.09650338441133499, 'learning_rate': 0.0001370747905240167, 'epoch': 0.38}


 38%|███▊      | 5160/13552 [30:04<45:05,  3.10it/s]

{'loss': 0.1688, 'grad_norm': 0.39426445960998535, 'learning_rate': 0.0001368590757827098, 'epoch': 0.38}


 38%|███▊      | 5170/13552 [30:07<46:14,  3.02it/s]

{'loss': 0.1775, 'grad_norm': 7.796167373657227, 'learning_rate': 0.00013664316237701964, 'epoch': 0.38}


 38%|███▊      | 5180/13552 [30:10<45:18,  3.08it/s]

{'loss': 0.1372, 'grad_norm': 0.0846213847398758, 'learning_rate': 0.00013642705147068402, 'epoch': 0.38}


 38%|███▊      | 5190/13552 [30:14<45:15,  3.08it/s]

{'loss': 0.1537, 'grad_norm': 0.1030745580792427, 'learning_rate': 0.00013621074422850543, 'epoch': 0.38}


 38%|███▊      | 5200/13552 [30:17<44:33,  3.12it/s]

{'loss': 0.1588, 'grad_norm': 0.22873395681381226, 'learning_rate': 0.00013599424181634441, 'epoch': 0.38}


 38%|███▊      | 5210/13552 [30:21<56:28,  2.46it/s]  

{'loss': 0.1409, 'grad_norm': 0.3535807132720947, 'learning_rate': 0.00013577754540111363, 'epoch': 0.38}


 39%|███▊      | 5220/13552 [30:24<55:26,  2.50it/s]

{'loss': 0.1657, 'grad_norm': 0.07015583664178848, 'learning_rate': 0.00013556065615077118, 'epoch': 0.39}


 39%|███▊      | 5230/13552 [30:28<46:47,  2.96it/s]

{'loss': 0.1492, 'grad_norm': 0.07346407324075699, 'learning_rate': 0.00013534357523431464, 'epoch': 0.39}


 39%|███▊      | 5240/13552 [30:31<46:50,  2.96it/s]

{'loss': 0.1301, 'grad_norm': 0.06059110164642334, 'learning_rate': 0.0001351263038217746, 'epoch': 0.39}


 39%|███▊      | 5250/13552 [30:35<46:48,  2.96it/s]

{'loss': 0.1526, 'grad_norm': 0.090203195810318, 'learning_rate': 0.00013490884308420845, 'epoch': 0.39}


 39%|███▉      | 5260/13552 [30:38<47:55,  2.88it/s]

{'loss': 0.1543, 'grad_norm': 0.33524635434150696, 'learning_rate': 0.00013469119419369386, 'epoch': 0.39}


 39%|███▉      | 5270/13552 [30:42<47:27,  2.91it/s]

{'loss': 0.1648, 'grad_norm': 0.5337138772010803, 'learning_rate': 0.00013447335832332278, 'epoch': 0.39}


 39%|███▉      | 5280/13552 [30:45<46:33,  2.96it/s]

{'loss': 0.1371, 'grad_norm': 0.04899156838655472, 'learning_rate': 0.00013425533664719488, 'epoch': 0.39}


 39%|███▉      | 5290/13552 [30:49<47:23,  2.91it/s]  

{'loss': 0.1465, 'grad_norm': 0.3505100905895233, 'learning_rate': 0.0001340371303404113, 'epoch': 0.39}


 39%|███▉      | 5300/13552 [30:53<46:44,  2.94it/s]  

{'loss': 0.1401, 'grad_norm': 0.3628348112106323, 'learning_rate': 0.00013381874057906827, 'epoch': 0.39}


 39%|███▉      | 5310/13552 [30:56<47:45,  2.88it/s]

{'loss': 0.1636, 'grad_norm': 0.06135358661413193, 'learning_rate': 0.00013360016854025087, 'epoch': 0.39}


 39%|███▉      | 5320/13552 [31:00<48:23,  2.84it/s]

{'loss': 0.1517, 'grad_norm': 0.12014520913362503, 'learning_rate': 0.00013338141540202662, 'epoch': 0.39}


 39%|███▉      | 5330/13552 [31:04<59:12,  2.31it/s]

{'loss': 0.1462, 'grad_norm': 0.0855148434638977, 'learning_rate': 0.0001331624823434391, 'epoch': 0.39}


 39%|███▉      | 5340/13552 [31:08<48:06,  2.85it/s]  

{'loss': 0.1471, 'grad_norm': 0.07948046922683716, 'learning_rate': 0.00013294337054450166, 'epoch': 0.39}


 39%|███▉      | 5350/13552 [31:11<50:42,  2.70it/s]

{'loss': 0.1805, 'grad_norm': 0.5899810194969177, 'learning_rate': 0.00013272408118619096, 'epoch': 0.39}


 40%|███▉      | 5360/13552 [31:15<45:59,  2.97it/s]

{'loss': 0.1549, 'grad_norm': 0.07688143104314804, 'learning_rate': 0.0001325046154504408, 'epoch': 0.4}


 40%|███▉      | 5370/13552 [31:18<46:27,  2.94it/s]

{'loss': 0.1584, 'grad_norm': 0.08374939858913422, 'learning_rate': 0.00013228497452013552, 'epoch': 0.4}


 40%|███▉      | 5380/13552 [31:21<45:50,  2.97it/s]

{'loss': 0.1664, 'grad_norm': 0.22056323289871216, 'learning_rate': 0.00013206515957910384, 'epoch': 0.4}


 40%|███▉      | 5390/13552 [31:25<49:37,  2.74it/s]

{'loss': 0.1475, 'grad_norm': 0.06080124154686928, 'learning_rate': 0.00013184517181211224, 'epoch': 0.4}


 40%|███▉      | 5400/13552 [31:29<50:56,  2.67it/s]

{'loss': 0.1341, 'grad_norm': 0.06289230287075043, 'learning_rate': 0.00013162501240485878, 'epoch': 0.4}


 40%|███▉      | 5410/13552 [31:32<47:17,  2.87it/s]

{'loss': 0.1599, 'grad_norm': 0.0960937887430191, 'learning_rate': 0.0001314046825439666, 'epoch': 0.4}


 40%|███▉      | 5420/13552 [31:36<45:28,  2.98it/s]

{'loss': 0.147, 'grad_norm': 0.1025860458612442, 'learning_rate': 0.0001311841834169776, 'epoch': 0.4}


 40%|████      | 5430/13552 [31:39<48:25,  2.80it/s]

{'loss': 0.1513, 'grad_norm': 0.11990869045257568, 'learning_rate': 0.00013096351621234595, 'epoch': 0.4}


 40%|████      | 5440/13552 [31:43<45:43,  2.96it/s]

{'loss': 0.148, 'grad_norm': 0.06390497833490372, 'learning_rate': 0.00013074268211943178, 'epoch': 0.4}


 40%|████      | 5450/13552 [31:46<47:58,  2.81it/s]

{'loss': 0.1438, 'grad_norm': 0.057260237634181976, 'learning_rate': 0.00013052168232849466, 'epoch': 0.4}


 40%|████      | 5460/13552 [31:50<45:14,  2.98it/s]

{'loss': 0.1605, 'grad_norm': 0.09198635816574097, 'learning_rate': 0.00013030051803068727, 'epoch': 0.4}


 40%|████      | 5470/13552 [31:53<47:35,  2.83it/s]

{'loss': 0.131, 'grad_norm': 0.07950647920370102, 'learning_rate': 0.00013007919041804904, 'epoch': 0.4}


 40%|████      | 5480/13552 [31:57<46:57,  2.86it/s]

{'loss': 0.1501, 'grad_norm': 0.3781164884567261, 'learning_rate': 0.00012985770068349946, 'epoch': 0.4}


 41%|████      | 5490/13552 [32:00<48:11,  2.79it/s]

{'loss': 0.1842, 'grad_norm': 0.12754598259925842, 'learning_rate': 0.000129636050020832, 'epoch': 0.41}


 41%|████      | 5500/13552 [32:04<55:45,  2.41it/s]  

{'loss': 0.1499, 'grad_norm': 0.24025844037532806, 'learning_rate': 0.0001294142396247074, 'epoch': 0.41}


 41%|████      | 5510/13552 [32:08<47:41,  2.81it/s]

{'loss': 0.1415, 'grad_norm': 0.08007631450891495, 'learning_rate': 0.00012919227069064742, 'epoch': 0.41}


 41%|████      | 5520/13552 [32:11<44:57,  2.98it/s]

{'loss': 0.1539, 'grad_norm': 0.08529098331928253, 'learning_rate': 0.00012897014441502826, 'epoch': 0.41}


 41%|████      | 5530/13552 [32:15<49:46,  2.69it/s]

{'loss': 0.1291, 'grad_norm': 0.07652618736028671, 'learning_rate': 0.00012874786199507413, 'epoch': 0.41}


 41%|████      | 5540/13552 [32:18<46:14,  2.89it/s]

{'loss': 0.1434, 'grad_norm': 0.24106355011463165, 'learning_rate': 0.0001285254246288509, 'epoch': 0.41}


 41%|████      | 5550/13552 [32:22<51:18,  2.60it/s]

{'loss': 0.1512, 'grad_norm': 0.07046986371278763, 'learning_rate': 0.0001283028335152596, 'epoch': 0.41}


 41%|████      | 5560/13552 [32:26<46:03,  2.89it/s]

{'loss': 0.1503, 'grad_norm': 0.06337303668260574, 'learning_rate': 0.00012808008985402985, 'epoch': 0.41}


 41%|████      | 5570/13552 [32:29<46:05,  2.89it/s]

{'loss': 0.1429, 'grad_norm': 0.06427056342363358, 'learning_rate': 0.0001278571948457135, 'epoch': 0.41}


 41%|████      | 5580/13552 [32:33<46:27,  2.86it/s]

{'loss': 0.1564, 'grad_norm': 0.08630508929491043, 'learning_rate': 0.00012763414969167817, 'epoch': 0.41}


 41%|████      | 5590/13552 [32:36<1:00:54,  2.18it/s]

{'loss': 0.1849, 'grad_norm': 0.19362697005271912, 'learning_rate': 0.0001274109555941007, 'epoch': 0.41}


 41%|████▏     | 5600/13552 [32:40<46:18,  2.86it/s]  

{'loss': 0.1622, 'grad_norm': 0.18418660759925842, 'learning_rate': 0.00012718761375596073, 'epoch': 0.41}


 41%|████▏     | 5610/13552 [32:43<48:42,  2.72it/s]

{'loss': 0.1325, 'grad_norm': 0.08418998122215271, 'learning_rate': 0.00012696412538103425, 'epoch': 0.41}


 41%|████▏     | 5620/13552 [32:47<44:48,  2.95it/s]

{'loss': 0.154, 'grad_norm': 0.06631176918745041, 'learning_rate': 0.00012674049167388693, 'epoch': 0.41}


 42%|████▏     | 5630/13552 [32:50<50:00,  2.64it/s]

{'loss': 0.1802, 'grad_norm': 0.11687679588794708, 'learning_rate': 0.00012651671383986788, 'epoch': 0.42}


 42%|████▏     | 5640/13552 [32:54<46:54,  2.81it/s]

{'loss': 0.1492, 'grad_norm': 0.09069252014160156, 'learning_rate': 0.000126292793085103, 'epoch': 0.42}


 42%|████▏     | 5650/13552 [32:57<43:50,  3.00it/s]

{'loss': 0.1584, 'grad_norm': 0.09306317567825317, 'learning_rate': 0.00012606873061648844, 'epoch': 0.42}


 42%|████▏     | 5660/13552 [33:01<47:36,  2.76it/s]

{'loss': 0.1497, 'grad_norm': 0.26893579959869385, 'learning_rate': 0.00012584452764168423, 'epoch': 0.42}


 42%|████▏     | 5670/13552 [33:05<49:18,  2.66it/s]

{'loss': 0.1644, 'grad_norm': 0.14509046077728271, 'learning_rate': 0.00012562018536910777, 'epoch': 0.42}


 42%|████▏     | 5680/13552 [33:08<42:39,  3.08it/s]

{'loss': 0.1778, 'grad_norm': 1.602662444114685, 'learning_rate': 0.00012539570500792713, 'epoch': 0.42}


 42%|████▏     | 5690/13552 [33:12<49:04,  2.67it/s]  

{'loss': 0.1802, 'grad_norm': 0.08098282665014267, 'learning_rate': 0.00012517108776805466, 'epoch': 0.42}


 42%|████▏     | 5700/13552 [33:16<44:18,  2.95it/s]

{'loss': 0.1505, 'grad_norm': 0.17221513390541077, 'learning_rate': 0.00012494633486014053, 'epoch': 0.42}


 42%|████▏     | 5710/13552 [33:19<49:51,  2.62it/s]

{'loss': 0.15, 'grad_norm': 0.040725283324718475, 'learning_rate': 0.0001247214474955661, 'epoch': 0.42}


 42%|████▏     | 5720/13552 [33:23<46:16,  2.82it/s]

{'loss': 0.1762, 'grad_norm': 0.3420715630054474, 'learning_rate': 0.0001244964268864375, 'epoch': 0.42}


 42%|████▏     | 5730/13552 [33:26<52:16,  2.49it/s]

{'loss': 0.1399, 'grad_norm': 0.18613941967487335, 'learning_rate': 0.00012427127424557893, 'epoch': 0.42}


 42%|████▏     | 5740/13552 [33:30<43:47,  2.97it/s]

{'loss': 0.1592, 'grad_norm': 0.12772756814956665, 'learning_rate': 0.00012404599078652626, 'epoch': 0.42}


 42%|████▏     | 5750/13552 [33:33<44:41,  2.91it/s]

{'loss': 0.1507, 'grad_norm': 0.12475504726171494, 'learning_rate': 0.00012382057772352043, 'epoch': 0.42}


 43%|████▎     | 5760/13552 [33:37<43:06,  3.01it/s]

{'loss': 0.1568, 'grad_norm': 0.12397751957178116, 'learning_rate': 0.000123595036271501, 'epoch': 0.43}


 43%|████▎     | 5770/13552 [33:41<49:21,  2.63it/s]  

{'loss': 0.1449, 'grad_norm': 0.2786450684070587, 'learning_rate': 0.0001233693676460994, 'epoch': 0.43}


 43%|████▎     | 5780/13552 [33:44<43:15,  2.99it/s]

{'loss': 0.1448, 'grad_norm': 0.06208149343729019, 'learning_rate': 0.0001231435730636326, 'epoch': 0.43}


 43%|████▎     | 5790/13552 [33:48<43:13,  2.99it/s]

{'loss': 0.1363, 'grad_norm': 0.37206581234931946, 'learning_rate': 0.00012291765374109642, 'epoch': 0.43}


 43%|████▎     | 5800/13552 [33:52<55:23,  2.33it/s]

{'loss': 0.1407, 'grad_norm': 0.6275725364685059, 'learning_rate': 0.000122691610896159, 'epoch': 0.43}


 43%|████▎     | 5810/13552 [33:55<46:48,  2.76it/s]

{'loss': 0.142, 'grad_norm': 0.08569289743900299, 'learning_rate': 0.00012246544574715434, 'epoch': 0.43}


 43%|████▎     | 5820/13552 [33:59<44:22,  2.90it/s]

{'loss': 0.1606, 'grad_norm': 1.254773497581482, 'learning_rate': 0.00012223915951307547, 'epoch': 0.43}


 43%|████▎     | 5830/13552 [34:02<42:51,  3.00it/s]

{'loss': 0.1689, 'grad_norm': 0.09302239865064621, 'learning_rate': 0.0001220127534135682, 'epoch': 0.43}


 43%|████▎     | 5840/13552 [34:06<44:11,  2.91it/s]  

{'loss': 0.1451, 'grad_norm': 0.09309684485197067, 'learning_rate': 0.0001217862286689243, 'epoch': 0.43}


 43%|████▎     | 5850/13552 [34:09<41:46,  3.07it/s]

{'loss': 0.1454, 'grad_norm': 0.3188503086566925, 'learning_rate': 0.00012155958650007509, 'epoch': 0.43}


 43%|████▎     | 5860/13552 [34:13<44:14,  2.90it/s]

{'loss': 0.1571, 'grad_norm': 0.09628219902515411, 'learning_rate': 0.00012133282812858473, 'epoch': 0.43}


 43%|████▎     | 5870/13552 [34:16<41:29,  3.09it/s]

{'loss': 0.1509, 'grad_norm': 0.17655834555625916, 'learning_rate': 0.00012110595477664368, 'epoch': 0.43}


 43%|████▎     | 5880/13552 [34:20<51:44,  2.47it/s]

{'loss': 0.1669, 'grad_norm': 0.03935711830854416, 'learning_rate': 0.00012087896766706222, 'epoch': 0.43}


 43%|████▎     | 5890/13552 [34:23<41:23,  3.09it/s]

{'loss': 0.1541, 'grad_norm': 0.09490133821964264, 'learning_rate': 0.00012065186802326363, 'epoch': 0.43}


 44%|████▎     | 5900/13552 [34:27<43:52,  2.91it/s]

{'loss': 0.1582, 'grad_norm': 0.34538835287094116, 'learning_rate': 0.0001204246570692779, 'epoch': 0.44}


 44%|████▎     | 5910/13552 [34:30<41:02,  3.10it/s]

{'loss': 0.1552, 'grad_norm': 0.05941597744822502, 'learning_rate': 0.0001201973360297348, 'epoch': 0.44}


 44%|████▎     | 5920/13552 [34:33<42:45,  2.98it/s]

{'loss': 0.1507, 'grad_norm': 0.07144922018051147, 'learning_rate': 0.00011996990612985755, 'epoch': 0.44}


 44%|████▍     | 5930/13552 [34:36<41:37,  3.05it/s]

{'loss': 0.1429, 'grad_norm': 0.04580823704600334, 'learning_rate': 0.00011974236859545602, 'epoch': 0.44}


 44%|████▍     | 5940/13552 [34:40<40:34,  3.13it/s]

{'loss': 0.1691, 'grad_norm': 0.3204388916492462, 'learning_rate': 0.00011951472465292033, 'epoch': 0.44}


 44%|████▍     | 5950/13552 [34:43<40:54,  3.10it/s]

{'loss': 0.1513, 'grad_norm': 0.2026282399892807, 'learning_rate': 0.00011928697552921402, 'epoch': 0.44}


 44%|████▍     | 5960/13552 [34:47<47:13,  2.68it/s]

{'loss': 0.1396, 'grad_norm': 0.0912962257862091, 'learning_rate': 0.00011905912245186759, 'epoch': 0.44}


 44%|████▍     | 5970/13552 [34:50<44:27,  2.84it/s]

{'loss': 0.1604, 'grad_norm': 0.06576354801654816, 'learning_rate': 0.00011883116664897178, 'epoch': 0.44}


 44%|████▍     | 5980/13552 [34:54<41:21,  3.05it/s]

{'loss': 0.1444, 'grad_norm': 0.15755772590637207, 'learning_rate': 0.00011860310934917113, 'epoch': 0.44}


 44%|████▍     | 5990/13552 [34:57<40:31,  3.11it/s]

{'loss': 0.1614, 'grad_norm': 0.10284831374883652, 'learning_rate': 0.00011837495178165706, 'epoch': 0.44}


 44%|████▍     | 6000/13552 [35:00<42:55,  2.93it/s]

{'loss': 0.1505, 'grad_norm': 0.10576114803552628, 'learning_rate': 0.0001181466951761615, 'epoch': 0.44}


 44%|████▍     | 6010/13552 [35:04<40:51,  3.08it/s]

{'loss': 0.1469, 'grad_norm': 0.3288692831993103, 'learning_rate': 0.00011791834076295022, 'epoch': 0.44}


 44%|████▍     | 6020/13552 [35:07<40:33,  3.09it/s]

{'loss': 0.1628, 'grad_norm': 0.08178874105215073, 'learning_rate': 0.00011768988977281614, 'epoch': 0.44}


 44%|████▍     | 6030/13552 [35:10<40:16,  3.11it/s]

{'loss': 0.1493, 'grad_norm': 0.17445902526378632, 'learning_rate': 0.00011746134343707266, 'epoch': 0.44}


 45%|████▍     | 6040/13552 [35:13<40:13,  3.11it/s]

{'loss': 0.1664, 'grad_norm': 0.11945789307355881, 'learning_rate': 0.0001172327029875471, 'epoch': 0.45}


 45%|████▍     | 6050/13552 [35:17<41:24,  3.02it/s]

{'loss': 0.1314, 'grad_norm': 0.05859432369470596, 'learning_rate': 0.00011700396965657405, 'epoch': 0.45}


 45%|████▍     | 6060/13552 [35:20<39:52,  3.13it/s]

{'loss': 0.1652, 'grad_norm': 0.10339373350143433, 'learning_rate': 0.00011677514467698868, 'epoch': 0.45}


 45%|████▍     | 6070/13552 [35:23<43:26,  2.87it/s]

{'loss': 0.1266, 'grad_norm': 0.14969736337661743, 'learning_rate': 0.00011654622928212027, 'epoch': 0.45}


 45%|████▍     | 6080/13552 [35:27<40:00,  3.11it/s]

{'loss': 0.155, 'grad_norm': 0.6639183163642883, 'learning_rate': 0.00011631722470578519, 'epoch': 0.45}


 45%|████▍     | 6090/13552 [35:30<40:52,  3.04it/s]

{'loss': 0.146, 'grad_norm': 0.0931282714009285, 'learning_rate': 0.00011608813218228067, 'epoch': 0.45}


 45%|████▌     | 6100/13552 [35:33<39:55,  3.11it/s]

{'loss': 0.1605, 'grad_norm': 0.13551808893680573, 'learning_rate': 0.00011585895294637792, 'epoch': 0.45}


 45%|████▌     | 6110/13552 [35:36<41:07,  3.02it/s]

{'loss': 0.1617, 'grad_norm': 0.12895545363426208, 'learning_rate': 0.00011562968823331546, 'epoch': 0.45}


 45%|████▌     | 6120/13552 [35:40<42:06,  2.94it/s]

{'loss': 0.1523, 'grad_norm': 0.09229263663291931, 'learning_rate': 0.00011540033927879255, 'epoch': 0.45}


 45%|████▌     | 6130/13552 [35:43<42:08,  2.94it/s]

{'loss': 0.147, 'grad_norm': 0.10302837193012238, 'learning_rate': 0.00011517090731896254, 'epoch': 0.45}


 45%|████▌     | 6140/13552 [35:47<41:05,  3.01it/s]

{'loss': 0.1635, 'grad_norm': 0.0734330490231514, 'learning_rate': 0.0001149413935904261, 'epoch': 0.45}


 45%|████▌     | 6150/13552 [35:50<40:44,  3.03it/s]

{'loss': 0.1463, 'grad_norm': 0.11380065977573395, 'learning_rate': 0.00011471179933022467, 'epoch': 0.45}


 45%|████▌     | 6160/13552 [35:54<40:50,  3.02it/s]

{'loss': 0.1404, 'grad_norm': 0.11615323275327682, 'learning_rate': 0.00011448212577583368, 'epoch': 0.45}


 46%|████▌     | 6170/13552 [35:57<41:16,  2.98it/s]

{'loss': 0.1528, 'grad_norm': 0.10190681368112564, 'learning_rate': 0.000114252374165156, 'epoch': 0.46}


 46%|████▌     | 6180/13552 [36:01<45:04,  2.73it/s]

{'loss': 0.1702, 'grad_norm': 0.21613085269927979, 'learning_rate': 0.00011402254573651521, 'epoch': 0.46}


 46%|████▌     | 6190/13552 [36:04<41:09,  2.98it/s]

{'loss': 0.1456, 'grad_norm': 0.05951038375496864, 'learning_rate': 0.00011379264172864892, 'epoch': 0.46}


 46%|████▌     | 6200/13552 [36:08<42:14,  2.90it/s]

{'loss': 0.1609, 'grad_norm': 0.1975880116224289, 'learning_rate': 0.00011356266338070205, 'epoch': 0.46}


 46%|████▌     | 6210/13552 [36:12<46:00,  2.66it/s]

{'loss': 0.1469, 'grad_norm': 0.1219887062907219, 'learning_rate': 0.00011333261193222027, 'epoch': 0.46}


 46%|████▌     | 6220/13552 [36:16<55:21,  2.21it/s]

{'loss': 0.1258, 'grad_norm': 0.04683425650000572, 'learning_rate': 0.00011310248862314318, 'epoch': 0.46}


 46%|████▌     | 6230/13552 [36:20<55:32,  2.20it/s]

{'loss': 0.1461, 'grad_norm': 0.09856817126274109, 'learning_rate': 0.00011287229469379777, 'epoch': 0.46}


 46%|████▌     | 6240/13552 [36:23<40:33,  3.00it/s]

{'loss': 0.1483, 'grad_norm': 0.297575980424881, 'learning_rate': 0.00011264203138489162, 'epoch': 0.46}


 46%|████▌     | 6250/13552 [36:27<41:42,  2.92it/s]

{'loss': 0.1435, 'grad_norm': 0.10080096125602722, 'learning_rate': 0.00011241169993750626, 'epoch': 0.46}


 46%|████▌     | 6260/13552 [36:30<40:04,  3.03it/s]

{'loss': 0.1399, 'grad_norm': 0.10324260592460632, 'learning_rate': 0.00011218130159309048, 'epoch': 0.46}


 46%|████▋     | 6270/13552 [36:34<49:53,  2.43it/s]

{'loss': 0.136, 'grad_norm': 0.05493934825062752, 'learning_rate': 0.00011195083759345364, 'epoch': 0.46}


 46%|████▋     | 6280/13552 [36:38<56:17,  2.15it/s]

{'loss': 0.1412, 'grad_norm': 0.6518208384513855, 'learning_rate': 0.00011172030918075895, 'epoch': 0.46}


 46%|████▋     | 6290/13552 [36:41<40:51,  2.96it/s]

{'loss': 0.1582, 'grad_norm': 0.08982034772634506, 'learning_rate': 0.00011148971759751682, 'epoch': 0.46}


 46%|████▋     | 6300/13552 [36:45<42:05,  2.87it/s]

{'loss': 0.1485, 'grad_norm': 0.07348059862852097, 'learning_rate': 0.00011125906408657811, 'epoch': 0.46}


 47%|████▋     | 6310/13552 [36:48<40:50,  2.96it/s]

{'loss': 0.1543, 'grad_norm': 0.08152903616428375, 'learning_rate': 0.00011102834989112751, 'epoch': 0.47}


 47%|████▋     | 6320/13552 [36:52<39:47,  3.03it/s]

{'loss': 0.157, 'grad_norm': 0.10042140632867813, 'learning_rate': 0.00011079757625467672, 'epoch': 0.47}


 47%|████▋     | 6330/13552 [36:55<40:07,  3.00it/s]

{'loss': 0.1446, 'grad_norm': 0.10134238749742508, 'learning_rate': 0.0001105667444210579, 'epoch': 0.47}


 47%|████▋     | 6340/13552 [36:58<42:27,  2.83it/s]

{'loss': 0.1481, 'grad_norm': 0.07332075387239456, 'learning_rate': 0.00011033585563441677, 'epoch': 0.47}


 47%|████▋     | 6350/13552 [37:02<42:02,  2.86it/s]

{'loss': 0.1571, 'grad_norm': 0.10503190755844116, 'learning_rate': 0.00011010491113920612, 'epoch': 0.47}


 47%|████▋     | 6360/13552 [37:05<39:52,  3.01it/s]

{'loss': 0.1376, 'grad_norm': 0.37468138337135315, 'learning_rate': 0.00010987391218017899, 'epoch': 0.47}


 47%|████▋     | 6370/13552 [37:09<39:44,  3.01it/s]

{'loss': 0.1613, 'grad_norm': 0.32235243916511536, 'learning_rate': 0.00010964286000238194, 'epoch': 0.47}


 47%|████▋     | 6380/13552 [37:12<39:57,  2.99it/s]

{'loss': 0.1469, 'grad_norm': 0.15976741909980774, 'learning_rate': 0.00010941175585114834, 'epoch': 0.47}


 47%|████▋     | 6390/13552 [37:15<39:37,  3.01it/s]

{'loss': 0.1398, 'grad_norm': 0.09111153334379196, 'learning_rate': 0.00010918060097209175, 'epoch': 0.47}


 47%|████▋     | 6400/13552 [37:19<44:32,  2.68it/s]

{'loss': 0.1398, 'grad_norm': 0.06508852541446686, 'learning_rate': 0.00010894939661109911, 'epoch': 0.47}


 47%|████▋     | 6410/13552 [37:22<39:10,  3.04it/s]

{'loss': 0.1592, 'grad_norm': 0.255973219871521, 'learning_rate': 0.00010871814401432408, 'epoch': 0.47}


 47%|████▋     | 6420/13552 [37:25<39:12,  3.03it/s]

{'loss': 0.1507, 'grad_norm': 0.1839796006679535, 'learning_rate': 0.00010848684442818028, 'epoch': 0.47}


 47%|████▋     | 6430/13552 [37:29<38:59,  3.04it/s]

{'loss': 0.1575, 'grad_norm': 0.08381551504135132, 'learning_rate': 0.00010825549909933463, 'epoch': 0.47}


 48%|████▊     | 6440/13552 [37:32<40:09,  2.95it/s]

{'loss': 0.1496, 'grad_norm': 0.27972835302352905, 'learning_rate': 0.00010802410927470057, 'epoch': 0.48}


 48%|████▊     | 6450/13552 [37:36<39:08,  3.02it/s]

{'loss': 0.1441, 'grad_norm': 0.0969802513718605, 'learning_rate': 0.00010779267620143134, 'epoch': 0.48}


 48%|████▊     | 6460/13552 [37:39<39:51,  2.97it/s]

{'loss': 0.1609, 'grad_norm': 0.11448521912097931, 'learning_rate': 0.00010756120112691332, 'epoch': 0.48}


 48%|████▊     | 6470/13552 [37:43<39:40,  2.98it/s]

{'loss': 0.1512, 'grad_norm': 0.31169620156288147, 'learning_rate': 0.00010732968529875927, 'epoch': 0.48}


 48%|████▊     | 6480/13552 [37:46<40:07,  2.94it/s]

{'loss': 0.1468, 'grad_norm': 0.1837426722049713, 'learning_rate': 0.0001070981299648016, 'epoch': 0.48}


 48%|████▊     | 6490/13552 [37:49<39:25,  2.99it/s]

{'loss': 0.1444, 'grad_norm': 0.06638551503419876, 'learning_rate': 0.00010686653637308564, 'epoch': 0.48}


 48%|████▊     | 6500/13552 [37:53<39:15,  2.99it/s]

{'loss': 0.1572, 'grad_norm': 0.10141381621360779, 'learning_rate': 0.00010663490577186295, 'epoch': 0.48}


 48%|████▊     | 6510/13552 [37:56<39:16,  2.99it/s]

{'loss': 0.1351, 'grad_norm': 0.13656651973724365, 'learning_rate': 0.0001064032394095845, 'epoch': 0.48}


 48%|████▊     | 6520/13552 [38:00<43:13,  2.71it/s]

{'loss': 0.1444, 'grad_norm': 0.06580260396003723, 'learning_rate': 0.00010617153853489403, 'epoch': 0.48}


 48%|████▊     | 6530/13552 [38:03<39:41,  2.95it/s]

{'loss': 0.1405, 'grad_norm': 0.10625696927309036, 'learning_rate': 0.0001059398043966214, 'epoch': 0.48}


 48%|████▊     | 6540/13552 [38:07<39:05,  2.99it/s]

{'loss': 0.1487, 'grad_norm': 0.08083610981702805, 'learning_rate': 0.00010570803824377568, 'epoch': 0.48}


 48%|████▊     | 6550/13552 [38:10<39:07,  2.98it/s]

{'loss': 0.155, 'grad_norm': 0.18632404506206512, 'learning_rate': 0.00010547624132553839, 'epoch': 0.48}


 48%|████▊     | 6560/13552 [38:14<40:03,  2.91it/s]

{'loss': 0.1541, 'grad_norm': 0.23836925625801086, 'learning_rate': 0.00010524441489125702, 'epoch': 0.48}


 48%|████▊     | 6570/13552 [38:17<42:29,  2.74it/s]

{'loss': 0.1458, 'grad_norm': 0.20357723534107208, 'learning_rate': 0.00010501256019043811, 'epoch': 0.48}


 49%|████▊     | 6580/13552 [38:20<39:06,  2.97it/s]

{'loss': 0.1541, 'grad_norm': 0.06914855539798737, 'learning_rate': 0.00010478067847274047, 'epoch': 0.49}


 49%|████▊     | 6590/13552 [38:24<38:45,  2.99it/s]

{'loss': 0.1539, 'grad_norm': 0.10131066292524338, 'learning_rate': 0.00010454877098796865, 'epoch': 0.49}


 49%|████▊     | 6600/13552 [38:27<39:48,  2.91it/s]

{'loss': 0.1461, 'grad_norm': 0.27039963006973267, 'learning_rate': 0.00010431683898606599, 'epoch': 0.49}


 49%|████▉     | 6610/13552 [38:31<38:29,  3.01it/s]

{'loss': 0.149, 'grad_norm': 0.059761740267276764, 'learning_rate': 0.00010408488371710805, 'epoch': 0.49}


 49%|████▉     | 6620/13552 [38:35<53:43,  2.15it/s]

{'loss': 0.1441, 'grad_norm': 0.0521487221121788, 'learning_rate': 0.0001038529064312957, 'epoch': 0.49}


 49%|████▉     | 6630/13552 [38:38<39:45,  2.90it/s]

{'loss': 0.1352, 'grad_norm': 0.05682746693491936, 'learning_rate': 0.00010362090837894853, 'epoch': 0.49}


 49%|████▉     | 6640/13552 [38:41<40:23,  2.85it/s]

{'loss': 0.1402, 'grad_norm': 0.040494125336408615, 'learning_rate': 0.0001033888908104981, 'epoch': 0.49}


 49%|████▉     | 6650/13552 [38:45<40:35,  2.83it/s]

{'loss': 0.1478, 'grad_norm': 0.10838676244020462, 'learning_rate': 0.00010315685497648106, 'epoch': 0.49}


 49%|████▉     | 6660/13552 [38:49<39:35,  2.90it/s]

{'loss': 0.1338, 'grad_norm': 0.05464246869087219, 'learning_rate': 0.00010292480212753259, 'epoch': 0.49}


 49%|████▉     | 6670/13552 [38:52<38:12,  3.00it/s]

{'loss': 0.1521, 'grad_norm': 0.061192262917757034, 'learning_rate': 0.00010269273351437958, 'epoch': 0.49}


 49%|████▉     | 6680/13552 [38:56<38:19,  2.99it/s]

{'loss': 0.1542, 'grad_norm': 0.0740707740187645, 'learning_rate': 0.00010246065038783382, 'epoch': 0.49}


 49%|████▉     | 6690/13552 [38:59<39:39,  2.88it/s]

{'loss': 0.1505, 'grad_norm': 0.07244253158569336, 'learning_rate': 0.00010222855399878531, 'epoch': 0.49}


 49%|████▉     | 6700/13552 [39:03<38:36,  2.96it/s]

{'loss': 0.1477, 'grad_norm': 0.10776180773973465, 'learning_rate': 0.00010199644559819567, 'epoch': 0.49}


 50%|████▉     | 6710/13552 [39:06<38:05,  2.99it/s]

{'loss': 0.1499, 'grad_norm': 0.07412145286798477, 'learning_rate': 0.0001017643264370912, 'epoch': 0.5}


 50%|████▉     | 6720/13552 [39:09<38:07,  2.99it/s]

{'loss': 0.1484, 'grad_norm': 0.1203649640083313, 'learning_rate': 0.00010153219776655606, 'epoch': 0.5}


 50%|████▉     | 6730/13552 [39:13<38:11,  2.98it/s]

{'loss': 0.1476, 'grad_norm': 0.08658339083194733, 'learning_rate': 0.00010130006083772586, 'epoch': 0.5}


 50%|████▉     | 6740/13552 [39:16<39:48,  2.85it/s]

{'loss': 0.1427, 'grad_norm': 0.043439142405986786, 'learning_rate': 0.00010106791690178059, 'epoch': 0.5}


 50%|████▉     | 6750/13552 [39:20<41:39,  2.72it/s]

{'loss': 0.1619, 'grad_norm': 0.07683214545249939, 'learning_rate': 0.00010083576720993808, 'epoch': 0.5}


 50%|████▉     | 6760/13552 [39:24<42:32,  2.66it/s]

{'loss': 0.1444, 'grad_norm': 0.04891765117645264, 'learning_rate': 0.00010060361301344716, 'epoch': 0.5}


 50%|████▉     | 6770/13552 [39:27<37:54,  2.98it/s]

{'loss': 0.1548, 'grad_norm': 0.09604059904813766, 'learning_rate': 0.00010037145556358094, 'epoch': 0.5}


 50%|█████     | 6780/13552 [39:31<39:14,  2.88it/s]

{'loss': 0.1305, 'grad_norm': 0.09041548520326614, 'learning_rate': 0.00010013929611163005, 'epoch': 0.5}


 50%|█████     | 6790/13552 [39:34<37:36,  3.00it/s]

{'loss': 0.1556, 'grad_norm': 0.07007892429828644, 'learning_rate': 9.99071359088959e-05, 'epoch': 0.5}


 50%|█████     | 6800/13552 [39:39<40:00,  2.81it/s]

{'loss': 0.1376, 'grad_norm': 0.7149993181228638, 'learning_rate': 9.967497620668402e-05, 'epoch': 0.5}


 50%|█████     | 6810/13552 [39:42<38:40,  2.91it/s]

{'loss': 0.1673, 'grad_norm': 0.08019503951072693, 'learning_rate': 9.944281825629717e-05, 'epoch': 0.5}


 50%|█████     | 6820/13552 [39:46<37:45,  2.97it/s]

{'loss': 0.1402, 'grad_norm': 0.06876727193593979, 'learning_rate': 9.92106633090287e-05, 'epoch': 0.5}


 50%|█████     | 6830/13552 [39:49<37:05,  3.02it/s]

{'loss': 0.156, 'grad_norm': 0.1450110673904419, 'learning_rate': 9.897851261615573e-05, 'epoch': 0.5}


 50%|█████     | 6840/13552 [39:52<38:08,  2.93it/s]

{'loss': 0.1491, 'grad_norm': 0.09832637012004852, 'learning_rate': 9.87463674289325e-05, 'epoch': 0.5}


 51%|█████     | 6850/13552 [39:56<38:52,  2.87it/s]

{'loss': 0.1503, 'grad_norm': 0.1023600623011589, 'learning_rate': 9.851422899858358e-05, 'epoch': 0.51}


 51%|█████     | 6860/13552 [40:00<39:46,  2.80it/s]

{'loss': 0.1732, 'grad_norm': 0.16337372362613678, 'learning_rate': 9.828209857629706e-05, 'epoch': 0.51}


 51%|█████     | 6870/13552 [40:04<48:52,  2.28it/s]

{'loss': 0.1444, 'grad_norm': 0.27347514033317566, 'learning_rate': 9.804997741321793e-05, 'epoch': 0.51}


 51%|█████     | 6880/13552 [40:07<39:51,  2.79it/s]

{'loss': 0.1451, 'grad_norm': 0.07311601936817169, 'learning_rate': 9.781786676044127e-05, 'epoch': 0.51}


 51%|█████     | 6890/13552 [40:11<46:35,  2.38it/s]

{'loss': 0.1729, 'grad_norm': 0.06412886828184128, 'learning_rate': 9.758576786900548e-05, 'epoch': 0.51}


 51%|█████     | 6900/13552 [40:14<37:55,  2.92it/s]

{'loss': 0.1439, 'grad_norm': 0.12125423550605774, 'learning_rate': 9.735368198988561e-05, 'epoch': 0.51}


 51%|█████     | 6910/13552 [40:18<43:39,  2.54it/s]

{'loss': 0.1556, 'grad_norm': 0.0881161019206047, 'learning_rate': 9.712161037398648e-05, 'epoch': 0.51}


 51%|█████     | 6920/13552 [40:21<37:30,  2.95it/s]

{'loss': 0.1643, 'grad_norm': 0.19197115302085876, 'learning_rate': 9.688955427213612e-05, 'epoch': 0.51}


 51%|█████     | 6930/13552 [40:25<37:47,  2.92it/s]

{'loss': 0.1483, 'grad_norm': 0.09048684686422348, 'learning_rate': 9.665751493507896e-05, 'epoch': 0.51}


 51%|█████     | 6940/13552 [40:28<37:25,  2.94it/s]

{'loss': 0.1415, 'grad_norm': 0.08217904716730118, 'learning_rate': 9.642549361346901e-05, 'epoch': 0.51}


 51%|█████▏    | 6950/13552 [40:32<41:03,  2.68it/s]

{'loss': 0.1371, 'grad_norm': 0.22369429469108582, 'learning_rate': 9.619349155786321e-05, 'epoch': 0.51}


 51%|█████▏    | 6960/13552 [40:36<39:06,  2.81it/s]

{'loss': 0.1542, 'grad_norm': 0.18973812460899353, 'learning_rate': 9.596151001871465e-05, 'epoch': 0.51}


 51%|█████▏    | 6970/13552 [40:40<39:07,  2.80it/s]

{'loss': 0.1311, 'grad_norm': 0.16531983017921448, 'learning_rate': 9.572955024636585e-05, 'epoch': 0.51}


 52%|█████▏    | 6980/13552 [40:43<39:20,  2.78it/s]

{'loss': 0.1545, 'grad_norm': 0.06519840657711029, 'learning_rate': 9.549761349104198e-05, 'epoch': 0.52}


 52%|█████▏    | 6990/13552 [40:46<38:11,  2.86it/s]

{'loss': 0.1515, 'grad_norm': 0.22138039767742157, 'learning_rate': 9.526570100284422e-05, 'epoch': 0.52}


 52%|█████▏    | 7000/13552 [40:50<42:12,  2.59it/s]

{'loss': 0.1403, 'grad_norm': 0.06500478833913803, 'learning_rate': 9.503381403174286e-05, 'epoch': 0.52}


 52%|█████▏    | 7010/13552 [40:54<40:25,  2.70it/s]

{'loss': 0.1426, 'grad_norm': 0.07668328285217285, 'learning_rate': 9.480195382757072e-05, 'epoch': 0.52}


 52%|█████▏    | 7020/13552 [40:57<36:52,  2.95it/s]

{'loss': 0.1647, 'grad_norm': 0.34221556782722473, 'learning_rate': 9.457012164001635e-05, 'epoch': 0.52}


 52%|█████▏    | 7030/13552 [41:01<36:29,  2.98it/s]

{'loss': 0.1556, 'grad_norm': 0.13836057484149933, 'learning_rate': 9.433831871861727e-05, 'epoch': 0.52}


 52%|█████▏    | 7040/13552 [41:04<38:12,  2.84it/s]

{'loss': 0.1319, 'grad_norm': 0.0788404792547226, 'learning_rate': 9.410654631275324e-05, 'epoch': 0.52}


 52%|█████▏    | 7050/13552 [41:08<37:12,  2.91it/s]

{'loss': 0.1688, 'grad_norm': 0.21851417422294617, 'learning_rate': 9.387480567163965e-05, 'epoch': 0.52}


 52%|█████▏    | 7060/13552 [41:11<36:15,  2.98it/s]

{'loss': 0.1505, 'grad_norm': 0.10769980400800705, 'learning_rate': 9.364309804432057e-05, 'epoch': 0.52}


 52%|█████▏    | 7070/13552 [41:15<43:09,  2.50it/s]

{'loss': 0.1462, 'grad_norm': 0.4718938171863556, 'learning_rate': 9.341142467966222e-05, 'epoch': 0.52}


 52%|█████▏    | 7080/13552 [41:18<39:00,  2.76it/s]

{'loss': 0.1506, 'grad_norm': 0.08890176564455032, 'learning_rate': 9.3179786826346e-05, 'epoch': 0.52}


 52%|█████▏    | 7090/13552 [41:22<40:19,  2.67it/s]

{'loss': 0.148, 'grad_norm': 0.05844862386584282, 'learning_rate': 9.294818573286207e-05, 'epoch': 0.52}


 52%|█████▏    | 7100/13552 [41:25<36:13,  2.97it/s]

{'loss': 0.1444, 'grad_norm': 0.35983723402023315, 'learning_rate': 9.271662264750242e-05, 'epoch': 0.52}


 52%|█████▏    | 7110/13552 [41:29<36:46,  2.92it/s]

{'loss': 0.178, 'grad_norm': 0.06154102459549904, 'learning_rate': 9.248509881835414e-05, 'epoch': 0.52}


 53%|█████▎    | 7120/13552 [41:33<46:08,  2.32it/s]

{'loss': 0.1529, 'grad_norm': 0.36959290504455566, 'learning_rate': 9.225361549329278e-05, 'epoch': 0.53}


 53%|█████▎    | 7130/13552 [41:36<37:24,  2.86it/s]

{'loss': 0.1438, 'grad_norm': 0.08037687838077545, 'learning_rate': 9.202217391997554e-05, 'epoch': 0.53}


 53%|█████▎    | 7140/13552 [41:40<39:33,  2.70it/s]

{'loss': 0.165, 'grad_norm': 0.37183287739753723, 'learning_rate': 9.179077534583461e-05, 'epoch': 0.53}


 53%|█████▎    | 7150/13552 [41:44<40:24,  2.64it/s]

{'loss': 0.1389, 'grad_norm': 0.060809820890426636, 'learning_rate': 9.155942101807042e-05, 'epoch': 0.53}


 53%|█████▎    | 7160/13552 [41:48<36:48,  2.89it/s]

{'loss': 0.155, 'grad_norm': 0.080405093729496, 'learning_rate': 9.132811218364495e-05, 'epoch': 0.53}


 53%|█████▎    | 7170/13552 [41:51<36:07,  2.94it/s]

{'loss': 0.1478, 'grad_norm': 0.10956061631441116, 'learning_rate': 9.109685008927486e-05, 'epoch': 0.53}


 53%|█████▎    | 7180/13552 [41:55<38:22,  2.77it/s]

{'loss': 0.1381, 'grad_norm': 0.20328542590141296, 'learning_rate': 9.086563598142504e-05, 'epoch': 0.53}


 53%|█████▎    | 7190/13552 [41:58<36:01,  2.94it/s]

{'loss': 0.1537, 'grad_norm': 0.05713384598493576, 'learning_rate': 9.063447110630166e-05, 'epoch': 0.53}


 53%|█████▎    | 7200/13552 [42:02<35:09,  3.01it/s]

{'loss': 0.1552, 'grad_norm': 0.07894617319107056, 'learning_rate': 9.040335670984553e-05, 'epoch': 0.53}


 53%|█████▎    | 7210/13552 [42:05<34:57,  3.02it/s]

{'loss': 0.1518, 'grad_norm': 0.06684707850217819, 'learning_rate': 9.017229403772542e-05, 'epoch': 0.53}


 53%|█████▎    | 7220/13552 [42:09<35:53,  2.94it/s]

{'loss': 0.1413, 'grad_norm': 0.05334843695163727, 'learning_rate': 8.994128433533128e-05, 'epoch': 0.53}


 53%|█████▎    | 7230/13552 [42:12<39:31,  2.67it/s]

{'loss': 0.1494, 'grad_norm': 0.06435734033584595, 'learning_rate': 8.971032884776763e-05, 'epoch': 0.53}


 53%|█████▎    | 7240/13552 [42:16<35:20,  2.98it/s]

{'loss': 0.1401, 'grad_norm': 0.06253170222043991, 'learning_rate': 8.947942881984671e-05, 'epoch': 0.53}


 53%|█████▎    | 7250/13552 [42:19<34:13,  3.07it/s]

{'loss': 0.1625, 'grad_norm': 0.11944716423749924, 'learning_rate': 8.924858549608183e-05, 'epoch': 0.53}


 54%|█████▎    | 7260/13552 [42:22<35:17,  2.97it/s]

{'loss': 0.1445, 'grad_norm': 0.06843270361423492, 'learning_rate': 8.901780012068071e-05, 'epoch': 0.54}


 54%|█████▎    | 7270/13552 [42:26<34:19,  3.05it/s]

{'loss': 0.1468, 'grad_norm': 0.09128887951374054, 'learning_rate': 8.878707393753877e-05, 'epoch': 0.54}


 54%|█████▎    | 7280/13552 [42:30<41:54,  2.49it/s]

{'loss': 0.1575, 'grad_norm': 0.05925470590591431, 'learning_rate': 8.855640819023236e-05, 'epoch': 0.54}


 54%|█████▍    | 7290/13552 [42:33<37:07,  2.81it/s]

{'loss': 0.1438, 'grad_norm': 0.07270143926143646, 'learning_rate': 8.832580412201205e-05, 'epoch': 0.54}


 54%|█████▍    | 7300/13552 [42:37<38:47,  2.69it/s]

{'loss': 0.1244, 'grad_norm': 0.0669667199254036, 'learning_rate': 8.809526297579605e-05, 'epoch': 0.54}


 54%|█████▍    | 7310/13552 [42:41<34:36,  3.01it/s]

{'loss': 0.162, 'grad_norm': 0.06578944623470306, 'learning_rate': 8.786478599416337e-05, 'epoch': 0.54}


 54%|█████▍    | 7320/13552 [42:44<34:24,  3.02it/s]

{'loss': 0.1486, 'grad_norm': 0.061964791268110275, 'learning_rate': 8.763437441934722e-05, 'epoch': 0.54}


 54%|█████▍    | 7330/13552 [42:47<34:22,  3.02it/s]

{'loss': 0.1437, 'grad_norm': 0.5355584621429443, 'learning_rate': 8.740402949322827e-05, 'epoch': 0.54}


 54%|█████▍    | 7340/13552 [42:51<34:25,  3.01it/s]

{'loss': 0.1441, 'grad_norm': 0.06016683951020241, 'learning_rate': 8.71737524573279e-05, 'epoch': 0.54}


 54%|█████▍    | 7350/13552 [42:54<35:16,  2.93it/s]

{'loss': 0.1363, 'grad_norm': 0.05778462439775467, 'learning_rate': 8.694354455280168e-05, 'epoch': 0.54}


 54%|█████▍    | 7360/13552 [42:58<35:11,  2.93it/s]

{'loss': 0.1368, 'grad_norm': 0.29560571908950806, 'learning_rate': 8.671340702043249e-05, 'epoch': 0.54}


 54%|█████▍    | 7370/13552 [43:01<35:09,  2.93it/s]

{'loss': 0.1582, 'grad_norm': 0.06765174120664597, 'learning_rate': 8.648334110062399e-05, 'epoch': 0.54}


 54%|█████▍    | 7380/13552 [43:05<41:50,  2.46it/s]

{'loss': 0.1386, 'grad_norm': 0.14459487795829773, 'learning_rate': 8.625334803339376e-05, 'epoch': 0.54}


 55%|█████▍    | 7390/13552 [43:08<35:07,  2.92it/s]

{'loss': 0.1388, 'grad_norm': 0.05471471697092056, 'learning_rate': 8.602342905836681e-05, 'epoch': 0.55}


 55%|█████▍    | 7400/13552 [43:12<34:36,  2.96it/s]

{'loss': 0.1415, 'grad_norm': 0.0816035196185112, 'learning_rate': 8.579358541476877e-05, 'epoch': 0.55}


 55%|█████▍    | 7410/13552 [43:15<36:57,  2.77it/s]

{'loss': 0.1602, 'grad_norm': 0.16093851625919342, 'learning_rate': 8.55638183414193e-05, 'epoch': 0.55}


 55%|█████▍    | 7420/13552 [43:19<33:39,  3.04it/s]

{'loss': 0.1595, 'grad_norm': 0.07483427971601486, 'learning_rate': 8.53341290767252e-05, 'epoch': 0.55}


 55%|█████▍    | 7430/13552 [43:22<37:09,  2.75it/s]

{'loss': 0.1471, 'grad_norm': 0.28606677055358887, 'learning_rate': 8.510451885867403e-05, 'epoch': 0.55}


 55%|█████▍    | 7440/13552 [43:26<35:25,  2.87it/s]

{'loss': 0.1457, 'grad_norm': 0.07530022412538528, 'learning_rate': 8.487498892482727e-05, 'epoch': 0.55}


 55%|█████▍    | 7450/13552 [43:29<34:03,  2.99it/s]

{'loss': 0.1493, 'grad_norm': 0.4919412136077881, 'learning_rate': 8.464554051231371e-05, 'epoch': 0.55}


 55%|█████▌    | 7460/13552 [43:33<38:24,  2.64it/s]

{'loss': 0.14, 'grad_norm': 0.05746276676654816, 'learning_rate': 8.441617485782272e-05, 'epoch': 0.55}


 55%|█████▌    | 7470/13552 [43:36<37:10,  2.73it/s]

{'loss': 0.1416, 'grad_norm': 0.1898277997970581, 'learning_rate': 8.41868931975976e-05, 'epoch': 0.55}


 55%|█████▌    | 7480/13552 [43:40<34:24,  2.94it/s]

{'loss': 0.1322, 'grad_norm': 0.1128113865852356, 'learning_rate': 8.395769676742897e-05, 'epoch': 0.55}


 55%|█████▌    | 7490/13552 [43:43<34:34,  2.92it/s]

{'loss': 0.1364, 'grad_norm': 0.055314354598522186, 'learning_rate': 8.372858680264807e-05, 'epoch': 0.55}


 55%|█████▌    | 7500/13552 [43:46<36:48,  2.74it/s]

{'loss': 0.1556, 'grad_norm': 0.08028973639011383, 'learning_rate': 8.349956453812009e-05, 'epoch': 0.55}


 55%|█████▌    | 7510/13552 [43:50<39:17,  2.56it/s]

{'loss': 0.1396, 'grad_norm': 0.0791705921292305, 'learning_rate': 8.327063120823753e-05, 'epoch': 0.55}


 55%|█████▌    | 7520/13552 [43:53<38:11,  2.63it/s]

{'loss': 0.1641, 'grad_norm': 0.0885690227150917, 'learning_rate': 8.304178804691354e-05, 'epoch': 0.55}


 56%|█████▌    | 7530/13552 [43:57<34:53,  2.88it/s]

{'loss': 0.1581, 'grad_norm': 0.06777957826852798, 'learning_rate': 8.28130362875753e-05, 'epoch': 0.56}


 56%|█████▌    | 7540/13552 [44:00<33:28,  2.99it/s]

{'loss': 0.1472, 'grad_norm': 0.06269073486328125, 'learning_rate': 8.258437716315736e-05, 'epoch': 0.56}


 56%|█████▌    | 7550/13552 [44:04<32:32,  3.07it/s]

{'loss': 0.1609, 'grad_norm': 0.2878674864768982, 'learning_rate': 8.235581190609493e-05, 'epoch': 0.56}


 56%|█████▌    | 7560/13552 [44:07<33:12,  3.01it/s]

{'loss': 0.1372, 'grad_norm': 0.18827193975448608, 'learning_rate': 8.212734174831734e-05, 'epoch': 0.56}


 56%|█████▌    | 7570/13552 [44:10<32:34,  3.06it/s]

{'loss': 0.1366, 'grad_norm': 0.22079947590827942, 'learning_rate': 8.189896792124129e-05, 'epoch': 0.56}


 56%|█████▌    | 7580/13552 [44:14<33:21,  2.98it/s]

{'loss': 0.1517, 'grad_norm': 0.10312443971633911, 'learning_rate': 8.16706916557644e-05, 'epoch': 0.56}


 56%|█████▌    | 7590/13552 [44:17<33:43,  2.95it/s]

{'loss': 0.1485, 'grad_norm': 0.08263777196407318, 'learning_rate': 8.144251418225835e-05, 'epoch': 0.56}


 56%|█████▌    | 7600/13552 [44:21<33:41,  2.94it/s]

{'loss': 0.1417, 'grad_norm': 0.7872421145439148, 'learning_rate': 8.121443673056228e-05, 'epoch': 0.56}


 56%|█████▌    | 7610/13552 [44:24<32:31,  3.04it/s]

{'loss': 0.1319, 'grad_norm': 0.04501973092556, 'learning_rate': 8.098646052997634e-05, 'epoch': 0.56}


 56%|█████▌    | 7620/13552 [44:28<34:12,  2.89it/s]

{'loss': 0.1461, 'grad_norm': 0.0617382638156414, 'learning_rate': 8.075858680925494e-05, 'epoch': 0.56}


 56%|█████▋    | 7630/13552 [44:31<32:29,  3.04it/s]

{'loss': 0.1363, 'grad_norm': 0.07922123372554779, 'learning_rate': 8.053081679660015e-05, 'epoch': 0.56}


 56%|█████▋    | 7640/13552 [44:35<33:30,  2.94it/s]

{'loss': 0.1334, 'grad_norm': 0.05376612767577171, 'learning_rate': 8.030315171965501e-05, 'epoch': 0.56}


 56%|█████▋    | 7650/13552 [44:38<35:34,  2.77it/s]

{'loss': 0.1409, 'grad_norm': 0.07281982153654099, 'learning_rate': 8.007559280549701e-05, 'epoch': 0.56}


 57%|█████▋    | 7660/13552 [44:42<33:03,  2.97it/s]

{'loss': 0.1316, 'grad_norm': 0.0508173406124115, 'learning_rate': 7.984814128063144e-05, 'epoch': 0.57}


 57%|█████▋    | 7670/13552 [44:45<37:01,  2.65it/s]

{'loss': 0.1341, 'grad_norm': 0.11898645758628845, 'learning_rate': 7.96207983709848e-05, 'epoch': 0.57}


 57%|█████▋    | 7680/13552 [44:49<35:57,  2.72it/s]

{'loss': 0.1426, 'grad_norm': 0.049893591552972794, 'learning_rate': 7.939356530189812e-05, 'epoch': 0.57}


 57%|█████▋    | 7690/13552 [44:52<35:38,  2.74it/s]

{'loss': 0.1288, 'grad_norm': 0.3109118640422821, 'learning_rate': 7.916644329812044e-05, 'epoch': 0.57}


 57%|█████▋    | 7700/13552 [44:56<35:41,  2.73it/s]

{'loss': 0.136, 'grad_norm': 0.057389624416828156, 'learning_rate': 7.893943358380217e-05, 'epoch': 0.57}


 57%|█████▋    | 7710/13552 [45:00<44:48,  2.17it/s]

{'loss': 0.1552, 'grad_norm': 0.05354079231619835, 'learning_rate': 7.871253738248851e-05, 'epoch': 0.57}


 57%|█████▋    | 7720/13552 [45:03<34:14,  2.84it/s]

{'loss': 0.1388, 'grad_norm': 0.04183205962181091, 'learning_rate': 7.848575591711283e-05, 'epoch': 0.57}


 57%|█████▋    | 7730/13552 [45:07<33:02,  2.94it/s]

{'loss': 0.1499, 'grad_norm': 0.0726030245423317, 'learning_rate': 7.82590904099901e-05, 'epoch': 0.57}


 57%|█████▋    | 7740/13552 [45:10<32:38,  2.97it/s]

{'loss': 0.1556, 'grad_norm': 0.18566416203975677, 'learning_rate': 7.803254208281028e-05, 'epoch': 0.57}


 57%|█████▋    | 7750/13552 [45:14<34:17,  2.82it/s]

{'loss': 0.1376, 'grad_norm': 0.16020169854164124, 'learning_rate': 7.780611215663177e-05, 'epoch': 0.57}


 57%|█████▋    | 7760/13552 [45:18<45:50,  2.11it/s]

{'loss': 0.1968, 'grad_norm': 0.12007785588502884, 'learning_rate': 7.757980185187484e-05, 'epoch': 0.57}


 57%|█████▋    | 7770/13552 [45:21<33:03,  2.92it/s]

{'loss': 0.148, 'grad_norm': 0.06509857624769211, 'learning_rate': 7.73536123883149e-05, 'epoch': 0.57}


 57%|█████▋    | 7780/13552 [45:25<39:16,  2.45it/s]

{'loss': 0.1275, 'grad_norm': 0.10847991704940796, 'learning_rate': 7.712754498507613e-05, 'epoch': 0.57}


 57%|█████▋    | 7790/13552 [45:28<32:44,  2.93it/s]

{'loss': 0.1514, 'grad_norm': 0.06363740563392639, 'learning_rate': 7.690160086062486e-05, 'epoch': 0.57}


 58%|█████▊    | 7800/13552 [45:32<35:31,  2.70it/s]

{'loss': 0.1253, 'grad_norm': 0.07980813831090927, 'learning_rate': 7.66757812327629e-05, 'epoch': 0.58}


 58%|█████▊    | 7810/13552 [45:36<33:52,  2.82it/s]

{'loss': 0.1543, 'grad_norm': 0.06724397838115692, 'learning_rate': 7.645008731862109e-05, 'epoch': 0.58}


 58%|█████▊    | 7820/13552 [45:40<32:33,  2.93it/s]

{'loss': 0.1466, 'grad_norm': 0.08674372732639313, 'learning_rate': 7.622452033465265e-05, 'epoch': 0.58}


 58%|█████▊    | 7830/13552 [45:43<33:12,  2.87it/s]

{'loss': 0.1634, 'grad_norm': 0.07084512710571289, 'learning_rate': 7.599908149662671e-05, 'epoch': 0.58}


 58%|█████▊    | 7840/13552 [45:47<33:27,  2.84it/s]

{'loss': 0.1377, 'grad_norm': 0.0768732950091362, 'learning_rate': 7.57737720196217e-05, 'epoch': 0.58}


 58%|█████▊    | 7850/13552 [45:50<31:43,  3.00it/s]

{'loss': 0.1485, 'grad_norm': 0.06354880332946777, 'learning_rate': 7.554859311801877e-05, 'epoch': 0.58}


 58%|█████▊    | 7860/13552 [45:54<31:03,  3.05it/s]

{'loss': 0.1526, 'grad_norm': 0.07488598674535751, 'learning_rate': 7.532354600549537e-05, 'epoch': 0.58}


 58%|█████▊    | 7870/13552 [45:57<33:40,  2.81it/s]

{'loss': 0.1413, 'grad_norm': 0.06880978494882584, 'learning_rate': 7.509863189501856e-05, 'epoch': 0.58}


 58%|█████▊    | 7880/13552 [46:01<32:23,  2.92it/s]

{'loss': 0.1535, 'grad_norm': 0.060968074947595596, 'learning_rate': 7.487385199883859e-05, 'epoch': 0.58}


 58%|█████▊    | 7890/13552 [46:04<34:10,  2.76it/s]

{'loss': 0.1325, 'grad_norm': 0.05761712044477463, 'learning_rate': 7.464920752848228e-05, 'epoch': 0.58}


 58%|█████▊    | 7900/13552 [46:07<31:14,  3.01it/s]

{'loss': 0.1404, 'grad_norm': 0.07611200213432312, 'learning_rate': 7.442469969474656e-05, 'epoch': 0.58}


 58%|█████▊    | 7910/13552 [46:11<35:08,  2.68it/s]

{'loss': 0.1488, 'grad_norm': 0.07627605646848679, 'learning_rate': 7.420032970769187e-05, 'epoch': 0.58}


 58%|█████▊    | 7920/13552 [46:14<31:13,  3.01it/s]

{'loss': 0.1527, 'grad_norm': 0.06076887249946594, 'learning_rate': 7.397609877663571e-05, 'epoch': 0.58}


 59%|█████▊    | 7930/13552 [46:18<31:27,  2.98it/s]

{'loss': 0.1324, 'grad_norm': 0.05943606421351433, 'learning_rate': 7.375200811014615e-05, 'epoch': 0.59}


 59%|█████▊    | 7940/13552 [46:22<31:59,  2.92it/s]

{'loss': 0.1419, 'grad_norm': 0.0881345346570015, 'learning_rate': 7.352805891603509e-05, 'epoch': 0.59}


 59%|█████▊    | 7950/13552 [46:25<31:04,  3.00it/s]

{'loss': 0.1319, 'grad_norm': 0.05747109651565552, 'learning_rate': 7.330425240135206e-05, 'epoch': 0.59}


 59%|█████▊    | 7960/13552 [46:29<35:03,  2.66it/s]

{'loss': 0.1401, 'grad_norm': 0.0829470083117485, 'learning_rate': 7.30805897723775e-05, 'epoch': 0.59}


 59%|█████▉    | 7970/13552 [46:32<30:40,  3.03it/s]

{'loss': 0.161, 'grad_norm': 0.07931918650865555, 'learning_rate': 7.285707223461642e-05, 'epoch': 0.59}


 59%|█████▉    | 7980/13552 [46:35<31:04,  2.99it/s]

{'loss': 0.131, 'grad_norm': 0.07726681977510452, 'learning_rate': 7.263370099279172e-05, 'epoch': 0.59}


 59%|█████▉    | 7990/13552 [46:39<30:45,  3.01it/s]

{'loss': 0.1424, 'grad_norm': 0.2710762917995453, 'learning_rate': 7.24104772508378e-05, 'epoch': 0.59}


 59%|█████▉    | 8000/13552 [46:42<31:00,  2.98it/s]

{'loss': 0.1397, 'grad_norm': 0.4788326323032379, 'learning_rate': 7.218740221189411e-05, 'epoch': 0.59}


 59%|█████▉    | 8010/13552 [46:46<31:27,  2.94it/s]

{'loss': 0.147, 'grad_norm': 0.07313542813062668, 'learning_rate': 7.196447707829857e-05, 'epoch': 0.59}


 59%|█████▉    | 8020/13552 [46:49<30:50,  2.99it/s]

{'loss': 0.1735, 'grad_norm': 0.06820527464151382, 'learning_rate': 7.174170305158115e-05, 'epoch': 0.59}


 59%|█████▉    | 8030/13552 [46:52<30:35,  3.01it/s]

{'loss': 0.1468, 'grad_norm': 0.06499116122722626, 'learning_rate': 7.151908133245737e-05, 'epoch': 0.59}


 59%|█████▉    | 8040/13552 [46:56<30:09,  3.05it/s]

{'loss': 0.1537, 'grad_norm': 0.0765836164355278, 'learning_rate': 7.129661312082186e-05, 'epoch': 0.59}


 59%|█████▉    | 8050/13552 [46:59<30:11,  3.04it/s]

{'loss': 0.147, 'grad_norm': 0.08345901221036911, 'learning_rate': 7.107429961574183e-05, 'epoch': 0.59}


 59%|█████▉    | 8060/13552 [47:03<35:39,  2.57it/s]

{'loss': 0.1523, 'grad_norm': 0.18431274592876434, 'learning_rate': 7.08521420154507e-05, 'epoch': 0.59}


 60%|█████▉    | 8070/13552 [47:06<32:23,  2.82it/s]

{'loss': 0.1436, 'grad_norm': 0.46948155760765076, 'learning_rate': 7.063014151734155e-05, 'epoch': 0.6}


 60%|█████▉    | 8080/13552 [47:10<31:05,  2.93it/s]

{'loss': 0.1415, 'grad_norm': 0.06814739853143692, 'learning_rate': 7.040829931796071e-05, 'epoch': 0.6}


 60%|█████▉    | 8090/13552 [47:13<30:07,  3.02it/s]

{'loss': 0.1434, 'grad_norm': 0.07786799222230911, 'learning_rate': 7.01866166130013e-05, 'epoch': 0.6}


 60%|█████▉    | 8100/13552 [47:16<30:37,  2.97it/s]

{'loss': 0.1535, 'grad_norm': 0.1063133180141449, 'learning_rate': 6.996509459729687e-05, 'epoch': 0.6}


 60%|█████▉    | 8110/13552 [47:20<32:52,  2.76it/s]

{'loss': 0.1422, 'grad_norm': 0.0844172015786171, 'learning_rate': 6.974373446481473e-05, 'epoch': 0.6}


 60%|█████▉    | 8120/13552 [47:23<29:51,  3.03it/s]

{'loss': 0.1544, 'grad_norm': 0.23880048096179962, 'learning_rate': 6.952253740864978e-05, 'epoch': 0.6}


 60%|█████▉    | 8130/13552 [47:27<29:34,  3.06it/s]

{'loss': 0.1522, 'grad_norm': 0.19041407108306885, 'learning_rate': 6.93015046210179e-05, 'epoch': 0.6}


 60%|██████    | 8140/13552 [47:30<29:29,  3.06it/s]

{'loss': 0.1499, 'grad_norm': 0.05639129877090454, 'learning_rate': 6.908063729324969e-05, 'epoch': 0.6}


 60%|██████    | 8150/13552 [47:34<32:09,  2.80it/s]

{'loss': 0.1469, 'grad_norm': 0.060058023780584335, 'learning_rate': 6.885993661578384e-05, 'epoch': 0.6}


 60%|██████    | 8160/13552 [47:37<29:33,  3.04it/s]

{'loss': 0.1647, 'grad_norm': 0.3462097942829132, 'learning_rate': 6.863940377816091e-05, 'epoch': 0.6}


 60%|██████    | 8170/13552 [47:41<36:43,  2.44it/s]

{'loss': 0.1594, 'grad_norm': 0.48861733078956604, 'learning_rate': 6.841903996901673e-05, 'epoch': 0.6}


 60%|██████    | 8180/13552 [47:44<30:01,  2.98it/s]

{'loss': 0.1412, 'grad_norm': 0.2768930494785309, 'learning_rate': 6.819884637607619e-05, 'epoch': 0.6}


 60%|██████    | 8190/13552 [47:47<29:32,  3.03it/s]

{'loss': 0.1335, 'grad_norm': 0.10799027234315872, 'learning_rate': 6.797882418614668e-05, 'epoch': 0.6}


 61%|██████    | 8200/13552 [47:51<29:23,  3.04it/s]

{'loss': 0.1487, 'grad_norm': 0.10435967892408371, 'learning_rate': 6.775897458511176e-05, 'epoch': 0.61}


 61%|██████    | 8210/13552 [47:54<29:06,  3.06it/s]

{'loss': 0.1586, 'grad_norm': 0.07331813126802444, 'learning_rate': 6.753929875792482e-05, 'epoch': 0.61}


 61%|██████    | 8220/13552 [47:57<30:17,  2.93it/s]

{'loss': 0.1549, 'grad_norm': 0.06491106748580933, 'learning_rate': 6.731979788860258e-05, 'epoch': 0.61}


 61%|██████    | 8230/13552 [48:01<33:22,  2.66it/s]

{'loss': 0.1382, 'grad_norm': 0.08676944673061371, 'learning_rate': 6.710047316021879e-05, 'epoch': 0.61}


 61%|██████    | 8240/13552 [48:05<31:02,  2.85it/s]

{'loss': 0.1413, 'grad_norm': 0.23015530407428741, 'learning_rate': 6.688132575489783e-05, 'epoch': 0.61}


 61%|██████    | 8250/13552 [48:08<29:19,  3.01it/s]

{'loss': 0.1557, 'grad_norm': 0.07499395310878754, 'learning_rate': 6.666235685380832e-05, 'epoch': 0.61}


 61%|██████    | 8260/13552 [48:11<30:38,  2.88it/s]

{'loss': 0.1312, 'grad_norm': 0.1454714983701706, 'learning_rate': 6.644356763715678e-05, 'epoch': 0.61}


 61%|██████    | 8270/13552 [48:15<29:31,  2.98it/s]

{'loss': 0.1493, 'grad_norm': 0.06728053092956543, 'learning_rate': 6.622495928418133e-05, 'epoch': 0.61}


 61%|██████    | 8280/13552 [48:18<30:25,  2.89it/s]

{'loss': 0.1579, 'grad_norm': 0.05031755566596985, 'learning_rate': 6.600653297314511e-05, 'epoch': 0.61}


 61%|██████    | 8290/13552 [48:22<33:34,  2.61it/s]

{'loss': 0.1358, 'grad_norm': 0.2585679888725281, 'learning_rate': 6.57882898813302e-05, 'epoch': 0.61}


 61%|██████    | 8300/13552 [48:25<31:17,  2.80it/s]

{'loss': 0.1304, 'grad_norm': 0.08084958046674728, 'learning_rate': 6.557023118503114e-05, 'epoch': 0.61}


 61%|██████▏   | 8310/13552 [48:29<28:58,  3.01it/s]

{'loss': 0.1395, 'grad_norm': 0.10020791739225388, 'learning_rate': 6.535235805954857e-05, 'epoch': 0.61}


 61%|██████▏   | 8320/13552 [48:32<28:38,  3.04it/s]

{'loss': 0.1567, 'grad_norm': 0.07053154706954956, 'learning_rate': 6.513467167918301e-05, 'epoch': 0.61}


 61%|██████▏   | 8330/13552 [48:36<33:42,  2.58it/s]

{'loss': 0.1426, 'grad_norm': 0.04718540236353874, 'learning_rate': 6.491717321722839e-05, 'epoch': 0.61}


 62%|██████▏   | 8340/13552 [48:40<29:43,  2.92it/s]

{'loss': 0.1357, 'grad_norm': 0.09512709826231003, 'learning_rate': 6.469986384596579e-05, 'epoch': 0.62}


 62%|██████▏   | 8350/13552 [48:43<29:20,  2.95it/s]

{'loss': 0.1384, 'grad_norm': 0.05684436112642288, 'learning_rate': 6.448274473665717e-05, 'epoch': 0.62}


 62%|██████▏   | 8360/13552 [48:47<31:13,  2.77it/s]

{'loss': 0.1347, 'grad_norm': 0.07750155031681061, 'learning_rate': 6.426581705953896e-05, 'epoch': 0.62}


 62%|██████▏   | 8370/13552 [48:50<30:36,  2.82it/s]

{'loss': 0.1441, 'grad_norm': 0.0653018057346344, 'learning_rate': 6.404908198381581e-05, 'epoch': 0.62}


 62%|██████▏   | 8380/13552 [48:54<30:39,  2.81it/s]

{'loss': 0.1289, 'grad_norm': 0.08004147559404373, 'learning_rate': 6.38325406776543e-05, 'epoch': 0.62}


 62%|██████▏   | 8390/13552 [48:57<29:01,  2.96it/s]

{'loss': 0.1572, 'grad_norm': 0.07123296707868576, 'learning_rate': 6.361619430817663e-05, 'epoch': 0.62}


 62%|██████▏   | 8400/13552 [49:02<33:10,  2.59it/s]

{'loss': 0.1352, 'grad_norm': 0.1075766459107399, 'learning_rate': 6.340004404145428e-05, 'epoch': 0.62}


 62%|██████▏   | 8410/13552 [49:05<30:37,  2.80it/s]

{'loss': 0.143, 'grad_norm': 0.07238862663507462, 'learning_rate': 6.318409104250182e-05, 'epoch': 0.62}


 62%|██████▏   | 8420/13552 [49:09<32:02,  2.67it/s]

{'loss': 0.1374, 'grad_norm': 0.09817233681678772, 'learning_rate': 6.296833647527054e-05, 'epoch': 0.62}


 62%|██████▏   | 8430/13552 [49:13<29:42,  2.87it/s]

{'loss': 0.147, 'grad_norm': 0.056329745799303055, 'learning_rate': 6.275278150264223e-05, 'epoch': 0.62}


 62%|██████▏   | 8440/13552 [49:16<30:28,  2.80it/s]

{'loss': 0.1255, 'grad_norm': 1.1782571077346802, 'learning_rate': 6.253742728642295e-05, 'epoch': 0.62}


 62%|██████▏   | 8450/13552 [49:19<27:48,  3.06it/s]

{'loss': 0.1603, 'grad_norm': 0.06695786863565445, 'learning_rate': 6.232227498733658e-05, 'epoch': 0.62}


 62%|██████▏   | 8460/13552 [49:23<29:42,  2.86it/s]

{'loss': 0.1269, 'grad_norm': 0.06038951128721237, 'learning_rate': 6.210732576501884e-05, 'epoch': 0.62}


 62%|██████▎   | 8470/13552 [49:26<28:52,  2.93it/s]

{'loss': 0.1495, 'grad_norm': 0.22105495631694794, 'learning_rate': 6.18925807780108e-05, 'epoch': 0.62}


 63%|██████▎   | 8480/13552 [49:30<27:45,  3.04it/s]

{'loss': 0.1301, 'grad_norm': 0.04552709683775902, 'learning_rate': 6.16780411837528e-05, 'epoch': 0.63}


 63%|██████▎   | 8490/13552 [49:33<27:25,  3.08it/s]

{'loss': 0.154, 'grad_norm': 0.05925481766462326, 'learning_rate': 6.146370813857815e-05, 'epoch': 0.63}


 63%|██████▎   | 8500/13552 [49:37<29:17,  2.87it/s]

{'loss': 0.1271, 'grad_norm': 0.13518497347831726, 'learning_rate': 6.124958279770685e-05, 'epoch': 0.63}


 63%|██████▎   | 8510/13552 [49:40<28:06,  2.99it/s]

{'loss': 0.1577, 'grad_norm': 0.9552913904190063, 'learning_rate': 6.103566631523942e-05, 'epoch': 0.63}


 63%|██████▎   | 8520/13552 [49:43<27:14,  3.08it/s]

{'loss': 0.1543, 'grad_norm': 0.45006152987480164, 'learning_rate': 6.0821959844150687e-05, 'epoch': 0.63}


 63%|██████▎   | 8530/13552 [49:47<27:14,  3.07it/s]

{'loss': 0.1319, 'grad_norm': 0.10823783278465271, 'learning_rate': 6.060846453628355e-05, 'epoch': 0.63}


 63%|██████▎   | 8540/13552 [49:50<27:56,  2.99it/s]

{'loss': 0.14, 'grad_norm': 0.11054440587759018, 'learning_rate': 6.0395181542342716e-05, 'epoch': 0.63}


 63%|██████▎   | 8550/13552 [49:54<33:09,  2.51it/s]

{'loss': 0.1233, 'grad_norm': 0.0674520805478096, 'learning_rate': 6.0182112011888636e-05, 'epoch': 0.63}


 63%|██████▎   | 8560/13552 [49:57<26:55,  3.09it/s]

{'loss': 0.1582, 'grad_norm': 0.06583648175001144, 'learning_rate': 5.996925709333119e-05, 'epoch': 0.63}


 63%|██████▎   | 8570/13552 [50:00<27:04,  3.07it/s]

{'loss': 0.1441, 'grad_norm': 0.08410894870758057, 'learning_rate': 5.975661793392353e-05, 'epoch': 0.63}


 63%|██████▎   | 8580/13552 [50:04<34:54,  2.37it/s]

{'loss': 0.133, 'grad_norm': 0.1385922133922577, 'learning_rate': 5.954419567975591e-05, 'epoch': 0.63}


 63%|██████▎   | 8590/13552 [50:07<26:50,  3.08it/s]

{'loss': 0.1446, 'grad_norm': 0.0621805377304554, 'learning_rate': 5.93319914757495e-05, 'epoch': 0.63}


 63%|██████▎   | 8600/13552 [50:10<26:38,  3.10it/s]

{'loss': 0.1494, 'grad_norm': 0.06708677858114243, 'learning_rate': 5.9120006465650216e-05, 'epoch': 0.63}


 64%|██████▎   | 8610/13552 [50:14<28:09,  2.93it/s]

{'loss': 0.1478, 'grad_norm': 0.08046074211597443, 'learning_rate': 5.8908241792022553e-05, 'epoch': 0.64}


 64%|██████▎   | 8620/13552 [50:18<32:50,  2.50it/s]

{'loss': 0.1459, 'grad_norm': 0.1767013520002365, 'learning_rate': 5.8696698596243416e-05, 'epoch': 0.64}


 64%|██████▎   | 8630/13552 [50:21<26:45,  3.07it/s]

{'loss': 0.164, 'grad_norm': 0.07533425837755203, 'learning_rate': 5.848537801849601e-05, 'epoch': 0.64}


 64%|██████▍   | 8640/13552 [50:24<26:57,  3.04it/s]

{'loss': 0.1444, 'grad_norm': 0.05584567040205002, 'learning_rate': 5.827428119776361e-05, 'epoch': 0.64}


 64%|██████▍   | 8650/13552 [50:27<26:27,  3.09it/s]

{'loss': 0.1461, 'grad_norm': 0.084760382771492, 'learning_rate': 5.806340927182357e-05, 'epoch': 0.64}


 64%|██████▍   | 8660/13552 [50:31<29:53,  2.73it/s]

{'loss': 0.133, 'grad_norm': 0.14276555180549622, 'learning_rate': 5.785276337724101e-05, 'epoch': 0.64}


 64%|██████▍   | 8670/13552 [50:35<31:15,  2.60it/s]

{'loss': 0.1361, 'grad_norm': 0.08079865574836731, 'learning_rate': 5.764234464936282e-05, 'epoch': 0.64}


 64%|██████▍   | 8680/13552 [50:38<27:02,  3.00it/s]

{'loss': 0.126, 'grad_norm': 0.07437784969806671, 'learning_rate': 5.7432154222311505e-05, 'epoch': 0.64}


 64%|██████▍   | 8690/13552 [50:41<26:05,  3.11it/s]

{'loss': 0.1455, 'grad_norm': 0.06217905506491661, 'learning_rate': 5.7222193228979037e-05, 'epoch': 0.64}


 64%|██████▍   | 8700/13552 [50:45<28:21,  2.85it/s]

{'loss': 0.1352, 'grad_norm': 0.17033834755420685, 'learning_rate': 5.7012462801020816e-05, 'epoch': 0.64}


 64%|██████▍   | 8710/13552 [50:48<27:01,  2.99it/s]

{'loss': 0.1459, 'grad_norm': 0.39800235629081726, 'learning_rate': 5.680296406884946e-05, 'epoch': 0.64}


 64%|██████▍   | 8720/13552 [50:52<38:24,  2.10it/s]

{'loss': 0.1355, 'grad_norm': 0.0933605208992958, 'learning_rate': 5.659369816162885e-05, 'epoch': 0.64}


 64%|██████▍   | 8730/13552 [50:56<26:50,  2.99it/s]

{'loss': 0.1526, 'grad_norm': 0.07534266263246536, 'learning_rate': 5.638466620726797e-05, 'epoch': 0.64}


 64%|██████▍   | 8740/13552 [50:59<25:53,  3.10it/s]

{'loss': 0.148, 'grad_norm': 0.07138089090585709, 'learning_rate': 5.617586933241479e-05, 'epoch': 0.64}


 65%|██████▍   | 8750/13552 [51:03<30:03,  2.66it/s]

{'loss': 0.1447, 'grad_norm': 0.3616451919078827, 'learning_rate': 5.5967308662450343e-05, 'epoch': 0.65}


 65%|██████▍   | 8760/13552 [51:06<26:10,  3.05it/s]

{'loss': 0.1376, 'grad_norm': 0.07975893467664719, 'learning_rate': 5.575898532148247e-05, 'epoch': 0.65}


 65%|██████▍   | 8770/13552 [51:10<27:25,  2.91it/s]

{'loss': 0.1449, 'grad_norm': 0.12792828679084778, 'learning_rate': 5.555090043233986e-05, 'epoch': 0.65}


 65%|██████▍   | 8780/13552 [51:13<31:09,  2.55it/s]

{'loss': 0.1435, 'grad_norm': 0.060378547757864, 'learning_rate': 5.534305511656604e-05, 'epoch': 0.65}


 65%|██████▍   | 8790/13552 [51:16<25:49,  3.07it/s]

{'loss': 0.1495, 'grad_norm': 0.0701041892170906, 'learning_rate': 5.5135450494413136e-05, 'epoch': 0.65}


 65%|██████▍   | 8800/13552 [51:20<26:52,  2.95it/s]

{'loss': 0.1448, 'grad_norm': 0.41781696677207947, 'learning_rate': 5.4928087684836124e-05, 'epoch': 0.65}


 65%|██████▌   | 8810/13552 [51:23<34:52,  2.27it/s]

{'loss': 0.1337, 'grad_norm': 0.05265902355313301, 'learning_rate': 5.472096780548659e-05, 'epoch': 0.65}


 65%|██████▌   | 8820/13552 [51:27<26:11,  3.01it/s]

{'loss': 0.1387, 'grad_norm': 0.05343596637248993, 'learning_rate': 5.451409197270678e-05, 'epoch': 0.65}


 65%|██████▌   | 8830/13552 [51:30<28:31,  2.76it/s]

{'loss': 0.1417, 'grad_norm': 0.07103642076253891, 'learning_rate': 5.430746130152353e-05, 'epoch': 0.65}


 65%|██████▌   | 8840/13552 [51:34<26:53,  2.92it/s]

{'loss': 0.1439, 'grad_norm': 0.07769989967346191, 'learning_rate': 5.4101076905642366e-05, 'epoch': 0.65}


 65%|██████▌   | 8850/13552 [51:37<25:31,  3.07it/s]

{'loss': 0.146, 'grad_norm': 0.05858103558421135, 'learning_rate': 5.389493989744135e-05, 'epoch': 0.65}


 65%|██████▌   | 8860/13552 [51:41<28:10,  2.78it/s]

{'loss': 0.1364, 'grad_norm': 0.08168378472328186, 'learning_rate': 5.368905138796523e-05, 'epoch': 0.65}


 65%|██████▌   | 8870/13552 [51:44<26:42,  2.92it/s]

{'loss': 0.1451, 'grad_norm': 0.19727006554603577, 'learning_rate': 5.3483412486919346e-05, 'epoch': 0.65}


 66%|██████▌   | 8880/13552 [51:47<24:43,  3.15it/s]

{'loss': 0.1624, 'grad_norm': 0.05981258675456047, 'learning_rate': 5.327802430266373e-05, 'epoch': 0.66}


 66%|██████▌   | 8890/13552 [51:51<24:50,  3.13it/s]

{'loss': 0.1423, 'grad_norm': 0.06740612536668777, 'learning_rate': 5.307288794220704e-05, 'epoch': 0.66}


 66%|██████▌   | 8900/13552 [51:54<24:35,  3.15it/s]

{'loss': 0.1638, 'grad_norm': 0.0672392025589943, 'learning_rate': 5.286800451120066e-05, 'epoch': 0.66}


 66%|██████▌   | 8910/13552 [51:57<24:13,  3.19it/s]

{'loss': 0.141, 'grad_norm': 0.06469538062810898, 'learning_rate': 5.266337511393277e-05, 'epoch': 0.66}


 66%|██████▌   | 8920/13552 [52:00<24:26,  3.16it/s]

{'loss': 0.135, 'grad_norm': 0.0576692670583725, 'learning_rate': 5.2459000853322296e-05, 'epoch': 0.66}


 66%|██████▌   | 8930/13552 [52:03<24:07,  3.19it/s]

{'loss': 0.145, 'grad_norm': 0.05710514262318611, 'learning_rate': 5.225488283091304e-05, 'epoch': 0.66}


 66%|██████▌   | 8940/13552 [52:06<24:11,  3.18it/s]

{'loss': 0.1405, 'grad_norm': 0.05134456977248192, 'learning_rate': 5.205102214686772e-05, 'epoch': 0.66}


 66%|██████▌   | 8950/13552 [52:10<26:41,  2.87it/s]

{'loss': 0.127, 'grad_norm': 0.053389061242341995, 'learning_rate': 5.184741989996209e-05, 'epoch': 0.66}


 66%|██████▌   | 8960/13552 [52:13<24:05,  3.18it/s]

{'loss': 0.1543, 'grad_norm': 0.05539688467979431, 'learning_rate': 5.164407718757882e-05, 'epoch': 0.66}


 66%|██████▌   | 8970/13552 [52:16<24:14,  3.15it/s]

{'loss': 0.134, 'grad_norm': 0.06380535662174225, 'learning_rate': 5.1440995105701916e-05, 'epoch': 0.66}


 66%|██████▋   | 8980/13552 [52:20<29:27,  2.59it/s]

{'loss': 0.1198, 'grad_norm': 0.11309375613927841, 'learning_rate': 5.1238174748910525e-05, 'epoch': 0.66}


 66%|██████▋   | 8990/13552 [52:24<27:31,  2.76it/s]

{'loss': 0.1242, 'grad_norm': 0.25607094168663025, 'learning_rate': 5.103561721037318e-05, 'epoch': 0.66}


 66%|██████▋   | 9000/13552 [52:27<24:54,  3.05it/s]

{'loss': 0.1471, 'grad_norm': 0.05323216691613197, 'learning_rate': 5.083332358184183e-05, 'epoch': 0.66}


 66%|██████▋   | 9010/13552 [52:30<24:02,  3.15it/s]

{'loss': 0.1403, 'grad_norm': 0.049173761159181595, 'learning_rate': 5.0631294953646004e-05, 'epoch': 0.66}


 67%|██████▋   | 9020/13552 [52:34<26:32,  2.85it/s]

{'loss': 0.1286, 'grad_norm': 0.06978446990251541, 'learning_rate': 5.042953241468693e-05, 'epoch': 0.67}


 67%|██████▋   | 9030/13552 [52:37<26:01,  2.90it/s]

{'loss': 0.1325, 'grad_norm': 0.06730365008115768, 'learning_rate': 5.022803705243169e-05, 'epoch': 0.67}


 67%|██████▋   | 9040/13552 [52:40<23:46,  3.16it/s]

{'loss': 0.1459, 'grad_norm': 0.04664018750190735, 'learning_rate': 5.0026809952907286e-05, 'epoch': 0.67}


 67%|██████▋   | 9050/13552 [52:44<28:12,  2.66it/s]

{'loss': 0.1392, 'grad_norm': 0.12762512266635895, 'learning_rate': 4.9825852200694776e-05, 'epoch': 0.67}


 67%|██████▋   | 9060/13552 [52:47<23:52,  3.13it/s]

{'loss': 0.1573, 'grad_norm': 0.06490091234445572, 'learning_rate': 4.962516487892359e-05, 'epoch': 0.67}


 67%|██████▋   | 9070/13552 [52:50<25:07,  2.97it/s]

{'loss': 0.1532, 'grad_norm': 0.07635170966386795, 'learning_rate': 4.942474906926553e-05, 'epoch': 0.67}


 67%|██████▋   | 9080/13552 [52:54<26:01,  2.86it/s]

{'loss': 0.1329, 'grad_norm': 0.06299179792404175, 'learning_rate': 4.9224605851928984e-05, 'epoch': 0.67}


 67%|██████▋   | 9090/13552 [52:57<24:10,  3.08it/s]

{'loss': 0.1468, 'grad_norm': 0.07987216114997864, 'learning_rate': 4.9024736305653096e-05, 'epoch': 0.67}


 67%|██████▋   | 9100/13552 [53:01<25:52,  2.87it/s]

{'loss': 0.1354, 'grad_norm': 0.054896071553230286, 'learning_rate': 4.882514150770207e-05, 'epoch': 0.67}


 67%|██████▋   | 9110/13552 [53:04<24:02,  3.08it/s]

{'loss': 0.1328, 'grad_norm': 0.08929772675037384, 'learning_rate': 4.8625822533859135e-05, 'epoch': 0.67}


 67%|██████▋   | 9120/13552 [53:08<24:19,  3.04it/s]

{'loss': 0.1486, 'grad_norm': 0.4171760082244873, 'learning_rate': 4.8426780458420974e-05, 'epoch': 0.67}


 67%|██████▋   | 9130/13552 [53:11<23:44,  3.10it/s]

{'loss': 0.1532, 'grad_norm': 0.07204306125640869, 'learning_rate': 4.8228016354191696e-05, 'epoch': 0.67}


 67%|██████▋   | 9140/13552 [53:14<23:12,  3.17it/s]

{'loss': 0.1348, 'grad_norm': 0.18634194135665894, 'learning_rate': 4.8029531292477336e-05, 'epoch': 0.67}


 68%|██████▊   | 9150/13552 [53:18<25:47,  2.84it/s]

{'loss': 0.1371, 'grad_norm': 0.12445703148841858, 'learning_rate': 4.783132634307986e-05, 'epoch': 0.68}


 68%|██████▊   | 9160/13552 [53:21<23:29,  3.12it/s]

{'loss': 0.1281, 'grad_norm': 0.05913203954696655, 'learning_rate': 4.76334025742915e-05, 'epoch': 0.68}


 68%|██████▊   | 9170/13552 [53:24<23:41,  3.08it/s]

{'loss': 0.138, 'grad_norm': 0.053116653114557266, 'learning_rate': 4.743576105288896e-05, 'epoch': 0.68}


 68%|██████▊   | 9180/13552 [53:28<24:30,  2.97it/s]

{'loss': 0.14, 'grad_norm': 0.0778503492474556, 'learning_rate': 4.723840284412767e-05, 'epoch': 0.68}


 68%|██████▊   | 9190/13552 [53:31<22:49,  3.18it/s]

{'loss': 0.1572, 'grad_norm': 0.03848912566900253, 'learning_rate': 4.704132901173608e-05, 'epoch': 0.68}


 68%|██████▊   | 9200/13552 [53:34<23:48,  3.05it/s]

{'loss': 0.155, 'grad_norm': 0.09776204079389572, 'learning_rate': 4.684454061790987e-05, 'epoch': 0.68}


 68%|██████▊   | 9210/13552 [53:37<23:25,  3.09it/s]

{'loss': 0.1411, 'grad_norm': 0.07300003618001938, 'learning_rate': 4.664803872330625e-05, 'epoch': 0.68}


 68%|██████▊   | 9220/13552 [53:41<23:22,  3.09it/s]

{'loss': 0.1524, 'grad_norm': 0.09147308021783829, 'learning_rate': 4.645182438703828e-05, 'epoch': 0.68}


 68%|██████▊   | 9230/13552 [53:44<22:55,  3.14it/s]

{'loss': 0.145, 'grad_norm': 0.04771972447633743, 'learning_rate': 4.6255898666669085e-05, 'epoch': 0.68}


 68%|██████▊   | 9240/13552 [53:48<25:47,  2.79it/s]

{'loss': 0.1493, 'grad_norm': 0.12783148884773254, 'learning_rate': 4.606026261820622e-05, 'epoch': 0.68}


 68%|██████▊   | 9250/13552 [53:51<22:39,  3.16it/s]

{'loss': 0.1447, 'grad_norm': 0.12188452482223511, 'learning_rate': 4.586491729609597e-05, 'epoch': 0.68}


 68%|██████▊   | 9260/13552 [53:54<22:40,  3.16it/s]

{'loss': 0.1415, 'grad_norm': 0.11913514882326126, 'learning_rate': 4.566986375321761e-05, 'epoch': 0.68}


 68%|██████▊   | 9270/13552 [53:57<24:04,  2.96it/s]

{'loss': 0.1341, 'grad_norm': 0.05557790771126747, 'learning_rate': 4.547510304087782e-05, 'epoch': 0.68}


 68%|██████▊   | 9280/13552 [54:01<22:35,  3.15it/s]

{'loss': 0.1299, 'grad_norm': 0.04989767447113991, 'learning_rate': 4.528063620880494e-05, 'epoch': 0.68}


 69%|██████▊   | 9290/13552 [54:04<22:16,  3.19it/s]

{'loss': 0.1301, 'grad_norm': 0.05277353897690773, 'learning_rate': 4.508646430514341e-05, 'epoch': 0.69}


 69%|██████▊   | 9300/13552 [54:07<22:15,  3.18it/s]

{'loss': 0.1374, 'grad_norm': 0.0863603726029396, 'learning_rate': 4.489258837644789e-05, 'epoch': 0.69}


 69%|██████▊   | 9310/13552 [54:10<23:49,  2.97it/s]

{'loss': 0.1454, 'grad_norm': 0.0909893587231636, 'learning_rate': 4.469900946767791e-05, 'epoch': 0.69}


 69%|██████▉   | 9320/13552 [54:13<22:19,  3.16it/s]

{'loss': 0.1584, 'grad_norm': 0.06602214276790619, 'learning_rate': 4.450572862219218e-05, 'epoch': 0.69}


 69%|██████▉   | 9330/13552 [54:16<22:25,  3.14it/s]

{'loss': 0.1472, 'grad_norm': 0.0845298320055008, 'learning_rate': 4.431274688174274e-05, 'epoch': 0.69}


 69%|██████▉   | 9340/13552 [54:20<22:20,  3.14it/s]

{'loss': 0.1239, 'grad_norm': 0.04490567371249199, 'learning_rate': 4.41200652864696e-05, 'epoch': 0.69}


 69%|██████▉   | 9350/13552 [54:23<22:12,  3.15it/s]

{'loss': 0.1478, 'grad_norm': 0.07908102869987488, 'learning_rate': 4.3927684874895004e-05, 'epoch': 0.69}


 69%|██████▉   | 9360/13552 [54:26<22:37,  3.09it/s]

{'loss': 0.1366, 'grad_norm': 0.0647459328174591, 'learning_rate': 4.373560668391786e-05, 'epoch': 0.69}


 69%|██████▉   | 9370/13552 [54:30<24:04,  2.90it/s]

{'loss': 0.1334, 'grad_norm': 0.0705065056681633, 'learning_rate': 4.354383174880818e-05, 'epoch': 0.69}


 69%|██████▉   | 9380/13552 [54:33<21:56,  3.17it/s]

{'loss': 0.1559, 'grad_norm': 0.07132549583911896, 'learning_rate': 4.33523611032015e-05, 'epoch': 0.69}


 69%|██████▉   | 9390/13552 [54:36<24:02,  2.89it/s]

{'loss': 0.1305, 'grad_norm': 0.1053161472082138, 'learning_rate': 4.316119577909317e-05, 'epoch': 0.69}


 69%|██████▉   | 9400/13552 [54:40<22:04,  3.13it/s]

{'loss': 0.1379, 'grad_norm': 0.07546710222959518, 'learning_rate': 4.2970336806833024e-05, 'epoch': 0.69}


 69%|██████▉   | 9410/13552 [54:43<21:49,  3.16it/s]

{'loss': 0.1477, 'grad_norm': 0.08934705704450607, 'learning_rate': 4.277978521511966e-05, 'epoch': 0.69}


 70%|██████▉   | 9420/13552 [54:46<21:55,  3.14it/s]

{'loss': 0.1399, 'grad_norm': 0.07820931077003479, 'learning_rate': 4.2589542030994986e-05, 'epoch': 0.7}


 70%|██████▉   | 9430/13552 [54:49<28:46,  2.39it/s]

{'loss': 0.1402, 'grad_norm': 0.06019250676035881, 'learning_rate': 4.2399608279838586e-05, 'epoch': 0.7}


 70%|██████▉   | 9440/13552 [54:53<23:49,  2.88it/s]

{'loss': 0.1431, 'grad_norm': 0.09069336205720901, 'learning_rate': 4.2209984985362264e-05, 'epoch': 0.7}


 70%|██████▉   | 9450/13552 [54:56<23:01,  2.97it/s]

{'loss': 0.1413, 'grad_norm': 0.0804080069065094, 'learning_rate': 4.202067316960459e-05, 'epoch': 0.7}


 70%|██████▉   | 9460/13552 [55:00<21:32,  3.17it/s]

{'loss': 0.1508, 'grad_norm': 0.04923206940293312, 'learning_rate': 4.1831673852925255e-05, 'epoch': 0.7}


 70%|██████▉   | 9470/13552 [55:03<22:03,  3.08it/s]

{'loss': 0.1331, 'grad_norm': 0.34080272912979126, 'learning_rate': 4.1642988053999565e-05, 'epoch': 0.7}


 70%|██████▉   | 9480/13552 [55:06<24:57,  2.72it/s]

{'loss': 0.13, 'grad_norm': 0.3172113597393036, 'learning_rate': 4.1454616789813105e-05, 'epoch': 0.7}


 70%|███████   | 9490/13552 [55:10<21:33,  3.14it/s]

{'loss': 0.1491, 'grad_norm': 0.07107258588075638, 'learning_rate': 4.126656107565615e-05, 'epoch': 0.7}


 70%|███████   | 9500/13552 [55:13<28:07,  2.40it/s]

{'loss': 0.1296, 'grad_norm': 0.03722873330116272, 'learning_rate': 4.107882192511822e-05, 'epoch': 0.7}


 70%|███████   | 9510/13552 [55:17<21:44,  3.10it/s]

{'loss': 0.1438, 'grad_norm': 0.061101093888282776, 'learning_rate': 4.0891400350082586e-05, 'epoch': 0.7}


 70%|███████   | 9520/13552 [55:20<21:15,  3.16it/s]

{'loss': 0.1313, 'grad_norm': 0.11098337173461914, 'learning_rate': 4.070429736072085e-05, 'epoch': 0.7}


 70%|███████   | 9530/13552 [55:23<22:01,  3.04it/s]

{'loss': 0.1403, 'grad_norm': 0.08793562650680542, 'learning_rate': 4.0517513965487485e-05, 'epoch': 0.7}


 70%|███████   | 9540/13552 [55:26<20:58,  3.19it/s]

{'loss': 0.1545, 'grad_norm': 0.07962410151958466, 'learning_rate': 4.033105117111441e-05, 'epoch': 0.7}


 70%|███████   | 9550/13552 [55:30<25:02,  2.66it/s]

{'loss': 0.1413, 'grad_norm': 0.06092238053679466, 'learning_rate': 4.014490998260557e-05, 'epoch': 0.7}


 71%|███████   | 9560/13552 [55:34<23:36,  2.82it/s]

{'loss': 0.1481, 'grad_norm': 0.15110960602760315, 'learning_rate': 3.9959091403231475e-05, 'epoch': 0.71}


 71%|███████   | 9570/13552 [55:37<22:22,  2.97it/s]

{'loss': 0.1305, 'grad_norm': 0.08724122494459152, 'learning_rate': 3.9773596434523854e-05, 'epoch': 0.71}


 71%|███████   | 9580/13552 [55:40<21:00,  3.15it/s]

{'loss': 0.1397, 'grad_norm': 0.06406377255916595, 'learning_rate': 3.9588426076270214e-05, 'epoch': 0.71}


 71%|███████   | 9590/13552 [55:43<21:35,  3.06it/s]

{'loss': 0.1387, 'grad_norm': 0.09825443476438522, 'learning_rate': 3.9403581326508465e-05, 'epoch': 0.71}


 71%|███████   | 9600/13552 [55:47<21:59,  3.00it/s]

{'loss': 0.1262, 'grad_norm': 0.0557374432682991, 'learning_rate': 3.921906318152153e-05, 'epoch': 0.71}


 71%|███████   | 9610/13552 [55:50<20:36,  3.19it/s]

{'loss': 0.1416, 'grad_norm': 0.05653311312198639, 'learning_rate': 3.903487263583202e-05, 'epoch': 0.71}


 71%|███████   | 9620/13552 [55:53<21:39,  3.03it/s]

{'loss': 0.1348, 'grad_norm': 0.05391249060630798, 'learning_rate': 3.885101068219681e-05, 'epoch': 0.71}


 71%|███████   | 9630/13552 [55:57<22:33,  2.90it/s]

{'loss': 0.1377, 'grad_norm': 0.07713229209184647, 'learning_rate': 3.8667478311601726e-05, 'epoch': 0.71}


 71%|███████   | 9640/13552 [56:00<25:09,  2.59it/s]

{'loss': 0.1347, 'grad_norm': 0.04773189127445221, 'learning_rate': 3.848427651325622e-05, 'epoch': 0.71}


 71%|███████   | 9650/13552 [56:04<22:20,  2.91it/s]

{'loss': 0.1283, 'grad_norm': 0.05091167986392975, 'learning_rate': 3.8301406274587924e-05, 'epoch': 0.71}


 71%|███████▏  | 9660/13552 [56:07<21:12,  3.06it/s]

{'loss': 0.1364, 'grad_norm': 0.06859109550714493, 'learning_rate': 3.811886858123749e-05, 'epoch': 0.71}


 71%|███████▏  | 9670/13552 [56:10<20:33,  3.15it/s]

{'loss': 0.1519, 'grad_norm': 0.06234729290008545, 'learning_rate': 3.793666441705326e-05, 'epoch': 0.71}


 71%|███████▏  | 9680/13552 [56:13<20:49,  3.10it/s]

{'loss': 0.1381, 'grad_norm': 0.06332679837942123, 'learning_rate': 3.775479476408581e-05, 'epoch': 0.71}


 72%|███████▏  | 9690/13552 [56:17<20:05,  3.20it/s]

{'loss': 0.1472, 'grad_norm': 0.07841852307319641, 'learning_rate': 3.757326060258278e-05, 'epoch': 0.72}


 72%|███████▏  | 9700/13552 [56:20<20:55,  3.07it/s]

{'loss': 0.1323, 'grad_norm': 0.04842384159564972, 'learning_rate': 3.73920629109836e-05, 'epoch': 0.72}


 72%|███████▏  | 9710/13552 [56:23<19:47,  3.23it/s]

{'loss': 0.1444, 'grad_norm': 0.0829302966594696, 'learning_rate': 3.7211202665914155e-05, 'epoch': 0.72}


 72%|███████▏  | 9720/13552 [56:26<21:34,  2.96it/s]

{'loss': 0.1478, 'grad_norm': 0.1983530968427658, 'learning_rate': 3.7030680842181566e-05, 'epoch': 0.72}


 72%|███████▏  | 9730/13552 [56:30<21:01,  3.03it/s]

{'loss': 0.135, 'grad_norm': 0.05585908517241478, 'learning_rate': 3.685049841276886e-05, 'epoch': 0.72}


 72%|███████▏  | 9740/13552 [56:33<19:40,  3.23it/s]

{'loss': 0.1731, 'grad_norm': 0.0859040841460228, 'learning_rate': 3.6670656348829846e-05, 'epoch': 0.72}


 72%|███████▏  | 9750/13552 [56:36<21:56,  2.89it/s]

{'loss': 0.1356, 'grad_norm': 0.11060899496078491, 'learning_rate': 3.649115561968381e-05, 'epoch': 0.72}


 72%|███████▏  | 9760/13552 [56:40<20:37,  3.06it/s]

{'loss': 0.1434, 'grad_norm': 0.051331497728824615, 'learning_rate': 3.63119971928103e-05, 'epoch': 0.72}


 72%|███████▏  | 9770/13552 [56:43<20:29,  3.08it/s]

{'loss': 0.1285, 'grad_norm': 0.06899034976959229, 'learning_rate': 3.6133182033843885e-05, 'epoch': 0.72}


 72%|███████▏  | 9780/13552 [56:46<21:30,  2.92it/s]

{'loss': 0.157, 'grad_norm': 0.09072012454271317, 'learning_rate': 3.5954711106568996e-05, 'epoch': 0.72}


 72%|███████▏  | 9790/13552 [56:49<19:31,  3.21it/s]

{'loss': 0.1334, 'grad_norm': 0.0579218752682209, 'learning_rate': 3.5776585372914696e-05, 'epoch': 0.72}


 72%|███████▏  | 9800/13552 [56:53<22:41,  2.76it/s]

{'loss': 0.1365, 'grad_norm': 0.09646713733673096, 'learning_rate': 3.5598805792949564e-05, 'epoch': 0.72}


 72%|███████▏  | 9810/13552 [56:56<20:02,  3.11it/s]

{'loss': 0.1371, 'grad_norm': 0.06412503123283386, 'learning_rate': 3.5421373324876436e-05, 'epoch': 0.72}


 72%|███████▏  | 9820/13552 [57:00<22:00,  2.83it/s]

{'loss': 0.1363, 'grad_norm': 0.1172783151268959, 'learning_rate': 3.5244288925027204e-05, 'epoch': 0.72}


 73%|███████▎  | 9830/13552 [57:03<19:16,  3.22it/s]

{'loss': 0.1402, 'grad_norm': 0.08947031199932098, 'learning_rate': 3.5067553547857814e-05, 'epoch': 0.73}


 73%|███████▎  | 9840/13552 [57:06<23:21,  2.65it/s]

{'loss': 0.1439, 'grad_norm': 0.07627585530281067, 'learning_rate': 3.489116814594302e-05, 'epoch': 0.73}


 73%|███████▎  | 9850/13552 [57:09<18:58,  3.25it/s]

{'loss': 0.1385, 'grad_norm': 0.09978247433900833, 'learning_rate': 3.4715133669971246e-05, 'epoch': 0.73}


 73%|███████▎  | 9860/13552 [57:12<19:01,  3.23it/s]

{'loss': 0.156, 'grad_norm': 0.06786175072193146, 'learning_rate': 3.453945106873949e-05, 'epoch': 0.73}


 73%|███████▎  | 9870/13552 [57:16<20:32,  2.99it/s]

{'loss': 0.1277, 'grad_norm': 0.062049511820077896, 'learning_rate': 3.436412128914822e-05, 'epoch': 0.73}


 73%|███████▎  | 9880/13552 [57:20<22:20,  2.74it/s]

{'loss': 0.1189, 'grad_norm': 0.06157045066356659, 'learning_rate': 3.4189145276196245e-05, 'epoch': 0.73}


 73%|███████▎  | 9890/13552 [57:23<19:04,  3.20it/s]

{'loss': 0.1362, 'grad_norm': 0.06063307076692581, 'learning_rate': 3.401452397297561e-05, 'epoch': 0.73}


 73%|███████▎  | 9900/13552 [57:26<23:58,  2.54it/s]

{'loss': 0.1538, 'grad_norm': 0.06530322134494781, 'learning_rate': 3.384025832066655e-05, 'epoch': 0.73}


 73%|███████▎  | 9910/13552 [57:30<19:06,  3.18it/s]

{'loss': 0.1401, 'grad_norm': 0.05046127364039421, 'learning_rate': 3.3666349258532406e-05, 'epoch': 0.73}


 73%|███████▎  | 9920/13552 [57:33<18:37,  3.25it/s]

{'loss': 0.1511, 'grad_norm': 0.07457349449396133, 'learning_rate': 3.349279772391454e-05, 'epoch': 0.73}


 73%|███████▎  | 9930/13552 [57:36<22:40,  2.66it/s]

{'loss': 0.155, 'grad_norm': 0.06132678687572479, 'learning_rate': 3.331960465222731e-05, 'epoch': 0.73}


 73%|███████▎  | 9940/13552 [57:39<18:41,  3.22it/s]

{'loss': 0.1491, 'grad_norm': 0.2064400315284729, 'learning_rate': 3.314677097695301e-05, 'epoch': 0.73}


 73%|███████▎  | 9950/13552 [57:43<25:09,  2.39it/s]

{'loss': 0.1347, 'grad_norm': 0.09759804606437683, 'learning_rate': 3.2974297629636865e-05, 'epoch': 0.73}


 73%|███████▎  | 9960/13552 [57:46<18:36,  3.22it/s]

{'loss': 0.1548, 'grad_norm': 0.07921770960092545, 'learning_rate': 3.2802185539881956e-05, 'epoch': 0.73}


 74%|███████▎  | 9970/13552 [57:49<19:38,  3.04it/s]

{'loss': 0.1409, 'grad_norm': 0.06677687168121338, 'learning_rate': 3.263043563534428e-05, 'epoch': 0.74}


 74%|███████▎  | 9980/13552 [57:52<18:30,  3.22it/s]

{'loss': 0.1544, 'grad_norm': 0.12970088422298431, 'learning_rate': 3.245904884172772e-05, 'epoch': 0.74}


 74%|███████▎  | 9990/13552 [57:56<19:08,  3.10it/s]

{'loss': 0.1372, 'grad_norm': 0.1327550858259201, 'learning_rate': 3.228802608277899e-05, 'epoch': 0.74}


 74%|███████▍  | 10000/13552 [57:59<18:25,  3.21it/s]

{'loss': 0.1518, 'grad_norm': 0.07647386193275452, 'learning_rate': 3.211736828028278e-05, 'epoch': 0.74}


 74%|███████▍  | 10010/13552 [58:02<20:35,  2.87it/s]

{'loss': 0.1294, 'grad_norm': 0.15426303446292877, 'learning_rate': 3.194707635405665e-05, 'epoch': 0.74}


 74%|███████▍  | 10020/13552 [58:05<19:35,  3.00it/s]

{'loss': 0.1324, 'grad_norm': 0.05329939350485802, 'learning_rate': 3.1777151221946285e-05, 'epoch': 0.74}


 74%|███████▍  | 10030/13552 [58:09<19:49,  2.96it/s]

{'loss': 0.138, 'grad_norm': 0.0723162367939949, 'learning_rate': 3.16075937998203e-05, 'epoch': 0.74}


 74%|███████▍  | 10040/13552 [58:12<19:24,  3.02it/s]

{'loss': 0.1268, 'grad_norm': 0.09682448208332062, 'learning_rate': 3.143840500156542e-05, 'epoch': 0.74}


 74%|███████▍  | 10050/13552 [58:15<18:27,  3.16it/s]

{'loss': 0.1352, 'grad_norm': 0.06353987753391266, 'learning_rate': 3.126958573908156e-05, 'epoch': 0.74}


 74%|███████▍  | 10060/13552 [58:19<20:01,  2.91it/s]

{'loss': 0.1408, 'grad_norm': 0.0625171810388565, 'learning_rate': 3.1101136922276955e-05, 'epoch': 0.74}


 74%|███████▍  | 10070/13552 [58:22<18:09,  3.20it/s]

{'loss': 0.1453, 'grad_norm': 0.06544759124517441, 'learning_rate': 3.093305945906308e-05, 'epoch': 0.74}


 74%|███████▍  | 10080/13552 [58:25<23:01,  2.51it/s]

{'loss': 0.1437, 'grad_norm': 0.035638391971588135, 'learning_rate': 3.076535425534996e-05, 'epoch': 0.74}


 74%|███████▍  | 10090/13552 [58:28<21:40,  2.66it/s]

{'loss': 0.1361, 'grad_norm': 0.04479450732469559, 'learning_rate': 3.0598022215041175e-05, 'epoch': 0.74}


 75%|███████▍  | 10100/13552 [58:32<19:23,  2.97it/s]

{'loss': 0.1355, 'grad_norm': 0.058087870478630066, 'learning_rate': 3.043106424002905e-05, 'epoch': 0.75}


 75%|███████▍  | 10110/13552 [58:35<18:31,  3.10it/s]

{'loss': 0.1315, 'grad_norm': 0.08396309614181519, 'learning_rate': 3.0264481230189724e-05, 'epoch': 0.75}


 75%|███████▍  | 10120/13552 [58:39<26:48,  2.13it/s]

{'loss': 0.145, 'grad_norm': 0.43318140506744385, 'learning_rate': 3.009827408337834e-05, 'epoch': 0.75}


 75%|███████▍  | 10130/13552 [58:42<18:52,  3.02it/s]

{'loss': 0.1507, 'grad_norm': 0.06559160351753235, 'learning_rate': 2.9932443695424217e-05, 'epoch': 0.75}


 75%|███████▍  | 10140/13552 [58:46<18:45,  3.03it/s]

{'loss': 0.1387, 'grad_norm': 0.05766221508383751, 'learning_rate': 2.976699096012594e-05, 'epoch': 0.75}


 75%|███████▍  | 10150/13552 [58:49<18:32,  3.06it/s]

{'loss': 0.1482, 'grad_norm': 0.07893472164869308, 'learning_rate': 2.9601916769246773e-05, 'epoch': 0.75}


 75%|███████▍  | 10160/13552 [58:52<18:48,  3.00it/s]

{'loss': 0.1429, 'grad_norm': 0.06644153594970703, 'learning_rate': 2.943722201250948e-05, 'epoch': 0.75}


 75%|███████▌  | 10170/13552 [58:56<18:10,  3.10it/s]

{'loss': 0.1443, 'grad_norm': 0.07703819870948792, 'learning_rate': 2.927290757759187e-05, 'epoch': 0.75}


 75%|███████▌  | 10180/13552 [58:59<24:23,  2.30it/s]

{'loss': 0.1325, 'grad_norm': 0.08139296621084213, 'learning_rate': 2.9108974350121822e-05, 'epoch': 0.75}


 75%|███████▌  | 10190/13552 [59:02<17:44,  3.16it/s]

{'loss': 0.1417, 'grad_norm': 0.0640001893043518, 'learning_rate': 2.8945423213672608e-05, 'epoch': 0.75}


 75%|███████▌  | 10200/13552 [59:06<20:21,  2.75it/s]

{'loss': 0.1345, 'grad_norm': 0.05434199422597885, 'learning_rate': 2.8782255049758077e-05, 'epoch': 0.75}


 75%|███████▌  | 10210/13552 [59:09<17:43,  3.14it/s]

{'loss': 0.1394, 'grad_norm': 0.07201676070690155, 'learning_rate': 2.8619470737827904e-05, 'epoch': 0.75}


 75%|███████▌  | 10220/13552 [59:13<20:10,  2.75it/s]

{'loss': 0.1412, 'grad_norm': 0.0656510442495346, 'learning_rate': 2.8457071155262884e-05, 'epoch': 0.75}


 75%|███████▌  | 10230/13552 [59:16<19:08,  2.89it/s]

{'loss': 0.1399, 'grad_norm': 0.07156604528427124, 'learning_rate': 2.8295057177370165e-05, 'epoch': 0.75}


 76%|███████▌  | 10240/13552 [59:19<19:15,  2.87it/s]

{'loss': 0.1403, 'grad_norm': 0.04844985902309418, 'learning_rate': 2.8133429677378577e-05, 'epoch': 0.76}


 76%|███████▌  | 10250/13552 [59:23<20:42,  2.66it/s]

{'loss': 0.1202, 'grad_norm': 0.7938644289970398, 'learning_rate': 2.7972189526433866e-05, 'epoch': 0.76}


 76%|███████▌  | 10260/13552 [59:27<19:50,  2.76it/s]

{'loss': 0.1223, 'grad_norm': 0.0722837820649147, 'learning_rate': 2.7811337593594056e-05, 'epoch': 0.76}


 76%|███████▌  | 10270/13552 [59:30<17:59,  3.04it/s]

{'loss': 0.1497, 'grad_norm': 0.06762567162513733, 'learning_rate': 2.765087474582473e-05, 'epoch': 0.76}


 76%|███████▌  | 10280/13552 [59:34<17:31,  3.11it/s]

{'loss': 0.1511, 'grad_norm': 0.07953675836324692, 'learning_rate': 2.749080184799435e-05, 'epoch': 0.76}


 76%|███████▌  | 10290/13552 [59:37<17:08,  3.17it/s]

{'loss': 0.1396, 'grad_norm': 0.07090583443641663, 'learning_rate': 2.7331119762869638e-05, 'epoch': 0.76}


 76%|███████▌  | 10300/13552 [59:40<17:09,  3.16it/s]

{'loss': 0.1508, 'grad_norm': 0.04509052634239197, 'learning_rate': 2.7171829351110877e-05, 'epoch': 0.76}


 76%|███████▌  | 10310/13552 [59:43<17:25,  3.10it/s]

{'loss': 0.1265, 'grad_norm': 0.054802872240543365, 'learning_rate': 2.701293147126731e-05, 'epoch': 0.76}


 76%|███████▌  | 10320/13552 [59:47<20:07,  2.68it/s]

{'loss': 0.1421, 'grad_norm': 0.10277024656534195, 'learning_rate': 2.685442697977253e-05, 'epoch': 0.76}


 76%|███████▌  | 10330/13552 [59:50<17:15,  3.11it/s]

{'loss': 0.1372, 'grad_norm': 0.08502859622240067, 'learning_rate': 2.669631673093972e-05, 'epoch': 0.76}


 76%|███████▋  | 10340/13552 [59:53<20:07,  2.66it/s]

{'loss': 0.1237, 'grad_norm': 0.08274891972541809, 'learning_rate': 2.6538601576957268e-05, 'epoch': 0.76}


 76%|███████▋  | 10350/13552 [59:57<17:12,  3.10it/s]

{'loss': 0.1468, 'grad_norm': 0.08417898416519165, 'learning_rate': 2.638128236788403e-05, 'epoch': 0.76}


 76%|███████▋  | 10360/13552 [1:00:00<16:55,  3.14it/s]

{'loss': 0.1478, 'grad_norm': 0.07811901718378067, 'learning_rate': 2.622435995164475e-05, 'epoch': 0.76}


 77%|███████▋  | 10370/13552 [1:00:03<16:51,  3.14it/s]

{'loss': 0.1421, 'grad_norm': 0.06354970484972, 'learning_rate': 2.6067835174025623e-05, 'epoch': 0.77}


 77%|███████▋  | 10380/13552 [1:00:07<19:27,  2.72it/s]

{'loss': 0.1235, 'grad_norm': 0.06867978721857071, 'learning_rate': 2.591170887866955e-05, 'epoch': 0.77}


 77%|███████▋  | 10390/13552 [1:00:10<18:12,  2.89it/s]

{'loss': 0.1445, 'grad_norm': 0.051766544580459595, 'learning_rate': 2.575598190707168e-05, 'epoch': 0.77}


 77%|███████▋  | 10400/13552 [1:00:13<16:58,  3.09it/s]

{'loss': 0.1412, 'grad_norm': 0.05874105170369148, 'learning_rate': 2.5600655098574934e-05, 'epoch': 0.77}


 77%|███████▋  | 10410/13552 [1:00:17<17:08,  3.06it/s]

{'loss': 0.1376, 'grad_norm': 0.07862585783004761, 'learning_rate': 2.5445729290365315e-05, 'epoch': 0.77}


 77%|███████▋  | 10420/13552 [1:00:20<18:03,  2.89it/s]

{'loss': 0.1452, 'grad_norm': 0.35232701897621155, 'learning_rate': 2.5291205317467593e-05, 'epoch': 0.77}


 77%|███████▋  | 10430/13552 [1:00:23<16:59,  3.06it/s]

{'loss': 0.1552, 'grad_norm': 0.08346445113420486, 'learning_rate': 2.5137084012740687e-05, 'epoch': 0.77}


 77%|███████▋  | 10440/13552 [1:00:27<17:21,  2.99it/s]

{'loss': 0.1298, 'grad_norm': 0.09142136573791504, 'learning_rate': 2.4983366206873183e-05, 'epoch': 0.77}


 77%|███████▋  | 10450/13552 [1:00:31<22:36,  2.29it/s]

{'loss': 0.1227, 'grad_norm': 0.059860486537218094, 'learning_rate': 2.4830052728378885e-05, 'epoch': 0.77}


 77%|███████▋  | 10460/13552 [1:00:34<18:29,  2.79it/s]

{'loss': 0.143, 'grad_norm': 0.07670790702104568, 'learning_rate': 2.4677144403592346e-05, 'epoch': 0.77}


 77%|███████▋  | 10470/13552 [1:00:37<16:42,  3.07it/s]

{'loss': 0.1429, 'grad_norm': 0.07169478386640549, 'learning_rate': 2.4524642056664394e-05, 'epoch': 0.77}


 77%|███████▋  | 10480/13552 [1:00:40<16:34,  3.09it/s]

{'loss': 0.1414, 'grad_norm': 0.05017385631799698, 'learning_rate': 2.4372546509557725e-05, 'epoch': 0.77}


 77%|███████▋  | 10490/13552 [1:00:44<16:33,  3.08it/s]

{'loss': 0.1554, 'grad_norm': 0.08587495982646942, 'learning_rate': 2.4220858582042405e-05, 'epoch': 0.77}


 77%|███████▋  | 10500/13552 [1:00:47<16:30,  3.08it/s]

{'loss': 0.1374, 'grad_norm': 0.09534575790166855, 'learning_rate': 2.4069579091691564e-05, 'epoch': 0.77}


 78%|███████▊  | 10510/13552 [1:00:51<16:38,  3.05it/s]

{'loss': 0.1372, 'grad_norm': 0.04690456762909889, 'learning_rate': 2.391870885387685e-05, 'epoch': 0.78}


 78%|███████▊  | 10520/13552 [1:00:54<16:11,  3.12it/s]

{'loss': 0.1458, 'grad_norm': 0.0749107226729393, 'learning_rate': 2.376824868176416e-05, 'epoch': 0.78}


 78%|███████▊  | 10530/13552 [1:00:57<16:00,  3.15it/s]

{'loss': 0.1402, 'grad_norm': 0.07726910710334778, 'learning_rate': 2.361819938630918e-05, 'epoch': 0.78}


 78%|███████▊  | 10540/13552 [1:01:01<19:07,  2.63it/s]

{'loss': 0.1302, 'grad_norm': 0.05578554421663284, 'learning_rate': 2.3468561776253052e-05, 'epoch': 0.78}


 78%|███████▊  | 10550/13552 [1:01:04<15:58,  3.13it/s]

{'loss': 0.1489, 'grad_norm': 0.07226667553186417, 'learning_rate': 2.3319336658117984e-05, 'epoch': 0.78}


 78%|███████▊  | 10560/13552 [1:01:07<16:17,  3.06it/s]

{'loss': 0.1441, 'grad_norm': 0.10486423969268799, 'learning_rate': 2.3170524836202933e-05, 'epoch': 0.78}


 78%|███████▊  | 10570/13552 [1:01:11<17:30,  2.84it/s]

{'loss': 0.1148, 'grad_norm': 0.06312423944473267, 'learning_rate': 2.302212711257925e-05, 'epoch': 0.78}


 78%|███████▊  | 10580/13552 [1:01:14<15:48,  3.13it/s]

{'loss': 0.1309, 'grad_norm': 0.054951392114162445, 'learning_rate': 2.2874144287086373e-05, 'epoch': 0.78}


 78%|███████▊  | 10590/13552 [1:01:18<16:02,  3.08it/s]

{'loss': 0.1263, 'grad_norm': 0.07908377796411514, 'learning_rate': 2.272657715732751e-05, 'epoch': 0.78}


 78%|███████▊  | 10600/13552 [1:01:22<16:59,  2.90it/s]

{'loss': 0.1323, 'grad_norm': 0.09106499701738358, 'learning_rate': 2.2579426518665305e-05, 'epoch': 0.78}


 78%|███████▊  | 10610/13552 [1:01:25<17:30,  2.80it/s]

{'loss': 0.1326, 'grad_norm': 0.06680164486169815, 'learning_rate': 2.2432693164217622e-05, 'epoch': 0.78}


 78%|███████▊  | 10620/13552 [1:01:29<17:31,  2.79it/s]

{'loss': 0.1269, 'grad_norm': 0.08527223765850067, 'learning_rate': 2.2286377884853203e-05, 'epoch': 0.78}


 78%|███████▊  | 10630/13552 [1:01:33<18:04,  2.69it/s]

{'loss': 0.1348, 'grad_norm': 0.07212690263986588, 'learning_rate': 2.2140481469187467e-05, 'epoch': 0.78}


 79%|███████▊  | 10640/13552 [1:01:36<19:06,  2.54it/s]

{'loss': 0.133, 'grad_norm': 0.049000103026628494, 'learning_rate': 2.1995004703578193e-05, 'epoch': 0.79}


 79%|███████▊  | 10650/13552 [1:01:40<20:22,  2.37it/s]

{'loss': 0.1394, 'grad_norm': 0.06448903679847717, 'learning_rate': 2.184994837212133e-05, 'epoch': 0.79}


 79%|███████▊  | 10660/13552 [1:01:43<20:07,  2.40it/s]

{'loss': 0.1518, 'grad_norm': 0.06618174910545349, 'learning_rate': 2.1705313256646774e-05, 'epoch': 0.79}


 79%|███████▊  | 10670/13552 [1:01:47<15:23,  3.12it/s]

{'loss': 0.1501, 'grad_norm': 0.17535172402858734, 'learning_rate': 2.1561100136714075e-05, 'epoch': 0.79}


 79%|███████▉  | 10680/13552 [1:01:50<15:44,  3.04it/s]

{'loss': 0.1528, 'grad_norm': 0.05662962421774864, 'learning_rate': 2.1417309789608352e-05, 'epoch': 0.79}


 79%|███████▉  | 10690/13552 [1:01:54<18:42,  2.55it/s]

{'loss': 0.1222, 'grad_norm': 0.05857061222195625, 'learning_rate': 2.127394299033604e-05, 'epoch': 0.79}


 79%|███████▉  | 10700/13552 [1:01:57<15:46,  3.01it/s]

{'loss': 0.1464, 'grad_norm': 0.039186831563711166, 'learning_rate': 2.1131000511620702e-05, 'epoch': 0.79}


 79%|███████▉  | 10710/13552 [1:02:00<15:18,  3.09it/s]

{'loss': 0.1461, 'grad_norm': 0.08282879739999771, 'learning_rate': 2.0988483123898885e-05, 'epoch': 0.79}


 79%|███████▉  | 10720/13552 [1:02:04<15:16,  3.09it/s]

{'loss': 0.1317, 'grad_norm': 0.07938345521688461, 'learning_rate': 2.0846391595316005e-05, 'epoch': 0.79}


 79%|███████▉  | 10730/13552 [1:02:07<15:24,  3.05it/s]

{'loss': 0.1409, 'grad_norm': 0.07274535298347473, 'learning_rate': 2.070472669172213e-05, 'epoch': 0.79}


 79%|███████▉  | 10740/13552 [1:02:10<14:50,  3.16it/s]

{'loss': 0.1483, 'grad_norm': 0.07226946204900742, 'learning_rate': 2.056348917666788e-05, 'epoch': 0.79}


 79%|███████▉  | 10750/13552 [1:02:14<15:01,  3.11it/s]

{'loss': 0.1469, 'grad_norm': 0.1446947455406189, 'learning_rate': 2.0422679811400368e-05, 'epoch': 0.79}


 79%|███████▉  | 10760/13552 [1:02:17<15:30,  3.00it/s]

{'loss': 0.1414, 'grad_norm': 0.05418514087796211, 'learning_rate': 2.028229935485896e-05, 'epoch': 0.79}


 79%|███████▉  | 10770/13552 [1:02:21<19:07,  2.42it/s]

{'loss': 0.1228, 'grad_norm': 0.08298011869192123, 'learning_rate': 2.0142348563671366e-05, 'epoch': 0.79}


 80%|███████▉  | 10780/13552 [1:02:25<14:41,  3.14it/s]

{'loss': 0.1602, 'grad_norm': 0.27674785256385803, 'learning_rate': 2.0002828192149425e-05, 'epoch': 0.8}


 80%|███████▉  | 10790/13552 [1:02:28<14:38,  3.15it/s]

{'loss': 0.1419, 'grad_norm': 0.05509417504072189, 'learning_rate': 1.9863738992285096e-05, 'epoch': 0.8}


 80%|███████▉  | 10800/13552 [1:02:31<14:45,  3.11it/s]

{'loss': 0.1355, 'grad_norm': 0.06462132930755615, 'learning_rate': 1.9725081713746407e-05, 'epoch': 0.8}


 80%|███████▉  | 10810/13552 [1:02:34<14:35,  3.13it/s]

{'loss': 0.1495, 'grad_norm': 0.06997428834438324, 'learning_rate': 1.9586857103873368e-05, 'epoch': 0.8}


 80%|███████▉  | 10820/13552 [1:02:37<14:34,  3.12it/s]

{'loss': 0.1444, 'grad_norm': 0.11065515875816345, 'learning_rate': 1.9449065907674002e-05, 'epoch': 0.8}


 80%|███████▉  | 10830/13552 [1:02:41<18:16,  2.48it/s]

{'loss': 0.136, 'grad_norm': 0.08729399740695953, 'learning_rate': 1.93117088678203e-05, 'epoch': 0.8}


 80%|███████▉  | 10840/13552 [1:02:45<19:00,  2.38it/s]

{'loss': 0.1332, 'grad_norm': 0.043180204927921295, 'learning_rate': 1.91747867246442e-05, 'epoch': 0.8}


 80%|████████  | 10850/13552 [1:02:48<14:27,  3.12it/s]

{'loss': 0.1487, 'grad_norm': 0.07844395190477371, 'learning_rate': 1.9038300216133654e-05, 'epoch': 0.8}


 80%|████████  | 10860/13552 [1:02:51<14:18,  3.14it/s]

{'loss': 0.1333, 'grad_norm': 0.05097299441695213, 'learning_rate': 1.8902250077928586e-05, 'epoch': 0.8}


 80%|████████  | 10870/13552 [1:02:54<14:14,  3.14it/s]

{'loss': 0.1487, 'grad_norm': 0.06793306022882462, 'learning_rate': 1.876663704331697e-05, 'epoch': 0.8}


 80%|████████  | 10880/13552 [1:02:58<14:55,  2.98it/s]

{'loss': 0.1311, 'grad_norm': 0.06642124801874161, 'learning_rate': 1.8631461843230846e-05, 'epoch': 0.8}


 80%|████████  | 10890/13552 [1:03:01<14:04,  3.15it/s]

{'loss': 0.1502, 'grad_norm': 0.07613896578550339, 'learning_rate': 1.8496725206242416e-05, 'epoch': 0.8}


 80%|████████  | 10900/13552 [1:03:04<14:40,  3.01it/s]

{'loss': 0.1392, 'grad_norm': 0.07455355674028397, 'learning_rate': 1.8362427858560093e-05, 'epoch': 0.8}


 81%|████████  | 10910/13552 [1:03:07<14:05,  3.13it/s]

{'loss': 0.145, 'grad_norm': 0.05047590658068657, 'learning_rate': 1.822857052402459e-05, 'epoch': 0.81}


 81%|████████  | 10920/13552 [1:03:11<13:55,  3.15it/s]

{'loss': 0.1588, 'grad_norm': 0.06301295757293701, 'learning_rate': 1.809515392410506e-05, 'epoch': 0.81}


 81%|████████  | 10930/13552 [1:03:14<14:12,  3.07it/s]

{'loss': 0.1385, 'grad_norm': 0.08975902199745178, 'learning_rate': 1.7962178777895034e-05, 'epoch': 0.81}


 81%|████████  | 10940/13552 [1:03:17<17:44,  2.45it/s]

{'loss': 0.1385, 'grad_norm': 0.2195959836244583, 'learning_rate': 1.7829645802108863e-05, 'epoch': 0.81}


 81%|████████  | 10950/13552 [1:03:21<13:58,  3.10it/s]

{'loss': 0.1314, 'grad_norm': 0.07655134797096252, 'learning_rate': 1.7697555711077574e-05, 'epoch': 0.81}


 81%|████████  | 10960/13552 [1:03:24<13:41,  3.16it/s]

{'loss': 0.1542, 'grad_norm': 0.05433037504553795, 'learning_rate': 1.7565909216745115e-05, 'epoch': 0.81}


 81%|████████  | 10970/13552 [1:03:27<14:51,  2.90it/s]

{'loss': 0.1348, 'grad_norm': 0.047690752893686295, 'learning_rate': 1.7434707028664534e-05, 'epoch': 0.81}


 81%|████████  | 10980/13552 [1:03:30<14:53,  2.88it/s]

{'loss': 0.1405, 'grad_norm': 0.1294102966785431, 'learning_rate': 1.7303949853994138e-05, 'epoch': 0.81}


 81%|████████  | 10990/13552 [1:03:34<15:20,  2.78it/s]

{'loss': 0.1327, 'grad_norm': 0.07350840419530869, 'learning_rate': 1.717363839749371e-05, 'epoch': 0.81}


 81%|████████  | 11000/13552 [1:03:37<14:30,  2.93it/s]

{'loss': 0.1437, 'grad_norm': 0.08703648298978806, 'learning_rate': 1.7043773361520666e-05, 'epoch': 0.81}


 81%|████████  | 11010/13552 [1:03:40<13:28,  3.14it/s]

{'loss': 0.1464, 'grad_norm': 0.05800513178110123, 'learning_rate': 1.691435544602624e-05, 'epoch': 0.81}


 81%|████████▏ | 11020/13552 [1:03:44<15:04,  2.80it/s]

{'loss': 0.1324, 'grad_norm': 0.05571383237838745, 'learning_rate': 1.678538534855185e-05, 'epoch': 0.81}


 81%|████████▏ | 11030/13552 [1:03:47<13:19,  3.16it/s]

{'loss': 0.1469, 'grad_norm': 0.22300967574119568, 'learning_rate': 1.6656863764225196e-05, 'epoch': 0.81}


 81%|████████▏ | 11040/13552 [1:03:51<14:10,  2.95it/s]

{'loss': 0.1378, 'grad_norm': 0.06716526299715042, 'learning_rate': 1.652879138575656e-05, 'epoch': 0.81}


 82%|████████▏ | 11050/13552 [1:03:54<13:28,  3.09it/s]

{'loss': 0.1352, 'grad_norm': 0.179058238863945, 'learning_rate': 1.6401168903435073e-05, 'epoch': 0.82}


 82%|████████▏ | 11060/13552 [1:03:58<13:27,  3.09it/s]

{'loss': 0.1401, 'grad_norm': 0.06803583353757858, 'learning_rate': 1.6273997005125073e-05, 'epoch': 0.82}


 82%|████████▏ | 11070/13552 [1:04:01<14:08,  2.93it/s]

{'loss': 0.1345, 'grad_norm': 0.06195668876171112, 'learning_rate': 1.6147276376262255e-05, 'epoch': 0.82}


 82%|████████▏ | 11080/13552 [1:04:04<13:20,  3.09it/s]

{'loss': 0.1448, 'grad_norm': 0.05841364711523056, 'learning_rate': 1.6021007699850033e-05, 'epoch': 0.82}


 82%|████████▏ | 11090/13552 [1:04:08<17:15,  2.38it/s]

{'loss': 0.1368, 'grad_norm': 0.05651392415165901, 'learning_rate': 1.5895191656455944e-05, 'epoch': 0.82}


 82%|████████▏ | 11100/13552 [1:04:11<13:47,  2.96it/s]

{'loss': 0.1324, 'grad_norm': 0.05794134363532066, 'learning_rate': 1.576982892420781e-05, 'epoch': 0.82}


 82%|████████▏ | 11110/13552 [1:04:15<13:21,  3.05it/s]

{'loss': 0.1397, 'grad_norm': 0.04627124220132828, 'learning_rate': 1.5644920178790235e-05, 'epoch': 0.82}


 82%|████████▏ | 11120/13552 [1:04:18<13:11,  3.07it/s]

{'loss': 0.1526, 'grad_norm': 0.057678330689668655, 'learning_rate': 1.5520466093440933e-05, 'epoch': 0.82}


 82%|████████▏ | 11130/13552 [1:04:22<14:22,  2.81it/s]

{'loss': 0.1308, 'grad_norm': 0.060904596000909805, 'learning_rate': 1.539646733894704e-05, 'epoch': 0.82}


 82%|████████▏ | 11140/13552 [1:04:25<13:09,  3.05it/s]

{'loss': 0.1292, 'grad_norm': 0.06203493848443031, 'learning_rate': 1.5272924583641523e-05, 'epoch': 0.82}


 82%|████████▏ | 11150/13552 [1:04:29<13:04,  3.06it/s]

{'loss': 0.1357, 'grad_norm': 0.0738619938492775, 'learning_rate': 1.5149838493399615e-05, 'epoch': 0.82}


 82%|████████▏ | 11160/13552 [1:04:32<13:15,  3.01it/s]

{'loss': 0.1288, 'grad_norm': 0.07168109714984894, 'learning_rate': 1.5027209731635195e-05, 'epoch': 0.82}


 82%|████████▏ | 11170/13552 [1:04:36<18:39,  2.13it/s]

{'loss': 0.1277, 'grad_norm': 0.04721134528517723, 'learning_rate': 1.4905038959297213e-05, 'epoch': 0.82}


 82%|████████▏ | 11180/13552 [1:04:40<14:52,  2.66it/s]

{'loss': 0.1344, 'grad_norm': 0.04821108281612396, 'learning_rate': 1.4783326834866107e-05, 'epoch': 0.82}


 83%|████████▎ | 11190/13552 [1:04:44<13:58,  2.82it/s]

{'loss': 0.1462, 'grad_norm': 0.0791422575712204, 'learning_rate': 1.4662074014350314e-05, 'epoch': 0.83}


 83%|████████▎ | 11200/13552 [1:04:47<12:47,  3.07it/s]

{'loss': 0.1446, 'grad_norm': 0.07727249711751938, 'learning_rate': 1.4541281151282682e-05, 'epoch': 0.83}


 83%|████████▎ | 11210/13552 [1:04:50<12:43,  3.07it/s]

{'loss': 0.1475, 'grad_norm': 0.08312854170799255, 'learning_rate': 1.4420948896716968e-05, 'epoch': 0.83}


 83%|████████▎ | 11220/13552 [1:04:54<14:01,  2.77it/s]

{'loss': 0.1352, 'grad_norm': 0.07190648466348648, 'learning_rate': 1.4301077899224314e-05, 'epoch': 0.83}


 83%|████████▎ | 11230/13552 [1:04:57<12:41,  3.05it/s]

{'loss': 0.1553, 'grad_norm': 0.17500722408294678, 'learning_rate': 1.418166880488977e-05, 'epoch': 0.83}


 83%|████████▎ | 11240/13552 [1:05:01<12:32,  3.07it/s]

{'loss': 0.1451, 'grad_norm': 0.05596638098359108, 'learning_rate': 1.4062722257308803e-05, 'epoch': 0.83}


 83%|████████▎ | 11250/13552 [1:05:04<12:28,  3.07it/s]

{'loss': 0.1456, 'grad_norm': 0.04960852861404419, 'learning_rate': 1.394423889758384e-05, 'epoch': 0.83}


 83%|████████▎ | 11260/13552 [1:05:07<12:23,  3.08it/s]

{'loss': 0.1431, 'grad_norm': 0.071451835334301, 'learning_rate': 1.3826219364320792e-05, 'epoch': 0.83}


 83%|████████▎ | 11270/13552 [1:05:11<12:46,  2.98it/s]

{'loss': 0.141, 'grad_norm': 0.07799211144447327, 'learning_rate': 1.3708664293625573e-05, 'epoch': 0.83}


 83%|████████▎ | 11280/13552 [1:05:14<12:22,  3.06it/s]

{'loss': 0.1456, 'grad_norm': 0.07765600085258484, 'learning_rate': 1.3591574319100819e-05, 'epoch': 0.83}


 83%|████████▎ | 11290/13552 [1:05:17<13:45,  2.74it/s]

{'loss': 0.1409, 'grad_norm': 0.05102000758051872, 'learning_rate': 1.3474950071842308e-05, 'epoch': 0.83}


 83%|████████▎ | 11300/13552 [1:05:21<13:01,  2.88it/s]

{'loss': 0.1321, 'grad_norm': 0.0519290529191494, 'learning_rate': 1.335879218043562e-05, 'epoch': 0.83}


 83%|████████▎ | 11310/13552 [1:05:24<12:07,  3.08it/s]

{'loss': 0.1356, 'grad_norm': 0.07184458523988724, 'learning_rate': 1.3243101270952795e-05, 'epoch': 0.83}


 84%|████████▎ | 11320/13552 [1:05:28<12:04,  3.08it/s]

{'loss': 0.1415, 'grad_norm': 0.08021481335163116, 'learning_rate': 1.3127877966948876e-05, 'epoch': 0.84}


 84%|████████▎ | 11330/13552 [1:05:31<12:43,  2.91it/s]

{'loss': 0.1407, 'grad_norm': 0.09004919975996017, 'learning_rate': 1.3013122889458606e-05, 'epoch': 0.84}


 84%|████████▎ | 11340/13552 [1:05:34<12:05,  3.05it/s]

{'loss': 0.1454, 'grad_norm': 0.07321729511022568, 'learning_rate': 1.2898836656993085e-05, 'epoch': 0.84}


 84%|████████▍ | 11350/13552 [1:05:38<11:53,  3.09it/s]

{'loss': 0.1483, 'grad_norm': 0.08599753677845001, 'learning_rate': 1.2785019885536354e-05, 'epoch': 0.84}


 84%|████████▍ | 11360/13552 [1:05:41<12:28,  2.93it/s]

{'loss': 0.1423, 'grad_norm': 0.06383849680423737, 'learning_rate': 1.26716731885422e-05, 'epoch': 0.84}


 84%|████████▍ | 11370/13552 [1:05:44<12:43,  2.86it/s]

{'loss': 0.1299, 'grad_norm': 0.06593560427427292, 'learning_rate': 1.2558797176930759e-05, 'epoch': 0.84}


 84%|████████▍ | 11380/13552 [1:05:48<12:17,  2.95it/s]

{'loss': 0.1482, 'grad_norm': 0.06584341078996658, 'learning_rate': 1.2446392459085244e-05, 'epoch': 0.84}


 84%|████████▍ | 11390/13552 [1:05:51<11:48,  3.05it/s]

{'loss': 0.1374, 'grad_norm': 0.08216458559036255, 'learning_rate': 1.2334459640848683e-05, 'epoch': 0.84}


 84%|████████▍ | 11400/13552 [1:05:54<11:51,  3.02it/s]

{'loss': 0.1393, 'grad_norm': 0.07123164087533951, 'learning_rate': 1.2222999325520623e-05, 'epoch': 0.84}


 84%|████████▍ | 11410/13552 [1:05:58<11:42,  3.05it/s]

{'loss': 0.127, 'grad_norm': 0.0629710853099823, 'learning_rate': 1.2112012113853954e-05, 'epoch': 0.84}


 84%|████████▍ | 11420/13552 [1:06:01<11:45,  3.02it/s]

{'loss': 0.1407, 'grad_norm': 0.06877202540636063, 'learning_rate': 1.2001498604051553e-05, 'epoch': 0.84}


 84%|████████▍ | 11430/13552 [1:06:04<11:34,  3.05it/s]

{'loss': 0.1446, 'grad_norm': 0.07499897480010986, 'learning_rate': 1.189145939176316e-05, 'epoch': 0.84}


 84%|████████▍ | 11440/13552 [1:06:08<13:12,  2.67it/s]

{'loss': 0.1329, 'grad_norm': 0.07355145364999771, 'learning_rate': 1.1781895070082072e-05, 'epoch': 0.84}


 84%|████████▍ | 11450/13552 [1:06:12<11:30,  3.04it/s]

{'loss': 0.145, 'grad_norm': 0.34084025025367737, 'learning_rate': 1.1672806229542055e-05, 'epoch': 0.84}


 85%|████████▍ | 11460/13552 [1:06:15<11:28,  3.04it/s]

{'loss': 0.1328, 'grad_norm': 0.07836157828569412, 'learning_rate': 1.1564193458114115e-05, 'epoch': 0.85}


 85%|████████▍ | 11470/13552 [1:06:18<11:23,  3.05it/s]

{'loss': 0.1407, 'grad_norm': 0.058219365775585175, 'learning_rate': 1.1456057341203297e-05, 'epoch': 0.85}


 85%|████████▍ | 11480/13552 [1:06:22<13:16,  2.60it/s]

{'loss': 0.128, 'grad_norm': 0.07676871865987778, 'learning_rate': 1.1348398461645571e-05, 'epoch': 0.85}


 85%|████████▍ | 11490/13552 [1:06:26<14:30,  2.37it/s]

{'loss': 0.1349, 'grad_norm': 0.04467107728123665, 'learning_rate': 1.1241217399704663e-05, 'epoch': 0.85}


 85%|████████▍ | 11500/13552 [1:06:30<11:36,  2.95it/s]

{'loss': 0.142, 'grad_norm': 0.2646111845970154, 'learning_rate': 1.1134514733068947e-05, 'epoch': 0.85}


 85%|████████▍ | 11510/13552 [1:06:33<11:42,  2.91it/s]

{'loss': 0.1236, 'grad_norm': 0.11257990449666977, 'learning_rate': 1.1028291036848326e-05, 'epoch': 0.85}


 85%|████████▌ | 11520/13552 [1:06:37<11:01,  3.07it/s]

{'loss': 0.1498, 'grad_norm': 0.07385735958814621, 'learning_rate': 1.0922546883571138e-05, 'epoch': 0.85}


 85%|████████▌ | 11530/13552 [1:06:40<11:07,  3.03it/s]

{'loss': 0.1532, 'grad_norm': 0.08378514647483826, 'learning_rate': 1.0817282843181043e-05, 'epoch': 0.85}


 85%|████████▌ | 11540/13552 [1:06:44<15:01,  2.23it/s]

{'loss': 0.1208, 'grad_norm': 0.06354502588510513, 'learning_rate': 1.0712499483034e-05, 'epoch': 0.85}


 85%|████████▌ | 11550/13552 [1:06:47<10:57,  3.04it/s]

{'loss': 0.1499, 'grad_norm': 0.053628675639629364, 'learning_rate': 1.0608197367895145e-05, 'epoch': 0.85}


 85%|████████▌ | 11560/13552 [1:06:51<11:23,  2.91it/s]

{'loss': 0.1421, 'grad_norm': 0.10898340493440628, 'learning_rate': 1.05043770599358e-05, 'epoch': 0.85}


 85%|████████▌ | 11570/13552 [1:06:55<13:25,  2.46it/s]

{'loss': 0.1353, 'grad_norm': 0.1836157888174057, 'learning_rate': 1.0401039118730449e-05, 'epoch': 0.85}


 85%|████████▌ | 11580/13552 [1:06:58<10:52,  3.02it/s]

{'loss': 0.1417, 'grad_norm': 0.247743159532547, 'learning_rate': 1.029818410125365e-05, 'epoch': 0.85}


 86%|████████▌ | 11590/13552 [1:07:01<11:12,  2.92it/s]

{'loss': 0.1264, 'grad_norm': 0.050958264619112015, 'learning_rate': 1.0195812561877116e-05, 'epoch': 0.86}


 86%|████████▌ | 11600/13552 [1:07:05<10:42,  3.04it/s]

{'loss': 0.1444, 'grad_norm': 0.06810607761144638, 'learning_rate': 1.0093925052366714e-05, 'epoch': 0.86}


 86%|████████▌ | 11610/13552 [1:07:08<10:38,  3.04it/s]

{'loss': 0.1292, 'grad_norm': 0.08451175689697266, 'learning_rate': 9.992522121879377e-06, 'epoch': 0.86}


 86%|████████▌ | 11620/13552 [1:07:11<10:36,  3.04it/s]

{'loss': 0.1473, 'grad_norm': 0.0798131674528122, 'learning_rate': 9.891604316960301e-06, 'epoch': 0.86}


 86%|████████▌ | 11630/13552 [1:07:15<13:40,  2.34it/s]

{'loss': 0.1423, 'grad_norm': 0.052379585802555084, 'learning_rate': 9.791172181539988e-06, 'epoch': 0.86}


 86%|████████▌ | 11640/13552 [1:07:19<11:16,  2.83it/s]

{'loss': 0.1379, 'grad_norm': 0.09231636673212051, 'learning_rate': 9.691226256931164e-06, 'epoch': 0.86}


 86%|████████▌ | 11650/13552 [1:07:22<10:20,  3.07it/s]

{'loss': 0.1303, 'grad_norm': 0.07045534998178482, 'learning_rate': 9.591767081826008e-06, 'epoch': 0.86}


 86%|████████▌ | 11660/13552 [1:07:25<10:16,  3.07it/s]

{'loss': 0.1481, 'grad_norm': 0.057116761803627014, 'learning_rate': 9.49279519229318e-06, 'epoch': 0.86}


 86%|████████▌ | 11670/13552 [1:07:29<11:23,  2.75it/s]

{'loss': 0.1407, 'grad_norm': 0.06682302057743073, 'learning_rate': 9.394311121774968e-06, 'epoch': 0.86}


 86%|████████▌ | 11680/13552 [1:07:32<10:23,  3.00it/s]

{'loss': 0.1247, 'grad_norm': 0.07762399315834045, 'learning_rate': 9.296315401084387e-06, 'epoch': 0.86}


 86%|████████▋ | 11690/13552 [1:07:36<12:00,  2.58it/s]

{'loss': 0.1539, 'grad_norm': 0.09926073253154755, 'learning_rate': 9.198808558402338e-06, 'epoch': 0.86}


 86%|████████▋ | 11700/13552 [1:07:40<11:17,  2.73it/s]

{'loss': 0.1294, 'grad_norm': 0.06696660071611404, 'learning_rate': 9.101791119274706e-06, 'epoch': 0.86}


 86%|████████▋ | 11710/13552 [1:07:43<11:35,  2.65it/s]

{'loss': 0.13, 'grad_norm': 0.04870827868580818, 'learning_rate': 9.005263606609615e-06, 'epoch': 0.86}


 86%|████████▋ | 11720/13552 [1:07:47<11:13,  2.72it/s]

{'loss': 0.1287, 'grad_norm': 0.05663891136646271, 'learning_rate': 8.909226540674565e-06, 'epoch': 0.86}


 87%|████████▋ | 11730/13552 [1:07:50<10:50,  2.80it/s]

{'loss': 0.1438, 'grad_norm': 0.08496791869401932, 'learning_rate': 8.813680439093596e-06, 'epoch': 0.87}


 87%|████████▋ | 11740/13552 [1:07:54<10:06,  2.99it/s]

{'loss': 0.1525, 'grad_norm': 0.11810541152954102, 'learning_rate': 8.718625816844573e-06, 'epoch': 0.87}


 87%|████████▋ | 11750/13552 [1:07:57<10:45,  2.79it/s]

{'loss': 0.1208, 'grad_norm': 0.05779301002621651, 'learning_rate': 8.624063186256326e-06, 'epoch': 0.87}


 87%|████████▋ | 11760/13552 [1:08:01<10:08,  2.94it/s]

{'loss': 0.1451, 'grad_norm': 0.09159209579229355, 'learning_rate': 8.52999305700597e-06, 'epoch': 0.87}


 87%|████████▋ | 11770/13552 [1:08:04<09:52,  3.01it/s]

{'loss': 0.1491, 'grad_norm': 0.05047442391514778, 'learning_rate': 8.436415936116104e-06, 'epoch': 0.87}


 87%|████████▋ | 11780/13552 [1:08:08<11:19,  2.61it/s]

{'loss': 0.1355, 'grad_norm': 0.060920313000679016, 'learning_rate': 8.34333232795207e-06, 'epoch': 0.87}


 87%|████████▋ | 11790/13552 [1:08:11<10:35,  2.77it/s]

{'loss': 0.1266, 'grad_norm': 0.09961855411529541, 'learning_rate': 8.250742734219274e-06, 'epoch': 0.87}


 87%|████████▋ | 11800/13552 [1:08:15<09:36,  3.04it/s]

{'loss': 0.1396, 'grad_norm': 0.06699538975954056, 'learning_rate': 8.158647653960461e-06, 'epoch': 0.87}


 87%|████████▋ | 11810/13552 [1:08:18<11:02,  2.63it/s]

{'loss': 0.1347, 'grad_norm': 0.06469594687223434, 'learning_rate': 8.067047583553055e-06, 'epoch': 0.87}


 87%|████████▋ | 11820/13552 [1:08:22<10:39,  2.71it/s]

{'loss': 0.1408, 'grad_norm': 0.08266059309244156, 'learning_rate': 7.975943016706433e-06, 'epoch': 0.87}


 87%|████████▋ | 11830/13552 [1:08:26<11:48,  2.43it/s]

{'loss': 0.1308, 'grad_norm': 0.05887000635266304, 'learning_rate': 7.885334444459269e-06, 'epoch': 0.87}


 87%|████████▋ | 11840/13552 [1:08:29<10:14,  2.79it/s]

{'loss': 0.1316, 'grad_norm': 0.06486888974905014, 'learning_rate': 7.795222355176967e-06, 'epoch': 0.87}


 87%|████████▋ | 11850/13552 [1:08:33<09:32,  2.97it/s]

{'loss': 0.1361, 'grad_norm': 0.09788288921117783, 'learning_rate': 7.705607234548905e-06, 'epoch': 0.87}


 88%|████████▊ | 11860/13552 [1:08:36<10:26,  2.70it/s]

{'loss': 0.1331, 'grad_norm': 0.06892049312591553, 'learning_rate': 7.6164895655859266e-06, 'epoch': 0.88}


 88%|████████▊ | 11870/13552 [1:08:40<09:37,  2.91it/s]

{'loss': 0.1547, 'grad_norm': 0.07424318045377731, 'learning_rate': 7.527869828617673e-06, 'epoch': 0.88}


 88%|████████▊ | 11880/13552 [1:08:43<09:15,  3.01it/s]

{'loss': 0.1332, 'grad_norm': 0.17981036007404327, 'learning_rate': 7.43974850129e-06, 'epoch': 0.88}


 88%|████████▊ | 11890/13552 [1:08:47<10:16,  2.70it/s]

{'loss': 0.1446, 'grad_norm': 0.15924228727817535, 'learning_rate': 7.352126058562458e-06, 'epoch': 0.88}


 88%|████████▊ | 11900/13552 [1:08:50<09:13,  2.98it/s]

{'loss': 0.1457, 'grad_norm': 0.100339375436306, 'learning_rate': 7.265002972705659e-06, 'epoch': 0.88}


 88%|████████▊ | 11910/13552 [1:08:54<09:17,  2.94it/s]

{'loss': 0.1314, 'grad_norm': 0.07401952892541885, 'learning_rate': 7.1783797132987555e-06, 'epoch': 0.88}


 88%|████████▊ | 11920/13552 [1:08:57<09:16,  2.93it/s]

{'loss': 0.1317, 'grad_norm': 0.27023330330848694, 'learning_rate': 7.0922567472269444e-06, 'epoch': 0.88}


 88%|████████▊ | 11930/13552 [1:09:00<08:42,  3.10it/s]

{'loss': 0.1427, 'grad_norm': 0.04958425089716911, 'learning_rate': 7.0066345386789135e-06, 'epoch': 0.88}


 88%|████████▊ | 11940/13552 [1:09:04<09:08,  2.94it/s]

{'loss': 0.1415, 'grad_norm': 0.0725439041852951, 'learning_rate': 6.921513549144365e-06, 'epoch': 0.88}


 88%|████████▊ | 11950/13552 [1:09:07<08:36,  3.10it/s]

{'loss': 0.1413, 'grad_norm': 0.09055238217115402, 'learning_rate': 6.836894237411451e-06, 'epoch': 0.88}


 88%|████████▊ | 11960/13552 [1:09:10<10:05,  2.63it/s]

{'loss': 0.1376, 'grad_norm': 0.12715573608875275, 'learning_rate': 6.75277705956443e-06, 'epoch': 0.88}


 88%|████████▊ | 11970/13552 [1:09:14<08:44,  3.02it/s]

{'loss': 0.1301, 'grad_norm': 0.06609433889389038, 'learning_rate': 6.669162468981105e-06, 'epoch': 0.88}


 88%|████████▊ | 11980/13552 [1:09:18<10:17,  2.55it/s]

{'loss': 0.1304, 'grad_norm': 0.08478561043739319, 'learning_rate': 6.586050916330455e-06, 'epoch': 0.88}


 88%|████████▊ | 11990/13552 [1:09:21<08:24,  3.10it/s]

{'loss': 0.1448, 'grad_norm': 0.11576110124588013, 'learning_rate': 6.50344284957013e-06, 'epoch': 0.88}


 89%|████████▊ | 12000/13552 [1:09:24<08:48,  2.94it/s]

{'loss': 0.1401, 'grad_norm': 0.09215259552001953, 'learning_rate': 6.421338713944081e-06, 'epoch': 0.89}


 89%|████████▊ | 12010/13552 [1:09:28<09:23,  2.74it/s]

{'loss': 0.1341, 'grad_norm': 0.06162475422024727, 'learning_rate': 6.339738951980156e-06, 'epoch': 0.89}


 89%|████████▊ | 12020/13552 [1:09:31<08:26,  3.02it/s]

{'loss': 0.1456, 'grad_norm': 0.07833490520715714, 'learning_rate': 6.258644003487701e-06, 'epoch': 0.89}


 89%|████████▉ | 12030/13552 [1:09:35<09:39,  2.62it/s]

{'loss': 0.1262, 'grad_norm': 0.1086261048913002, 'learning_rate': 6.178054305555237e-06, 'epoch': 0.89}


 89%|████████▉ | 12040/13552 [1:09:39<10:18,  2.44it/s]

{'loss': 0.1275, 'grad_norm': 0.0373837910592556, 'learning_rate': 6.097970292548016e-06, 'epoch': 0.89}


 89%|████████▉ | 12050/13552 [1:09:42<08:15,  3.03it/s]

{'loss': 0.1401, 'grad_norm': 0.0783766359090805, 'learning_rate': 6.018392396105754e-06, 'epoch': 0.89}


 89%|████████▉ | 12060/13552 [1:09:46<08:35,  2.90it/s]

{'loss': 0.1283, 'grad_norm': 0.06805671006441116, 'learning_rate': 5.939321045140289e-06, 'epoch': 0.89}


 89%|████████▉ | 12070/13552 [1:09:49<07:54,  3.12it/s]

{'loss': 0.13, 'grad_norm': 0.06236353889107704, 'learning_rate': 5.860756665833267e-06, 'epoch': 0.89}


 89%|████████▉ | 12080/13552 [1:09:53<09:56,  2.47it/s]

{'loss': 0.1246, 'grad_norm': 0.06539326906204224, 'learning_rate': 5.782699681633818e-06, 'epoch': 0.89}


 89%|████████▉ | 12090/13552 [1:09:56<09:01,  2.70it/s]

{'loss': 0.1365, 'grad_norm': 0.09008858352899551, 'learning_rate': 5.7051505132562965e-06, 'epoch': 0.89}


 89%|████████▉ | 12100/13552 [1:09:59<08:32,  2.83it/s]

{'loss': 0.1361, 'grad_norm': 0.19231846928596497, 'learning_rate': 5.628109578678031e-06, 'epoch': 0.89}


 89%|████████▉ | 12110/13552 [1:10:03<09:20,  2.57it/s]

{'loss': 0.127, 'grad_norm': 0.08598224818706512, 'learning_rate': 5.551577293137078e-06, 'epoch': 0.89}


 89%|████████▉ | 12120/13552 [1:10:07<09:26,  2.53it/s]

{'loss': 0.1468, 'grad_norm': 0.07674623280763626, 'learning_rate': 5.475554069129874e-06, 'epoch': 0.89}


 90%|████████▉ | 12130/13552 [1:10:10<07:59,  2.96it/s]

{'loss': 0.1232, 'grad_norm': 0.0810655802488327, 'learning_rate': 5.400040316409172e-06, 'epoch': 0.9}


 90%|████████▉ | 12140/13552 [1:10:13<07:53,  2.98it/s]

{'loss': 0.1285, 'grad_norm': 0.0918925404548645, 'learning_rate': 5.325036441981723e-06, 'epoch': 0.9}


 90%|████████▉ | 12150/13552 [1:10:17<07:34,  3.09it/s]

{'loss': 0.1408, 'grad_norm': 0.09343647211790085, 'learning_rate': 5.250542850106155e-06, 'epoch': 0.9}


 90%|████████▉ | 12160/13552 [1:10:20<08:00,  2.90it/s]

{'loss': 0.1406, 'grad_norm': 0.09121951460838318, 'learning_rate': 5.176559942290704e-06, 'epoch': 0.9}


 90%|████████▉ | 12170/13552 [1:10:24<07:29,  3.07it/s]

{'loss': 0.1387, 'grad_norm': 0.07491245120763779, 'learning_rate': 5.103088117291144e-06, 'epoch': 0.9}


 90%|████████▉ | 12180/13552 [1:10:27<07:30,  3.04it/s]

{'loss': 0.1284, 'grad_norm': 0.07309029996395111, 'learning_rate': 5.030127771108572e-06, 'epoch': 0.9}


 90%|████████▉ | 12190/13552 [1:10:30<07:36,  2.98it/s]

{'loss': 0.1425, 'grad_norm': 0.12990282475948334, 'learning_rate': 4.957679296987316e-06, 'epoch': 0.9}


 90%|█████████ | 12200/13552 [1:10:34<07:22,  3.05it/s]

{'loss': 0.1426, 'grad_norm': 0.0814104750752449, 'learning_rate': 4.885743085412786e-06, 'epoch': 0.9}


 90%|█████████ | 12210/13552 [1:10:37<07:48,  2.86it/s]

{'loss': 0.1183, 'grad_norm': 0.04834199324250221, 'learning_rate': 4.814319524109379e-06, 'epoch': 0.9}


 90%|█████████ | 12220/13552 [1:10:41<08:00,  2.77it/s]

{'loss': 0.1215, 'grad_norm': 0.07666587829589844, 'learning_rate': 4.743408998038412e-06, 'epoch': 0.9}


 90%|█████████ | 12230/13552 [1:10:44<07:13,  3.05it/s]

{'loss': 0.1614, 'grad_norm': 0.06205367296934128, 'learning_rate': 4.673011889395995e-06, 'epoch': 0.9}


 90%|█████████ | 12240/13552 [1:10:48<07:24,  2.95it/s]

{'loss': 0.1322, 'grad_norm': 0.06325896084308624, 'learning_rate': 4.6031285776110155e-06, 'epoch': 0.9}


 90%|█████████ | 12250/13552 [1:10:51<07:14,  3.00it/s]

{'loss': 0.1303, 'grad_norm': 0.05938836559653282, 'learning_rate': 4.5337594393431e-06, 'epoch': 0.9}


 90%|█████████ | 12260/13552 [1:10:55<07:23,  2.91it/s]

{'loss': 0.1555, 'grad_norm': 0.055539458990097046, 'learning_rate': 4.464904848480523e-06, 'epoch': 0.9}


 91%|█████████ | 12270/13552 [1:10:58<06:57,  3.07it/s]

{'loss': 0.1535, 'grad_norm': 0.06105445325374603, 'learning_rate': 4.396565176138267e-06, 'epoch': 0.91}


 91%|█████████ | 12280/13552 [1:11:01<07:05,  2.99it/s]

{'loss': 0.1348, 'grad_norm': 0.12674923241138458, 'learning_rate': 4.328740790655983e-06, 'epoch': 0.91}


 91%|█████████ | 12290/13552 [1:11:05<06:53,  3.05it/s]

{'loss': 0.1366, 'grad_norm': 0.06466344743967056, 'learning_rate': 4.261432057595982e-06, 'epoch': 0.91}


 91%|█████████ | 12300/13552 [1:11:08<06:58,  2.99it/s]

{'loss': 0.1348, 'grad_norm': 0.06669931858778, 'learning_rate': 4.194639339741335e-06, 'epoch': 0.91}


 91%|█████████ | 12310/13552 [1:11:12<07:21,  2.81it/s]

{'loss': 0.1518, 'grad_norm': 0.07592431455850601, 'learning_rate': 4.128362997093837e-06, 'epoch': 0.91}


 91%|█████████ | 12320/13552 [1:11:15<07:01,  2.93it/s]

{'loss': 0.1365, 'grad_norm': 0.0616828091442585, 'learning_rate': 4.062603386872121e-06, 'epoch': 0.91}


 91%|█████████ | 12330/13552 [1:11:19<07:02,  2.89it/s]

{'loss': 0.124, 'grad_norm': 0.08584518730640411, 'learning_rate': 3.997360863509747e-06, 'epoch': 0.91}


 91%|█████████ | 12340/13552 [1:11:22<06:50,  2.96it/s]

{'loss': 0.144, 'grad_norm': 0.10084713250398636, 'learning_rate': 3.932635778653215e-06, 'epoch': 0.91}


 91%|█████████ | 12350/13552 [1:11:25<07:13,  2.77it/s]

{'loss': 0.1328, 'grad_norm': 0.17751401662826538, 'learning_rate': 3.868428481160146e-06, 'epoch': 0.91}


 91%|█████████ | 12360/13552 [1:11:29<08:12,  2.42it/s]

{'loss': 0.124, 'grad_norm': 0.05796825885772705, 'learning_rate': 3.804739317097339e-06, 'epoch': 0.91}


 91%|█████████▏| 12370/13552 [1:11:33<06:31,  3.02it/s]

{'loss': 0.1296, 'grad_norm': 0.0751650407910347, 'learning_rate': 3.7415686297389896e-06, 'epoch': 0.91}


 91%|█████████▏| 12380/13552 [1:11:36<06:19,  3.09it/s]

{'loss': 0.1337, 'grad_norm': 0.04604875296354294, 'learning_rate': 3.678916759564732e-06, 'epoch': 0.91}


 91%|█████████▏| 12390/13552 [1:11:39<06:50,  2.83it/s]

{'loss': 0.1275, 'grad_norm': 0.07138258218765259, 'learning_rate': 3.6167840442578927e-06, 'epoch': 0.91}


 91%|█████████▏| 12400/13552 [1:11:43<06:59,  2.75it/s]

{'loss': 0.1423, 'grad_norm': 0.07892174273729324, 'learning_rate': 3.5551708187036346e-06, 'epoch': 0.91}


 92%|█████████▏| 12410/13552 [1:11:46<06:22,  2.99it/s]

{'loss': 0.1382, 'grad_norm': 0.08927877247333527, 'learning_rate': 3.4940774149871734e-06, 'epoch': 0.92}


 92%|█████████▏| 12420/13552 [1:11:50<06:20,  2.98it/s]

{'loss': 0.1547, 'grad_norm': 0.05982896685600281, 'learning_rate': 3.4335041623919208e-06, 'epoch': 0.92}


 92%|█████████▏| 12430/13552 [1:11:53<06:07,  3.05it/s]

{'loss': 0.1469, 'grad_norm': 0.08703389018774033, 'learning_rate': 3.373451387397819e-06, 'epoch': 0.92}


 92%|█████████▏| 12440/13552 [1:11:56<06:23,  2.90it/s]

{'loss': 0.1363, 'grad_norm': 0.0844854786992073, 'learning_rate': 3.313919413679478e-06, 'epoch': 0.92}


 92%|█████████▏| 12450/13552 [1:12:00<06:24,  2.87it/s]

{'loss': 0.1413, 'grad_norm': 0.06952786445617676, 'learning_rate': 3.254908562104497e-06, 'epoch': 0.92}


 92%|█████████▏| 12460/13552 [1:12:03<05:55,  3.08it/s]

{'loss': 0.1545, 'grad_norm': 0.27663594484329224, 'learning_rate': 3.196419150731689e-06, 'epoch': 0.92}


 92%|█████████▏| 12470/13552 [1:12:07<05:55,  3.04it/s]

{'loss': 0.1493, 'grad_norm': 0.06989826261997223, 'learning_rate': 3.1384514948094244e-06, 'epoch': 0.92}


 92%|█████████▏| 12480/13552 [1:12:10<06:05,  2.93it/s]

{'loss': 0.1386, 'grad_norm': 0.08335646241903305, 'learning_rate': 3.0810059067738593e-06, 'epoch': 0.92}


 92%|█████████▏| 12490/13552 [1:12:14<06:02,  2.93it/s]

{'loss': 0.1501, 'grad_norm': 0.09817007929086685, 'learning_rate': 3.0240826962473102e-06, 'epoch': 0.92}


 92%|█████████▏| 12500/13552 [1:12:17<06:11,  2.83it/s]

{'loss': 0.1399, 'grad_norm': 0.08769387751817703, 'learning_rate': 2.9676821700365786e-06, 'epoch': 0.92}


 92%|█████████▏| 12510/13552 [1:12:20<05:47,  2.99it/s]

{'loss': 0.1288, 'grad_norm': 0.05450161173939705, 'learning_rate': 2.911804632131243e-06, 'epoch': 0.92}


 92%|█████████▏| 12520/13552 [1:12:24<07:17,  2.36it/s]

{'loss': 0.1424, 'grad_norm': 0.09484051167964935, 'learning_rate': 2.856450383702092e-06, 'epoch': 0.92}


 92%|█████████▏| 12530/13552 [1:12:27<05:34,  3.05it/s]

{'loss': 0.1465, 'grad_norm': 0.08869083225727081, 'learning_rate': 2.801619723099469e-06, 'epoch': 0.92}


 93%|█████████▎| 12540/13552 [1:12:31<06:29,  2.60it/s]

{'loss': 0.1199, 'grad_norm': 0.05823611095547676, 'learning_rate': 2.7473129458516546e-06, 'epoch': 0.93}


 93%|█████████▎| 12550/13552 [1:12:35<05:38,  2.96it/s]

{'loss': 0.1213, 'grad_norm': 0.07432173937559128, 'learning_rate': 2.693530344663275e-06, 'epoch': 0.93}


 93%|█████████▎| 12560/13552 [1:12:38<06:18,  2.62it/s]

{'loss': 0.1341, 'grad_norm': 0.05061359703540802, 'learning_rate': 2.640272209413752e-06, 'epoch': 0.93}


 93%|█████████▎| 12570/13552 [1:12:42<05:25,  3.02it/s]

{'loss': 0.1437, 'grad_norm': 0.12921030819416046, 'learning_rate': 2.587538827155722e-06, 'epoch': 0.93}


 93%|█████████▎| 12580/13552 [1:12:45<05:29,  2.95it/s]

{'loss': 0.1293, 'grad_norm': 0.08710204809904099, 'learning_rate': 2.5353304821134625e-06, 'epoch': 0.93}


 93%|█████████▎| 12590/13552 [1:12:49<05:44,  2.79it/s]

{'loss': 0.158, 'grad_norm': 0.09671390801668167, 'learning_rate': 2.483647455681415e-06, 'epoch': 0.93}


 93%|█████████▎| 12600/13552 [1:12:52<05:08,  3.08it/s]

{'loss': 0.1379, 'grad_norm': 0.09149099886417389, 'learning_rate': 2.4324900264226403e-06, 'epoch': 0.93}


 93%|█████████▎| 12610/13552 [1:12:56<06:01,  2.60it/s]

{'loss': 0.1336, 'grad_norm': 0.08096057176589966, 'learning_rate': 2.3818584700672886e-06, 'epoch': 0.93}


 93%|█████████▎| 12620/13552 [1:12:59<05:03,  3.07it/s]

{'loss': 0.1562, 'grad_norm': 0.08917746692895889, 'learning_rate': 2.3317530595111657e-06, 'epoch': 0.93}


 93%|█████████▎| 12630/13552 [1:13:02<05:02,  3.05it/s]

{'loss': 0.1473, 'grad_norm': 0.07779355347156525, 'learning_rate': 2.2821740648142463e-06, 'epoch': 0.93}


 93%|█████████▎| 12640/13552 [1:13:06<04:55,  3.09it/s]

{'loss': 0.1494, 'grad_norm': 0.07780183106660843, 'learning_rate': 2.233121753199163e-06, 'epoch': 0.93}


 93%|█████████▎| 12650/13552 [1:13:09<05:19,  2.82it/s]

{'loss': 0.1396, 'grad_norm': 0.07799555361270905, 'learning_rate': 2.184596389049853e-06, 'epoch': 0.93}


 93%|█████████▎| 12660/13552 [1:13:12<04:50,  3.07it/s]

{'loss': 0.1533, 'grad_norm': 0.1064109355211258, 'learning_rate': 2.1365982339101033e-06, 'epoch': 0.93}


 93%|█████████▎| 12670/13552 [1:13:16<04:48,  3.05it/s]

{'loss': 0.1347, 'grad_norm': 0.08573230355978012, 'learning_rate': 2.0891275464820725e-06, 'epoch': 0.93}


 94%|█████████▎| 12680/13552 [1:13:19<04:45,  3.06it/s]

{'loss': 0.1324, 'grad_norm': 0.057425934821367264, 'learning_rate': 2.0421845826250395e-06, 'epoch': 0.94}


 94%|█████████▎| 12690/13552 [1:13:22<04:42,  3.05it/s]

{'loss': 0.1405, 'grad_norm': 0.06814515590667725, 'learning_rate': 1.995769595353869e-06, 'epoch': 0.94}


 94%|█████████▎| 12700/13552 [1:13:26<04:40,  3.04it/s]

{'loss': 0.1338, 'grad_norm': 0.08549433201551437, 'learning_rate': 1.949882834837735e-06, 'epoch': 0.94}


 94%|█████████▍| 12710/13552 [1:13:29<04:43,  2.97it/s]

{'loss': 0.1288, 'grad_norm': 0.16005603969097137, 'learning_rate': 1.9045245483987895e-06, 'epoch': 0.94}


 94%|█████████▍| 12720/13552 [1:13:32<04:33,  3.04it/s]

{'loss': 0.1464, 'grad_norm': 0.08047695457935333, 'learning_rate': 1.859694980510751e-06, 'epoch': 0.94}


 94%|█████████▍| 12730/13552 [1:13:36<04:25,  3.09it/s]

{'loss': 0.1432, 'grad_norm': 0.22523608803749084, 'learning_rate': 1.8153943727976407e-06, 'epoch': 0.94}


 94%|█████████▍| 12740/13552 [1:13:39<04:28,  3.03it/s]

{'loss': 0.1427, 'grad_norm': 0.09316756576299667, 'learning_rate': 1.7716229640324932e-06, 'epoch': 0.94}


 94%|█████████▍| 12750/13552 [1:13:42<04:19,  3.09it/s]

{'loss': 0.1505, 'grad_norm': 0.056424580514431, 'learning_rate': 1.728380990136047e-06, 'epoch': 0.94}


 94%|█████████▍| 12760/13552 [1:13:46<04:30,  2.93it/s]

{'loss': 0.1357, 'grad_norm': 0.06289247423410416, 'learning_rate': 1.6856686841754344e-06, 'epoch': 0.94}


 94%|█████████▍| 12770/13552 [1:13:49<04:29,  2.90it/s]

{'loss': 0.1322, 'grad_norm': 0.05889750272035599, 'learning_rate': 1.6434862763630155e-06, 'epoch': 0.94}


 94%|█████████▍| 12780/13552 [1:13:53<04:42,  2.74it/s]

{'loss': 0.1423, 'grad_norm': 0.06688394397497177, 'learning_rate': 1.6018339940550465e-06, 'epoch': 0.94}


 94%|█████████▍| 12790/13552 [1:13:57<05:13,  2.43it/s]

{'loss': 0.144, 'grad_norm': 0.06359022855758667, 'learning_rate': 1.5607120617505133e-06, 'epoch': 0.94}


 94%|█████████▍| 12800/13552 [1:14:00<04:04,  3.08it/s]

{'loss': 0.1458, 'grad_norm': 0.09322386234998703, 'learning_rate': 1.5201207010899e-06, 'epoch': 0.94}


 95%|█████████▍| 12810/13552 [1:14:04<04:27,  2.78it/s]

{'loss': 0.1298, 'grad_norm': 0.10375168174505234, 'learning_rate': 1.4800601308539886e-06, 'epoch': 0.95}


 95%|█████████▍| 12820/13552 [1:14:07<03:58,  3.07it/s]

{'loss': 0.1441, 'grad_norm': 0.06156586483120918, 'learning_rate': 1.4405305669626946e-06, 'epoch': 0.95}


 95%|█████████▍| 12830/13552 [1:14:10<03:55,  3.07it/s]

{'loss': 0.141, 'grad_norm': 0.0682382583618164, 'learning_rate': 1.4015322224739002e-06, 'epoch': 0.95}


 95%|█████████▍| 12840/13552 [1:14:14<03:55,  3.03it/s]

{'loss': 0.142, 'grad_norm': 0.11126656085252762, 'learning_rate': 1.3630653075823008e-06, 'epoch': 0.95}


 95%|█████████▍| 12850/13552 [1:14:17<04:10,  2.81it/s]

{'loss': 0.1477, 'grad_norm': 0.07328414171934128, 'learning_rate': 1.32513002961826e-06, 'epoch': 0.95}


 95%|█████████▍| 12860/13552 [1:14:21<04:09,  2.77it/s]

{'loss': 0.1331, 'grad_norm': 0.048647671937942505, 'learning_rate': 1.287726593046712e-06, 'epoch': 0.95}


 95%|█████████▍| 12870/13552 [1:14:24<03:43,  3.05it/s]

{'loss': 0.1298, 'grad_norm': 0.06143331527709961, 'learning_rate': 1.2508551994660612e-06, 'epoch': 0.95}


 95%|█████████▌| 12880/13552 [1:14:28<03:55,  2.85it/s]

{'loss': 0.1445, 'grad_norm': 0.05502637103199959, 'learning_rate': 1.2145160476070838e-06, 'epoch': 0.95}


 95%|█████████▌| 12890/13552 [1:14:31<03:34,  3.08it/s]

{'loss': 0.1363, 'grad_norm': 0.0898745134472847, 'learning_rate': 1.1787093333318511e-06, 'epoch': 0.95}


 95%|█████████▌| 12900/13552 [1:14:34<03:31,  3.08it/s]

{'loss': 0.1537, 'grad_norm': 0.07460483908653259, 'learning_rate': 1.1434352496327071e-06, 'epoch': 0.95}


 95%|█████████▌| 12910/13552 [1:14:37<03:26,  3.10it/s]

{'loss': 0.1377, 'grad_norm': 0.07436329126358032, 'learning_rate': 1.1086939866311597e-06, 'epoch': 0.95}


 95%|█████████▌| 12920/13552 [1:14:41<04:16,  2.47it/s]

{'loss': 0.1437, 'grad_norm': 0.10121641308069229, 'learning_rate': 1.0744857315769574e-06, 'epoch': 0.95}


 95%|█████████▌| 12930/13552 [1:14:45<04:07,  2.51it/s]

{'loss': 0.1297, 'grad_norm': 0.1240437924861908, 'learning_rate': 1.0408106688469698e-06, 'epoch': 0.95}


 95%|█████████▌| 12940/13552 [1:14:49<03:19,  3.06it/s]

{'loss': 0.1546, 'grad_norm': 0.06902045011520386, 'learning_rate': 1.0076689799442873e-06, 'epoch': 0.95}


 96%|█████████▌| 12950/13552 [1:14:52<03:20,  3.01it/s]

{'loss': 0.1357, 'grad_norm': 0.06720224022865295, 'learning_rate': 9.750608434971776e-07, 'epoch': 0.96}


 96%|█████████▌| 12960/13552 [1:14:56<03:14,  3.04it/s]

{'loss': 0.1315, 'grad_norm': 0.06385710090398788, 'learning_rate': 9.42986435258153e-07, 'epoch': 0.96}


 96%|█████████▌| 12970/13552 [1:14:59<03:08,  3.08it/s]

{'loss': 0.143, 'grad_norm': 0.09869956225156784, 'learning_rate': 9.114459281030496e-07, 'epoch': 0.96}


 96%|█████████▌| 12980/13552 [1:15:02<03:06,  3.07it/s]

{'loss': 0.1592, 'grad_norm': 0.4147297739982605, 'learning_rate': 8.804394920300052e-07, 'epoch': 0.96}


 96%|█████████▌| 12990/13552 [1:15:05<03:01,  3.09it/s]

{'loss': 0.1483, 'grad_norm': 0.09320630133152008, 'learning_rate': 8.499672941586379e-07, 'epoch': 0.96}


 96%|█████████▌| 13000/13552 [1:15:09<03:02,  3.03it/s]

{'loss': 0.1447, 'grad_norm': 0.06681425869464874, 'learning_rate': 8.200294987291135e-07, 'epoch': 0.96}


 96%|█████████▌| 13010/13552 [1:15:12<03:07,  2.89it/s]

{'loss': 0.1375, 'grad_norm': 0.10875263065099716, 'learning_rate': 7.906262671012466e-07, 'epoch': 0.96}


 96%|█████████▌| 13020/13552 [1:15:16<02:59,  2.96it/s]

{'loss': 0.1396, 'grad_norm': 0.08883153647184372, 'learning_rate': 7.61757757753645e-07, 'epoch': 0.96}


 96%|█████████▌| 13030/13552 [1:15:19<02:50,  3.06it/s]

{'loss': 0.1312, 'grad_norm': 0.07101918756961823, 'learning_rate': 7.334241262828334e-07, 'epoch': 0.96}


 96%|█████████▌| 13040/13552 [1:15:22<02:45,  3.10it/s]

{'loss': 0.1446, 'grad_norm': 0.09494579583406448, 'learning_rate': 7.056255254024647e-07, 'epoch': 0.96}


 96%|█████████▋| 13050/13552 [1:15:26<02:56,  2.84it/s]

{'loss': 0.1315, 'grad_norm': 0.07920383661985397, 'learning_rate': 6.783621049424316e-07, 'epoch': 0.96}


 96%|█████████▋| 13060/13552 [1:15:29<02:54,  2.81it/s]

{'loss': 0.1195, 'grad_norm': 0.07446640729904175, 'learning_rate': 6.516340118481123e-07, 'epoch': 0.96}


 96%|█████████▋| 13070/13552 [1:15:33<02:38,  3.04it/s]

{'loss': 0.1379, 'grad_norm': 0.09033800661563873, 'learning_rate': 6.254413901795486e-07, 'epoch': 0.96}


 97%|█████████▋| 13080/13552 [1:15:36<02:37,  3.00it/s]

{'loss': 0.1515, 'grad_norm': 0.07271627336740494, 'learning_rate': 5.997843811106906e-07, 'epoch': 0.97}


 97%|█████████▋| 13090/13552 [1:15:39<02:30,  3.07it/s]

{'loss': 0.1441, 'grad_norm': 0.07288219779729843, 'learning_rate': 5.746631229286092e-07, 'epoch': 0.97}


 97%|█████████▋| 13100/13552 [1:15:43<02:30,  3.00it/s]

{'loss': 0.1396, 'grad_norm': 0.0812278613448143, 'learning_rate': 5.500777510327626e-07, 'epoch': 0.97}


 97%|█████████▋| 13110/13552 [1:15:46<02:37,  2.81it/s]

{'loss': 0.1416, 'grad_norm': 0.12905164062976837, 'learning_rate': 5.260283979343084e-07, 'epoch': 0.97}


 97%|█████████▋| 13120/13552 [1:15:50<02:22,  3.02it/s]

{'loss': 0.1286, 'grad_norm': 0.07649575173854828, 'learning_rate': 5.02515193255293e-07, 'epoch': 0.97}


 97%|█████████▋| 13130/13552 [1:15:53<02:18,  3.04it/s]

{'loss': 0.1464, 'grad_norm': 0.06544031947851181, 'learning_rate': 4.795382637280522e-07, 'epoch': 0.97}


 97%|█████████▋| 13140/13552 [1:15:56<02:14,  3.06it/s]

{'loss': 0.1296, 'grad_norm': 0.0542980320751667, 'learning_rate': 4.5709773319447815e-07, 'epoch': 0.97}


 97%|█████████▋| 13150/13552 [1:16:00<02:13,  3.02it/s]

{'loss': 0.1293, 'grad_norm': 0.10648532211780548, 'learning_rate': 4.351937226053537e-07, 'epoch': 0.97}


 97%|█████████▋| 13160/13552 [1:16:03<02:24,  2.72it/s]

{'loss': 0.1288, 'grad_norm': 0.07149568200111389, 'learning_rate': 4.1382635001970817e-07, 'epoch': 0.97}


 97%|█████████▋| 13170/13552 [1:16:07<02:19,  2.73it/s]

{'loss': 0.1337, 'grad_norm': 0.06457556784152985, 'learning_rate': 3.9299573060417315e-07, 'epoch': 0.97}


 97%|█████████▋| 13180/13552 [1:16:10<02:21,  2.62it/s]

{'loss': 0.1438, 'grad_norm': 0.06495822221040726, 'learning_rate': 3.727019766323836e-07, 'epoch': 0.97}


 97%|█████████▋| 13190/13552 [1:16:13<01:58,  3.07it/s]

{'loss': 0.1401, 'grad_norm': 0.18425507843494415, 'learning_rate': 3.529451974843556e-07, 'epoch': 0.97}


 97%|█████████▋| 13200/13552 [1:16:17<01:54,  3.07it/s]

{'loss': 0.1247, 'grad_norm': 0.055504318326711655, 'learning_rate': 3.337254996458872e-07, 'epoch': 0.97}


 97%|█████████▋| 13210/13552 [1:16:20<01:57,  2.90it/s]

{'loss': 0.1348, 'grad_norm': 0.24244339764118195, 'learning_rate': 3.1504298670800294e-07, 'epoch': 0.97}


 98%|█████████▊| 13220/13552 [1:16:23<01:49,  3.03it/s]

{'loss': 0.1406, 'grad_norm': 0.06898362934589386, 'learning_rate': 2.9689775936639907e-07, 'epoch': 0.98}


 98%|█████████▊| 13230/13552 [1:16:27<01:45,  3.04it/s]

{'loss': 0.1484, 'grad_norm': 0.0653277039527893, 'learning_rate': 2.79289915420855e-07, 'epoch': 0.98}


 98%|█████████▊| 13240/13552 [1:16:30<02:04,  2.51it/s]

{'loss': 0.146, 'grad_norm': 0.03338482975959778, 'learning_rate': 2.622195497747892e-07, 'epoch': 0.98}


 98%|█████████▊| 13250/13552 [1:16:34<01:44,  2.90it/s]

{'loss': 0.1372, 'grad_norm': 0.10777589678764343, 'learning_rate': 2.4568675443467084e-07, 'epoch': 0.98}


 98%|█████████▊| 13260/13552 [1:16:37<01:35,  3.06it/s]

{'loss': 0.1443, 'grad_norm': 0.1053692102432251, 'learning_rate': 2.2969161850956433e-07, 'epoch': 0.98}


 98%|█████████▊| 13270/13552 [1:16:41<01:51,  2.54it/s]

{'loss': 0.127, 'grad_norm': 0.09048253297805786, 'learning_rate': 2.1423422821063022e-07, 'epoch': 0.98}


 98%|█████████▊| 13280/13552 [1:16:45<01:49,  2.47it/s]

{'loss': 0.1366, 'grad_norm': 0.05835472792387009, 'learning_rate': 1.9931466685065847e-07, 'epoch': 0.98}


 98%|█████████▊| 13290/13552 [1:16:49<01:24,  3.11it/s]

{'loss': 0.142, 'grad_norm': 0.0878349244594574, 'learning_rate': 1.8493301484366899e-07, 'epoch': 0.98}


 98%|█████████▊| 13300/13552 [1:16:52<01:20,  3.14it/s]

{'loss': 0.1385, 'grad_norm': 0.057317104190588, 'learning_rate': 1.7108934970437862e-07, 'epoch': 0.98}


 98%|█████████▊| 13310/13552 [1:16:55<01:16,  3.16it/s]

{'loss': 0.1423, 'grad_norm': 0.08721143752336502, 'learning_rate': 1.5778374604791256e-07, 'epoch': 0.98}


 98%|█████████▊| 13320/13552 [1:16:58<01:13,  3.14it/s]

{'loss': 0.1269, 'grad_norm': 0.07113158702850342, 'learning_rate': 1.4501627558926033e-07, 'epoch': 0.98}


 98%|█████████▊| 13330/13552 [1:17:02<01:10,  3.13it/s]

{'loss': 0.1453, 'grad_norm': 0.09305892139673233, 'learning_rate': 1.3278700714302038e-07, 'epoch': 0.98}


 98%|█████████▊| 13340/13552 [1:17:05<01:21,  2.60it/s]

{'loss': 0.136, 'grad_norm': 0.06920744478702545, 'learning_rate': 1.2109600662293385e-07, 'epoch': 0.98}


 99%|█████████▊| 13350/13552 [1:17:09<01:06,  3.06it/s]

{'loss': 0.1307, 'grad_norm': 0.08614525943994522, 'learning_rate': 1.0994333704158478e-07, 'epoch': 0.99}


 99%|█████████▊| 13360/13552 [1:17:12<01:01,  3.13it/s]

{'loss': 0.1368, 'grad_norm': 0.06430500000715256, 'learning_rate': 9.932905851004482e-08, 'epoch': 0.99}


 99%|█████████▊| 13370/13552 [1:17:15<01:08,  2.66it/s]

{'loss': 0.1398, 'grad_norm': 0.08497905731201172, 'learning_rate': 8.925322823751802e-08, 'epoch': 0.99}


 99%|█████████▊| 13380/13552 [1:17:20<00:57,  3.00it/s]

{'loss': 0.1228, 'grad_norm': 0.07169566303491592, 'learning_rate': 7.971590053109657e-08, 'epoch': 0.99}


 99%|█████████▉| 13390/13552 [1:17:23<00:51,  3.17it/s]

{'loss': 0.1432, 'grad_norm': 0.07675659656524658, 'learning_rate': 7.071712679541654e-08, 'epoch': 0.99}


 99%|█████████▉| 13400/13552 [1:17:26<00:55,  2.74it/s]

{'loss': 0.1366, 'grad_norm': 0.0593290776014328, 'learning_rate': 6.225695553238043e-08, 'epoch': 0.99}


 99%|█████████▉| 13410/13552 [1:17:30<00:53,  2.66it/s]

{'loss': 0.1396, 'grad_norm': 0.07569713145494461, 'learning_rate': 5.433543234093508e-08, 'epoch': 0.99}


 99%|█████████▉| 13420/13552 [1:17:33<00:42,  3.11it/s]

{'loss': 0.143, 'grad_norm': 0.11148075014352798, 'learning_rate': 4.6952599916783025e-08, 'epoch': 0.99}


 99%|█████████▉| 13430/13552 [1:17:36<00:38,  3.16it/s]

{'loss': 0.1389, 'grad_norm': 0.08717450499534607, 'learning_rate': 4.010849805220485e-08, 'epoch': 0.99}


 99%|█████████▉| 13440/13552 [1:17:39<00:35,  3.20it/s]

{'loss': 0.1363, 'grad_norm': 0.06693167239427567, 'learning_rate': 3.380316363577052e-08, 'epoch': 0.99}


 99%|█████████▉| 13450/13552 [1:17:42<00:32,  3.14it/s]

{'loss': 0.1426, 'grad_norm': 0.1223253533244133, 'learning_rate': 2.8036630652206187e-08, 'epoch': 0.99}


 99%|█████████▉| 13460/13552 [1:17:46<00:32,  2.84it/s]

{'loss': 0.1301, 'grad_norm': 0.0639023631811142, 'learning_rate': 2.2808930182172117e-08, 'epoch': 0.99}


 99%|█████████▉| 13470/13552 [1:17:49<00:26,  3.15it/s]

{'loss': 0.1407, 'grad_norm': 0.07165771722793579, 'learning_rate': 1.8120090402129476e-08, 'epoch': 0.99}


 99%|█████████▉| 13480/13552 [1:17:53<00:24,  2.96it/s]

{'loss': 0.1232, 'grad_norm': 0.06655575335025787, 'learning_rate': 1.3970136584151583e-08, 'epoch': 0.99}


100%|█████████▉| 13490/13552 [1:17:56<00:20,  3.03it/s]

{'loss': 0.1407, 'grad_norm': 0.09361343830823898, 'learning_rate': 1.03590910958129e-08, 'epoch': 1.0}


100%|█████████▉| 13500/13552 [1:18:00<00:17,  2.95it/s]

{'loss': 0.1315, 'grad_norm': 0.08271950483322144, 'learning_rate': 7.286973400055797e-09, 'epoch': 1.0}


100%|█████████▉| 13510/13552 [1:18:03<00:13,  3.12it/s]

{'loss': 0.1385, 'grad_norm': 0.08128482103347778, 'learning_rate': 4.753800055090629e-09, 'epoch': 1.0}


100%|█████████▉| 13520/13552 [1:18:06<00:10,  3.07it/s]

{'loss': 0.1386, 'grad_norm': 0.06370788812637329, 'learning_rate': 2.7595847143069286e-09, 'epoch': 1.0}


100%|█████████▉| 13530/13552 [1:18:09<00:07,  3.06it/s]

{'loss': 0.1325, 'grad_norm': 0.0795731320977211, 'learning_rate': 1.304338126195681e-09, 'epoch': 1.0}


100%|█████████▉| 13540/13552 [1:18:13<00:03,  3.10it/s]

{'loss': 0.1491, 'grad_norm': 0.08235369622707367, 'learning_rate': 3.880681343049197e-10, 'epoch': 1.0}


100%|█████████▉| 13550/13552 [1:18:16<00:00,  3.07it/s]

{'loss': 0.1328, 'grad_norm': 0.07617275416851044, 'learning_rate': 1.077967717311168e-11, 'epoch': 1.0}


100%|██████████| 13552/13552 [1:18:20<00:00,  2.88it/s]

{'train_runtime': 4700.5594, 'train_samples_per_second': 5.766, 'train_steps_per_second': 2.883, 'train_loss': 0.17071948282271096, 'epoch': 1.0}





TrainOutput(global_step=13552, training_loss=0.17071948282271096, metrics={'train_runtime': 4700.5594, 'train_samples_per_second': 5.766, 'train_steps_per_second': 2.883, 'total_flos': 8.403717176027136e+16, 'train_loss': 0.17071948282271096, 'epoch': 1.0})

In [27]:
trainer.save_model()

In [28]:

del trainer
del model
del tokenizer

gc.collect()
torch.cuda.empty_cache()

In [5]:
model = AutoPeftModelForCausalLM.from_pretrained(
  'i_speak_cypher_3b',
  device_map="auto",  # use auto for inference
  torch_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained('i_speak_cypher_3b')
#model.resize_token_embeddings(len(tokenizer))

# Text generation pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
     

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.26it/s]
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCaus

In [6]:
test_dataset = load_dataset("json", data_files=data_path+"test_dataset.json", split="train")

In [12]:
prompt = pipe.tokenizer.apply_chat_template(test_dataset[102]["messages"][:2],
                                            tokenize=False,
                                            add_generation_prompt=True)
print(prompt)

<|im_start|>system

You are a text to Cypher query translator. Convert the following question into a Cypher query using the provided graph schema!
Graph schema: Relevant node labels and their properties (with datatypes) are:
Article {article_id: INTEGER}
DOI {}

Relevant relationships are:
{'start': Article, 'type': HAS_DOI, 'end': DOI }
<|im_end|><|endoftext|>
<|im_start|>user
Search for the article_id values from 20 Article that are linked to DOI via HAS_DOI and return article_id along with the respective DOI counts!<|im_end|><|endoftext|>
<|im_start|>assistant



In [13]:
outputs = pipe(prompt,
              max_new_tokens=256,
              do_sample=False,
              temperature=0.1,
              top_k=50,
              top_p=0.1
              )

print(f"Question: {test_dataset[102]['messages'][1]['content']}")
print(f"Correct Cypher: {test_dataset[102]['messages'][2]['content']}")
print(f"Generated Cypher: {outputs[0]['generated_text'][len(prompt):-10].strip()}")



Question: Search for the article_id values from 20 Article that are linked to DOI via HAS_DOI and return article_id along with the respective DOI counts!
Correct Cypher: MATCH (n:Article) -[:HAS_DOI]->(m:DOI) WITH DISTINCT n, m RETURN n.article_id AS article_id, count(m) AS count LIMIT 20
Generated Cypher: MATCH (n:Article) -[:HAS_DOI]->(m:DOI) WITH DISTINCT n, m RETURN n.article_id AS article_id, count(m) AS count LIMIT 20<


In [10]:
#@title Test on a Subset of Samples
from tqdm import tqdm

# Compare the generated text with provided Cypher statement

def evaluate(sample):
    prompt = pipe.tokenizer.apply_chat_template(sample["messages"][:2],
                                                tokenize=False,
                                                add_generation_prompt=True)
    outputs = pipe(prompt,
                   max_new_tokens=256,
                   do_sample=True,
                   temperature=0.7,
                   top_k=50,
                   top_p=0.95
                   )

    predicted_answer = outputs[0]['generated_text'][len(prompt):-10].strip() # remove end generation token

    if predicted_answer[:-1] == sample["messages"][2]["content"]:
        return 1
    else:
        return 0

success_rate = []
number_of_eval_samples = 100

# Iterate over sample dataset and predict
for s in tqdm(test_dataset.shuffle().select(range(number_of_eval_samples))):
    success_rate.append(evaluate(s))

# compute accuracy
accuracy = sum(success_rate)/len(success_rate)

print(f"Accuracy: {accuracy*100:.2f}%")

100%|██████████| 100/100 [03:21<00:00,  2.01s/it]

Accuracy: 71.00%





In [13]:
!python -m pip install numpy==1.24.1

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting numpy==1.24.1
  Downloading numpy-1.24.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.24.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.1.1
    Uninstalling numpy-2.1.1:
      Successfully uninstalled numpy-2.1.1
Successfully installed numpy-1.24.1
