In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, TaskType
import torch
from pathlib import Path

# Config
MODEL_NAME = "Salesforce/codegen-350M-mono"

DATA_PATH = "./1-encoder-data/train/train.jsonl"
EVAL_PATH = "./1-encoder-data/eval/val.jsonl"
OUTPUT_DIR = "./finetuned-codegemma"

# Load dataset
train_data = load_dataset("json", data_files=DATA_PATH)["train"]
val_data = load_dataset("json", data_files=EVAL_PATH)["train"]

# Preprocess: concatenate input + output (zero-instruction)
def format_example(example):
    return {"text": example["input"] + "\n" + example["output"]}

train_data = train_data.map(format_example)
val_data = val_data.map(format_example)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token  # prevent pad token errors

def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

train_data = train_data.map(tokenize, batched=True)
val_data = val_data.map(tokenize, batched=True)

# Load model
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
#for name, module in model.named_modules():
#    if "Linear" in str(type(module)):
#        print(name)


# LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["qkv_proj", "out_proj", "fc_in", "fc_out"],  # Correct modules
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)


model = get_peft_model(model, lora_config)

# Training args
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=12,
    logging_steps=20,
    learning_rate=2e-4,
    fp16=True,
    report_to="none",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

# Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)


last_checkpoint = None
checkpoint_dir = Path(OUTPUT_DIR)
checkpoints = list(checkpoint_dir.glob("checkpoint-*"))
if checkpoints:
    last_checkpoint = str(sorted(checkpoints, key=lambda p: int(p.name.split('-')[-1]))[-1])

# Start training (will resume if checkpoint exists)
trainer.train(resume_from_checkpoint=last_checkpoint)


# Train
#trainer.train()


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  return torch.load(checkpoint_file, map_location=map_location)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
  checkpoint_rng_state = torch.load(rng_file)
You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 67%|██████▋   | 13760/20628 [00:09<00:00, 8390.39it/s]

{'loss': 0.2527, 'learning_rate': 6.659879775063022e-05, 'epoch': 8.0}


 67%|██████▋   | 13780/20628 [00:28<00:13, 513.47it/s] 

{'loss': 0.2615, 'learning_rate': 6.640488656195462e-05, 'epoch': 8.02}


 67%|██████▋   | 13800/20628 [00:47<01:19, 85.74it/s] 

{'loss': 0.263, 'learning_rate': 6.621097537327904e-05, 'epoch': 8.03}


 67%|██████▋   | 13820/20628 [01:06<02:37, 43.28it/s]

{'loss': 0.2724, 'learning_rate': 6.601706418460345e-05, 'epoch': 8.04}


 67%|██████▋   | 13840/20628 [01:26<05:55, 19.08it/s]

{'loss': 0.2692, 'learning_rate': 6.582315299592787e-05, 'epoch': 8.05}


 67%|██████▋   | 13860/20628 [01:45<10:48, 10.44it/s]

{'loss': 0.2649, 'learning_rate': 6.562924180725229e-05, 'epoch': 8.06}


 67%|██████▋   | 13880/20628 [02:04<33:34,  3.35it/s]

{'loss': 0.2646, 'learning_rate': 6.543533061857669e-05, 'epoch': 8.07}


 67%|██████▋   | 13900/20628 [02:23<53:41,  2.09it/s]

{'loss': 0.2723, 'learning_rate': 6.524141942990111e-05, 'epoch': 8.09}


 67%|██████▋   | 13920/20628 [02:42<1:25:33,  1.31it/s]

{'loss': 0.2755, 'learning_rate': 6.504750824122552e-05, 'epoch': 8.1}


 68%|██████▊   | 13940/20628 [03:01<1:45:54,  1.05it/s]

{'loss': 0.2618, 'learning_rate': 6.485359705254993e-05, 'epoch': 8.11}


 68%|██████▊   | 13960/20628 [03:20<1:45:24,  1.05it/s]

{'loss': 0.2655, 'learning_rate': 6.465968586387435e-05, 'epoch': 8.12}


 68%|██████▊   | 13980/20628 [03:40<1:46:11,  1.04it/s]

{'loss': 0.2634, 'learning_rate': 6.446577467519876e-05, 'epoch': 8.13}


 68%|██████▊   | 14000/20628 [03:59<1:46:02,  1.04it/s]

{'loss': 0.2687, 'learning_rate': 6.427186348652318e-05, 'epoch': 8.14}


 68%|██████▊   | 14020/20628 [04:18<1:45:43,  1.04it/s]

{'loss': 0.2783, 'learning_rate': 6.407795229784758e-05, 'epoch': 8.16}


 68%|██████▊   | 14040/20628 [04:37<1:45:47,  1.04it/s]

{'loss': 0.2695, 'learning_rate': 6.3884041109172e-05, 'epoch': 8.17}


 68%|██████▊   | 14060/20628 [04:57<1:45:14,  1.04it/s]

{'loss': 0.268, 'learning_rate': 6.369012992049641e-05, 'epoch': 8.18}


 68%|██████▊   | 14080/20628 [05:16<1:45:03,  1.04it/s]

{'loss': 0.2723, 'learning_rate': 6.349621873182084e-05, 'epoch': 8.19}


 68%|██████▊   | 14100/20628 [05:35<1:44:44,  1.04it/s]

{'loss': 0.2758, 'learning_rate': 6.330230754314525e-05, 'epoch': 8.2}


 68%|██████▊   | 14120/20628 [05:54<1:43:39,  1.05it/s]

{'loss': 0.2746, 'learning_rate': 6.310839635446965e-05, 'epoch': 8.21}


 69%|██████▊   | 14140/20628 [06:14<1:44:02,  1.04it/s]

{'loss': 0.271, 'learning_rate': 6.291448516579407e-05, 'epoch': 8.23}


 69%|██████▊   | 14160/20628 [06:33<1:43:35,  1.04it/s]

{'loss': 0.2782, 'learning_rate': 6.272057397711847e-05, 'epoch': 8.24}


 69%|██████▊   | 14180/20628 [06:52<1:43:07,  1.04it/s]

{'loss': 0.2741, 'learning_rate': 6.25266627884429e-05, 'epoch': 8.25}


 69%|██████▉   | 14200/20628 [07:11<1:43:27,  1.04it/s]

{'loss': 0.2866, 'learning_rate': 6.233275159976731e-05, 'epoch': 8.26}


 69%|██████▉   | 14220/20628 [07:30<1:42:35,  1.04it/s]

{'loss': 0.2751, 'learning_rate': 6.213884041109173e-05, 'epoch': 8.27}


 69%|██████▉   | 14240/20628 [07:49<1:41:07,  1.05it/s]

{'loss': 0.268, 'learning_rate': 6.194492922241614e-05, 'epoch': 8.28}


 69%|██████▉   | 14260/20628 [08:09<1:42:04,  1.04it/s]

{'loss': 0.2753, 'learning_rate': 6.175101803374054e-05, 'epoch': 8.3}


 69%|██████▉   | 14280/20628 [08:28<1:41:50,  1.04it/s]

{'loss': 0.261, 'learning_rate': 6.155710684506496e-05, 'epoch': 8.31}


 69%|██████▉   | 14300/20628 [08:47<1:40:19,  1.05it/s]

{'loss': 0.2741, 'learning_rate': 6.136319565638937e-05, 'epoch': 8.32}


 69%|██████▉   | 14320/20628 [09:06<1:40:49,  1.04it/s]

{'loss': 0.272, 'learning_rate': 6.11692844677138e-05, 'epoch': 8.33}


 70%|██████▉   | 14340/20628 [09:25<1:40:54,  1.04it/s]

{'loss': 0.2741, 'learning_rate': 6.0975373279038205e-05, 'epoch': 8.34}


 70%|██████▉   | 14360/20628 [09:45<1:39:52,  1.05it/s]

{'loss': 0.2762, 'learning_rate': 6.078146209036262e-05, 'epoch': 8.35}


 70%|██████▉   | 14380/20628 [10:04<1:40:01,  1.04it/s]

{'loss': 0.269, 'learning_rate': 6.058755090168703e-05, 'epoch': 8.37}


 70%|██████▉   | 14400/20628 [10:23<1:38:32,  1.05it/s]

{'loss': 0.2763, 'learning_rate': 6.039363971301144e-05, 'epoch': 8.38}


 70%|██████▉   | 14420/20628 [10:42<1:40:05,  1.03it/s]

{'loss': 0.2797, 'learning_rate': 6.019972852433585e-05, 'epoch': 8.39}


 70%|███████   | 14440/20628 [11:01<1:39:05,  1.04it/s]

{'loss': 0.2716, 'learning_rate': 6.000581733566027e-05, 'epoch': 8.4}


 70%|███████   | 14460/20628 [11:20<1:38:13,  1.05it/s]

{'loss': 0.2709, 'learning_rate': 5.9811906146984684e-05, 'epoch': 8.41}


 70%|███████   | 14480/20628 [11:40<1:38:58,  1.04it/s]

{'loss': 0.2779, 'learning_rate': 5.9617994958309096e-05, 'epoch': 8.42}


 70%|███████   | 14500/20628 [11:59<1:37:44,  1.04it/s]

{'loss': 0.2835, 'learning_rate': 5.942408376963351e-05, 'epoch': 8.44}


 70%|███████   | 14520/20628 [12:18<1:38:15,  1.04it/s]

{'loss': 0.2779, 'learning_rate': 5.923017258095792e-05, 'epoch': 8.45}


 70%|███████   | 14540/20628 [12:37<1:37:16,  1.04it/s]

{'loss': 0.2799, 'learning_rate': 5.903626139228233e-05, 'epoch': 8.46}


 71%|███████   | 14560/20628 [12:56<1:36:09,  1.05it/s]

{'loss': 0.2764, 'learning_rate': 5.884235020360676e-05, 'epoch': 8.47}


 71%|███████   | 14580/20628 [13:15<1:36:37,  1.04it/s]

{'loss': 0.2635, 'learning_rate': 5.8648439014931164e-05, 'epoch': 8.48}


 71%|███████   | 14600/20628 [13:35<1:36:48,  1.04it/s]

{'loss': 0.2697, 'learning_rate': 5.8454527826255576e-05, 'epoch': 8.49}


 71%|███████   | 14620/20628 [13:54<1:35:02,  1.05it/s]

{'loss': 0.2686, 'learning_rate': 5.826061663757999e-05, 'epoch': 8.5}


 71%|███████   | 14640/20628 [14:13<1:34:50,  1.05it/s]

{'loss': 0.2767, 'learning_rate': 5.80667054489044e-05, 'epoch': 8.52}


 71%|███████   | 14660/20628 [14:32<1:35:53,  1.04it/s]

{'loss': 0.2711, 'learning_rate': 5.787279426022881e-05, 'epoch': 8.53}


 71%|███████   | 14680/20628 [14:51<1:35:10,  1.04it/s]

{'loss': 0.269, 'learning_rate': 5.767888307155324e-05, 'epoch': 8.54}


 71%|███████▏  | 14700/20628 [15:11<1:34:43,  1.04it/s]

{'loss': 0.2724, 'learning_rate': 5.748497188287765e-05, 'epoch': 8.55}


 71%|███████▏  | 14720/20628 [15:30<1:34:27,  1.04it/s]

{'loss': 0.2657, 'learning_rate': 5.7291060694202056e-05, 'epoch': 8.56}


 71%|███████▏  | 14740/20628 [15:49<1:34:27,  1.04it/s]

{'loss': 0.2707, 'learning_rate': 5.709714950552647e-05, 'epoch': 8.57}


 72%|███████▏  | 14760/20628 [16:08<1:32:42,  1.05it/s]

{'loss': 0.2754, 'learning_rate': 5.690323831685088e-05, 'epoch': 8.59}


 72%|███████▏  | 14780/20628 [16:27<1:33:21,  1.04it/s]

{'loss': 0.2775, 'learning_rate': 5.670932712817529e-05, 'epoch': 8.6}


 72%|███████▏  | 14800/20628 [16:46<1:34:00,  1.03it/s]

{'loss': 0.2764, 'learning_rate': 5.651541593949972e-05, 'epoch': 8.61}


 72%|███████▏  | 14820/20628 [17:06<1:32:40,  1.04it/s]

{'loss': 0.2812, 'learning_rate': 5.632150475082413e-05, 'epoch': 8.62}


 72%|███████▏  | 14840/20628 [17:25<1:32:20,  1.04it/s]

{'loss': 0.2774, 'learning_rate': 5.612759356214854e-05, 'epoch': 8.63}


 72%|███████▏  | 14860/20628 [17:44<1:32:55,  1.03it/s]

{'loss': 0.2752, 'learning_rate': 5.5933682373472954e-05, 'epoch': 8.64}


 72%|███████▏  | 14880/20628 [18:03<1:31:13,  1.05it/s]

{'loss': 0.2764, 'learning_rate': 5.573977118479736e-05, 'epoch': 8.66}


 72%|███████▏  | 14900/20628 [18:22<1:31:40,  1.04it/s]

{'loss': 0.2892, 'learning_rate': 5.554585999612177e-05, 'epoch': 8.67}


 72%|███████▏  | 14920/20628 [18:42<1:31:39,  1.04it/s]

{'loss': 0.2829, 'learning_rate': 5.53519488074462e-05, 'epoch': 8.68}


 72%|███████▏  | 14940/20628 [19:01<1:31:18,  1.04it/s]

{'loss': 0.2675, 'learning_rate': 5.515803761877061e-05, 'epoch': 8.69}


 73%|███████▎  | 14960/20628 [19:20<1:30:44,  1.04it/s]

{'loss': 0.2706, 'learning_rate': 5.496412643009502e-05, 'epoch': 8.7}


 73%|███████▎  | 14980/20628 [19:39<1:30:46,  1.04it/s]

{'loss': 0.2784, 'learning_rate': 5.4770215241419434e-05, 'epoch': 8.71}


 73%|███████▎  | 15000/20628 [19:59<1:30:35,  1.04it/s]

{'loss': 0.2889, 'learning_rate': 5.4576304052743846e-05, 'epoch': 8.73}


 73%|███████▎  | 15020/20628 [20:18<1:28:40,  1.05it/s]

{'loss': 0.2905, 'learning_rate': 5.438239286406825e-05, 'epoch': 8.74}


 73%|███████▎  | 15040/20628 [20:37<1:28:33,  1.05it/s]

{'loss': 0.2767, 'learning_rate': 5.418848167539268e-05, 'epoch': 8.75}


 73%|███████▎  | 15060/20628 [20:56<1:28:42,  1.05it/s]

{'loss': 0.2678, 'learning_rate': 5.399457048671709e-05, 'epoch': 8.76}


 73%|███████▎  | 15080/20628 [21:15<1:28:34,  1.04it/s]

{'loss': 0.2747, 'learning_rate': 5.38006592980415e-05, 'epoch': 8.77}


 73%|███████▎  | 15100/20628 [21:34<1:28:26,  1.04it/s]

{'loss': 0.271, 'learning_rate': 5.3606748109365913e-05, 'epoch': 8.78}


 73%|███████▎  | 15120/20628 [21:54<1:28:27,  1.04it/s]

{'loss': 0.2716, 'learning_rate': 5.3412836920690326e-05, 'epoch': 8.8}


 73%|███████▎  | 15140/20628 [22:13<1:28:21,  1.04it/s]

{'loss': 0.2822, 'learning_rate': 5.321892573201474e-05, 'epoch': 8.81}


 73%|███████▎  | 15160/20628 [22:32<1:27:46,  1.04it/s]

{'loss': 0.292, 'learning_rate': 5.302501454333916e-05, 'epoch': 8.82}


 74%|███████▎  | 15180/20628 [22:52<1:27:47,  1.03it/s]

{'loss': 0.2723, 'learning_rate': 5.283110335466357e-05, 'epoch': 8.83}


 74%|███████▎  | 15200/20628 [23:11<1:27:01,  1.04it/s]

{'loss': 0.2782, 'learning_rate': 5.263719216598798e-05, 'epoch': 8.84}


 74%|███████▍  | 15220/20628 [23:30<1:26:30,  1.04it/s]

{'loss': 0.2735, 'learning_rate': 5.244328097731239e-05, 'epoch': 8.85}


 74%|███████▍  | 15240/20628 [23:49<1:26:34,  1.04it/s]

{'loss': 0.2743, 'learning_rate': 5.2249369788636805e-05, 'epoch': 8.87}


 74%|███████▍  | 15260/20628 [24:09<1:26:07,  1.04it/s]

{'loss': 0.276, 'learning_rate': 5.205545859996122e-05, 'epoch': 8.88}


 74%|███████▍  | 15280/20628 [24:28<1:25:51,  1.04it/s]

{'loss': 0.2741, 'learning_rate': 5.1861547411285636e-05, 'epoch': 8.89}


 74%|███████▍  | 15300/20628 [24:47<1:25:56,  1.03it/s]

{'loss': 0.2765, 'learning_rate': 5.166763622261005e-05, 'epoch': 8.9}


 74%|███████▍  | 15320/20628 [25:06<1:24:47,  1.04it/s]

{'loss': 0.2795, 'learning_rate': 5.147372503393446e-05, 'epoch': 8.91}


 74%|███████▍  | 15340/20628 [25:25<1:23:51,  1.05it/s]

{'loss': 0.2794, 'learning_rate': 5.127981384525887e-05, 'epoch': 8.92}


 74%|███████▍  | 15360/20628 [25:45<1:24:27,  1.04it/s]

{'loss': 0.2789, 'learning_rate': 5.1085902656583285e-05, 'epoch': 8.94}


 75%|███████▍  | 15380/20628 [26:04<1:24:10,  1.04it/s]

{'loss': 0.2756, 'learning_rate': 5.08919914679077e-05, 'epoch': 8.95}


 75%|███████▍  | 15400/20628 [26:23<1:22:52,  1.05it/s]

{'loss': 0.2818, 'learning_rate': 5.0698080279232116e-05, 'epoch': 8.96}


 75%|███████▍  | 15420/20628 [26:42<1:23:21,  1.04it/s]

{'loss': 0.289, 'learning_rate': 5.050416909055653e-05, 'epoch': 8.97}


 75%|███████▍  | 15440/20628 [27:01<1:23:14,  1.04it/s]

{'loss': 0.2845, 'learning_rate': 5.031025790188094e-05, 'epoch': 8.98}


 75%|███████▍  | 15460/20628 [27:20<1:21:37,  1.06it/s]

{'loss': 0.2745, 'learning_rate': 5.011634671320535e-05, 'epoch': 8.99}


                                                       
 75%|███████▌  | 15471/20628 [28:49<1:22:37,  1.04it/s]

{'eval_loss': 0.36817458271980286, 'eval_runtime': 77.525, 'eval_samples_per_second': 22.174, 'eval_steps_per_second': 11.093, 'epoch': 9.0}


 75%|███████▌  | 15480/20628 [28:57<3:17:37,  2.30s/it] 

{'loss': 0.2667, 'learning_rate': 4.992243552452977e-05, 'epoch': 9.0}


 75%|███████▌  | 15500/20628 [29:17<1:22:39,  1.03it/s]

{'loss': 0.2579, 'learning_rate': 4.9728524335854183e-05, 'epoch': 9.02}


 75%|███████▌  | 15520/20628 [29:36<1:21:39,  1.04it/s]

{'loss': 0.259, 'learning_rate': 4.9534613147178596e-05, 'epoch': 9.03}


 75%|███████▌  | 15540/20628 [29:55<1:22:01,  1.03it/s]

{'loss': 0.2555, 'learning_rate': 4.934070195850301e-05, 'epoch': 9.04}


 75%|███████▌  | 15560/20628 [30:15<1:21:26,  1.04it/s]

{'loss': 0.2525, 'learning_rate': 4.914679076982742e-05, 'epoch': 9.05}


 76%|███████▌  | 15580/20628 [30:34<1:21:03,  1.04it/s]

{'loss': 0.2644, 'learning_rate': 4.895287958115183e-05, 'epoch': 9.06}


 76%|███████▌  | 15600/20628 [30:53<1:20:43,  1.04it/s]

{'loss': 0.2645, 'learning_rate': 4.875896839247625e-05, 'epoch': 9.07}


 76%|███████▌  | 15620/20628 [31:13<1:20:18,  1.04it/s]

{'loss': 0.2578, 'learning_rate': 4.856505720380066e-05, 'epoch': 9.09}


 76%|███████▌  | 15640/20628 [31:32<1:19:13,  1.05it/s]

{'loss': 0.2585, 'learning_rate': 4.8371146015125075e-05, 'epoch': 9.1}


 76%|███████▌  | 15660/20628 [31:51<1:20:01,  1.03it/s]

{'loss': 0.2623, 'learning_rate': 4.8177234826449494e-05, 'epoch': 9.11}


 76%|███████▌  | 15680/20628 [32:10<1:18:30,  1.05it/s]

{'loss': 0.2546, 'learning_rate': 4.79833236377739e-05, 'epoch': 9.12}


 76%|███████▌  | 15700/20628 [32:29<1:19:10,  1.04it/s]

{'loss': 0.2653, 'learning_rate': 4.778941244909831e-05, 'epoch': 9.13}


 76%|███████▌  | 15720/20628 [32:49<1:18:54,  1.04it/s]

{'loss': 0.2651, 'learning_rate': 4.759550126042273e-05, 'epoch': 9.14}


 76%|███████▋  | 15740/20628 [33:08<1:18:21,  1.04it/s]

{'loss': 0.2594, 'learning_rate': 4.740159007174714e-05, 'epoch': 9.16}


 76%|███████▋  | 15760/20628 [33:27<1:17:56,  1.04it/s]

{'loss': 0.2553, 'learning_rate': 4.7207678883071555e-05, 'epoch': 9.17}


 76%|███████▋  | 15780/20628 [33:46<1:17:48,  1.04it/s]

{'loss': 0.2568, 'learning_rate': 4.7013767694395974e-05, 'epoch': 9.18}


 77%|███████▋  | 15800/20628 [34:06<1:17:49,  1.03it/s]

{'loss': 0.2667, 'learning_rate': 4.6819856505720386e-05, 'epoch': 9.19}


 77%|███████▋  | 15820/20628 [34:25<1:17:12,  1.04it/s]

{'loss': 0.2594, 'learning_rate': 4.662594531704479e-05, 'epoch': 9.2}


 77%|███████▋  | 15840/20628 [34:44<1:16:59,  1.04it/s]

{'loss': 0.2666, 'learning_rate': 4.643203412836921e-05, 'epoch': 9.21}


 77%|███████▋  | 15860/20628 [35:03<1:16:15,  1.04it/s]

{'loss': 0.2586, 'learning_rate': 4.623812293969362e-05, 'epoch': 9.23}


 77%|███████▋  | 15880/20628 [35:23<1:16:14,  1.04it/s]

{'loss': 0.2578, 'learning_rate': 4.6044211751018035e-05, 'epoch': 9.24}


 77%|███████▋  | 15900/20628 [35:42<1:14:48,  1.05it/s]

{'loss': 0.2539, 'learning_rate': 4.5850300562342453e-05, 'epoch': 9.25}


 77%|███████▋  | 15920/20628 [36:01<1:15:37,  1.04it/s]

{'loss': 0.2528, 'learning_rate': 4.5656389373666866e-05, 'epoch': 9.26}


 77%|███████▋  | 15940/20628 [36:20<1:15:17,  1.04it/s]

{'loss': 0.2674, 'learning_rate': 4.546247818499128e-05, 'epoch': 9.27}


 77%|███████▋  | 15960/20628 [36:40<1:14:50,  1.04it/s]

{'loss': 0.2635, 'learning_rate': 4.526856699631569e-05, 'epoch': 9.28}


 77%|███████▋  | 15980/20628 [36:59<1:14:47,  1.04it/s]

{'loss': 0.2627, 'learning_rate': 4.50746558076401e-05, 'epoch': 9.3}


 78%|███████▊  | 16000/20628 [37:18<1:14:33,  1.03it/s]

{'loss': 0.2604, 'learning_rate': 4.4880744618964514e-05, 'epoch': 9.31}


 78%|███████▊  | 16020/20628 [37:37<1:12:59,  1.05it/s]

{'loss': 0.2682, 'learning_rate': 4.468683343028893e-05, 'epoch': 9.32}


 78%|███████▊  | 16040/20628 [37:56<1:14:40,  1.02it/s]

{'loss': 0.2646, 'learning_rate': 4.4492922241613345e-05, 'epoch': 9.33}


 78%|███████▊  | 16060/20628 [38:17<1:15:28,  1.01it/s]

{'loss': 0.269, 'learning_rate': 4.429901105293776e-05, 'epoch': 9.34}


 78%|███████▊  | 16080/20628 [38:38<1:18:05,  1.03s/it]

{'loss': 0.2665, 'learning_rate': 4.410509986426217e-05, 'epoch': 9.35}


 78%|███████▊  | 16100/20628 [38:57<1:12:42,  1.04it/s]

{'loss': 0.2641, 'learning_rate': 4.391118867558658e-05, 'epoch': 9.37}


 78%|███████▊  | 16120/20628 [39:16<1:12:13,  1.04it/s]

{'loss': 0.2567, 'learning_rate': 4.3717277486910994e-05, 'epoch': 9.38}


 78%|███████▊  | 16140/20628 [39:36<1:12:34,  1.03it/s]

{'loss': 0.2602, 'learning_rate': 4.352336629823541e-05, 'epoch': 9.39}


 78%|███████▊  | 16160/20628 [39:55<1:11:54,  1.04it/s]

{'loss': 0.2549, 'learning_rate': 4.3329455109559825e-05, 'epoch': 9.4}


 78%|███████▊  | 16180/20628 [40:14<1:11:24,  1.04it/s]

{'loss': 0.2639, 'learning_rate': 4.313554392088424e-05, 'epoch': 9.41}


 79%|███████▊  | 16200/20628 [40:34<1:11:14,  1.04it/s]

{'loss': 0.2537, 'learning_rate': 4.2941632732208656e-05, 'epoch': 9.42}


 79%|███████▊  | 16220/20628 [40:53<1:10:34,  1.04it/s]

{'loss': 0.2509, 'learning_rate': 4.274772154353306e-05, 'epoch': 9.44}


 79%|███████▊  | 16240/20628 [41:12<1:08:22,  1.07it/s]

{'loss': 0.2572, 'learning_rate': 4.2553810354857473e-05, 'epoch': 9.45}


 79%|███████▉  | 16260/20628 [41:30<1:06:39,  1.09it/s]

{'loss': 0.258, 'learning_rate': 4.235989916618189e-05, 'epoch': 9.46}


 79%|███████▉  | 16280/20628 [41:49<1:07:13,  1.08it/s]

{'loss': 0.2516, 'learning_rate': 4.2165987977506305e-05, 'epoch': 9.47}


 79%|███████▉  | 16300/20628 [42:07<1:06:11,  1.09it/s]

{'loss': 0.2521, 'learning_rate': 4.197207678883072e-05, 'epoch': 9.48}


 79%|███████▉  | 16320/20628 [42:26<1:05:43,  1.09it/s]

{'loss': 0.2516, 'learning_rate': 4.1778165600155136e-05, 'epoch': 9.49}


 79%|███████▉  | 16340/20628 [42:44<1:05:25,  1.09it/s]

{'loss': 0.2599, 'learning_rate': 4.158425441147955e-05, 'epoch': 9.51}


 79%|███████▉  | 16360/20628 [43:02<1:04:44,  1.10it/s]

{'loss': 0.2629, 'learning_rate': 4.139034322280396e-05, 'epoch': 9.52}


 79%|███████▉  | 16380/20628 [43:21<1:05:11,  1.09it/s]

{'loss': 0.2598, 'learning_rate': 4.119643203412837e-05, 'epoch': 9.53}


 80%|███████▉  | 16400/20628 [43:39<1:04:50,  1.09it/s]

{'loss': 0.2722, 'learning_rate': 4.1002520845452784e-05, 'epoch': 9.54}


 80%|███████▉  | 16420/20628 [43:57<1:04:24,  1.09it/s]

{'loss': 0.2732, 'learning_rate': 4.0808609656777196e-05, 'epoch': 9.55}


 80%|███████▉  | 16440/20628 [44:16<1:03:58,  1.09it/s]

{'loss': 0.2523, 'learning_rate': 4.0614698468101615e-05, 'epoch': 9.56}


 80%|███████▉  | 16460/20628 [44:34<1:03:28,  1.09it/s]

{'loss': 0.2618, 'learning_rate': 4.042078727942603e-05, 'epoch': 9.57}


 80%|███████▉  | 16480/20628 [44:52<1:03:14,  1.09it/s]

{'loss': 0.2634, 'learning_rate': 4.022687609075044e-05, 'epoch': 9.59}


 80%|███████▉  | 16500/20628 [45:11<1:02:51,  1.09it/s]

{'loss': 0.2618, 'learning_rate': 4.003296490207485e-05, 'epoch': 9.6}


 80%|████████  | 16520/20628 [45:29<1:02:52,  1.09it/s]

{'loss': 0.2697, 'learning_rate': 3.9839053713399264e-05, 'epoch': 9.61}


 80%|████████  | 16540/20628 [45:47<1:02:42,  1.09it/s]

{'loss': 0.2563, 'learning_rate': 3.9645142524723676e-05, 'epoch': 9.62}


 80%|████████  | 16560/20628 [46:06<1:02:14,  1.09it/s]

{'loss': 0.2604, 'learning_rate': 3.9451231336048095e-05, 'epoch': 9.63}


 80%|████████  | 16580/20628 [46:24<1:01:46,  1.09it/s]

{'loss': 0.2618, 'learning_rate': 3.925732014737251e-05, 'epoch': 9.64}


 80%|████████  | 16600/20628 [46:42<1:01:15,  1.10it/s]

{'loss': 0.262, 'learning_rate': 3.906340895869692e-05, 'epoch': 9.66}


 81%|████████  | 16620/20628 [47:01<1:01:00,  1.10it/s]

{'loss': 0.2638, 'learning_rate': 3.886949777002134e-05, 'epoch': 9.67}


 81%|████████  | 16640/20628 [47:19<1:01:00,  1.09it/s]

{'loss': 0.2649, 'learning_rate': 3.8675586581345743e-05, 'epoch': 9.68}


 81%|████████  | 16660/20628 [47:37<1:00:45,  1.09it/s]

{'loss': 0.2572, 'learning_rate': 3.8481675392670156e-05, 'epoch': 9.69}


 81%|████████  | 16680/20628 [47:56<1:00:13,  1.09it/s]

{'loss': 0.2576, 'learning_rate': 3.8287764203994574e-05, 'epoch': 9.7}


 81%|████████  | 16700/20628 [48:14<1:00:03,  1.09it/s]

{'loss': 0.2641, 'learning_rate': 3.809385301531899e-05, 'epoch': 9.71}


 81%|████████  | 16720/20628 [48:32<59:31,  1.09it/s]  

{'loss': 0.2675, 'learning_rate': 3.78999418266434e-05, 'epoch': 9.73}


 81%|████████  | 16740/20628 [48:51<59:09,  1.10it/s]

{'loss': 0.2591, 'learning_rate': 3.770603063796782e-05, 'epoch': 9.74}


 81%|████████  | 16760/20628 [49:09<58:52,  1.09it/s]  

{'loss': 0.2573, 'learning_rate': 3.751211944929223e-05, 'epoch': 9.75}


 81%|████████▏ | 16780/20628 [49:27<58:44,  1.09it/s]

{'loss': 0.2579, 'learning_rate': 3.7318208260616635e-05, 'epoch': 9.76}


 81%|████████▏ | 16800/20628 [49:46<58:20,  1.09it/s]

{'loss': 0.2545, 'learning_rate': 3.7124297071941054e-05, 'epoch': 9.77}


 82%|████████▏ | 16820/20628 [50:04<57:53,  1.10it/s]

{'loss': 0.2641, 'learning_rate': 3.6930385883265466e-05, 'epoch': 9.78}


 82%|████████▏ | 16840/20628 [50:23<57:57,  1.09it/s]

{'loss': 0.2625, 'learning_rate': 3.673647469458988e-05, 'epoch': 9.8}


 82%|████████▏ | 16860/20628 [50:41<57:19,  1.10it/s]

{'loss': 0.2634, 'learning_rate': 3.65425635059143e-05, 'epoch': 9.81}


 82%|████████▏ | 16880/20628 [50:59<56:57,  1.10it/s]

{'loss': 0.254, 'learning_rate': 3.634865231723871e-05, 'epoch': 9.82}


 82%|████████▏ | 16900/20628 [51:17<57:05,  1.09it/s]

{'loss': 0.2619, 'learning_rate': 3.615474112856312e-05, 'epoch': 9.83}


 82%|████████▏ | 16920/20628 [51:36<56:36,  1.09it/s]

{'loss': 0.2648, 'learning_rate': 3.5960829939887534e-05, 'epoch': 9.84}


 82%|████████▏ | 16940/20628 [51:54<56:25,  1.09it/s]

{'loss': 0.2623, 'learning_rate': 3.5766918751211946e-05, 'epoch': 9.85}


 82%|████████▏ | 16960/20628 [52:12<55:42,  1.10it/s]

{'loss': 0.2624, 'learning_rate': 3.557300756253636e-05, 'epoch': 9.87}


 82%|████████▏ | 16980/20628 [52:31<55:23,  1.10it/s]

{'loss': 0.2642, 'learning_rate': 3.537909637386078e-05, 'epoch': 9.88}


 82%|████████▏ | 17000/20628 [52:49<55:18,  1.09it/s]

{'loss': 0.2648, 'learning_rate': 3.518518518518519e-05, 'epoch': 9.89}


 83%|████████▎ | 17020/20628 [53:07<55:18,  1.09it/s]

{'loss': 0.2521, 'learning_rate': 3.49912739965096e-05, 'epoch': 9.9}


 83%|████████▎ | 17040/20628 [53:26<54:40,  1.09it/s]

{'loss': 0.2746, 'learning_rate': 3.4797362807834013e-05, 'epoch': 9.91}


 83%|████████▎ | 17060/20628 [53:44<54:30,  1.09it/s]

{'loss': 0.2614, 'learning_rate': 3.4603451619158426e-05, 'epoch': 9.92}


 83%|████████▎ | 17080/20628 [54:02<54:06,  1.09it/s]

{'loss': 0.267, 'learning_rate': 3.440954043048284e-05, 'epoch': 9.94}


 83%|████████▎ | 17100/20628 [54:21<53:51,  1.09it/s]

{'loss': 0.2625, 'learning_rate': 3.421562924180726e-05, 'epoch': 9.95}


 83%|████████▎ | 17120/20628 [54:39<53:48,  1.09it/s]

{'loss': 0.2485, 'learning_rate': 3.402171805313167e-05, 'epoch': 9.96}


 83%|████████▎ | 17140/20628 [54:57<53:10,  1.09it/s]

{'loss': 0.2695, 'learning_rate': 3.382780686445608e-05, 'epoch': 9.97}


 83%|████████▎ | 17160/20628 [55:16<52:48,  1.09it/s]

{'loss': 0.265, 'learning_rate': 3.36338956757805e-05, 'epoch': 9.98}


 83%|████████▎ | 17180/20628 [55:34<52:29,  1.09it/s]

{'loss': 0.267, 'learning_rate': 3.3439984487104905e-05, 'epoch': 9.99}


                                                     
 83%|████████▎ | 17191/20628 [56:58<52:17,  1.10it/s]

{'eval_loss': 0.37718600034713745, 'eval_runtime': 73.8142, 'eval_samples_per_second': 23.288, 'eval_steps_per_second': 11.651, 'epoch': 10.0}


 83%|████████▎ | 17200/20628 [57:06<2:05:21,  2.19s/it] 

{'loss': 0.2568, 'learning_rate': 3.324607329842932e-05, 'epoch': 10.01}


 83%|████████▎ | 17220/20628 [57:25<52:10,  1.09it/s]  

{'loss': 0.2536, 'learning_rate': 3.3052162109753736e-05, 'epoch': 10.02}


 84%|████████▎ | 17240/20628 [57:43<51:39,  1.09it/s]

{'loss': 0.2479, 'learning_rate': 3.285825092107815e-05, 'epoch': 10.03}


 84%|████████▎ | 17260/20628 [58:01<51:11,  1.10it/s]

{'loss': 0.2373, 'learning_rate': 3.266433973240256e-05, 'epoch': 10.04}


 84%|████████▍ | 17280/20628 [58:20<50:59,  1.09it/s]

{'loss': 0.2413, 'learning_rate': 3.247042854372698e-05, 'epoch': 10.05}


 84%|████████▍ | 17300/20628 [58:38<50:35,  1.10it/s]

{'loss': 0.244, 'learning_rate': 3.227651735505139e-05, 'epoch': 10.06}


 84%|████████▍ | 17320/20628 [58:56<50:34,  1.09it/s]

{'loss': 0.2425, 'learning_rate': 3.2082606166375804e-05, 'epoch': 10.08}


 84%|████████▍ | 17340/20628 [59:15<50:02,  1.10it/s]

{'loss': 0.2484, 'learning_rate': 3.1888694977700216e-05, 'epoch': 10.09}


 84%|████████▍ | 17360/20628 [59:33<49:51,  1.09it/s]

{'loss': 0.249, 'learning_rate': 3.169478378902463e-05, 'epoch': 10.1}


 84%|████████▍ | 17380/20628 [59:51<49:37,  1.09it/s]

{'loss': 0.2402, 'learning_rate': 3.150087260034904e-05, 'epoch': 10.11}


 84%|████████▍ | 17400/20628 [1:00:10<49:20,  1.09it/s]

{'loss': 0.244, 'learning_rate': 3.130696141167346e-05, 'epoch': 10.12}


 84%|████████▍ | 17420/20628 [1:00:28<48:53,  1.09it/s]

{'loss': 0.2482, 'learning_rate': 3.111305022299787e-05, 'epoch': 10.13}


 85%|████████▍ | 17440/20628 [1:00:46<48:34,  1.09it/s]

{'loss': 0.2345, 'learning_rate': 3.0919139034322283e-05, 'epoch': 10.14}


 85%|████████▍ | 17460/20628 [1:01:05<48:17,  1.09it/s]

{'loss': 0.2458, 'learning_rate': 3.0725227845646696e-05, 'epoch': 10.16}


 85%|████████▍ | 17480/20628 [1:01:23<48:03,  1.09it/s]

{'loss': 0.2549, 'learning_rate': 3.053131665697111e-05, 'epoch': 10.17}


 85%|████████▍ | 17500/20628 [1:01:41<47:40,  1.09it/s]

{'loss': 0.2473, 'learning_rate': 3.033740546829552e-05, 'epoch': 10.18}


 85%|████████▍ | 17520/20628 [1:02:00<47:22,  1.09it/s]

{'loss': 0.2561, 'learning_rate': 3.014349427961994e-05, 'epoch': 10.19}


 85%|████████▌ | 17540/20628 [1:02:18<47:03,  1.09it/s]

{'loss': 0.24, 'learning_rate': 2.994958309094435e-05, 'epoch': 10.2}


 85%|████████▌ | 17560/20628 [1:02:36<46:39,  1.10it/s]

{'loss': 0.2491, 'learning_rate': 2.975567190226876e-05, 'epoch': 10.21}


 85%|████████▌ | 17580/20628 [1:02:55<46:20,  1.10it/s]

{'loss': 0.2399, 'learning_rate': 2.956176071359318e-05, 'epoch': 10.23}


 85%|████████▌ | 17600/20628 [1:03:13<46:26,  1.09it/s]

{'loss': 0.2436, 'learning_rate': 2.936784952491759e-05, 'epoch': 10.24}


 85%|████████▌ | 17620/20628 [1:03:31<45:45,  1.10it/s]

{'loss': 0.2519, 'learning_rate': 2.9173938336242003e-05, 'epoch': 10.25}


 86%|████████▌ | 17640/20628 [1:03:51<48:46,  1.02it/s]

{'loss': 0.2482, 'learning_rate': 2.898002714756642e-05, 'epoch': 10.26}


 86%|████████▌ | 17660/20628 [1:04:10<48:40,  1.02it/s]

{'loss': 0.2561, 'learning_rate': 2.878611595889083e-05, 'epoch': 10.27}


 86%|████████▌ | 17680/20628 [1:04:29<47:17,  1.04it/s]

{'loss': 0.2365, 'learning_rate': 2.8592204770215243e-05, 'epoch': 10.28}


 86%|████████▌ | 17700/20628 [1:04:49<47:10,  1.03it/s]

{'loss': 0.2646, 'learning_rate': 2.8398293581539658e-05, 'epoch': 10.3}


 86%|████████▌ | 17720/20628 [1:05:08<46:39,  1.04it/s]

{'loss': 0.2443, 'learning_rate': 2.820438239286407e-05, 'epoch': 10.31}


 86%|████████▌ | 17740/20628 [1:05:27<46:10,  1.04it/s]

{'loss': 0.2468, 'learning_rate': 2.8010471204188483e-05, 'epoch': 10.32}


 86%|████████▌ | 17760/20628 [1:05:47<45:53,  1.04it/s]

{'loss': 0.2497, 'learning_rate': 2.7816560015512898e-05, 'epoch': 10.33}


 86%|████████▌ | 17780/20628 [1:06:06<45:35,  1.04it/s]

{'loss': 0.2459, 'learning_rate': 2.762264882683731e-05, 'epoch': 10.34}


 86%|████████▋ | 17800/20628 [1:06:25<45:19,  1.04it/s]

{'loss': 0.2441, 'learning_rate': 2.7428737638161722e-05, 'epoch': 10.35}


 86%|████████▋ | 17820/20628 [1:06:45<45:12,  1.04it/s]

{'loss': 0.2508, 'learning_rate': 2.7234826449486138e-05, 'epoch': 10.37}


 86%|████████▋ | 17840/20628 [1:07:04<44:37,  1.04it/s]

{'loss': 0.2392, 'learning_rate': 2.704091526081055e-05, 'epoch': 10.38}


 87%|████████▋ | 17860/20628 [1:07:23<44:32,  1.04it/s]

{'loss': 0.2455, 'learning_rate': 2.6847004072134962e-05, 'epoch': 10.39}


 87%|████████▋ | 17880/20628 [1:07:43<44:12,  1.04it/s]

{'loss': 0.2453, 'learning_rate': 2.665309288345938e-05, 'epoch': 10.4}


 87%|████████▋ | 17900/20628 [1:08:02<43:41,  1.04it/s]

{'loss': 0.2457, 'learning_rate': 2.645918169478379e-05, 'epoch': 10.41}


 87%|████████▋ | 17920/20628 [1:08:22<44:01,  1.03it/s]

{'loss': 0.2461, 'learning_rate': 2.6265270506108202e-05, 'epoch': 10.42}


 87%|████████▋ | 17940/20628 [1:08:41<43:08,  1.04it/s]

{'loss': 0.2488, 'learning_rate': 2.607135931743262e-05, 'epoch': 10.44}


 87%|████████▋ | 17960/20628 [1:09:00<42:56,  1.04it/s]

{'loss': 0.2454, 'learning_rate': 2.5877448128757033e-05, 'epoch': 10.45}


 87%|████████▋ | 17980/20628 [1:09:20<47:49,  1.08s/it]

{'loss': 0.2474, 'learning_rate': 2.5683536940081442e-05, 'epoch': 10.46}


 87%|████████▋ | 18000/20628 [1:09:41<41:05,  1.07it/s]

{'loss': 0.2451, 'learning_rate': 2.548962575140586e-05, 'epoch': 10.47}


 87%|████████▋ | 18020/20628 [1:09:59<41:17,  1.05it/s]

{'loss': 0.2448, 'learning_rate': 2.5295714562730273e-05, 'epoch': 10.48}


 87%|████████▋ | 18040/20628 [1:10:18<40:46,  1.06it/s]

{'loss': 0.2533, 'learning_rate': 2.510180337405468e-05, 'epoch': 10.49}


 88%|████████▊ | 18060/20628 [1:10:37<40:22,  1.06it/s]

{'loss': 0.2481, 'learning_rate': 2.4907892185379097e-05, 'epoch': 10.51}


 88%|████████▊ | 18080/20628 [1:10:57<47:26,  1.12s/it]

{'loss': 0.2485, 'learning_rate': 2.4713980996703513e-05, 'epoch': 10.52}


 88%|████████▊ | 18100/20628 [1:11:17<40:23,  1.04it/s]

{'loss': 0.2596, 'learning_rate': 2.4520069808027925e-05, 'epoch': 10.53}


 88%|████████▊ | 18120/20628 [1:11:36<39:44,  1.05it/s]

{'loss': 0.2452, 'learning_rate': 2.4326158619352337e-05, 'epoch': 10.54}


 88%|████████▊ | 18140/20628 [1:11:56<39:30,  1.05it/s]

{'loss': 0.237, 'learning_rate': 2.4132247430676752e-05, 'epoch': 10.55}


 88%|████████▊ | 18160/20628 [1:12:15<39:03,  1.05it/s]

{'loss': 0.2488, 'learning_rate': 2.3938336242001165e-05, 'epoch': 10.56}


 88%|████████▊ | 18180/20628 [1:12:34<38:47,  1.05it/s]

{'loss': 0.2496, 'learning_rate': 2.3744425053325577e-05, 'epoch': 10.58}


 88%|████████▊ | 18200/20628 [1:12:53<38:31,  1.05it/s]

{'loss': 0.252, 'learning_rate': 2.3550513864649992e-05, 'epoch': 10.59}


 88%|████████▊ | 18220/20628 [1:13:12<38:19,  1.05it/s]

{'loss': 0.2382, 'learning_rate': 2.3356602675974404e-05, 'epoch': 10.6}


 88%|████████▊ | 18240/20628 [1:13:31<37:44,  1.05it/s]

{'loss': 0.2511, 'learning_rate': 2.3162691487298817e-05, 'epoch': 10.61}


 89%|████████▊ | 18260/20628 [1:13:50<37:29,  1.05it/s]

{'loss': 0.2517, 'learning_rate': 2.2968780298623232e-05, 'epoch': 10.62}


 89%|████████▊ | 18280/20628 [1:14:09<37:01,  1.06it/s]

{'loss': 0.2482, 'learning_rate': 2.2774869109947644e-05, 'epoch': 10.63}


 89%|████████▊ | 18300/20628 [1:14:27<36:24,  1.07it/s]

{'loss': 0.2485, 'learning_rate': 2.258095792127206e-05, 'epoch': 10.64}


 89%|████████▉ | 18320/20628 [1:14:46<36:17,  1.06it/s]

{'loss': 0.245, 'learning_rate': 2.2387046732596472e-05, 'epoch': 10.66}


 89%|████████▉ | 18340/20628 [1:15:05<36:16,  1.05it/s]

{'loss': 0.2404, 'learning_rate': 2.2193135543920884e-05, 'epoch': 10.67}


 89%|████████▉ | 18360/20628 [1:15:24<35:45,  1.06it/s]

{'loss': 0.2461, 'learning_rate': 2.19992243552453e-05, 'epoch': 10.68}


 89%|████████▉ | 18380/20628 [1:15:43<35:35,  1.05it/s]

{'loss': 0.2522, 'learning_rate': 2.1805313166569712e-05, 'epoch': 10.69}


 89%|████████▉ | 18400/20628 [1:16:02<35:07,  1.06it/s]

{'loss': 0.2436, 'learning_rate': 2.1611401977894124e-05, 'epoch': 10.7}


 89%|████████▉ | 18420/20628 [1:16:21<34:52,  1.05it/s]

{'loss': 0.2511, 'learning_rate': 2.141749078921854e-05, 'epoch': 10.71}


 89%|████████▉ | 18440/20628 [1:16:40<34:46,  1.05it/s]

{'loss': 0.2436, 'learning_rate': 2.122357960054295e-05, 'epoch': 10.73}


 89%|████████▉ | 18460/20628 [1:16:59<34:14,  1.06it/s]

{'loss': 0.2367, 'learning_rate': 2.1029668411867364e-05, 'epoch': 10.74}


 90%|████████▉ | 18480/20628 [1:17:18<34:00,  1.05it/s]

{'loss': 0.2591, 'learning_rate': 2.083575722319178e-05, 'epoch': 10.75}


 90%|████████▉ | 18500/20628 [1:17:37<33:41,  1.05it/s]

{'loss': 0.2482, 'learning_rate': 2.0641846034516195e-05, 'epoch': 10.76}


 90%|████████▉ | 18520/20628 [1:17:56<32:54,  1.07it/s]

{'loss': 0.2533, 'learning_rate': 2.0447934845840604e-05, 'epoch': 10.77}


 90%|████████▉ | 18540/20628 [1:18:15<33:08,  1.05it/s]

{'loss': 0.2454, 'learning_rate': 2.025402365716502e-05, 'epoch': 10.78}


 90%|████████▉ | 18560/20628 [1:18:34<32:33,  1.06it/s]

{'loss': 0.2493, 'learning_rate': 2.0060112468489435e-05, 'epoch': 10.8}


 90%|█████████ | 18580/20628 [1:18:53<32:29,  1.05it/s]

{'loss': 0.2379, 'learning_rate': 1.9866201279813847e-05, 'epoch': 10.81}


 90%|█████████ | 18600/20628 [1:19:12<32:09,  1.05it/s]

{'loss': 0.2398, 'learning_rate': 1.967229009113826e-05, 'epoch': 10.82}


 90%|█████████ | 18620/20628 [1:19:31<31:26,  1.06it/s]

{'loss': 0.2451, 'learning_rate': 1.9478378902462674e-05, 'epoch': 10.83}


 90%|█████████ | 18640/20628 [1:19:50<31:27,  1.05it/s]

{'loss': 0.2497, 'learning_rate': 1.9284467713787087e-05, 'epoch': 10.84}


 90%|█████████ | 18660/20628 [1:20:09<31:02,  1.06it/s]

{'loss': 0.2493, 'learning_rate': 1.90905565251115e-05, 'epoch': 10.85}


 91%|█████████ | 18680/20628 [1:20:28<30:48,  1.05it/s]

{'loss': 0.254, 'learning_rate': 1.8896645336435914e-05, 'epoch': 10.87}


 91%|█████████ | 18700/20628 [1:20:47<30:22,  1.06it/s]

{'loss': 0.2519, 'learning_rate': 1.8702734147760326e-05, 'epoch': 10.88}


 91%|█████████ | 18720/20628 [1:21:06<30:09,  1.05it/s]

{'loss': 0.2552, 'learning_rate': 1.850882295908474e-05, 'epoch': 10.89}


 91%|█████████ | 18740/20628 [1:21:25<29:46,  1.06it/s]

{'loss': 0.2598, 'learning_rate': 1.8314911770409154e-05, 'epoch': 10.9}


 91%|█████████ | 18760/20628 [1:21:44<30:06,  1.03it/s]

{'loss': 0.254, 'learning_rate': 1.8121000581733566e-05, 'epoch': 10.91}


 91%|█████████ | 18780/20628 [1:22:03<29:04,  1.06it/s]

{'loss': 0.2434, 'learning_rate': 1.7927089393057982e-05, 'epoch': 10.92}


 91%|█████████ | 18800/20628 [1:22:22<28:46,  1.06it/s]

{'loss': 0.2548, 'learning_rate': 1.7733178204382394e-05, 'epoch': 10.94}


 91%|█████████ | 18820/20628 [1:22:41<28:24,  1.06it/s]

{'loss': 0.2548, 'learning_rate': 1.7539267015706806e-05, 'epoch': 10.95}


 91%|█████████▏| 18840/20628 [1:22:59<28:11,  1.06it/s]

{'loss': 0.2436, 'learning_rate': 1.734535582703122e-05, 'epoch': 10.96}


 91%|█████████▏| 18860/20628 [1:23:18<27:51,  1.06it/s]

{'loss': 0.2441, 'learning_rate': 1.7151444638355634e-05, 'epoch': 10.97}


 92%|█████████▏| 18880/20628 [1:23:37<27:35,  1.06it/s]

{'loss': 0.2484, 'learning_rate': 1.6957533449680046e-05, 'epoch': 10.98}


 92%|█████████▏| 18900/20628 [1:23:56<27:10,  1.06it/s]

{'loss': 0.2465, 'learning_rate': 1.676362226100446e-05, 'epoch': 10.99}


                                                       
 92%|█████████▏| 18910/20628 [1:25:23<27:03,  1.06it/s]

{'eval_loss': 0.3878016173839569, 'eval_runtime': 76.3655, 'eval_samples_per_second': 22.51, 'eval_steps_per_second': 11.262, 'epoch': 11.0}


 92%|█████████▏| 18920/20628 [1:25:32<53:18,  1.87s/it]   

{'loss': 0.2472, 'learning_rate': 1.6569711072328874e-05, 'epoch': 11.01}


 92%|█████████▏| 18940/20628 [1:25:51<26:39,  1.06it/s]

{'loss': 0.238, 'learning_rate': 1.6375799883653286e-05, 'epoch': 11.02}


 92%|█████████▏| 18960/20628 [1:26:09<26:17,  1.06it/s]

{'loss': 0.2311, 'learning_rate': 1.61818886949777e-05, 'epoch': 11.03}


 92%|█████████▏| 18980/20628 [1:26:28<26:03,  1.05it/s]

{'loss': 0.2369, 'learning_rate': 1.5987977506302117e-05, 'epoch': 11.04}


 92%|█████████▏| 19000/20628 [1:26:47<25:42,  1.06it/s]

{'loss': 0.2413, 'learning_rate': 1.5794066317626526e-05, 'epoch': 11.05}


 92%|█████████▏| 19020/20628 [1:27:06<25:23,  1.06it/s]

{'loss': 0.2401, 'learning_rate': 1.560015512895094e-05, 'epoch': 11.06}


 92%|█████████▏| 19040/20628 [1:27:25<24:59,  1.06it/s]

{'loss': 0.2389, 'learning_rate': 1.5406243940275357e-05, 'epoch': 11.08}


 92%|█████████▏| 19060/20628 [1:27:44<24:41,  1.06it/s]

{'loss': 0.2359, 'learning_rate': 1.5212332751599767e-05, 'epoch': 11.09}


 92%|█████████▏| 19080/20628 [1:28:03<24:21,  1.06it/s]

{'loss': 0.2366, 'learning_rate': 1.5018421562924181e-05, 'epoch': 11.1}


 93%|█████████▎| 19100/20628 [1:28:22<24:05,  1.06it/s]

{'loss': 0.2441, 'learning_rate': 1.4824510374248596e-05, 'epoch': 11.11}


 93%|█████████▎| 19120/20628 [1:28:41<23:45,  1.06it/s]

{'loss': 0.2414, 'learning_rate': 1.4630599185573007e-05, 'epoch': 11.12}


 93%|█████████▎| 19140/20628 [1:29:00<23:32,  1.05it/s]

{'loss': 0.2361, 'learning_rate': 1.4436687996897422e-05, 'epoch': 11.13}


 93%|█████████▎| 19160/20628 [1:29:19<23:04,  1.06it/s]

{'loss': 0.2262, 'learning_rate': 1.4242776808221836e-05, 'epoch': 11.15}


 93%|█████████▎| 19180/20628 [1:29:38<22:51,  1.06it/s]

{'loss': 0.2314, 'learning_rate': 1.4048865619546248e-05, 'epoch': 11.16}


 93%|█████████▎| 19200/20628 [1:29:57<22:27,  1.06it/s]

{'loss': 0.2457, 'learning_rate': 1.3854954430870662e-05, 'epoch': 11.17}


 93%|█████████▎| 19220/20628 [1:30:16<22:15,  1.05it/s]

{'loss': 0.2312, 'learning_rate': 1.3661043242195076e-05, 'epoch': 11.18}


 93%|█████████▎| 19240/20628 [1:30:35<22:18,  1.04it/s]

{'loss': 0.2459, 'learning_rate': 1.3467132053519488e-05, 'epoch': 11.19}


 93%|█████████▎| 19260/20628 [1:30:54<22:01,  1.03it/s]

{'loss': 0.2318, 'learning_rate': 1.3273220864843902e-05, 'epoch': 11.2}


 93%|█████████▎| 19280/20628 [1:31:13<21:45,  1.03it/s]

{'loss': 0.2386, 'learning_rate': 1.3079309676168316e-05, 'epoch': 11.21}


 94%|█████████▎| 19300/20628 [1:31:32<20:54,  1.06it/s]

{'loss': 0.2495, 'learning_rate': 1.2885398487492728e-05, 'epoch': 11.23}


 94%|█████████▎| 19320/20628 [1:31:51<20:44,  1.05it/s]

{'loss': 0.242, 'learning_rate': 1.2691487298817142e-05, 'epoch': 11.24}


 94%|█████████▍| 19340/20628 [1:32:10<20:19,  1.06it/s]

{'loss': 0.24, 'learning_rate': 1.2497576110141556e-05, 'epoch': 11.25}


 94%|█████████▍| 19360/20628 [1:32:29<20:02,  1.05it/s]

{'loss': 0.2303, 'learning_rate': 1.230366492146597e-05, 'epoch': 11.26}


 94%|█████████▍| 19380/20628 [1:32:48<19:41,  1.06it/s]

{'loss': 0.2464, 'learning_rate': 1.2109753732790383e-05, 'epoch': 11.27}


 94%|█████████▍| 19400/20628 [1:33:07<19:23,  1.06it/s]

{'loss': 0.2383, 'learning_rate': 1.1915842544114795e-05, 'epoch': 11.28}


 94%|█████████▍| 19420/20628 [1:33:26<19:04,  1.06it/s]

{'loss': 0.2263, 'learning_rate': 1.172193135543921e-05, 'epoch': 11.3}


 94%|█████████▍| 19440/20628 [1:33:45<18:45,  1.06it/s]

{'loss': 0.2332, 'learning_rate': 1.1528020166763623e-05, 'epoch': 11.31}


 94%|█████████▍| 19460/20628 [1:34:04<18:23,  1.06it/s]

{'loss': 0.2396, 'learning_rate': 1.1334108978088035e-05, 'epoch': 11.32}


 94%|█████████▍| 19480/20628 [1:34:23<18:07,  1.06it/s]

{'loss': 0.2317, 'learning_rate': 1.114019778941245e-05, 'epoch': 11.33}


 95%|█████████▍| 19500/20628 [1:34:42<17:43,  1.06it/s]

{'loss': 0.2313, 'learning_rate': 1.0946286600736863e-05, 'epoch': 11.34}


 95%|█████████▍| 19520/20628 [1:35:01<17:29,  1.06it/s]

{'loss': 0.2372, 'learning_rate': 1.0752375412061277e-05, 'epoch': 11.35}


 95%|█████████▍| 19540/20628 [1:35:20<17:07,  1.06it/s]

{'loss': 0.2413, 'learning_rate': 1.055846422338569e-05, 'epoch': 11.37}


 95%|█████████▍| 19560/20628 [1:35:38<16:48,  1.06it/s]

{'loss': 0.233, 'learning_rate': 1.0364553034710103e-05, 'epoch': 11.38}


 95%|█████████▍| 19580/20628 [1:35:57<16:31,  1.06it/s]

{'loss': 0.2388, 'learning_rate': 1.0170641846034517e-05, 'epoch': 11.39}


 95%|█████████▌| 19600/20628 [1:36:16<16:13,  1.06it/s]

{'loss': 0.2294, 'learning_rate': 9.97673065735893e-06, 'epoch': 11.4}


 95%|█████████▌| 19620/20628 [1:36:35<15:52,  1.06it/s]

{'loss': 0.232, 'learning_rate': 9.782819468683344e-06, 'epoch': 11.41}


 95%|█████████▌| 19640/20628 [1:36:54<15:35,  1.06it/s]

{'loss': 0.2316, 'learning_rate': 9.588908280007756e-06, 'epoch': 11.42}


 95%|█████████▌| 19660/20628 [1:37:13<15:14,  1.06it/s]

{'loss': 0.2461, 'learning_rate': 9.39499709133217e-06, 'epoch': 11.44}


 95%|█████████▌| 19680/20628 [1:37:32<14:59,  1.05it/s]

{'loss': 0.2384, 'learning_rate': 9.201085902656584e-06, 'epoch': 11.45}


 96%|█████████▌| 19700/20628 [1:37:51<14:36,  1.06it/s]

{'loss': 0.2432, 'learning_rate': 9.007174713980996e-06, 'epoch': 11.46}


 96%|█████████▌| 19720/20628 [1:38:10<14:20,  1.05it/s]

{'loss': 0.2324, 'learning_rate': 8.813263525305412e-06, 'epoch': 11.47}


 96%|█████████▌| 19740/20628 [1:38:29<13:59,  1.06it/s]

{'loss': 0.2429, 'learning_rate': 8.619352336629824e-06, 'epoch': 11.48}


 96%|█████████▌| 19760/20628 [1:38:48<13:43,  1.05it/s]

{'loss': 0.2329, 'learning_rate': 8.425441147954236e-06, 'epoch': 11.49}


 96%|█████████▌| 19780/20628 [1:39:07<13:20,  1.06it/s]

{'loss': 0.231, 'learning_rate': 8.231529959278652e-06, 'epoch': 11.51}


 96%|█████████▌| 19800/20628 [1:39:26<13:05,  1.05it/s]

{'loss': 0.2299, 'learning_rate': 8.037618770603064e-06, 'epoch': 11.52}


 96%|█████████▌| 19820/20628 [1:39:45<12:43,  1.06it/s]

{'loss': 0.2296, 'learning_rate': 7.843707581927478e-06, 'epoch': 11.53}


 96%|█████████▌| 19840/20628 [1:40:03<12:26,  1.06it/s]

{'loss': 0.232, 'learning_rate': 7.649796393251891e-06, 'epoch': 11.54}


 96%|█████████▋| 19860/20628 [1:40:22<12:04,  1.06it/s]

{'loss': 0.2327, 'learning_rate': 7.4558852045763044e-06, 'epoch': 11.55}


 96%|█████████▋| 19880/20628 [1:40:41<11:49,  1.05it/s]

{'loss': 0.2303, 'learning_rate': 7.2619740159007174e-06, 'epoch': 11.56}


 96%|█████████▋| 19900/20628 [1:41:00<11:26,  1.06it/s]

{'loss': 0.2319, 'learning_rate': 7.07775838665891e-06, 'epoch': 11.58}


 97%|█████████▋| 19920/20628 [1:41:19<11:13,  1.05it/s]

{'loss': 0.2315, 'learning_rate': 6.883847197983324e-06, 'epoch': 11.59}


 97%|█████████▋| 19940/20628 [1:41:38<10:50,  1.06it/s]

{'loss': 0.2414, 'learning_rate': 6.689936009307737e-06, 'epoch': 11.6}


 97%|█████████▋| 19960/20628 [1:41:57<10:33,  1.05it/s]

{'loss': 0.2397, 'learning_rate': 6.49602482063215e-06, 'epoch': 11.61}


 97%|█████████▋| 19980/20628 [1:42:16<10:11,  1.06it/s]

{'loss': 0.2298, 'learning_rate': 6.302113631956565e-06, 'epoch': 11.62}


 97%|█████████▋| 20000/20628 [1:42:35<09:54,  1.06it/s]

{'loss': 0.2403, 'learning_rate': 6.108202443280978e-06, 'epoch': 11.63}


 97%|█████████▋| 20020/20628 [1:42:54<09:35,  1.06it/s]

{'loss': 0.2267, 'learning_rate': 5.9142912546053915e-06, 'epoch': 11.65}


 97%|█████████▋| 20040/20628 [1:43:13<09:16,  1.06it/s]

{'loss': 0.2236, 'learning_rate': 5.7203800659298045e-06, 'epoch': 11.66}


 97%|█████████▋| 20060/20628 [1:43:32<08:58,  1.05it/s]

{'loss': 0.2283, 'learning_rate': 5.5264688772542175e-06, 'epoch': 11.67}


 97%|█████████▋| 20080/20628 [1:43:51<08:38,  1.06it/s]

{'loss': 0.2333, 'learning_rate': 5.332557688578631e-06, 'epoch': 11.68}


 97%|█████████▋| 20100/20628 [1:44:10<08:19,  1.06it/s]

{'loss': 0.2513, 'learning_rate': 5.138646499903044e-06, 'epoch': 11.69}


 98%|█████████▊| 20120/20628 [1:44:28<08:01,  1.06it/s]

{'loss': 0.2342, 'learning_rate': 4.944735311227458e-06, 'epoch': 11.7}


 98%|█████████▊| 20140/20628 [1:44:47<07:42,  1.06it/s]

{'loss': 0.2303, 'learning_rate': 4.750824122551872e-06, 'epoch': 11.72}


 98%|█████████▊| 20160/20628 [1:45:06<07:22,  1.06it/s]

{'loss': 0.2274, 'learning_rate': 4.556912933876285e-06, 'epoch': 11.73}


 98%|█████████▊| 20180/20628 [1:45:25<07:03,  1.06it/s]

{'loss': 0.2475, 'learning_rate': 4.363001745200698e-06, 'epoch': 11.74}


 98%|█████████▊| 20200/20628 [1:45:44<06:44,  1.06it/s]

{'loss': 0.2329, 'learning_rate': 4.169090556525112e-06, 'epoch': 11.75}


 98%|█████████▊| 20220/20628 [1:46:03<06:26,  1.06it/s]

{'loss': 0.2422, 'learning_rate': 3.975179367849525e-06, 'epoch': 11.76}


 98%|█████████▊| 20240/20628 [1:46:22<06:06,  1.06it/s]

{'loss': 0.2358, 'learning_rate': 3.7812681791739386e-06, 'epoch': 11.77}


 98%|█████████▊| 20260/20628 [1:46:41<05:48,  1.06it/s]

{'loss': 0.2422, 'learning_rate': 3.587356990498352e-06, 'epoch': 11.78}


 98%|█████████▊| 20280/20628 [1:47:00<05:28,  1.06it/s]

{'loss': 0.2376, 'learning_rate': 3.393445801822765e-06, 'epoch': 11.8}


 98%|█████████▊| 20300/20628 [1:47:19<05:10,  1.06it/s]

{'loss': 0.2279, 'learning_rate': 3.199534613147179e-06, 'epoch': 11.81}


 99%|█████████▊| 20320/20628 [1:47:38<04:50,  1.06it/s]

{'loss': 0.2295, 'learning_rate': 3.005623424471592e-06, 'epoch': 11.82}


 99%|█████████▊| 20340/20628 [1:47:57<04:33,  1.05it/s]

{'loss': 0.2325, 'learning_rate': 2.8117122357960057e-06, 'epoch': 11.83}


 99%|█████████▊| 20360/20628 [1:48:15<04:14,  1.05it/s]

{'loss': 0.2297, 'learning_rate': 2.617801047120419e-06, 'epoch': 11.84}


 99%|█████████▉| 20380/20628 [1:48:34<03:55,  1.06it/s]

{'loss': 0.2303, 'learning_rate': 2.423889858444832e-06, 'epoch': 11.85}


 99%|█████████▉| 20400/20628 [1:48:53<03:35,  1.06it/s]

{'loss': 0.2413, 'learning_rate': 2.229978669769246e-06, 'epoch': 11.87}


 99%|█████████▉| 20420/20628 [1:49:12<03:17,  1.05it/s]

{'loss': 0.2347, 'learning_rate': 2.0360674810936594e-06, 'epoch': 11.88}


 99%|█████████▉| 20440/20628 [1:49:31<02:57,  1.06it/s]

{'loss': 0.2443, 'learning_rate': 1.8421562924180726e-06, 'epoch': 11.89}


 99%|█████████▉| 20460/20628 [1:49:50<02:38,  1.06it/s]

{'loss': 0.2447, 'learning_rate': 1.6482451037424862e-06, 'epoch': 11.9}


 99%|█████████▉| 20480/20628 [1:50:09<02:19,  1.06it/s]

{'loss': 0.2411, 'learning_rate': 1.4543339150668994e-06, 'epoch': 11.91}


 99%|█████████▉| 20500/20628 [1:50:28<02:01,  1.06it/s]

{'loss': 0.2407, 'learning_rate': 1.2604227263913128e-06, 'epoch': 11.92}


 99%|█████████▉| 20520/20628 [1:50:47<01:42,  1.06it/s]

{'loss': 0.2361, 'learning_rate': 1.0665115377157262e-06, 'epoch': 11.94}


100%|█████████▉| 20540/20628 [1:51:06<01:23,  1.06it/s]

{'loss': 0.2354, 'learning_rate': 8.726003490401396e-07, 'epoch': 11.95}


100%|█████████▉| 20560/20628 [1:51:25<01:04,  1.06it/s]

{'loss': 0.2391, 'learning_rate': 6.78689160364553e-07, 'epoch': 11.96}


100%|█████████▉| 20580/20628 [1:51:44<00:45,  1.06it/s]

{'loss': 0.2373, 'learning_rate': 4.847779716889665e-07, 'epoch': 11.97}


100%|█████████▉| 20600/20628 [1:52:03<00:26,  1.06it/s]

{'loss': 0.2368, 'learning_rate': 2.908667830133799e-07, 'epoch': 11.98}


100%|█████████▉| 20620/20628 [1:52:21<00:07,  1.06it/s]

{'loss': 0.2325, 'learning_rate': 9.695559433779329e-08, 'epoch': 11.99}


                                                       
100%|██████████| 20628/20628 [1:53:45<00:00,  1.06it/s]

{'eval_loss': 0.3978927731513977, 'eval_runtime': 76.2653, 'eval_samples_per_second': 22.54, 'eval_steps_per_second': 11.276, 'epoch': 12.0}


100%|██████████| 20628/20628 [1:53:46<00:00,  3.02it/s]

{'train_runtime': 6826.0362, 'train_samples_per_second': 24.183, 'train_steps_per_second': 3.022, 'train_loss': 0.08490708646604159, 'epoch': 12.0}





TrainOutput(global_step=20628, training_loss=0.08490708646604159, metrics={'train_runtime': 6826.0362, 'train_samples_per_second': 24.183, 'train_steps_per_second': 3.022, 'train_loss': 0.08490708646604159, 'epoch': 12.0})

In [2]:
model.save_pretrained("./lora-adapter")
tokenizer.save_pretrained("./lora-adapter")


('./lora-adapter\\tokenizer_config.json',
 './lora-adapter\\special_tokens_map.json',
 './lora-adapter\\vocab.json',
 './lora-adapter\\merges.txt',
 './lora-adapter\\added_tokens.json',
 './lora-adapter\\tokenizer.json')