In [1]:
import os

In [2]:
%pwd

'/Users/deep2.usdadiya/Desktop/DS Project/Text-Summariser-Project/research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/Users/deep2.usdadiya/Desktop/DS Project/Text-Summariser-Project'

In [5]:
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Disable CUDA
os.environ["PYTORCH_NO_MPS"] = "1"  # Disable MPS

In [6]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: str
    eval_steps: int
    save_steps: float
    gradient_accumulation_steps: int

In [7]:
from src.textSummarizer.constants import *
from src.textSummarizer.utils.common import read_yaml, create_directories

In [8]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_ckpt = config.model_ckpt,
            num_train_epochs = params.num_train_epochs,
            warmup_steps = params.warmup_steps,
            per_device_train_batch_size = params.per_device_train_batch_size,
            weight_decay = params.weight_decay,
            logging_steps = params.logging_steps,
            evaluation_strategy = params.evaluation_strategy,
            eval_steps = params.evaluation_strategy,
            save_steps = params.save_steps,
            gradient_accumulation_steps = params.gradient_accumulation_steps
        )

        return model_trainer_config

In [9]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
import torch

  from .autonotebook import tqdm as notebook_tqdm


[2024-07-10 18:02:48,903: INFO: config: PyTorch version 2.3.1 available.]
[2024-07-10 18:02:48,905: INFO: config: TensorFlow version 2.16.1 available.]


In [11]:
import gc

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Disable CUDA
    torch.backends.mps.is_available = lambda: False  # Disable MPS
    device = torch.device("cpu") # type: ignore
    
    def train(self):
        device = torch.device("cpu")
        print(device)
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
        
        #loading data 
        dataset_samsum_pt = load_from_disk(self.config.data_path) # type: ignore

        # trainer_args = TrainingArguments(
        #     output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps,
        #     per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_train_batch_size,
        #     weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps,
        #     evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6,
        #     gradient_accumulation_steps=self.config.gradient_accumulation_steps
        # ) 


        trainer_args = TrainingArguments(
            output_dir=self.config.root_dir, num_train_epochs=1, warmup_steps=500, # type: ignore
            per_device_train_batch_size=1, per_device_eval_batch_size=1,
            weight_decay=0.01, logging_steps=10,
            evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
            gradient_accumulation_steps=16
        ) 

        device = torch.device("cpu")

        trainer = Trainer(model=model_pegasus, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["train"],  # type: ignore
                  eval_dataset=dataset_samsum_pt["validation"]) # type: ignore
        
        trainer.train()

        ## Save model
        model_pegasus.save_pretrained(os.path.join(self.config.root_dir,"pegasus-samsum-model"))
        ## Save tokenizer
        tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer"))

        gc.collect()
        torch.cuda.empty_cache()

In [12]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

[2024-07-10 18:03:12,197: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-10 18:03:12,199: INFO: common: yaml file: params.yaml loaded successfully]
[2024-07-10 18:03:12,200: INFO: common: created directory at: artifacts]
[2024-07-10 18:03:12,200: INFO: common: created directory at: artifacts/model_trainer]
cpu


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  1%|          | 10/920 [09:36<13:33:58, 53.67s/it]

{'loss': 3.0679, 'grad_norm': 13.077805519104004, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}


  2%|▏         | 20/920 [3:50:24<518:45:13, 2075.01s/it]

{'loss': 2.9845, 'grad_norm': 10.075687408447266, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.02}


  3%|▎         | 30/920 [15:25:38<1333:00:36, 5391.95s/it]

{'loss': 3.0906, 'grad_norm': 16.274648666381836, 'learning_rate': 3e-06, 'epoch': 0.03}


  4%|▍         | 40/920 [16:27:02<58:03:49, 237.53s/it]   

{'loss': 3.0837, 'grad_norm': 26.513925552368164, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.04}


  5%|▌         | 50/920 [16:36:21<13:40:03, 56.56s/it] 

{'loss': 2.7693, 'grad_norm': 16.296735763549805, 'learning_rate': 5e-06, 'epoch': 0.05}


  7%|▋         | 60/920 [16:45:49<13:45:03, 57.56s/it]

{'loss': 2.8915, 'grad_norm': 163.62078857421875, 'learning_rate': 6e-06, 'epoch': 0.07}


  8%|▊         | 70/920 [16:55:05<12:54:17, 54.66s/it]

{'loss': 2.6269, 'grad_norm': 75.61419677734375, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.08}


  9%|▊         | 80/920 [17:03:26<11:40:13, 50.02s/it]

{'loss': 2.6226, 'grad_norm': 8.398561477661133, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.09}


 10%|▉         | 90/920 [17:12:14<11:37:53, 50.45s/it]

{'loss': 2.4781, 'grad_norm': 39.50974655151367, 'learning_rate': 9e-06, 'epoch': 0.1}


 11%|█         | 100/920 [17:20:35<11:24:06, 50.06s/it]

{'loss': 2.341, 'grad_norm': 7.485022068023682, 'learning_rate': 1e-05, 'epoch': 0.11}


 12%|█▏        | 110/920 [17:28:08<10:14:54, 45.55s/it]

{'loss': 2.336, 'grad_norm': 6.621525764465332, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.12}


 13%|█▎        | 120/920 [17:36:19<11:55:24, 53.66s/it]

{'loss': 2.2391, 'grad_norm': 13.85648250579834, 'learning_rate': 1.2e-05, 'epoch': 0.13}


 14%|█▍        | 130/920 [17:44:57<11:27:26, 52.21s/it]

{'loss': 2.2009, 'grad_norm': 9.296263694763184, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.14}


 15%|█▌        | 140/920 [17:54:52<13:33:12, 62.55s/it]

{'loss': 2.1205, 'grad_norm': 6.836944580078125, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.15}


 16%|█▋        | 150/920 [18:08:25<16:14:34, 75.94s/it]

{'loss': 2.1697, 'grad_norm': 14.46204948425293, 'learning_rate': 1.5e-05, 'epoch': 0.16}


 17%|█▋        | 160/920 [18:23:56<19:19:36, 91.55s/it] 

{'loss': 1.9912, 'grad_norm': 7.406050682067871, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.17}


 18%|█▊        | 170/920 [18:34:25<12:58:32, 62.28s/it]

{'loss': 1.9929, 'grad_norm': 14.570110321044922, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.18}


 20%|█▉        | 180/920 [18:45:20<13:18:36, 64.75s/it]

{'loss': 1.9025, 'grad_norm': 6.3453898429870605, 'learning_rate': 1.8e-05, 'epoch': 0.2}


 21%|██        | 190/920 [18:57:33<16:22:11, 80.73s/it]

{'loss': 1.8904, 'grad_norm': 11.752842903137207, 'learning_rate': 1.9e-05, 'epoch': 0.21}


 22%|██▏       | 200/920 [19:08:06<13:58:17, 69.86s/it]

{'loss': 1.8947, 'grad_norm': 23.61323356628418, 'learning_rate': 2e-05, 'epoch': 0.22}


 23%|██▎       | 210/920 [19:17:17<10:10:34, 51.60s/it]

{'loss': 1.8985, 'grad_norm': 4.822911262512207, 'learning_rate': 2.1e-05, 'epoch': 0.23}


 24%|██▍       | 220/920 [19:27:04<9:53:00, 50.83s/it] 

{'loss': 1.851, 'grad_norm': 4.942234039306641, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.24}


 25%|██▌       | 230/920 [19:35:00<9:00:56, 47.04s/it]

{'loss': 1.9562, 'grad_norm': 3.5223662853240967, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.25}


 26%|██▌       | 240/920 [19:43:11<8:21:25, 44.24s/it] 

{'loss': 1.7904, 'grad_norm': 4.321300983428955, 'learning_rate': 2.4e-05, 'epoch': 0.26}


 27%|██▋       | 250/920 [19:52:31<10:41:16, 57.43s/it]

{'loss': 1.8715, 'grad_norm': 6.981652736663818, 'learning_rate': 2.5e-05, 'epoch': 0.27}


 28%|██▊       | 260/920 [20:00:59<9:16:21, 50.58s/it] 

{'loss': 1.7719, 'grad_norm': 4.066737174987793, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.28}


 29%|██▉       | 270/920 [20:08:25<7:44:20, 42.86s/it]

{'loss': 1.7037, 'grad_norm': 5.956827163696289, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.29}


 30%|███       | 280/920 [20:15:36<7:33:13, 42.49s/it]

{'loss': 1.7878, 'grad_norm': 6.954625606536865, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.3}


 32%|███▏      | 290/920 [21:15:14<65:32:28, 374.52s/it] 

{'loss': 1.8629, 'grad_norm': 7.839517593383789, 'learning_rate': 2.9e-05, 'epoch': 0.31}


 33%|███▎      | 300/920 [21:23:39<9:51:34, 57.25s/it]  

{'loss': 1.6989, 'grad_norm': 4.996867656707764, 'learning_rate': 3e-05, 'epoch': 0.33}


 34%|███▎      | 310/920 [21:31:52<8:57:33, 52.87s/it]

{'loss': 1.8478, 'grad_norm': 5.22020149230957, 'learning_rate': 3.1e-05, 'epoch': 0.34}


 35%|███▍      | 320/920 [21:39:03<7:20:18, 44.03s/it]

{'loss': 1.897, 'grad_norm': 5.065990447998047, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.35}


 36%|███▌      | 330/920 [21:46:54<7:47:14, 47.52s/it]

{'loss': 1.8221, 'grad_norm': 5.771514415740967, 'learning_rate': 3.3e-05, 'epoch': 0.36}


 37%|███▋      | 340/920 [21:55:35<8:09:14, 50.61s/it]

{'loss': 1.7484, 'grad_norm': 3.9532201290130615, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.37}


 38%|███▊      | 350/920 [22:04:50<8:00:55, 50.62s/it]

{'loss': 1.7381, 'grad_norm': 3.4980626106262207, 'learning_rate': 3.5e-05, 'epoch': 0.38}


 39%|███▉      | 360/920 [22:13:03<7:47:45, 50.12s/it]

{'loss': 1.6284, 'grad_norm': 4.061013698577881, 'learning_rate': 3.6e-05, 'epoch': 0.39}


 40%|████      | 370/920 [22:20:00<6:04:09, 39.73s/it]

{'loss': 1.7108, 'grad_norm': 5.225276470184326, 'learning_rate': 3.7e-05, 'epoch': 0.4}


 41%|████▏     | 380/920 [22:27:11<6:48:03, 45.34s/it]

{'loss': 1.6921, 'grad_norm': 6.869014263153076, 'learning_rate': 3.8e-05, 'epoch': 0.41}


 42%|████▏     | 390/920 [22:35:43<7:04:37, 48.07s/it]

{'loss': 1.7122, 'grad_norm': 3.5152902603149414, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.42}


 43%|████▎     | 400/920 [22:45:24<8:13:00, 56.89s/it]

{'loss': 1.6989, 'grad_norm': 3.9363701343536377, 'learning_rate': 4e-05, 'epoch': 0.43}


 45%|████▍     | 410/920 [22:54:56<7:38:01, 53.88s/it]

{'loss': 1.712, 'grad_norm': 4.172395706176758, 'learning_rate': 4.1e-05, 'epoch': 0.45}


 46%|████▌     | 420/920 [23:05:13<8:28:47, 61.05s/it]

{'loss': 1.6282, 'grad_norm': 3.935276985168457, 'learning_rate': 4.2e-05, 'epoch': 0.46}


 47%|████▋     | 430/920 [23:15:38<7:19:17, 53.79s/it] 

{'loss': 1.7418, 'grad_norm': 4.53522253036499, 'learning_rate': 4.3e-05, 'epoch': 0.47}


 48%|████▊     | 440/920 [23:24:44<7:04:40, 53.08s/it]

{'loss': 1.7032, 'grad_norm': 9.44779109954834, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.48}


 49%|████▉     | 450/920 [23:33:41<6:35:24, 50.48s/it]

{'loss': 1.6577, 'grad_norm': 39.39320373535156, 'learning_rate': 4.5e-05, 'epoch': 0.49}


 50%|█████     | 460/920 [23:42:30<6:57:01, 54.40s/it]

{'loss': 1.7032, 'grad_norm': 5.545787811279297, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.5}


 51%|█████     | 470/920 [23:51:54<8:01:23, 64.18s/it]

{'loss': 1.7041, 'grad_norm': 4.64144229888916, 'learning_rate': 4.7e-05, 'epoch': 0.51}


 52%|█████▏    | 480/920 [24:01:52<7:21:50, 60.25s/it]

{'loss': 1.6186, 'grad_norm': 3.9699723720550537, 'learning_rate': 4.8e-05, 'epoch': 0.52}


 53%|█████▎    | 490/920 [26:30:43<245:56:43, 2059.08s/it]

{'loss': 1.6386, 'grad_norm': 4.266426086425781, 'learning_rate': 4.9e-05, 'epoch': 0.53}


 54%|█████▍    | 500/920 [29:32:33<34:13:29, 293.36s/it]  

{'loss': 1.678, 'grad_norm': 4.008818626403809, 'learning_rate': 5e-05, 'epoch': 0.54}


                                                        
 54%|█████▍    | 500/920 [29:38:19<34:13:29, 293.36s/it]

{'eval_loss': 1.4866605997085571, 'eval_runtime': 345.8869, 'eval_samples_per_second': 2.365, 'eval_steps_per_second': 2.365, 'epoch': 0.54}


 55%|█████▌    | 510/920 [29:49:53<9:12:50, 80.90s/it]  

{'loss': 1.672, 'grad_norm': 3.6708574295043945, 'learning_rate': 4.880952380952381e-05, 'epoch': 0.55}


 57%|█████▋    | 520/920 [39:56:28<107:31:32, 967.73s/it] 

{'loss': 1.6503, 'grad_norm': 7.060591220855713, 'learning_rate': 4.761904761904762e-05, 'epoch': 0.56}


 58%|█████▊    | 530/920 [40:06:40<9:06:06, 84.02s/it]   

{'loss': 1.6918, 'grad_norm': 3.609449863433838, 'learning_rate': 4.642857142857143e-05, 'epoch': 0.58}


 59%|█████▊    | 540/920 [40:44:41<13:44:49, 130.24s/it]

{'loss': 1.5727, 'grad_norm': 7.153774261474609, 'learning_rate': 4.523809523809524e-05, 'epoch': 0.59}


 60%|█████▉    | 550/920 [40:55:00<6:54:38, 67.24s/it]  

{'loss': 1.6768, 'grad_norm': 4.797276020050049, 'learning_rate': 4.404761904761905e-05, 'epoch': 0.6}


 61%|██████    | 560/920 [41:04:14<5:51:24, 58.57s/it]

{'loss': 1.6942, 'grad_norm': 10.093389511108398, 'learning_rate': 4.2857142857142856e-05, 'epoch': 0.61}


 62%|██████▏   | 570/920 [41:12:52<5:07:29, 52.71s/it]

{'loss': 1.7191, 'grad_norm': 3.8840103149414062, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.62}


 63%|██████▎   | 580/920 [41:23:07<6:44:10, 71.33s/it]

{'loss': 1.6286, 'grad_norm': 3.793567419052124, 'learning_rate': 4.047619047619048e-05, 'epoch': 0.63}


 64%|██████▍   | 590/920 [41:33:01<5:29:35, 59.93s/it]

{'loss': 1.5394, 'grad_norm': 3.8086516857147217, 'learning_rate': 3.928571428571429e-05, 'epoch': 0.64}


 65%|██████▌   | 600/920 [41:41:35<4:56:06, 55.52s/it]

{'loss': 1.6616, 'grad_norm': 3.9121358394622803, 'learning_rate': 3.809523809523809e-05, 'epoch': 0.65}


 66%|██████▋   | 610/920 [41:50:25<5:12:13, 60.43s/it]

{'loss': 1.5692, 'grad_norm': 3.995569944381714, 'learning_rate': 3.690476190476191e-05, 'epoch': 0.66}


 67%|██████▋   | 620/920 [42:00:04<4:42:38, 56.53s/it]

{'loss': 1.6383, 'grad_norm': 3.4003915786743164, 'learning_rate': 3.571428571428572e-05, 'epoch': 0.67}


 68%|██████▊   | 630/920 [42:11:11<5:11:15, 64.40s/it]

{'loss': 1.6556, 'grad_norm': 4.898467063903809, 'learning_rate': 3.4523809523809526e-05, 'epoch': 0.68}


 70%|██████▉   | 640/920 [42:21:12<4:33:41, 58.65s/it]

{'loss': 1.6339, 'grad_norm': 4.076464653015137, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.7}


 71%|███████   | 650/920 [42:31:37<4:56:21, 65.86s/it]

{'loss': 1.5189, 'grad_norm': 3.7741665840148926, 'learning_rate': 3.2142857142857144e-05, 'epoch': 0.71}


 72%|███████▏  | 660/920 [42:41:07<3:53:30, 53.89s/it]

{'loss': 1.5714, 'grad_norm': 3.9458072185516357, 'learning_rate': 3.095238095238095e-05, 'epoch': 0.72}


 73%|███████▎  | 670/920 [42:49:49<3:48:44, 54.90s/it]

{'loss': 1.6042, 'grad_norm': 4.067460060119629, 'learning_rate': 2.9761904761904762e-05, 'epoch': 0.73}


 74%|███████▍  | 680/920 [42:59:50<3:43:41, 55.92s/it]

{'loss': 1.5909, 'grad_norm': 4.0966877937316895, 'learning_rate': 2.857142857142857e-05, 'epoch': 0.74}


 75%|███████▌  | 690/920 [43:09:12<3:39:38, 57.30s/it]

{'loss': 1.5634, 'grad_norm': 3.813253164291382, 'learning_rate': 2.7380952380952383e-05, 'epoch': 0.75}


 76%|███████▌  | 700/920 [43:19:27<3:46:40, 61.82s/it]

{'loss': 1.6206, 'grad_norm': 8.298880577087402, 'learning_rate': 2.6190476190476192e-05, 'epoch': 0.76}


 77%|███████▋  | 710/920 [43:28:13<3:33:02, 60.87s/it]

{'loss': 1.5993, 'grad_norm': 3.7240750789642334, 'learning_rate': 2.5e-05, 'epoch': 0.77}


 78%|███████▊  | 720/920 [43:37:38<3:09:25, 56.83s/it]

{'loss': 1.5872, 'grad_norm': 2.9229023456573486, 'learning_rate': 2.380952380952381e-05, 'epoch': 0.78}


 79%|███████▉  | 730/920 [43:48:11<3:24:26, 64.56s/it]

{'loss': 1.5444, 'grad_norm': 4.475892066955566, 'learning_rate': 2.261904761904762e-05, 'epoch': 0.79}


 80%|████████  | 740/920 [43:57:05<2:32:07, 50.71s/it]

{'loss': 1.6461, 'grad_norm': 3.105043649673462, 'learning_rate': 2.1428571428571428e-05, 'epoch': 0.8}


 82%|████████▏ | 750/920 [44:06:49<3:06:56, 65.98s/it]

{'loss': 1.5682, 'grad_norm': 3.5546247959136963, 'learning_rate': 2.023809523809524e-05, 'epoch': 0.81}


 83%|████████▎ | 760/920 [45:04:41<7:00:44, 157.78s/it] 

{'loss': 1.6099, 'grad_norm': 5.273390293121338, 'learning_rate': 1.9047619047619046e-05, 'epoch': 0.83}


 84%|████████▎ | 770/920 [45:14:24<2:28:13, 59.29s/it] 

{'loss': 1.5698, 'grad_norm': 4.740909099578857, 'learning_rate': 1.785714285714286e-05, 'epoch': 0.84}


 85%|████████▍ | 780/920 [45:24:44<2:16:17, 58.41s/it]

{'loss': 1.5545, 'grad_norm': 6.458977222442627, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.85}


 86%|████████▌ | 790/920 [45:33:42<1:56:12, 53.64s/it]

{'loss': 1.5505, 'grad_norm': 3.833984136581421, 'learning_rate': 1.5476190476190476e-05, 'epoch': 0.86}


 87%|████████▋ | 800/920 [45:42:50<2:05:05, 62.55s/it]

{'loss': 1.6646, 'grad_norm': 4.7887349128723145, 'learning_rate': 1.4285714285714285e-05, 'epoch': 0.87}


 88%|████████▊ | 810/920 [45:52:09<1:40:04, 54.59s/it]

{'loss': 1.5519, 'grad_norm': 2.9417026042938232, 'learning_rate': 1.3095238095238096e-05, 'epoch': 0.88}


 89%|████████▉ | 820/920 [46:01:25<1:25:46, 51.46s/it]

{'loss': 1.6204, 'grad_norm': 4.036319732666016, 'learning_rate': 1.1904761904761905e-05, 'epoch': 0.89}


 90%|█████████ | 830/920 [46:09:31<1:09:04, 46.05s/it]

{'loss': 1.6358, 'grad_norm': 3.226853132247925, 'learning_rate': 1.0714285714285714e-05, 'epoch': 0.9}


 91%|█████████▏| 840/920 [46:18:20<1:05:52, 49.41s/it]

{'loss': 1.5193, 'grad_norm': 4.546329021453857, 'learning_rate': 9.523809523809523e-06, 'epoch': 0.91}


 92%|█████████▏| 850/920 [46:27:12<1:05:42, 56.32s/it]

{'loss': 1.5589, 'grad_norm': 6.568892478942871, 'learning_rate': 8.333333333333334e-06, 'epoch': 0.92}


 93%|█████████▎| 860/920 [46:37:45<59:31, 59.53s/it]  

{'loss': 1.5612, 'grad_norm': 4.360992431640625, 'learning_rate': 7.142857142857143e-06, 'epoch': 0.93}


 95%|█████████▍| 870/920 [46:48:31<55:27, 66.55s/it]  

{'loss': 1.5569, 'grad_norm': 3.095595598220825, 'learning_rate': 5.9523809523809525e-06, 'epoch': 0.94}


 96%|█████████▌| 880/920 [46:58:20<39:35, 59.39s/it]

{'loss': 1.5348, 'grad_norm': 5.766001224517822, 'learning_rate': 4.7619047619047615e-06, 'epoch': 0.96}


 97%|█████████▋| 890/920 [47:07:10<23:05, 46.20s/it]

{'loss': 1.522, 'grad_norm': 3.1132798194885254, 'learning_rate': 3.5714285714285714e-06, 'epoch': 0.97}


 98%|█████████▊| 900/920 [47:52:40<35:51, 107.59s/it]  

{'loss': 1.5709, 'grad_norm': 3.373673677444458, 'learning_rate': 2.3809523809523808e-06, 'epoch': 0.98}


 99%|█████████▉| 910/920 [48:03:41<10:13, 61.36s/it] 

{'loss': 1.5324, 'grad_norm': 4.052250862121582, 'learning_rate': 1.1904761904761904e-06, 'epoch': 0.99}


100%|██████████| 920/920 [48:14:27<00:00, 188.77s/it]
Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


{'loss': 1.5819, 'grad_norm': 3.3042690753936768, 'learning_rate': 0.0, 'epoch': 1.0}
{'train_runtime': 173667.0787, 'train_samples_per_second': 0.085, 'train_steps_per_second': 0.005, 'train_loss': 1.8291466567827308, 'epoch': 1.0}
