In [1]:
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

In [2]:
os.chdir("../")

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    model_name: str
    dataset_dir: Path
    model_save_to_dir: Path
    tokenizer_save_to_dir: Path

In [4]:
from pathlib import Path
from textSummarizer.constants import CONFIG_FILE, PARAM_FILE
from textSummarizer.utils import read_yaml


class ConfigManager:
    def __init__(self, config_file: Path = CONFIG_FILE, param_file: Path = PARAM_FILE):
        self.config = read_yaml(config_file)
        self.param = read_yaml(param_file)

        Path(self.config.artifact_root).mkdir(parents=True, exist_ok=True)
    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer

        return ModelTrainerConfig(
            model_name=config.model_name,
            dataset_dir=config.dataset_dir,
            model_save_to_dir=config.model_save_to_dir,
            tokenizer_save_to_dir=config.tokenizer_save_to_dir
            )
    
    def get_param(self):
        return self.param

In [5]:
from textSummarizer.logging import logger
from datasets import load_from_disk
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, AutoTokenizer
from transformers import TrainingArguments, Trainer
import torch

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def train(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        logger.info(f"device: {device}")


        logger.info(f"Loading tokenizer {self.config.model_name}")
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)
        logger.info(f"Loading model {self.config.model_name}")
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_name).to(device)
        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
        
        #loading data 
        logger.info(f"Loading data")
        dataset_samsum_pt = load_from_disk(self.config.dataset_dir)

        # trainer_args = TrainingArguments(
        #     output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps,
        #     per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_train_batch_size,
        #     weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps,
        #     evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6,
        #     gradient_accumulation_steps=self.config.gradient_accumulation_steps
        # ) 

        self.config.model_save_to_dir.mkdir(parents=True, exist_ok=True)
        self.config.tokenizer_save_to_dir.mkdir(parents=True, exist_ok=True)

        trainer_args = TrainingArguments(
            output_dir=self.config.model_save_to_dir, num_train_epochs=1, warmup_steps=500,
            per_device_train_batch_size=1, per_device_eval_batch_size=1,
            weight_decay=0.01, logging_steps=10,
            evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
            gradient_accumulation_steps=16
        ) 

        trainer = Trainer(model=model_pegasus, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["test"], 
                  eval_dataset=dataset_samsum_pt["validation"])
        
        logger.info(f"Training...")
        trainer.train()

        ## Save model
        model_pegasus.save_pretrained(self.config.model_save_to_dir)
        ## Save tokenizer
        tokenizer.save_pretrained(self.config.tokenizer_save_to_dir)

  from .autonotebook import tqdm as notebook_tqdm


2024-03-01 14:25:51,212 - INFO - config - PyTorch version 2.2.1 available.


In [6]:

config_manager = ConfigManager()
model_trainer_config = config_manager.get_model_trainer_config()
model_trainer = ModelTrainer(model_trainer_config)
model_trainer.train()

2024-03-01 14:26:19,329 - INFO - __init__ - Reading configs\config.yaml ......
2024-03-01 14:26:19,333 - INFO - __init__ - Reading params\param.yaml ......
2024-03-01 14:26:19,336 - INFO - 3924925554 - device: cpu
2024-03-01 14:26:19,338 - INFO - 3924925554 - Loading tokenizer google/pegasus-cnn_dailymail


2024-03-01 14:26:23,626 - INFO - 3924925554 - Loading model google/pegasus-cnn_dailymail


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2024-03-01 14:26:50,719 - INFO - 3924925554 - Loading data
2024-03-01 14:26:53,028 - INFO - 3924925554 - Training...


  0%|          | 0/51 [00:00<?, ?it/s]