In [2]:
import os 
%pwd

'/home/prathameshdevadiga/personal/projs/TextSummarization/research'

In [3]:
os.chdir("../")
%pwd

'/home/prathameshdevadiga/personal/projs/TextSummarization'

In [4]:
from dataclasses import dataclass 
from pathlib import Path 

@dataclass(frozen = True) 
class ModelTrainerConfig: 
    root_dir: Path 
    data_path: Path 
    model_ckpt: Path 
    num_train_epochs: int 
    warmup_steps: int 
    per_device_train_batch_size: int 
    weight_decay: float 
    logging_steps: int 
    evaluation_strategy: str 
    eval_steps: int 
    save_steps: float 
    gradient_accumulation_steps: int

In [5]:
from textSummarizer.constants import * 
from textSummarizer.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    def __init__(
            self, 
            config_filepath = CONFIG_FILE_PATH, 
            params_filepath = PARAMS_FILE_PATH): 
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root]) 
    
    def get_model_trainer_config(self) -> ModelTrainerConfig: 
        
        config = self.config.model_trainer 
        params = self.params.TrainingArguments 

        create_directories([config.root_dir]) 

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir, 
            data_path=config.data_path, 
            model_ckpt=config.model_ckpt, 
            num_train_epochs=params.num_train_epochs, 
            warmup_steps=params.warmup_steps, 
            per_device_train_batch_size=params.per_device_train_batch_size, 
            weight_decay=params.weight_decay, 
            logging_steps=params.logging_steps,
            evaluation_strategy=params.evaluation_strategy, 
            eval_steps=params.eval_steps, 
            save_steps=params.save_steps, 
            gradient_accumulation_steps=params.gradient_accumulation_steps
        )

        return model_trainer_config


In [8]:
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer 
from datasets import load_dataset, load_from_disk 
import torch 

In [9]:
class ModelTrainer: 
    def __init__(self, config: ModelTrainerConfig):
        self.config = config 

    def train(self): 
        device = "cuda" if torch.cuda.is_available() else "cpu" 
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt) 
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device) 
        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model = model_pegasus) 

        dataset_samsum_pt = load_from_disk(self.config.data_path) 

        '''trainer_args = TrainingArguments(
            output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps, 
            per_device_eval_batch_size=self.config.per_device_train_batch_size, per_gpu_eval_batch_size=self.config.per_device_train_batch_size,
            weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps, evaluation_strategy=self.config.evaluation_strategy,
            eval_steps=self.config.eval_steps, save_steps=1e6, gradient_accumulation_steps=self.config.gradient_accumulation_steps
        )'''

        trainer_args = TrainingArguments(
            output_dir=self.config.root_dir, num_train_epochs=1, warmup_steps=500,
            per_device_train_batch_size=1, per_device_eval_batch_size=1,
            weight_decay=0.01, logging_steps=10,
            evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
            gradient_accumulation_steps=16
        ) 

        #using test only, cuz no computer power L

        trainer = Trainer(model = model_pegasus, args = trainer_args, tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                          train_dataset=dataset_samsum_pt['test'], eval_dataset=dataset_samsum_pt['validation'])
        
        trainer.train() 

        model_pegasus.save_pretrained(os.path.join(self.config.root_dir, "pegasus-samsum-model")) 

        tokenizer.save_pretrained(os.path.join(self.config.root_dir, "tokenizer"))
        

In [10]:
torch.cuda.empty_cache()

In [12]:
pip install accelerate -U

Defaulting to user installation because normal site-packages is not writeable
Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m0:01[0m
[?25hInstalling collected packages: accelerate
Successfully installed accelerate-0.25.0
Note: you may need to restart the kernel to use updated packages.


In [15]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb=256"

In [17]:
pip install transformers[torch]

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [18]:
try:
    config = ConfigurationManager() 
    model_trainer_config = config.get_model_trainer_config() 
    model_trainer_config = ModelTrainer(config = model_trainer_config) 
    model_trainer_config.train() 


except Exception as e:
    raise e

[2024-01-04 22:46:41,477 : INFO : common: yaml file : config/config.yaml is loaded successfully. ]
[2024-01-04 22:46:41,490 : INFO : common: yaml file : params.yaml is loaded successfully. ]
[2024-01-04 22:46:41,493 : INFO : common: creating directory at artifacts]
[2024-01-04 22:46:41,496 : INFO : common: creating directory at artifacts/model_trainer]


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacty of 3.81 GiB of which 13.12 MiB is free. Including non-PyTorch memory, this process has 3.79 GiB memory in use. Of the allocated memory 3.74 GiB is allocated by PyTorch, and 220.50 KiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF