In [1]:
import os
os.chdir('..')
%pwd

'd:\\coding\\ml\\text summer\\Text-summerizer'

In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: str
    eval_steps: int
    save_steps: float
    gradient_accumulation_steps: int

In [3]:
from src.TextsummerizeProject.constants import *
from src.TextsummerizeProject.utils.common import read_yaml, Create_directory

class ConfigrationManager:
    def __init__(
            self, 
            config_path = CONFIGPATH, 
            params_path = PARAMSPATH
            ):
        
        self.config = read_yaml(config_path)
        self.params = read_yaml(params_path)
        print(self.config.artifacts_root)
        Create_directory([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        Create_directory([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_ckpt = config.model_ckpt,
            num_train_epochs = params.num_train_epochs,
            warmup_steps = params.warmup_steps,
            per_device_train_batch_size = params.per_device_train_batch_size,
            weight_decay = params.weight_decay,
            logging_steps = params.logging_steps,
            evaluation_strategy = params.evaluation_strategy,
            eval_steps = params.evaluation_strategy,
            save_steps = params.save_steps,
            gradient_accumulation_steps = params.gradient_accumulation_steps
        )

        return model_trainer_config

In [4]:
import pickle
import os

In [5]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import datasets
import torch
import pandas as pd
%pip install --upgrade accelerate
%pip uninstall -y tensorflow accelerate
%pip install transformers accelerate



  from .autonotebook import tqdm as notebook_tqdm


Note: you may need to restart the kernel to use updated packages.
Found existing installation: accelerate 0.21.0
Uninstalling accelerate-0.21.0:
  Successfully uninstalled accelerate-0.21.0
Note: you may need to restart the kernel to use updated packages.




Collecting accelerate
  Using cached accelerate-0.21.0-py3-none-any.whl (244 kB)
Installing collected packages: accelerate
Successfully installed accelerate-0.21.0
Note: you may need to restart the kernel to use updated packages.


In [6]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config


    
    def train(self):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model_scibert = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_scibert)
        
        #loading data 
        path_train = os.path.join(self.config.data_path,'train.pkl')
        path_test = os.path.join(self.config.data_path,'test.pkl')
        trainfile = pickle.load(open(path_train,'rb'))
        testfile = pickle.load(open(path_test,'rb'))
        # x_train ={'input_ids':trainfile['input_ids'], 'attention_mask':trainfile['attention_mask']}
        # y_train ={'labels':trainfile['labels']}
        # x_test ={'input_ids':testfile['input_ids'], 'attention_mask':testfile['attention_mask']}
        # y_test ={'labels':testfile['labels']}
        # trainset = tf.data.Dataset.from_tensor_slices((dict(x_train),y_train))
        # testset = tf.data.Dataset.from_tensor_slices((dict(x_test),y_test))

        trainset = datasets.Dataset.from_pandas(pd.DataFrame(data=trainfile)[:900])
        testset = datasets.Dataset.from_pandas(pd.DataFrame(data=testfile)[:200])

        trainer_args = TrainingArguments(
            output_dir=self.config.root_dir, 
            num_train_epochs=1, warmup_steps=500,
            per_device_train_batch_size=1, per_device_eval_batch_size=1,
            weight_decay=0.01, logging_steps=5,
            evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
            gradient_accumulation_steps=16,
        ) 

        trainer = Trainer(model=model_scibert, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  eval_dataset=testset,
                  train_dataset=trainset)
        
        trainer.train()

        ## Save model
        model_scibert.save_pretrained(os.path.join(self.config.root_dir,"scibert-ML-DL-model"))
        ## Save tokenizer
        tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer"))

In [7]:
try:
    config = ConfigrationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config3)
    model_trainer_config.train()
except Exception as e:
    raise e

[INFO: common: Yaml file <_io.TextIOWrapper name='config\\config.yaml' mode='r' encoding='UTF-8'> read Successfully]
[INFO: common: Yaml file <_io.TextIOWrapper name='params.yaml' mode='r' encoding='UTF-8'> read Successfully]
artifacts
[INFO: common: Directory artifacts created Successfully]
[INFO: common: Directory artifacts/model_trainer created Successfully]


  0%|          | 0/56 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  9%|▉         | 5/56 [09:22<1:35:44, 112.64s/it]

{'loss': 33.0127, 'learning_rate': 4e-05, 'epoch': 0.09}


 18%|█▊        | 10/56 [19:14<1:33:58, 122.57s/it]

{'loss': 29.7236, 'learning_rate': 8e-05, 'epoch': 0.18}


 27%|██▋       | 15/56 [27:59<1:10:56, 103.81s/it]

{'loss': 25.3527, 'learning_rate': 0.00012, 'epoch': 0.27}


 36%|███▌      | 20/56 [35:58<58:36, 97.69s/it]   

{'loss': 19.1857, 'learning_rate': 0.00016, 'epoch': 0.36}


 45%|████▍     | 25/56 [43:38<47:21, 91.67s/it]

{'loss': 9.7689, 'learning_rate': 0.0002, 'epoch': 0.44}


 54%|█████▎    | 30/56 [52:52<47:25, 109.43s/it]

{'loss': 4.8885, 'learning_rate': 0.00024, 'epoch': 0.53}


 62%|██████▎   | 35/56 [1:01:21<35:09, 100.46s/it]

{'loss': 3.7525, 'learning_rate': 0.00028000000000000003, 'epoch': 0.62}


 71%|███████▏  | 40/56 [1:09:20<25:32, 95.77s/it] 

{'loss': 1.6734, 'learning_rate': 0.00032, 'epoch': 0.71}


 80%|████████  | 45/56 [1:17:32<18:17, 99.78s/it]

{'loss': 0.7306, 'learning_rate': 0.00035999999999999997, 'epoch': 0.8}


 89%|████████▉ | 50/56 [1:25:39<09:57, 99.58s/it]

{'loss': 0.5738, 'learning_rate': 0.0004, 'epoch': 0.89}


 98%|█████████▊| 55/56 [1:33:35<01:35, 95.48s/it]

{'loss': 0.5098, 'learning_rate': 0.00044, 'epoch': 0.98}


100%|██████████| 56/56 [1:35:11<00:00, 102.00s/it]


{'train_runtime': 5711.6934, 'train_samples_per_second': 0.158, 'train_steps_per_second': 0.01, 'train_loss': 11.543784795062882, 'epoch': 1.0}


In [None]:
# path_train = os.path.join(self.config.data_path,'train.pkl')
        # path_test = os.path.join(self.config.data_path,'test.pkl')
        # trainfile = pickle.load(open(path_train,'rb'))
testfile = pickle.load(open('artifacts\\data_transformation\\test.pkl','rb'))
        # x_train ={'input_ids':trainfile['input_ids'], 'attention_mask':trainfile['attention_mask']}
        # y_train ={'input_ids':trainfile['labels']}
x_test ={'input_ids':testfile['input_ids'], 'attention_mask':testfile['attention_mask']}
y_test ={'input_ids':testfile['labels']}
        # trainset = tf.data.Dataset.from_tensor_slices((dict(x_train),y_train))
testset = tf.data.Dataset.from_tensor_slices((dict(x_test),y_test))

In [None]:
print(testset)

In [None]:
y_train = {"label":trainfile['labels']}


In [None]:
y_train