## Model Training

In [1]:
import os
%pwd
os.chdir("../")
%pwd


'd:\\Data Science\\END to END Proj\\QA-BOT'

In [8]:
from dataclasses import dataclass
from pathlib import Path
import os
import torch
from torch.utils.data import DataLoader, Dataset  # Added Dataset import
from torch.optim import AdamW
from tqdm import tqdm
from src.QABOT import logger
import shutil
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizerFast  # Added tokenizer import


In [9]:

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    model_save_dir: Path
    trained_model_path: Path
    tokenizer_save_dir: Path
    num_train_epochs: int
    batch_size: int
    learning_rate: float

In [10]:
from src.QABOT.constant import *
from src.QABOT.utils.common import read_yaml,create_directories 
import json
from src.QABOT import logger


In [19]:
from src.QABOT.constant import *
from src.QABOT.utils.common import read_yaml,create_directories 
from src.QABOT.entity.config_entity import DataIngestionConfig,DataValidationConfig,DataTransformationConfig
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
     ):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        

        create_directories([self.config.artifacts_root])


    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir 
        )

        return data_ingestion_config
    

    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=Path(config.root_dir),
            STATUS_FILE=Path(config.STATUS_FILE),
            TRAIN_FILE=Path(config.TRAIN_FILE),
            DEV_FILE=Path(config.DEV_FILE),
            REQUIRED_KEYS=config.REQUIRED_KEYS,
            MIN_EXAMPLES=config.MIN_EXAMPLES
        )
        return data_validation_config
    




    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=Path(config.root_dir),
            TRAIN_FILE=Path(config.TRAIN_FILE),
            DEV_FILE=Path(config.DEV_FILE),
            MAX_LEN=self.params.Transformation.MAX_LEN,
            DOC_STRIDE=self.params.Transformation.DOC_STRIDE,
            PARA_VECTORIZER_FILE=Path(config.PARA_VECTORIZER_FILE),
            SENT_VECTORIZER_FILE=Path(config.SENT_VECTORIZER_FILE),
            PARAGRAPHS_FILE=Path(config.PARAGRAPHS_FILE),
            SENTENCES_FILE=Path(config.SENTENCES_FILE)
        )
        return data_transformation_config

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.Training

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=Path(config.root_dir),
            model_save_dir=Path(config.model_save_dir),
            trained_model_path=Path(config.trained_model_path),
            tokenizer_save_dir=Path(config.tokenizer_save_dir),
            num_train_epochs=params.num_train_epochs,
            batch_size=params.batch_size,
            learning_rate=params.learning_rate
        )
        return model_trainer_config


In [20]:



class ModelTrainer:
    def __init__(
        self,
        config: ModelTrainerConfig,
        train_dataset: Dataset,
        eval_dataset: Dataset = None
    ):
        self.config = config
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    

    def _check_model_exists(self) -> bool:
        """Check if trained model already exists"""
        return (
            os.path.exists(self.config.trained_model_path) and 
            os.path.exists(self.config.tokenizer_save_dir)
        )

    def train(self):
        try:
            # Skip training if model exists
            if self._check_model_exists():
                logger.info("Model already trained and saved. Skipping training.")
                return

            logger.info("Starting model training...")
            
            # Create fresh output directory
            if os.path.exists(self.config.model_save_dir):
                shutil.rmtree(self.config.model_save_dir)
            os.makedirs(self.config.model_save_dir, exist_ok=True)

            # Initialize model
            model = DistilBertForQuestionAnswering.from_pretrained(
                "distilbert-base-uncased"
            ).to(self.device)
            
            # Setup training
            train_loader = DataLoader(
                self.train_dataset,
                batch_size=self.config.batch_size,
                shuffle=True
            )
            
            optimizer = AdamW(
                model.parameters(),
                lr=self.config.learning_rate
            )

            # Training loop
            model.train()
            for epoch in range(self.config.num_train_epochs):
                loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{self.config.num_train_epochs}")
                for batch in loop:
                    optimizer.zero_grad()
                    batch = {k: v.to(self.device) for k, v in batch.items()}
                    outputs = model(**batch)
                    loss = outputs.loss
                    loss.backward()
                    optimizer.step()
                    loop.set_postfix(loss=loss.item())

            # Save model
            model.save_pretrained(self.config.model_save_dir)
            tokenizer.save_pretrained(self.config.tokenizer_save_dir)
            logger.info(f" Model trained and saved to {self.config.model_save_dir}")

        except Exception as e:
            logger.error(f"Model training failed: {str(e)}")
            raise e


In [21]:
from src.QABOT.components.data_transformation import DataTransformation

if __name__ == "__main__":
    try:
        # Initialize configuration
        config_manager = ConfigurationManager()
        trainer_config = config_manager.get_model_trainer_config()
        
        # Load datasets (from previous transformation step)
        transformer = DataTransformation(config_manager.get_data_transformation_config())
        train_dataset, _ = transformer.transform()
        
        # Train model
        trainer = ModelTrainer(
            config=trainer_config,
            train_dataset=train_dataset
        )
        trainer.train()

    except Exception as e:
        logger.error(f"Training pipeline failed: {str(e)}")
        raise e

[2025-08-13 16:58:05,024: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-08-13 16:58:05,028: INFO: common: yaml file: params.yaml loaded successfully]
[2025-08-13 16:58:05,031: INFO: common: created directory at: artifacts]
[2025-08-13 16:58:05,036: INFO: common: created directory at: artifacts/model_trainer]
[2025-08-13 16:58:05,041: INFO: common: created directory at: artifacts/data_transformation]




[2025-08-13 16:58:08,476: INFO: data_transformation: Flattened 87599 train and 10570 dev examples]
[2025-08-13 16:58:17,387: INFO: data_transformation: Built TF-IDF retriever with 20958 paragraphs and 104034 sentences]


Tokenizing examples: 100%|██████████| 87599/87599 [02:00<00:00, 726.83it/s] 
Tokenizing examples: 100%|██████████| 10570/10570 [00:16<00:00, 657.46it/s]

[2025-08-13 17:00:34,015: INFO: data_transformation: Prepared 87790 train and 10616 dev features]





[2025-08-13 17:00:35,654: INFO: 4066968275: Model already trained and saved. Skipping training.]
