## Model Training

In [1]:
import os
%pwd
os.chdir("../")
%pwd


'd:\\Data Science\\END to END Proj\\NVDNLP'

In [2]:
# ============================================
#     ENTITY: MODEL TRAINER CONFIG
# ============================================

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    model_dir: Path
    trained_model_path: Path
    n_estimators: int
    learning_rate: float
    max_depth: int
    subsample: float
    colsample_bytree: float
    random_state: int
    tree_method: str
    eval_metric: str
    early_stopping_rounds: int

In [3]:
# ============================================
# ⚙️ CONFIGURATION MANAGER
# ============================================

from src.NVDNLP.utils.common import read_yaml, create_directories 
# from src.NVDNLP.entity.config_entity import ModelTrainerConfig

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = "config/config.yaml",
        params_filepath = "params.yaml",
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])
    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.XGBoost

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=Path(config.root_dir),
            model_dir=Path(config.model_dir),
            trained_model_path=Path(config.trained_model_path),
            n_estimators=params.n_estimators,
            learning_rate=params.learning_rate,
            max_depth=params.max_depth,
            subsample=params.subsample,
            colsample_bytree=params.colsample_bytree,
            random_state=params.random_state,
            tree_method=params.tree_method,
            eval_metric=params.eval_metric,
            early_stopping_rounds=params.early_stopping_rounds
        )

        return model_trainer_config

In [4]:
# ============================================
# 🤖 MODEL TRAINER COMPONENT
# ============================================

import os
import joblib
import pandas as pd
import xgboost as xgb
# from src.NVDNLP.entity.config_entity import ModelTrainerConfig
from src.NVDNLP import logger

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig, train_data, test_data, label_encoder, tfidf_vectorizer):
        self.config = config
        self.train_data = train_data
        self.test_data = test_data
        self.label_encoder = label_encoder
        self.tfidf_vectorizer = tfidf_vectorizer
        self.model = None

    def _check_model_exists(self) -> bool:
        """Check if model already exists in artifacts/model_training"""
        return os.path.exists(self.config.trained_model_path)

    def prepare_data(self):
        """Prepare training and testing data"""
        try:
            logger.info(" Preparing data for model training...")
            
            # Extract features and labels from train data
            X_train_tfidf = self.tfidf_vectorizer.transform(self.train_data['Description'].astype(str))
            y_train = self.train_data['encoded_severity']
            
            # Extract features and labels from test data
            X_test_tfidf = self.tfidf_vectorizer.transform(self.test_data['Description'].astype(str))
            y_test = self.test_data['encoded_severity']
            
            logger.info(f" Training data: {X_train_tfidf.shape}, {len(y_train)} samples")
            logger.info(f" Testing data: {X_test_tfidf.shape}, {len(y_test)} samples")
            
            return X_train_tfidf, X_test_tfidf, y_train, y_test
            
        except Exception as e:
            logger.error(f" Data preparation failed: {e}")
            raise e

    def initialize_model(self):
        """Initialize XGBoost classifier with configuration"""
        try:
            logger.info(" Initializing XGBoost model...")
            
            self.model = xgb.XGBClassifier(
                n_estimators=self.config.n_estimators,
                learning_rate=self.config.learning_rate,
                max_depth=self.config.max_depth,
                subsample=self.config.subsample,
                colsample_bytree=self.config.colsample_bytree,
                random_state=self.config.random_state,
                tree_method=self.config.tree_method,
                eval_metric=self.config.eval_metric,
                early_stopping_rounds=self.config.early_stopping_rounds
            )
            
            logger.info(" XGBoost model initialized successfully!")
            return self.model
            
        except Exception as e:
            logger.error(f" Model initialization failed: {e}")
            raise e

    def train_model(self, X_train, X_test, y_train, y_test):
        """Train the XGBoost model"""
        try:
            logger.info(" Training tuned XGBoost model...")
            
            self.model.fit(
                X_train, y_train,
                eval_set=[(X_test, y_test)],
                verbose=50
            )
            
            logger.info(" Model training completed successfully!")
            return self.model
            
        except Exception as e:
            logger.error(f" Model training failed: {e}")
            raise e

    def save_model(self):
        """Save trained model and artifacts"""
        try:
            logger.info(" Saving model and artifacts...")
            
            # Create directory if it doesn't exist
            os.makedirs(self.config.model_dir, exist_ok=True)
            
            # Save the trained model
            joblib.dump(self.model, self.config.trained_model_path)
            logger.info(f" Saved trained model: {self.config.trained_model_path}")
            
            # Save training configuration
            training_info = {
                'model_type': 'XGBoost',
                'n_estimators': self.config.n_estimators,
                'learning_rate': self.config.learning_rate,
                'max_depth': self.config.max_depth,
                'training_samples': len(self.train_data),
                'feature_dimensions': self.tfidf_vectorizer.transform(['']).shape[1]
            }
            
            info_file = os.path.join(self.config.root_dir, "training_info.txt")
            with open(info_file, 'w') as f:
                f.write("=== MODEL TRAINING INFORMATION ===\n")
                for key, value in training_info.items():
                    f.write(f"{key}: {value}\n")
            
            logger.info(f" Saved training information: {info_file}")
            
        except Exception as e:
            logger.error(f" Failed to save model: {e}")
            raise e

    def train(self):
        """Complete model training pipeline (only training, no evaluation)"""
        try:
            # Check if model already exists
            if self._check_model_exists():
                logger.info(" Model already exists in artifacts/model_training. Skipping training.")
                return {
                    'status': 'skipped',
                    'message': 'Model already exists',
                    'model_path': self.config.trained_model_path
                }
            
            logger.info(" Starting Model Training Pipeline...")
            
            # Step 1: Prepare data
            X_train, X_test, y_train, y_test = self.prepare_data()
            
            # Step 2: Initialize model
            self.initialize_model()
            
            # Step 3: Train model
            self.train_model(X_train, X_test, y_train, y_test)
            
            # Step 4: Save model
            self.save_model()
            
            logger.info(" Model Training completed successfully!")
            
            return {
                'status': 'completed',
                'message': 'Model trained and saved successfully',
                'model_path': self.config.trained_model_path,
                'training_samples': len(self.train_data),
                'test_samples': len(self.test_data)
            }
            
        except Exception as e:
            logger.error(f" Model training pipeline failed: {e}")
            raise e

In [5]:
# ============================================
# 🤖 MODEL TRAINING PIPELINE
# ============================================

from src.NVDNLP.config.configuration import ConfigurationManager
from src.NVDNLP.components.DataTransformation import DataTransformation
# from src.NVDNLP.components.ModelTrainer import ModelTrainer
from src.NVDNLP import logger

STAGE_NAME = "Model Trainer stage"

class ModelTrainerTrainingPipeline:
    def __init__(self):
        pass

    def main(self):
        config = ConfigurationManager()
        
        # Load data transformation artifacts
        data_transformation_config = config.get_data_transformation_config()
        data_transformation = DataTransformation(config=data_transformation_config)
        transformation_result = data_transformation.transform()
        
        # Get model trainer config
        model_trainer_config = config.get_model_trainer_config()
        
        # Initialize and train model
        model_trainer = ModelTrainer(
            config=model_trainer_config,
            train_data=transformation_result['train_df'],
            test_data=transformation_result['test_df'],
            label_encoder=transformation_result['label_encoder'],
            tfidf_vectorizer=transformation_result['tfidf_vectorizer']
        )
        
        # Train model (will skip if already exists)
        training_result = model_trainer.train()
        
        return training_result

if __name__ == "__main__":
    try:
        logger.info(f">>>>>> Stage {STAGE_NAME} started <<<<<<")
        obj = ModelTrainerTrainingPipeline()
        result = obj.main()
        
        if result['status'] == 'completed':
            logger.info(f" Model trained successfully!")
            logger.info(f" Training samples: {result['training_samples']}")
            logger.info(f" Test samples: {result['test_samples']}")
            logger.info(f" Model saved at: {result['model_path']}")
            logger.info(f">>>>>> Stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
        else:
            logger.info(f">>>>>> Stage {STAGE_NAME} skipped (model already exists) <<<<<<\n\nx==========x")
    
    except Exception as e:
        logger.exception(e)
        raise e

[2025-10-22 22:29:26,840: INFO: 1034920712: >>>>>> Stage Model Trainer stage started <<<<<<]
[2025-10-22 22:29:26,962: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-10-22 22:29:26,962: INFO: common: yaml file: params.yaml loaded successfully]
[2025-10-22 22:29:26,962: INFO: common: created directory at: artifacts]
[2025-10-22 22:29:26,962: INFO: common: created directory at: artifacts/data_transformation]
[2025-10-22 22:29:26,978: INFO: DataTransformation:  Data transformation already completed successfully. Skipping...]
[2025-10-22 22:29:26,985: INFO: DataTransformation:  Transformation artifacts verified and valid]
[2025-10-22 22:29:26,986: INFO: DataTransformation:  Loading existing transformation artifacts...]


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


[2025-10-22 22:29:36,887: INFO: DataTransformation:  Successfully loaded existing transformation artifacts]
[2025-10-22 22:29:36,887: INFO: common: created directory at: artifacts/model_training]
[2025-10-22 22:29:36,887: INFO: 1165736424:  Model already exists in artifacts/model_training. Skipping training.]
[2025-10-22 22:29:37,430: INFO: 1034920712: >>>>>> Stage Model Trainer stage skipped (model already exists) <<<<<<

