In [1]:
import os

In [3]:
os.chdir("../")

In [4]:
%pwd

'/Users/macbookpro/Documents/semantic_preprocessor_model/semantic_preprocessor_model'

# Config.yaml

In [None]:
# Configuration related to model training
model_training:
  # Directory where model training results and artifacts are stored
  root_dir: artifacts/model_trainer
  
  # Path to the train features
  train_features_path: artifacts/data_transformation/train_features.npz
  
  # Path to the validation features
  val_features_path: artifacts/data_transformation/val_features.npz
  
  # Path to train labels
  train_labels_path: artifacts/data_transformation/train_labels.csv
  
  # Path to validation labels
  val_labels_path: artifacts/data_transformation/val_labels.csv
  
  # Path to save the trained model
  model_name: model.joblib


# Entity

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainingConfig:
    """
    Configuration class for model training using the MLPClassifier neural network.

    Attributes:
    - root_dir: Directory for storing model training results and related artifacts.
    - train_features_path: Path to the features of the training dataset.
    - train_labels_path: Path to the labels of the training dataset.
    - val_features_path: Path to the features of the validation dataset.
    - val_labels_path: Path to the labels of the validation dataset.
    - model_name: Name or path under which the trained model will be saved.
    - hidden_layer_sizes: Number of neurons in each hidden layer.
    - max_iter: Maximum number of iterations for the solver to converge.
    - random_state: Seed for reproducibility.
    """
    
    root_dir: Path  # Directory for storing model training results and related artifacts
    train_features_path: Path  # Path to the features of the training dataset
    train_labels_path: Path  # Path to the labels of the training dataset
    val_features_path: Path  # Path to the features of the validation dataset
    val_labels_path: Path  # Path to the labels of the validation dataset
    model_name: str  # Name or path where the trained model should be saved
    
    hidden_layer_sizes: tuple  # Number of neurons in each hidden layer
    max_iter: int  # Maximum number of iterations for the solver to converge
    random_state: int  # Seed for reproducibility


# Params

In [None]:
MLPClassifier:
  # Number of neurons in each hidden layer. For instance, (100,) denotes one hidden layer with 100 neurons.
  hidden_layer_sizes: (100,)
  
  # Maximum number of iterations for the solver to converge.
  max_iter: 500
  
  # A seed for reproducibility.
  random_state: 42


# Configuration Manager

In [6]:
from src.semantic_preprocessor_model.constants import *
from src.semantic_preprocessor_model.utils.common import read_yaml, create_directories
from src.semantic_preprocessor_model import logger
from src.semantic_preprocessor_model.entity.config_entity import  ModelTrainingConfig
import os

class ConfigurationManager:
    """
    The ConfigurationManager manages configuration settings needed throughout the data 
    pipeline processes, such as data validation and data transformation.

    It reads configuration, parameter, and schema settings from specified files and provides 
    a set of methods to access these settings. Additionally, it ensures that the required 
    directories specified in the configurations are created.
    """
    
    def __init__(self, 
                 config_filepath=CONFIG_FILE_PATH, 
                 params_filepath=PARAMS_FILE_PATH, 
                 schema_filepath=SCHEMA_FILE_PATH) -> None:
        """
        Initialize ConfigurationManager with configurations, parameters, and schema.

        Args:
        - config_filepath (Path): Path to the configuration file.
        - params_filepath (Path): Path to the parameters file.
        - schema_filepath (Path): Path to the schema file.

        Creates:
        - Directories specified in the configuration, if they don't exist.
        """
        self.config = self._read_config_file(config_filepath, "config")
        self.params = self._read_config_file(params_filepath, "params")
        self.schema = self._read_config_file(schema_filepath, "schema")

        # Ensure the directory for storing artifacts exists
        create_directories([self.config.artifacts_root])

    def _read_config_file(self, filepath: str, config_name: str) -> dict:
        """
        Read and return the content of a configuration file.

        Args:
        - filepath (str): Path to the configuration file.
        - config_name (str): Name of the configuration (for logging purposes).

        Returns:
        - dict: Content of the configuration file.

        Raises:
        - Exception: If there's an error reading the file.
        """
        try:
            return read_yaml(filepath)
        except Exception as e:
            logger.error(f"Error reading {config_name} file: {filepath}. Error: {e}")
            raise

    def get_model_training_config(self) -> ModelTrainingConfig:
        """
        Construct and return a configuration object for model training using the MLPClassifier.

        Returns:
        - ModelTrainerConfig: Configuration object for model training.

        Raises:
        - AttributeError: If an expected attribute does not exist in the config or params files.
        """
        try:
            config = self.config.model_training
            params = self.params.MLPClassifier

            # Ensure the root directory for model training exists
            create_directories([config.root_dir])

            # Construct and return the ModelTrainerConfig object
            return ModelTrainingConfig(
                root_dir=Path(config.root_dir),
                train_features_path=Path(config.train_features_path),
                train_labels_path=Path(config.train_labels_path),
                val_features_path=Path(config.val_features_path),
                val_labels_path=Path(config.val_labels_path),
                model_name=config.model_name,
                random_state=params.random_state,
                hidden_layer_sizes=params.hidden_layer_sizes,
                max_iter=params.max_iter,
            )

        except AttributeError as e:
            # Log the error and re-raise the exception for handling by the caller
            logger.error("An expected attribute does not exist in the config or params files.")
            raise e


# Component

In [7]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.neural_network import MLPClassifier
from src.semantic_preprocessor_model.config.configuration import ConfigurationManager
from scipy.sparse import load_npz
import joblib
import os
import pandas as pd
import ast


class ModelTraining:
    """
    ModelTraining is responsible for training a machine learning model based on the 
    provided configuration. It uses the MLPClassifier to train a neural network model 
    on the processed and transformed data.
    """
    
    def __init__(self, config: ConfigurationManager):
        """
        Initializes the ModelTraining component.
        
        Args:
        - config (ConfigurationManager): Configuration settings for model training.
        """
        self.config = config

    def train(self):
        """
        Train the model using the transformed data. The method loads the training data, 
        initializes the MLPClassifier with the specified parameters, trains the classifier, 
        and then saves the trained model to the specified path.
        """
        
        # Load training data
        X_train = load_npz(self.config.train_features_path)
        y_train = pd.read_csv(self.config.train_labels_path).iloc[:, 0]

        # Convert string representation of tuple to actual tuple
        hidden_layer_sizes_tuple = ast.literal_eval(self.config.hidden_layer_sizes)

        params = {
            'hidden_layer_sizes': hidden_layer_sizes_tuple,
            'max_iter': self.config.max_iter,
            'random_state': self.config.random_state
        }

        # Initialize a Neural Network classifier with the specified parameters
        nn_classifier_general = MLPClassifier(**params, verbose=True)

        print(X_train.shape[0])
        print(len(y_train))

        print(self.config.hidden_layer_sizes)
        print(type(self.config.hidden_layer_sizes))

        # Train the Neural Network classifier
        nn_classifier_general.fit(X_train, y_train)

        # Save the trained model
        model_save_path = os.path.join(self.config.root_dir, self.config.model_name)
        joblib.dump(nn_classifier_general, model_save_path)
        logger.info(f"Model saved successfully to {model_save_path}")



In [None]:
import pandas as pd
print(pd.__version__)

# Pipeline

In [8]:
from src.semantic_preprocessor_model import logger

class ModelTrainerPipeline:
    """
    This pipeline handles the model training process.

    After the data transformation stage, this class orchestrates the training of the model
    using the GradientBoostingRegressor and saves the trained model for future use.

    Attributes:
        STAGE_NAME (str): The name of this pipeline stage.
    """
    
    STAGE_NAME = "Model Training Pipeline"

    def __init__(self):
        """
        Initializes the pipeline with a configuration manager.
        """
        self.config_manager = ConfigurationManager()

    def run_model_training(self):
        """
        Orchestrates the model training process.

        Fetches configurations, initializes the model training process, trains the model,
        and logs the successful completion of the training.
        """
        try:
            logger.info("Fetching model training configuration...")
            model_training_configuration = self.config_manager.get_model_training_config()

            logger.info("Initializing model training process...")
            model_training = ModelTraining(config=model_training_configuration)

            logger.info("Executing model training...")
            model_training.train()

            logger.info("Model Training Pipeline completed successfully.")

        except Exception as e:
            logger.error(f"Error encountered during the model training: {e}")

    
    def run_pipeline(self):
        """
        Run the entire Model Training Pipeline.

        This method orchestrates the process of model training and provides logs for each stage 
        of the pipeline.
        """
        try:
            logger.info("Starting the Model Training Pipeline.")
            logger.info(f">>>>>> Stage: {ModelTrainerPipeline.STAGE_NAME} started <<<<<<")
            self.run_model_training()
            logger.info(f">>>>>> Stage {ModelTrainerPipeline.STAGE_NAME} completed <<<<<< \n\nx==========x")
        except Exception as e:
            logger.error(f"Error encountered during the {ModelTrainerPipeline.STAGE_NAME}: {e}")
            raise e


if __name__ == '__main__':
    pipeline = ModelTrainerPipeline()
    pipeline.run_pipeline()

[2023-10-23 01:44:48,090: 42: semantic_preprocessor_model_logger: INFO: common:  yaml file: config/config.yaml loaded successfully]
[2023-10-23 01:44:48,091: 42: semantic_preprocessor_model_logger: INFO: common:  yaml file: params.yaml loaded successfully]
[2023-10-23 01:44:48,093: 42: semantic_preprocessor_model_logger: INFO: common:  yaml file: schema.yaml loaded successfully]
[2023-10-23 01:44:48,094: 65: semantic_preprocessor_model_logger: INFO: common:  Created directory at: artifacts]
[2023-10-23 01:44:48,094: 53: semantic_preprocessor_model_logger: INFO: 2216604145:  Starting the Model Training Pipeline.]
[2023-10-23 01:44:48,094: 54: semantic_preprocessor_model_logger: INFO: 2216604145:  >>>>>> Stage: Model Training Pipeline started <<<<<<]
[2023-10-23 01:44:48,095: 30: semantic_preprocessor_model_logger: INFO: 2216604145:  Fetching model training configuration...]
[2023-10-23 01:44:48,095: 65: semantic_preprocessor_model_logger: INFO: common:  Created directory at: artifacts/m

