In [1]:
import os

In [2]:
os.chdir("../")
%pwd

'/Users/macbookpro/Documents/predict_publications/publications_prediction'

# 1. Config.yaml

In [None]:
# Configuration related to model training
model_training:
  # Directory where model training results and artifacts are stored
  root_dir: artifacts/model_trainer
  
  # Path to the train data
  train_data_path: artifacts/data_transformation/train_data.csv

  # Path to the test data
  test_data_path: artifacts/data_transformation/test_data.csv

  # Path to save our model
  model_name: model.joblib


# 2. Setup Entity

In [18]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    """
    Configuration for the model training process.
    
    This configuration class captures the necessary paths, directories, 
    and hyperparameters required for training the model.
    
    Attributes:
    - root_dir: Directory for storing trained model and related artifacts.
    - train_data_path: Path to the training data.
    - test_data_path: Path to the testing/validation data.
    - model_name: Name (or path) to save the trained model.
    - target_column: The column name of the target variable.
    - n_estimators: Number of boosting stages.
    - max_depth: Maximum depth of the individual regression estimators.
    - learning_rate: Step size for updating weights.
    - subsample: Fraction of samples used for fitting individual base learners.
    - random_state: Seed for reproducibility.
    - max_features: The number of features to consider for best split.
    - min_samples_split: Minimum number of samples required to split an internal node.
    - min_samples_leaf: Minimum number of samples required at a leaf node.
    """
    
    root_dir: Path  # Directory for storing model training results and related artifacts
    train_data_path: Path  # Path to train data
    test_data_path: Path  # Path to test data
    model_name: str  # Name or path where the trained model should be saved
    target_column: str  # The target column in the dataset
    n_estimators: int  # Number of boosting stages
    max_depth: int  # Maximum depth of the regression estimators
    learning_rate: float  # Learning rate
    random_state: int  # Seed for reproducibility
    subsample: float  # Fraction of samples for fitting individual base learners
    max_features: str  # Number of features to consider for best split
    min_samples_split: int  # Min samples required to split an internal node
    min_samples_leaf: int  # Min samples required at a leaf node


# 3. Setup Params.yaml

In [None]:
GradientBoostingRegressor:
  # Number of boosting stages to run. More might improve accuracy, but will also increase training time.
  n_estimators: 150

  # Maximum depth of the individual regression estimators. Helps in making the model more complex. 
  # Avoid setting it too high, as it might overfit.
  max_depth: 4

  # Controls the step size in the wrong direction that each tree correction should make. 
  # Smaller values might improve accuracy but will require more boosting stages.
  learning_rate: 0.05

  # The fraction of samples used for fitting the individual base learners. 
  # Values less than 1.0 can reduce variance and overfitting.
  subsample: 0.8

  # A seed for reproducibility.
  random_state: 42

  # The number of features to consider when looking for the best split. 
  # Using a smaller value can create more diverse trees, but might reduce accuracy.
  max_features: "sqrt"

  # Minimum number of samples required to split an internal node. Can be used to control over-fitting.
  min_samples_split: 2

  # Minimum number of samples required to be at a leaf node. Can be used to control over-fitting.
  min_samples_leaf: 1


# 4. Setup Transformed Schema.yaml

In [None]:
schema_type: "transformed"
description: "Schema of the transformed data after feature engineering and aggregation."

columns:
  timestamp: 
    type: datetime64[ns]
    description: "Timestamp of the data entry."
  lon: 
    type: float64
    description: "Longitude value."
  lat: 
    type: float64
    description: "Latitude value."
  hour:
    type: int64
    description: "Hour extracted from the timestamp."
  day:
    type: int64
    description: "Day extracted from the timestamp."
  dayofweek:
    type: int64
    description: "Day of the week extracted from the timestamp."
  month:
    type: int64
    description: "Month extracted from the timestamp."
  likescount: 
    type: float64
    description: "Mean count of likes for the aggregated period."
  commentscount: 
    type: float64
    description: "Mean count of comments for the aggregated period."
  symbols_cnt: 
    type: float64
    description: "Mean count of symbols for the aggregated period."
  words_cnt: 
    type: float64
    description: "Mean count of words for the aggregated period."
  hashtags_cnt: 
    type: float64
    description: "Mean count of hashtags for the aggregated period."
  mentions_cnt: 
    type: float64
    description: "Mean count of mentions for the aggregated period."
  links_cnt: 
    type: float64
    description: "Mean count of links for the aggregated period."
  emoji_cnt: 
    type: float64
    description: "Mean count of emojis for the aggregated period."
  publication_count:
    type: int64
    description: "Count of publications for the aggregated period."

target_column: 'publication_count'

# 5. Configuration Manager

In [19]:
from predicting_publications.constants import *
from predicting_publications.utils.common import read_yaml, create_directories
from predicting_publications import logger
from predicting_publications.entity.config_entity import (DataIngestionConfig, 
                                                          DataValidationConfig,
                                                          DataTransformationConfig)

class ConfigurationManager:
    """
    ConfigurationManager manages configurations needed for the data pipeline.

    The class reads configuration, parameter, and schema settings from specified files
    and provides a set of methods to access these settings. It also takes care of
    creating necessary directories defined in the configurations.

    Attributes:
    - config (dict): Configuration settings.
    - params (dict): Parameters for the pipeline.
    - schema (dict): Schema information.
    """
    
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH, 
                 params_filepath = PARAMS_FILE_PATH, 
                 schema_filepath = SCHEMA_FILE_PATH,
                 feature_schema_filepath = FEATURE_SCHEMA_FILE_PATH) -> None:
        """
        Initialize ConfigurationManager with configurations, parameters, and schema.

        Args:
        - config_filepath (Path): Path to the configuration file.
        - params_filepath (Path): Path to the parameters file.
        - schema_filepath (Path): Path to the schema file.

        Creates:
        - Directories specified in the configuration.
        """
        self.config = self._read_config_file(config_filepath, "config")
        self.params = self._read_config_file(params_filepath, "params")
        self.schema = self._read_config_file(schema_filepath, "initial_schema")
        self.feature_schema_filepath = self._read_config_file(feature_schema_filepath, "feature_engineered_schema")

        # Create the directory for storing artifacts if it doesn't exist
        create_directories([self.config.artifacts_root])

    def _read_config_file(self, filepath: str, config_name: str) -> dict:
        """
        Read a configuration file and return its content.

        Args:
        - filepath (str): Path to the configuration file.
        - config_name (str): Name of the configuration (for logging purposes).

        Returns:
        - dict: Configuration settings.

        Raises:
        - Exception: If there's an error reading the file.
        """
        try:
            return read_yaml(filepath)
        except Exception as e:
            logger.error(f"Error reading {config_name} file: {filepath}. Error: {e}")
            raise
    

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        """
        Extract and return data ingestion configurations as a DataIngestionConfig object.

        This method fetches settings related to data ingestion, like directories and file paths,
        and returns them as a DataIngestionConfig object.

        Returns:
        - DataIngestionConfig: Object containing data ingestion configuration settings.

        Raises:
        - AttributeError: If the 'data_ingestion' attribute does not exist in the config file.
        """
        try:
            config = self.config.data_ingestion
            # Create the root directory for data ingestion if it doesn't already exist
            create_directories([config.root_dir])
            
            return DataIngestionConfig(
                root_dir=Path(config.root_dir),
                local_data_file=Path(config.local_data_file),
            )

        except AttributeError as e:
            logger.error("The 'data_ingestion' attribute does not exist in the config file.")
            raise e
        

    def get_data_validation_config(self) -> DataValidationConfig:
        """
        Extract and return data validation configurations as a DataValidationConfig object.

        This method fetches settings related to data validation, like directories, file paths,
        and schema, and returns them as a DataValidationConfig object.

        Returns:
        - DataValidationConfig: Object containing data validation configuration settings.

        Raises:
        - AttributeError: If the 'data_validation' attribute does not exist in the config file.
        """
        try:
            # Extract data validation configurations
            config = self.config.data_validation
            
            # Extract schema for data validation
            schema = self.schema.columns
            
            # Ensure the parent directory for the status file exists
            create_directories([os.path.dirname(config.status_file)])

            
            # Construct and return the DataValidationConfig object
            return DataValidationConfig(
                root_dir=Path(config.root_dir),
                data_source_file=Path(config.data_source_file),
                status_file=Path(config.status_file),
                initial_schema=schema
            )

        except AttributeError as e:
            # Log the error and re-raise the exception for handling by the caller
            logger.error("The 'data_validation' attribute does not exist in the config file.")
            raise e


    def get_data_transformation_config(self) -> DataTransformationConfig:
        """
        Extract and return data transformation configurations as a DataTransformationConfig object.

        This method fetches settings related to data transformation, like directories and file paths,
        and returns them as a DataTransformationConfig object.

        Returns:
        - DataTransformationConfig: Object containing data transformation configuration settings.

        Raises:
        - AttributeError: If the 'data_transformation' attribute does not exist in the config file.
        """
        try:
            config = self.config.data_transformation
            
            # Ensure the root directory for data transformation exists
            create_directories([config.root_dir])

            # Construct and return the DataTransformationConfig object
            return DataTransformationConfig(
                root_dir=Path(config.root_dir),
                data_source_file=Path(config.data_source_file),
                data_validation=Path(config.data_validation),
            )

        except AttributeError as e:
            # Log the error and re-raise the exception for handling by the caller
            logger.error("The 'data_transformation' attribute does not exist in the config file.")
            raise e


    def get_model_trainer_config(self) -> ModelTrainerConfig:
        """
        Extract and return model training configurations as a ModelTrainerConfig object.

        This method fetches settings related to model training, like directories, file paths,
        and hyperparameters, and returns them as a ModelTrainerConfig object.

        Returns:
        - ModelTrainerConfig: Object containing model training configuration settings.

        Raises:
        - AttributeError: If the necessary attributes do not exist in the config or params files.
        """
        try:
            config = self.config.model_training
            params = self.params.GradientBoostingRegressor
            
            # The feature schema is a dictionary, extracting the target column
            target_col = self.feature_schema_filepath.get("target_column", "")


            # Ensure the root directory for model training exists
            create_directories([config.root_dir])

            # Construct and return the ModelTrainerConfig object
            return ModelTrainerConfig(
                root_dir=Path(config.root_dir),
                train_data_path=Path(config.train_data_path),
                test_data_path=Path(config.test_data_path),
                model_name=config.model_name,
                target_column=target_col,
                n_estimators=params.n_estimators,
                max_depth=params.max_depth,
                learning_rate=params.learning_rate,
                random_state=params.random_state,
                subsample=params.subsample,
                max_features=params.max_features,
                min_samples_split=params.min_samples_split,
                min_samples_leaf=params.min_samples_leaf
            )

        except AttributeError as e:
            # Log the error and re-raise the exception for handling by the caller
            logger.error("An expected attribute does not exist in the config or params files.")
            raise e


# 6. Component

In [30]:
from predicting_publications import logger
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
import joblib
import os 

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

class ModelTrainer:
    """
    ModelTrainer class handles the training of the GradientBoostingRegressor model.

    This component reads in the transformed training and test data, trains a Gradient 
    Boosting Regressor model using the specified hyperparameters, and saves the trained 
    model to the specified path.

    Attributes:
    - config (ModelTrainerConfig): Configuration settings for the model training process.
    """

    def __init__(self, config: ModelTrainerConfig):
        """
        Initialize ModelTrainer with the given configurations.

        Args:
        - config (ModelTrainerConfig): Configuration settings for model training.
        """
        self.config = config

    def hyperparameter_tuning(self, X_train, y_train):
        """
        Perform hyperparameter tuning using RandomizedSearchCV.
        
        Args:
        - X_train: Training data features.
        - y_train: Training data target.

        Returns:
        - Best hyperparameters found during the search.
        """
        param_dist = {
            'n_estimators': sp_randint(50, 200),
            'max_depth': sp_randint(1, 10),
            'learning_rate': sp_uniform(0.01, 0.2),
            'subsample': sp_uniform(0.5, 0.5),
            'max_features': ['sqrt', 'log2', None],
            'min_samples_split': sp_randint(2, 20),
            'min_samples_leaf': sp_randint(1, 20)
        }

        n_iter_search = 10  # Reduced to 10 combinations
        random_search = RandomizedSearchCV(GradientBoostingRegressor(),
                                           param_distributions=param_dist,
                                           n_iter=n_iter_search,
                                           cv=3,  # Reduced to 3-fold cross-validation
                                           scoring='neg_mean_squared_error',
                                           verbose=2,
                                           n_jobs=-1)

        random_search.fit(X_train, y_train)

        return random_search.best_params_

    def train(self):
        """
        Train a Gradient Boosting Regressor model.

        This method:
        1. Loads the training and test data from the paths specified in the configuration.
        2. Separates the predictors and target variables.
        3. Initializes a Gradient Boosting Regressor model with the specified hyperparameters.
        4. Fits the model on the training data.
        5. Saves the trained model to the path specified in the configuration.
        """
        # Load training dataset
        train_data = pd.read_csv(self.config.train_data_path)

        # Separate predictors and target variable
        X_train = train_data.drop([self.config.target_column], axis=1)
        y_train = train_data[[self.config.target_column]].values.ravel()

        # Perform hyperparameter tuning
        # best_params = self.hyperparameter_tuning(X_train, y_train)

        # Log the best parameters
        # logger.info(f"Best hyperparameters found: {best_params}")

        # Best hyperparameters
        best_params = {
            'learning_rate': self.config.learning_rate,
            'max_depth': self.config.max_depth,
            'max_features': None if self.config.max_features == 'None' else self.config.max_features,
            'min_samples_leaf': self.config.min_samples_leaf,
            'min_samples_split': self.config.min_samples_split,
            'n_estimators': self.config.n_estimators,
            'subsample': self.config.subsample,
            'random_state': self.config.random_state
        }

        # Train the model with the best parameters
        gb_model = GradientBoostingRegressor(**best_params)
        gb_model.fit(X_train, y_train)

        # Save the trained model
        model_save_path = os.path.join(self.config.root_dir, self.config.model_name)
        joblib.dump(gb_model, model_save_path)
        logger.info(f"Model saved successfully to {model_save_path}")


# 7. Pipeline

In [31]:
from predicting_publications import logger

class ModelTrainerPipeline:
    """
    This pipeline handles the model training process.

    After the data transformation stage, this class orchestrates the training of the model
    using the GradientBoostingRegressor and saves the trained model for future use.

    Attributes:
        STAGE_NAME (str): The name of this pipeline stage.
    """
    
    STAGE_NAME = "Model Training Pipeline"

    def __init__(self):
        """
        Initializes the pipeline with a configuration manager.
        """
        self.config_manager = ConfigurationManager()

    def run_model_training(self):
        """
        Orchestrates the model training process.

        Fetches configurations, initializes the model training process, trains the model,
        and logs the successful completion of the training.
        """
        try:
            logger.info("Fetching model training configuration...")
            model_training_configuration = self.config_manager.get_model_trainer_config()

            logger.info("Initializing model training process...")
            model_training = ModelTrainer(config=model_training_configuration)

            logger.info("Executing model training...")
            model_training.train()

            logger.info("Model Training Pipeline completed successfully.")

        except Exception as e:
            logger.error(f"Error encountered during the model training: {e}")

    
    def run_pipeline(self):
        """
        Run the entire Model Training Pipeline.

        This method orchestrates the process of model training and provides logs for each stage 
        of the pipeline.
        """
        try:
            logger.info("Starting the Model Training Pipeline.")
            logger.info(f">>>>>> Stage: {ModelTrainerPipeline.STAGE_NAME} started <<<<<<")
            self.run_model_training()
            logger.info(f">>>>>> Stage {ModelTrainerPipeline.STAGE_NAME} completed <<<<<< \n\nx==========x")
        except Exception as e:
            logger.error(f"Error encountered during the {ModelTrainerPipeline.STAGE_NAME}: {e}")
            raise e


if __name__ == '__main__':
    pipeline = ModelTrainerPipeline()
    pipeline.run_pipeline()

[2023-10-16 22:46:52,433: 42: predict_publications_logger: INFO: common:  yaml file: config/config.yaml loaded successfully]
[2023-10-16 22:46:52,437: 42: predict_publications_logger: INFO: common:  yaml file: params.yaml loaded successfully]
[2023-10-16 22:46:52,441: 42: predict_publications_logger: INFO: common:  yaml file: schema.yaml loaded successfully]
[2023-10-16 22:46:52,445: 42: predict_publications_logger: INFO: common:  yaml file: feature_engineered_schema.yaml loaded successfully]
[2023-10-16 22:46:52,447: 65: predict_publications_logger: INFO: common:  Created directory at: artifacts]
[2023-10-16 22:46:52,448: 53: predict_publications_logger: INFO: 2940486009:  Starting the Model Training Pipeline.]
[2023-10-16 22:46:52,449: 54: predict_publications_logger: INFO: 2940486009:  >>>>>> Stage: Model Training Pipeline started <<<<<<]
[2023-10-16 22:46:52,449: 30: predict_publications_logger: INFO: 2940486009:  Fetching model training configuration...]
[2023-10-16 22:46:52,450: 