In [1]:
import os

In [2]:
os.chdir("../")
%pwd

'/Users/macbookpro/Documents/predict_publications/publications_prediction'

# 1. Export MLFLow Env Variables in Terminal

In [None]:
export MLFLOW_TRACKING_URI=https://dagshub.com/etietopabraham/publications_prediction.mlflow
export MLFLOW_TRACKING_USERNAME=etietopabraham 
export MLFLOW_TRACKING_PASSWORD=324bb2aaa6fc82dbfce509eac2ce2cd6a016a869

# 2. Export MLFlow Env Variables for Notebook

In [3]:
os.environ["MLFLOW_TRACKING_URI"]="https://dagshub.com/etietopabraham/publications_prediction.mlflow"
os.environ["MLFLOW_TRACKING_USERNAME"]="etietopabraham"
os.environ["MLFLOW_TRACKING_PASSWORD"]="324bb2aaa6fc82dbfce509eac2ce2cd6a016a869"

# 3. Config.yaml

In [None]:
# Configuration for Model Evaluation

model_evaluation:
  # Root directory for saving model evaluation artifacts
  root_dir: artifacts/model_evaluation
  
  # Path to the test data used for evaluation
  test_data_path: artifacts/data_transformation/test_data.csv
  
  # Path to the trained model saved during the training step
  model_path: artifacts/model_trainer/model.joblib
  
  # Path to save the evaluation metrics in JSON format
  metric_file_name: artifacts/model_evaluation/metrics.json

  # MLFlow URI
  mlflow_uri: 'https://dagshub.com/etietopabraham/publications_prediction.mlflow'


4. Entity

In [7]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelEvaluationConfig:
    """
    Data class for storing configuration related to model evaluation.

    Attributes:
    - root_dir: Root directory for saving model evaluation artifacts.
    - test_data_path: Path to the test data used for evaluation.
    - model_path: Path to the trained model saved during the training step.
    - metric_file_name: Name (or path) to save the evaluation metrics.
    - all_params: Dictionary containing other relevant parameters.
    - target_column: Column name of the target variable in the dataset.
    - mlflow_uri: URI for MLflow tracking server.

    Note: The `frozen=True` argument makes instances of this class immutable, 
    ensuring that once an instance is created, its attributes cannot be modified.
    """

    root_dir: Path          # Directory for saving model evaluation artifacts
    test_data_path: Path    # Path to the test dataset
    model_path: Path        # Path to the saved model
    metric_file_name: str   # Filename to save evaluation metrics
    all_params: dict        # Other relevant parameters for evaluation
    target_column: str      # Name of the target column in the dataset
    mlflow_uri: str         # URI for MLflow tracking


# 5. Configuration Manager

In [12]:
from predicting_publications.constants import *
from predicting_publications.utils.common import read_yaml, create_directories
from predicting_publications import logger
from predicting_publications.entity.config_entity import (DataIngestionConfig, 
                                                          DataValidationConfig,
                                                          DataTransformationConfig,
                                                          ModelTrainerConfig)

class ConfigurationManager:
    """
    ConfigurationManager manages configurations needed for the data pipeline.

    The class reads configuration, parameter, and schema settings from specified files
    and provides a set of methods to access these settings. It also takes care of
    creating necessary directories defined in the configurations.

    Attributes:
    - config (dict): Configuration settings.
    - params (dict): Parameters for the pipeline.
    - schema (dict): Schema information.
    """
    
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH, 
                 params_filepath = PARAMS_FILE_PATH, 
                 schema_filepath = SCHEMA_FILE_PATH,
                 feature_schema_filepath = FEATURE_SCHEMA_FILE_PATH) -> None:
        """
        Initialize ConfigurationManager with configurations, parameters, and schema.

        Args:
        - config_filepath (Path): Path to the configuration file.
        - params_filepath (Path): Path to the parameters file.
        - schema_filepath (Path): Path to the schema file.

        Creates:
        - Directories specified in the configuration.
        """
        self.config = self._read_config_file(config_filepath, "config")
        self.params = self._read_config_file(params_filepath, "params")
        self.schema = self._read_config_file(schema_filepath, "initial_schema")
        self.feature_schema_filepath = self._read_config_file(feature_schema_filepath, "feature_engineered_schema")

        # Create the directory for storing artifacts if it doesn't exist
        create_directories([self.config.artifacts_root])

    def _read_config_file(self, filepath: str, config_name: str) -> dict:
        """
        Read a configuration file and return its content.

        Args:
        - filepath (str): Path to the configuration file.
        - config_name (str): Name of the configuration (for logging purposes).

        Returns:
        - dict: Configuration settings.

        Raises:
        - Exception: If there's an error reading the file.
        """
        try:
            return read_yaml(filepath)
        except Exception as e:
            logger.error(f"Error reading {config_name} file: {filepath}. Error: {e}")
            raise
    

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        """
        Extract and return data ingestion configurations as a DataIngestionConfig object.

        This method fetches settings related to data ingestion, like directories and file paths,
        and returns them as a DataIngestionConfig object.

        Returns:
        - DataIngestionConfig: Object containing data ingestion configuration settings.

        Raises:
        - AttributeError: If the 'data_ingestion' attribute does not exist in the config file.
        """
        try:
            config = self.config.data_ingestion
            # Create the root directory for data ingestion if it doesn't already exist
            create_directories([config.root_dir])
            
            return DataIngestionConfig(
                root_dir=Path(config.root_dir),
                local_data_file=Path(config.local_data_file),
            )

        except AttributeError as e:
            logger.error("The 'data_ingestion' attribute does not exist in the config file.")
            raise e
        

    def get_data_validation_config(self) -> DataValidationConfig:
        """
        Extract and return data validation configurations as a DataValidationConfig object.

        This method fetches settings related to data validation, like directories, file paths,
        and schema, and returns them as a DataValidationConfig object.

        Returns:
        - DataValidationConfig: Object containing data validation configuration settings.

        Raises:
        - AttributeError: If the 'data_validation' attribute does not exist in the config file.
        """
        try:
            # Extract data validation configurations
            config = self.config.data_validation
            
            # Extract schema for data validation
            schema = self.schema.columns
            
            # Ensure the parent directory for the status file exists
            create_directories([os.path.dirname(config.status_file)])

            
            # Construct and return the DataValidationConfig object
            return DataValidationConfig(
                root_dir=Path(config.root_dir),
                data_source_file=Path(config.data_source_file),
                status_file=Path(config.status_file),
                initial_schema=schema
            )

        except AttributeError as e:
            # Log the error and re-raise the exception for handling by the caller
            logger.error("The 'data_validation' attribute does not exist in the config file.")
            raise e


    def get_data_transformation_config(self) -> DataTransformationConfig:
        """
        Extract and return data transformation configurations as a DataTransformationConfig object.

        This method fetches settings related to data transformation, like directories and file paths,
        and returns them as a DataTransformationConfig object.

        Returns:
        - DataTransformationConfig: Object containing data transformation configuration settings.

        Raises:
        - AttributeError: If the 'data_transformation' attribute does not exist in the config file.
        """
        try:
            config = self.config.data_transformation
            
            # Ensure the root directory for data transformation exists
            create_directories([config.root_dir])

            # Construct and return the DataTransformationConfig object
            return DataTransformationConfig(
                root_dir=Path(config.root_dir),
                data_source_file=Path(config.data_source_file),
                data_validation=Path(config.data_validation),
            )

        except AttributeError as e:
            # Log the error and re-raise the exception for handling by the caller
            logger.error("The 'data_transformation' attribute does not exist in the config file.")
            raise e


    def get_model_trainer_config(self) -> ModelTrainerConfig:
        """
        Extract and return model training configurations as a ModelTrainerConfig object.

        This method fetches settings related to model training, like directories, file paths,
        and hyperparameters, and returns them as a ModelTrainerConfig object.

        Returns:
        - ModelTrainerConfig: Object containing model training configuration settings.

        Raises:
        - AttributeError: If the necessary attributes do not exist in the config or params files.
        """
        try:
            config = self.config.model_training
            params = self.params.GradientBoostingRegressor
            
            # The feature schema is a dictionary, extracting the target column
            target_col = self.feature_schema_filepath.get("target_column", "")


            # Ensure the root directory for model training exists
            create_directories([config.root_dir])

            # Construct and return the ModelTrainerConfig object
            return ModelTrainerConfig(
                root_dir=Path(config.root_dir),
                train_data_path=Path(config.train_data_path),
                test_data_path=Path(config.test_data_path),
                model_name=config.model_name,
                target_column=target_col,
                n_estimators=params.n_estimators,
                max_depth=params.max_depth,
                learning_rate=params.learning_rate,
                random_state=params.random_state,
                subsample=params.subsample,
                max_features=params.max_features,
                min_samples_split=params.min_samples_split,
                min_samples_leaf=params.min_samples_leaf
            )

        except AttributeError as e:
            # Log the error and re-raise the exception for handling by the caller
            logger.error("An expected attribute does not exist in the config or params files.")
            raise e


    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        """
        Retrieve the configuration related to model evaluation.

        This method:
        1. Extracts model evaluation configuration from the main configuration.
        2. Extracts GradientBoostingRegressor parameters from the params configuration.
        3. Retrieves the target column from the feature schema.
        4. Ensures the root directory for saving model evaluation artifacts exists.
        5. Constructs and returns a ModelEvaluationConfig object.

        Returns:
            ModelEvaluationConfig: Dataclass object containing configurations for model evaluation.

        Raises:
            AttributeError: If an expected attribute does not exist in the config or params files.
        """

        try:
            config = self.config.model_evaluation
            params = self.params.GradientBoostingRegressor

            # Extract the target column from the feature schema
            target_col = self.feature_schema_filepath.get("target_column", "")

            # Ensure the root directory for model evaluation exists
            create_directories([config.root_dir])

            # Construct and return the ModelEvaluationConfig object
            return ModelEvaluationConfig(
                root_dir=Path(config.root_dir),
                test_data_path=Path(config.test_data_path),
                model_path=config.model_path,
                metric_file_name=config.metric_file_name,
                all_params=params,
                target_column=target_col,
                mlflow_uri=config.mlflow_uri,
            )
        except AttributeError as e:
            # Log the error and re-raise the exception for handling by the caller
            logger.error("An expected attribute does not exist in the config or params files.")
            raise e 


# 6. Components

In [13]:
import os
import pandas as pd
import numpy as np
from urllib.parse import urlparse
import joblib
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import mlflow
from predicting_publications.utils.common import save_json

class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig):
        """
        Initialize the ModelEvaluation class.

        Parameters:
        - config: Configuration parameters for model evaluation.
        """
        self.config = config

    def eval_metrics(self, actual, pred):
        """
        Calculate evaluation metrics for the model predictions.

        Parameters:
        - actual: Ground truth values.
        - pred: Predicted values by the model.

        Returns:
        - rmse: Root Mean Squared Error.
        - mae: Mean Absolute Error.
        - r2: R2 Score.
        - average_relative_error: Average Relative Error.
        """
        rmse = np.sqrt(mean_squared_error(actual, pred))
        mae = mean_absolute_error(actual, pred)
        r2 = r2_score(actual, pred)
        epsilon = 1e-10
        relative_error = np.abs(pred - actual) / (pred + epsilon)
        average_relative_error = relative_error.mean()

        return rmse, mae, r2, average_relative_error

    def load_data(self):
        """
        Load test data and the trained model.
        """
        self.test_data = pd.read_csv(self.config.test_data_path)
        self.model = joblib.load(self.config.model_path)
        self.X_test = self.test_data.drop([self.config.target_column], axis=1)
        self.y_test = self.test_data[self.config.target_column]

    def log_into_mlflow(self):
        """
        Log model parameters, metrics, and the model itself into MLflow.
        """
        self.load_data()

        # Set the MLflow registry URI
        mlflow.set_registry_uri(self.config.mlflow_uri)
        tracking_url_type_score = urlparse(mlflow.get_tracking_uri()).scheme

        # Start an MLflow tracking session
        with mlflow.start_run():
            predicted_qualities = self.model.predict(self.X_test)
            (rmse, mae, r2, average_relative_error) = self.eval_metrics(self.y_test, predicted_qualities)
            scores = {"rmse": rmse, "mae": mae, "r2": r2, "average_relative_error": average_relative_error}

            # Save evaluation metrics to a JSON file
            save_json(path=Path(self.config.metric_file_name), data=scores)

            # Log parameters and metrics into MLflow
            mlflow.log_params(self.config.all_params)
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("mae", mae)
            mlflow.log_metric("r2", r2)
            mlflow.log_metric("average_relative_error", average_relative_error)


            # Log the model into MLflow based on the type of tracking URL
            if tracking_url_type_score != "file":
                mlflow.sklearn.log_model(self.model, "model", registered_model_name="GradientBoostingRegressor")
            else:
                mlflow.sklearn.log_model(self.model, "model")


# 7. Pipeline

In [14]:
from predicting_publications import logger

class ModelEvaluationPipeline:

    STAGE_NAME = "Model Evaluation Pipeline"

    def __init__(self):
        self.config_manager = ConfigurationManager()

    
    def run_pipeline(self):
        try:
            logger.info("Fetching model evaluation configuration...")
            model_evaluation_configuration = self.config_manager.get_model_evaluation_config()

            logger.info("Initializing model evaluation process...")
            model_evaluation = ModelEvaluation(config=model_evaluation_configuration)
            
            logger.info("Logging model evaluation into MLFlow...")
            model_evaluation.log_into_mlflow()
            
            logger.info("Model Evaluation Pipeline completed successfully.")
       
        except Exception as e:
            logger.error(f"Error encountered during the model evaluation: {e}")


if __name__ == '__main__':
    pipeline = ModelEvaluationPipeline()
    pipeline.run_pipeline()

[2023-10-17 01:46:32,727: 42: predict_publications_logger: INFO: common:  yaml file: config/config.yaml loaded successfully]
[2023-10-17 01:46:32,730: 42: predict_publications_logger: INFO: common:  yaml file: params.yaml loaded successfully]
[2023-10-17 01:46:32,733: 42: predict_publications_logger: INFO: common:  yaml file: schema.yaml loaded successfully]
[2023-10-17 01:46:32,738: 42: predict_publications_logger: INFO: common:  yaml file: feature_engineered_schema.yaml loaded successfully]
[2023-10-17 01:46:32,738: 65: predict_publications_logger: INFO: common:  Created directory at: artifacts]
[2023-10-17 01:46:32,739: 13: predict_publications_logger: INFO: 3544983170:  Fetching model evaluation configuration...]
[2023-10-17 01:46:32,740: 65: predict_publications_logger: INFO: common:  Created directory at: artifacts/model_evaluation]
[2023-10-17 01:46:32,741: 16: predict_publications_logger: INFO: 3544983170:  Initializing model evaluation process...]
[2023-10-17 01:46:32,742: 19:

Successfully registered model 'GradientBoostingRegressor'.
2023/10/17 01:46:47 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: GradientBoostingRegressor, version 1
Created version '1' of model 'GradientBoostingRegressor'.


[2023-10-17 01:46:48,086: 22: predict_publications_logger: INFO: 3544983170:  Model Evaluation Pipeline completed successfully.]
