In [1]:
import os

In [2]:
os.chdir("../")

In [3]:
%pwd

'/Users/macbookpro/Documents/semantic_preprocessor_model/semantic_preprocessor_model'

# MLFlow Env Variables in Terminal

In [None]:
export MLFLOW_TRACKING_URI=https://dagshub.com/etietopabraham/semantic_preprocessor_model.mlflow
export MLFLOW_TRACKING_USERNAME=etietopabraham
export MLFLOW_TRACKING_PASSWORD=324bb2aaa6fc82dbfce509eac2ce2cd6a016a869

MLFlow Env Variables in Notebook

In [4]:
os.environ["MLFLOW_TRACKING_URI"]="https://dagshub.com/etietopabraham/semantic_preprocessor_model.mlflow"
os.environ["MLFLOW_TRACKING_USERNAME"]="etietopabraham"
os.environ["MLFLOW_TRACKING_PASSWORD"]="324bb2aaa6fc82dbfce509eac2ce2cd6a016a869"

# Config.yaml

# Configuration for Model Evaluation

model_evaluation:
  # Root directory for saving model evaluation artifacts
  root_dir: artifacts/model_evaluation
  
  # Path to the val data used for evaluation
  val_features_path: artifacts/data_transformation/val_features.npz
  val_labels_path: artifacts/data_transformation/val_labels.csv
  
  # Path to the trained model saved during the training step
  model_path: artifacts/model_trainer/model.joblib
  
  # Path to save the evaluation metrics in JSON format
  metric_file_path: artifacts/model_evaluation/metrics.json

  # MLFlow URI
  mlflow_uri: 'https://dagshub.com/etietopabraham/semantic_preprocessor_model.mlflow'

# Entity

In [15]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelEvaluationConfig:
    """
    Data class for storing configuration related to model evaluation.

    Attributes:
    - root_dir: Root directory for saving model evaluation artifacts.
    - val_features_path: Path to the validation features used for evaluation.
    - val_labels_path: Path to the validation labels used for evaluation.
    - model_path: Path to the trained model saved during the training step.
    - metric_file_name: Name (or path) to save the evaluation metrics.
    - mlflow_uri: URI for MLflow tracking server.
    - all_params: (Optional) Dictionary containing other relevant parameters.

    Note: The `frozen=True` argument makes instances of this class immutable, 
    ensuring that once an instance is created, its attributes cannot be modified.
    """

    root_dir: Path          # Directory for saving model evaluation artifacts
    val_features_path: Path # Path to the validation features
    val_labels_path: Path   # Path to the validation labels
    model_path: Path        # Path to the saved model
    metric_file_path: str   # Filename or path to save evaluation metrics
    mlflow_uri: str         # URI for MLflow tracking
    all_params: dict        # Other relevant parameters for evaluation


# Configuration Manager

In [16]:
from src.semantic_preprocessor_model.constants import *
from src.semantic_preprocessor_model.utils.common import read_yaml, create_directories
from src.semantic_preprocessor_model import logger
from src.semantic_preprocessor_model.entity.config_entity import (DataIngestionConfig,
                                                                  DataValidationConfig,
                                                                  DataTransformationConfig,
                                                                  ModelTrainingConfig,
                                                                  ModelEvaluationConfig)

import os

class ConfigurationManager:
    """
    ConfigurationManager manages configurations needed for the data pipeline.

    The class reads configuration, parameter, and schema settings from specified files
    and provides a set of methods to access these settings. It also takes care of
    creating necessary directories defined in the configurations.

    Attributes:
    - config (dict): Configuration settings.
    - params (dict): Parameters for the pipeline.
    - schema (dict): Schema information.
    """
    
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH, 
                 params_filepath = PARAMS_FILE_PATH, 
                 schema_filepath = SCHEMA_FILE_PATH) -> None:
        """
        Initialize ConfigurationManager with configurations, parameters, and schema.

        Args:
        - config_filepath (Path): Path to the configuration file.
        - params_filepath (Path): Path to the parameters file.
        - schema_filepath (Path): Path to the schema file.

        Creates:
        - Directories specified in the configuration.
        """
        self.config = self._read_config_file(config_filepath, "config")
        self.params = self._read_config_file(params_filepath, "params")
        self.schema = self._read_config_file(schema_filepath, "initial_schema")

        # Create the directory for storing artifacts if it doesn't exist
        create_directories([self.config.artifacts_root])

    def _read_config_file(self, filepath: str, config_name: str) -> dict:
        """
        Read a configuration file and return its content.

        Args:
        - filepath (str): Path to the configuration file.
        - config_name (str): Name of the configuration (for logging purposes).

        Returns:
        - dict: Configuration settings.

        Raises:
        - Exception: If there's an error reading the file.
        """
        try:
            return read_yaml(filepath)
        except Exception as e:
            logger.error(f"Error reading {config_name} file: {filepath}. Error: {e}")
            raise

    
    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        """
        Retrieve the configuration related to model evaluation.

        This method:
        1. Extracts model evaluation configuration from the main configuration.
        2. Extracts GradientBoostingRegressor parameters from the params configuration.
        3. Retrieves the target column from the feature schema.
        4. Ensures the root directory for saving model evaluation artifacts exists.
        5. Constructs and returns a ModelEvaluationConfig object.

        Returns:
            ModelEvaluationConfig: Dataclass object containing configurations for model evaluation.

        Raises:
            AttributeError: If an expected attribute does not exist in the config or params files.
        """

        try:
            config = self.config.model_evaluation
            params = self.params.MLPClassifier

            # Ensure the root directory for model evaluation exists
            create_directories([config.root_dir])

            # Construct and return the ModelEvaluationConfig object
            return ModelEvaluationConfig(
                root_dir=Path(config.root_dir),
                val_features_path=Path(config.val_features_path),
                val_labels_path=Path(config.val_labels_path),
                model_path=config.model_path,
                metric_file_path=config.metric_file_path,
                all_params=params,
                mlflow_uri=config.mlflow_uri,
            )
        except AttributeError as e:
            # Log the error and re-raise the exception for handling by the caller
            logger.error("An expected attribute does not exist in the config or params files.")
            raise e 


# Component

In [17]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from pathlib import Path
from src.semantic_preprocessor_model import logger

import os
import pandas as pd
import numpy as np
from urllib.parse import urlparse
import joblib
import mlflow
from scipy.sparse import load_npz
import ast

from src.semantic_preprocessor_model.utils.common import save_json

class ModelEvaluation:
    """
    The ModelEvaluation class evaluates the performance of a trained model using 
    validation data and logs the results into MLflow.
    """
    
    def __init__(self, config: ConfigurationManager):
        """
        Initialize ModelEvaluation with a configuration manager.

        Args:
        - config (ConfigurationManager): Configuration manager instance.
        """
        self.config = config
        self.X_val = None
        self.y_val = None
        self.model = None

    def eval_metrics(self, actual, pred):
        """
        Calculate evaluation metrics for classification.
        
        Args:
        - actual (array-like): True labels.
        - pred (array-like): Predicted labels.
        
        Returns:
        - dict: Dictionary containing accuracy, precision, recall, and F1 score.
        """
        accuracy = accuracy_score(actual, pred)
        
        # Calculate precision, recall, and F1
        precision_values, recall_values, f1_values, _ = precision_recall_fscore_support(actual, pred, average='weighted')
        
        # Take average for logging purposes
        precision = precision_values
        recall = recall_values
        f1 = f1_values
        
        results = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        }
    
        print(f"Accuracy type: {type(accuracy)}")
        print(f"All Params: {self.config.all_params}")
        print(f"Metrics {results}")
    
        return results
    
    def load_data(self):
        """
        Load validation data and the trained model.
        """
        try:
            # Load validation data
            logger.info("Loading validation features...")
            self.X_val = load_npz(self.config.val_features_path)

            logger.info("Loading validation labels...")
            self.y_val = pd.read_csv(self.config.val_labels_path).iloc[:, 0]

            logger.info("Loading trained model...")
            self.model = joblib.load(self.config.model_path)

            logger.info("Data and model loaded successfully.")
            
        except Exception as e:
            logger.error(f"Error while loading data or model: {e}")
            raise e

    def log_into_mlflow(self):
        """
        Log model parameters, metrics, and the model itself into MLflow. This function first loads 
        the validation data and the trained model. It then predicts on the validation data using 
        the model and calculates evaluation metrics. These metrics, along with model parameters, 
        are then logged into MLflow. Finally, the model itself is also logged into MLflow.
        """
        # Logging the start of the MLflow logging process
        logger.info("Starting MLflow logging...")

        # Load validation data and the trained model
        self.load_data()

        # Set the MLflow registry URI
        mlflow.set_registry_uri(self.config.mlflow_uri)
        tracking_url_type_score = urlparse(mlflow.get_tracking_uri()).scheme

        # If the model configuration contains 'hidden_layer_sizes', convert its string representation to actual tuple
        if 'hidden_layer_sizes' in self.config.all_params:
            self.config.all_params['hidden_layer_sizes'] = ast.literal_eval(self.config.all_params['hidden_layer_sizes'])

        # Start an MLflow tracking session
        with mlflow.start_run():
            # Predict on validation data
            predicted_qualities = self.model.predict(self.X_val)

            # Calculate evaluation metrics
            metrics = self.eval_metrics(self.y_val, predicted_qualities)
            scores = {
                "accuracy": metrics['accuracy'], 
                "precision": metrics['precision'], 
                "recall": metrics['recall'], 
                "f1": metrics['f1']
            }

            # Save the calculated metrics to a JSON file
            save_json(path=Path(self.config.metric_file_path), data=scores)

            # Log model parameters into MLflow
            mlflow.log_params(self.config.all_params)

            # Log each metric into MLflow
            for key, value in scores.items():
                mlflow.log_metric(key, value)

            # Determine how to log the model into MLflow based on the tracking URL type
            if tracking_url_type_score != "file":
                mlflow.sklearn.log_model(self.model, "model", registered_model_name="MLPClassifier")
            else:
                mlflow.sklearn.log_model(self.model, "model")

        # Logging the completion of the MLflow logging process
        logger.info("MLflow logging completed.")


# Pipeline

In [19]:
from src.semantic_preprocessor_model import logger

class ModelEvaluationPipeline:

    STAGE_NAME = "Model Evaluation Pipeline"

    def __init__(self):
        self.config_manager = ConfigurationManager()

    
    def run_pipeline(self):
        try:
            logger.info("Fetching model evaluation configuration...")
            model_evaluation_configuration = self.config_manager.get_model_evaluation_config()

            logger.info("Initializing model evaluation process...")
            model_evaluation = ModelEvaluation(config=model_evaluation_configuration)
            
            logger.info("Logging model evaluation into MLFlow...")
            model_evaluation.log_into_mlflow()
            
            logger.info("Model Evaluation Pipeline completed successfully.")
       
        except Exception as e:
            logger.error(f"Error encountered during the model evaluation: {e}")


if __name__ == '__main__':
    pipeline = ModelEvaluationPipeline()
    pipeline.run_pipeline()

[2023-10-23 06:35:16,469: 42: semantic_preprocessor_model_logger: INFO: common:  yaml file: config/config.yaml loaded successfully]
[2023-10-23 06:35:16,470: 42: semantic_preprocessor_model_logger: INFO: common:  yaml file: params.yaml loaded successfully]
[2023-10-23 06:35:16,472: 42: semantic_preprocessor_model_logger: INFO: common:  yaml file: schema.yaml loaded successfully]
[2023-10-23 06:35:16,473: 65: semantic_preprocessor_model_logger: INFO: common:  Created directory at: artifacts]
[2023-10-23 06:35:16,473: 13: semantic_preprocessor_model_logger: INFO: 1743914438:  Fetching model evaluation configuration...]
[2023-10-23 06:35:16,473: 65: semantic_preprocessor_model_logger: INFO: common:  Created directory at: artifacts/model_evaluation]
[2023-10-23 06:35:16,474: 16: semantic_preprocessor_model_logger: INFO: 1743914438:  Initializing model evaluation process...]
[2023-10-23 06:35:16,474: 19: semantic_preprocessor_model_logger: INFO: 1743914438:  Logging model evaluation into ML

  _warn_prf(average, modifier, msg_start, len(result))
Registered model 'MLPClassifier' already exists. Creating a new version of this model...
2023/10/23 06:36:15 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: MLPClassifier, version 4
Created version '4' of model 'MLPClassifier'.


[2023-10-23 06:36:15,903: 141: semantic_preprocessor_model_logger: INFO: 84091956:  MLflow logging completed.]
[2023-10-23 06:36:15,905: 22: semantic_preprocessor_model_logger: INFO: 1743914438:  Model Evaluation Pipeline completed successfully.]
