In [1]:
import os

In [2]:
%pwd

'/Users/macbookpro/Desktop/pixi_hr_project/pixi_hr/research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/Users/macbookpro/Desktop/pixi_hr_project/pixi_hr'

ML Flow Setup

Export Configurations in terminal

export MLFLOW_TRACKING_URI=https://dagshub.com/etietopabraham/pixi_hr.mlflow 
export MLFLOW_TRACKING_USERNAME=etietopabraham 
export MLFLOW_TRACKING_PASSWORD=324bb2aaa6fc82dbfce509eac2ce2cd6a016a869

Export Configuration in Jupyter

In [5]:
os.environ["MLFLOW_TRACKING_URI"]="https://dagshub.com/etietopabraham/pixi_hr.mlflow "
os.environ["MLFLOW_TRACKING_USERNAME"]="etietopabraham"
os.environ["MLFLOW_TRACKING_PASSWORD"]="324bb2aaa6fc82dbfce509eac2ce2cd6a016a869"

Configure config.yaml

In [11]:
# Model Evaluation Configuration
model_evaluation:
  # Root directory to store evaluation artifacts
  root_dir: artifacts/model_evaluation
  
  # Path to the test dataset (output from the data transformation stage)
  test_data_path: artifacts/data_transformation/test.csv
  
  # Path to the trained model (output from the model trainer stage)
  model_path: artifacts/model_trainer/model.joblib
  
  # File path to save computed evaluation metrics in JSON format
  metric_file_name: artifacts/model_evaluation/metrics.json

  # MLFlow URI
  mlflow_uri: https://dagshub.com/etietopabraham/pixi_hr.mlflow


SyntaxError: invalid syntax (953188708.py, line 2)

Entity

In [12]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelEvaluationConfig:
    """Configuration parameters for the model evaluation stage."""

    # Root directory to store evaluation artifacts
    root_dir: Path

    # Path to the test dataset (output from the data transformation stage)
    test_data_path: Path

    # Path to the trained model (output from the model trainer stage)
    model_path: Path

    # File path to save computed evaluation metrics in JSON format
    metric_file_name: Path

    # Dictionary containing all the parameters for model evaluation
    all_params: dict

    # Name of the target column in the dataset
    target_column: str

    # URI for the MLFlow server or database
    mlflow_uri: str

Configuration Manager

In [13]:
from src.pixi_hr.constants import *
from src.pixi_hr.utils.common import read_yaml, create_directories, save_json


In [14]:
class ConfigurationManager:
    """
    Manages the configuration for different stages of the data processing and training pipelines.
    This class reads from YAML files to get the configurations, parameters, and schema details.
    It also provides methods to fetch specific configurations for different stages.
    """
    
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH, schema_filepath=SCHEMA_FILE_PATH):
        """
        Initializes the ConfigurationManager class.
        
        Args:
        - config_filepath (str): Path to the main configuration YAML file.
        - params_filepath (str): Path to the parameters YAML file.
        - schema_filepath (str): Path to the schema YAML file.
        """

        # Load configuration, parameters, and schema details from their respective YAML files
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        # Ensure necessary directories exist (e.g., for storing artifacts)
        create_directories([self.config.artifacts_root])

    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        """
        Fetches the configuration parameters required for the model evaluation stage.
        
        Returns:
        - ModelEvaluationConfig: Dataclass containing the configuration parameters for the model evaluation stage.
        """
        config = self.config.model_evaluation
        params = self.params.ElasticNet
        schema = self.schema.TARGET_COLUMN

        # Ensure the directory for model evaluation artifacts exists
        create_directories([config.root_dir])

        # Build the model evaluation configuration
        model_evaluation_config = ModelEvaluationConfig(
            root_dir=config.root_dir,
            test_data_path=config.test_data_path,
            model_path=config.model_path,
            metric_file_name=config.metric_file_name,
            all_params=params,
            target_column=schema.name,
            mlflow_uri=config.mlflow_uri
        )

        return model_evaluation_config


Components

In [15]:
import os
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn
import numpy as np
import joblib
from pathlib import Path

from src.pixi_hr.utils.common import save_json

from pixi_hr.config.configuration import ModelEvaluationConfig

In [16]:
class ModelEvaluation:
    """
    ModelEvaluation class for evaluating a trained predictive model.
    """

    def __init__(self, config: ModelEvaluationConfig):
        """
        Initializes the ModelEvaluation class with the given configuration.
        
        Args:
        - config (ModelEvaluationConfig): Configuration for the model evaluation.
        """
        self.config = config

    def eval_metrics(self, actual, predicted):
        """
        Computes evaluation metrics for the model's predictions.

        Args:
        - actual (pd.Series): Actual target values.
        - predicted (pd.Series): Predicted target values by the model.

        Returns:
        - tuple: RMSE, MAE, R2
        """
        rmse = np.sqrt(mean_squared_error(actual, predicted))
        mae = mean_absolute_error(actual, predicted)
        r2 = r2_score(actual, predicted)
        return rmse, mae, r2
    
    def load_data(self):
        """
        Load the test data and the trained model.
        """
        self.test_data = pd.read_csv(self.config.test_data_path)
        self.model = joblib.load(self.config.model_path)

    def preprocess_data(self):
        """
        Preprocesses the test data: Drops unwanted columns and splits data into features and target.
        """
        columns_to_drop = ['date_of_job_post', 'job_link', 'job_qualifications', 
                           'job_description', 'job_summary', 'date_of_job_post_temp']
        self.test_data.drop(columns_to_drop, axis=1, inplace=True)
        self.test_x = self.test_data.drop([self.config.target_column], axis=1)
        self.test_y = self.test_data[self.config.target_column]

    def log_into_mlflow(self):
        """
        Logs model evaluation metrics and parameters into MLflow.
        """
        self.load_data()
        self.preprocess_data()

        # Set MLflow registry URI
        mlflow.set_registry_uri(self.config.mlflow_uri)
        tracking_url_type_score = urlparse(mlflow.get_tracking_uri()).scheme

        with mlflow.start_run():
            predicted_qualities = self.model.predict(self.test_x)

            (rmse, mae, r2) = self.eval_metrics(self.test_y, predicted_qualities)
            scores = {"rmse": rmse, "mae": mae, "r2": r2}

            # Save evaluation metrics to JSON file using the utility function
            save_json(path=Path(self.config.metric_file_name), data=scores)

            # Log parameters and metrics into MLflow
            mlflow.log_params(self.config.all_params)
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("mae", mae)
            mlflow.log_metric("r2", r2)

            # Log model into MLflow
            if tracking_url_type_score != "file":
                mlflow.sklearn.log_model(self.model, "model", registered_model_name="ElasticnetModel")
            else:
                mlflow.sklearn.log_model(self.model, "model")

Pipeline

In [17]:
from pixi_hr import logger
from pixi_hr.config.configuration import ConfigurationManager
from pixi_hr.components.model_evaluation import ModelEvaluation


class ModelEvaluationPipeline:
    """
    Pipeline class for the model evaluation phase.

    This pipeline performs the following steps:
    1. Initializes the configuration manager.
    2. Fetches the model evaluation configuration.
    3. Initializes the ModelEvaluation component.
    4. Logs evaluation metrics into MLFlow.

    Attributes:
    - STAGE_NAME (str): Name of the stage (used for logging purposes).
    - config_manager (ConfigurationManager): Instance of the configuration manager.

    Methods:
    - main(): Executes the main functionality of the ModelEvaluationPipeline.
    """
    
    STAGE_NAME = "Model Evaluation Stage"

    def __init__(self):
        """
        Initializes the ModelEvaluationPipeline.
        Sets up the configuration manager.
        """
        self.config_manager = ConfigurationManager()

    def main(self):
        """
        Executes the main functionality of the ModelEvaluationPipeline.
        """
        logger.info("Starting the Model Evaluation Pipeline")

        # Fetch the model evaluation configuration
        model_evaluation_config = self.config_manager.get_model_evaluation_config()

        # Initialize the ModelEvaluation component
        model_evaluation = ModelEvaluation(config=model_evaluation_config)

        # Log the evaluation metrics into MLFlow
        model_evaluation.log_into_mlflow()

        logger.info("Model Evaluation Pipeline Completed Successfully.")


if __name__ == '__main__':
    try:
        logger.info(f">>>>>> Stage: {ModelEvaluationPipeline.STAGE_NAME} started <<<<<<")
        model_evaluation_pipeline = ModelEvaluationPipeline()
        model_evaluation_pipeline.main()
        logger.info(f">>>>>> Stage {ModelEvaluationPipeline.STAGE_NAME} completed <<<<<< \n\nx==========x")
    except Exception as e:
        logger.exception(f"Error encountered during the {ModelEvaluationPipeline.STAGE_NAME}: {e}")
        raise


[2023-08-24 19:02:23,669: 53: pixi_hr_project_logger: INFO: 1258113260:  >>>>>> Stage: Model Evaluation Stage started <<<<<<]
[2023-08-24 19:02:23,682: 41: pixi_hr_project_logger: INFO: common:  yaml file: config/config.yaml loaded successfully]
[2023-08-24 19:02:23,685: 41: pixi_hr_project_logger: INFO: common:  yaml file: params.yaml loaded successfully]
[2023-08-24 19:02:23,686: 41: pixi_hr_project_logger: INFO: common:  yaml file: schema.yaml loaded successfully]
[2023-08-24 19:02:23,688: 64: pixi_hr_project_logger: INFO: common:  Created directory at: artifacts]
[2023-08-24 19:02:23,688: 37: pixi_hr_project_logger: INFO: 1258113260:  Starting the Model Evaluation Pipeline]
[2023-08-24 19:02:23,689: 64: pixi_hr_project_logger: INFO: common:  Created directory at: artifacts/model_evaluation]
[2023-08-24 19:02:24,164: 85: pixi_hr_project_logger: INFO: common:  json file saved at: artifacts/model_evaluation/metrics.json]


Registered model 'ElasticnetModel' already exists. Creating a new version of this model...
2023/08/24 19:02:32 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: ElasticnetModel, version 2
Created version '2' of model 'ElasticnetModel'.


[2023-08-24 19:02:32,258: 48: pixi_hr_project_logger: INFO: 1258113260:  Model Evaluation Pipeline Completed Successfully.]
[2023-08-24 19:02:32,259: 56: pixi_hr_project_logger: INFO: 1258113260:  >>>>>> Stage Model Evaluation Stage completed <<<<<< 

