In [1]:
import os

In [2]:
%pwd

'/Users/macbookpro/Desktop/pixi_hr_project/pixi_hr/research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/Users/macbookpro/Desktop/pixi_hr_project/pixi_hr'

update config.yaml first

In [5]:
# Model Trainer Configuration

model_trainer:
  # Path to the root directory where model trainer artifacts are stored.
  root_dir: artifacts/model_trainer
  
  # Location of the training dataset (in this case, a CSV file).
  train_data_path: artifacts/data_transformation/train.csv
  
  # Location of the testing dataset (in this case, a CSV file).
  test_data_path: artifacts/data_transformation/test.csv
  
  # Name of the serialized trained model to be saved.
  model_name: model.joblib

SyntaxError: invalid syntax (1382475702.py, line 3)

Entity

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    """
    Configuration entity for the Model Trainer.

    Attributes:
    - root_dir: Directory where model training artifacts will be stored.
    - train_data_path: Path to the training data file.
    - test_data_path: Path to the test data file.
    - model_name: Name of the model file to be saved.
    - alpha: Regularization strength for the ElasticNet model. 
             Combines the L1 and L2 penalties. Higher values specify stronger regularization.
    - l1_ratio: The mix between L1 and L2 regularization. 
                0 <= l1_ratio <= 1. 0 corresponds to L2 (Ridge) and 1 to L1 (Lasso).
    - target_column: Name of the column in the dataset that represents the target variable.
    """

    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    alpha: float
    l1_ratio: float
    target_column: str


Update parameters in params.yaml where we can hypertune

In [None]:
ElasticNet:
    alpha: 0.5
    l1_ratio: 0.7

Configuration Manager

In [6]:
from src.pixi_hr.constants import *
from src.pixi_hr.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    """
    The ConfigurationManager class is responsible for managing configurations from various YAML files.

    Attributes:
        config (dict): Loaded configuration details.
        params (dict): Loaded parameter details.
        schema (dict): Loaded schema details.
    """

    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,    # Path to the main configuration YAML file
                 params_filepath=PARAMS_FILE_PATH,    # Path to the parameters YAML file
                 schema_filepath=SCHEMA_FILE_PATH):   # Path to the schema YAML file
        """
        Initializes the ConfigurationManager and loads configurations, parameters, and schema.

        Args:
            config_filepath (Path): Path to the main configuration YAML file.
            params_filepath (Path): Path to the parameters YAML file.
            schema_filepath (Path): Path to the schema YAML file.
        """
        
        # Load configurations, parameters, and schema details from their respective YAML files
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        # Create root directory for storing all artifacts, as specified in the configuration
        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        """
        Fetches the Model Trainer Configuration.

        Returns:
            ModelTrainerConfig: A dataclass instance containing the model trainer configuration.
        """
        
        # Extract model trainer configuration and parameters for ElasticNet
        config = self.config.model_trainer
        params = self.params.ElasticNet
        schema = self.schema.TARGET_COLUMN

        # Create the directory where model training artifacts will be stored
        create_directories([config.root_dir])

        # Create an instance of the ModelTrainerConfig dataclass using the extracted configuration and parameters
        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            model_name=config.model_name,
            alpha=params.alpha,
            l1_ratio=params.l1_ratio,
            target_column=schema.name
        )

        return model_trainer_config


Components

In [8]:
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler
import joblib
import pandas as pd
import os
from pixi_hr import logger

[2023-08-23 16:05:45,964: 160: numexpr.utils: INFO: utils:  NumExpr defaulting to 8 threads.]


In [41]:
class ModelTrainer:
    """
    ModelTrainer Class responsible for training a predictive model.
    
    Attributes:
        config (ModelTrainerConfig): Configuration for the model training.
    
    Methods:
        load_data: Load the training and testing datasets.
        preprocess_data: Preprocesses the data by dropping specific columns and splitting into features and target.
        scale_features: Scales the features using StandardScaler.
        train_model: Trains the model using the training data.
        save_model: Saves the trained model to a specified directory.
        main: Orchestrates the model training process.
    """

    def __init__(self, config: ModelTrainerConfig):
        """
        Initializes the ModelTrainer class with a configuration.

        Args:
            config (ModelTrainerConfig): Configuration for model training.
        """
        self.config = config

    def load_data(self):
        """Load training and test data."""
        self.train_data = pd.read_csv(self.config.train_data_path)
        self.test_data = pd.read_csv(self.config.test_data_path)

    def preprocess_data(self):
        """Drop unwanted columns and split data into features and target."""
        columns_to_drop = ['date_of_job_post', 'job_link', 'job_qualifications', 
                           'job_description', 'job_summary', 'date_of_job_post_temp']
        
        self.train_data.drop(columns_to_drop, axis=1, inplace=True)
        self.test_data.drop(columns_to_drop, axis=1, inplace=True)

        self.train_x = self.train_data.drop([self.config.target_column], axis=1)
        self.train_y = self.train_data[self.config.target_column]

        self.test_x = self.test_data.drop([self.config.target_column], axis=1)
        self.test_y = self.test_data[self.config.target_column]

    def scale_features(self):
        """Scale the features using StandardScaler."""
        scaler = StandardScaler()
        self.train_x = scaler.fit_transform(self.train_x)
        self.test_x = scaler.transform(self.test_x)

    def train_model(self, model):
        """
        Train the model using the training data.

        Args:
            model (model instance): Machine learning model to be trained.
        
        Returns:
            model (model instance): Trained machine learning model.
        """
        model.fit(self.train_x, self.train_y)
        return model

    def save_model(self, model):
        """
        Save the trained model to a specified directory.

        Args:
            model (model instance): Trained machine learning model to save.
        """
        try:
            joblib.dump(model, os.path.join(self.config.root_dir, self.config.model_name))
            logger.info(f"Model saved successfully at {os.path.join(self.config.root_dir, self.config.model_name)}")
        except Exception as e:
            logger.error(f"Error saving the model: {e}")
            raise e

    def main(self):
        """Main execution method that orchestrates the model training process."""
        self.load_data()
        self.preprocess_data()
        self.scale_features()
        lr = ElasticNet(alpha=self.config.alpha, l1_ratio=self.config.l1_ratio, random_state=44)
        trained_model = self.train_model(lr)
        self.save_model(trained_model)


Pipeline

In [10]:
from pixi_hr import logger
from pixi_hr.config.configuration import ConfigurationManager
from pixi_hr.components.model_trainer import ModelTrainer

In [42]:
class ModelTrainerPipeline:
    """
    Pipeline class for the model training phase.

    This pipeline performs the following steps:
    1. Initializes configuration management.
    2. Fetches the model training configuration.
    3. Initializes the ModelTrainer component.
    4. Trains the model using the training data.

    Attributes:
    - STAGE_NAME (str): Name of the stage (used for logging purposes).
    - config (ConfigurationManager): Instance of the ConfigurationManager.
    
    Methods:
    - main(): Executes the main functionality of the ModelTrainerPipeline.
    """
    
    STAGE_NAME = "Model Training Stage"

    def __init__(self):
        """
        Initializes the ModelTrainerPipeline.
        Sets up the configuration manager.
        """
        # Step 1: Initialize Configuration Manager
        self.config_manager = ConfigurationManager()

    def main(self):
        """
        Executes the main functionality of the ModelTrainerPipeline.
        """
        logger.info("Starting the Model Training Pipeline")

        # Step 2: Fetch Model Training Configuration
        model_trainer_config = self.config_manager.get_model_trainer_config()

        # Step 3: Initialize Model Trainer Component
        model_trainer = ModelTrainer(config=model_trainer_config)

        # Step 4: Train Model
        model_trainer.main()

        logger.info("Model Training Pipeline Completed Successfully.")

if __name__ == '__main__':
    try:
        logger.info(f">>>>>> Stage: {ModelTrainerPipeline.STAGE_NAME} started <<<<<<")
        model_trainer_pipeline = ModelTrainerPipeline()
        model_trainer_pipeline.main()
        logger.info(f">>>>>> Stage {ModelTrainerPipeline.STAGE_NAME} completed <<<<<< \n\nx==========x")
    except Exception as e:
        logger.exception(f"Error encountered during the {ModelTrainerPipeline.STAGE_NAME}: {e}")
        raise


[2023-08-23 16:56:20,497: 48: pixi_hr_project_logger: INFO: 3263865135:  >>>>>> Stage: Model Training Stage started <<<<<<]
[2023-08-23 16:56:20,505: 41: pixi_hr_project_logger: INFO: common:  yaml file: config/config.yaml loaded successfully]
[2023-08-23 16:56:20,507: 41: pixi_hr_project_logger: INFO: common:  yaml file: params.yaml loaded successfully]
[2023-08-23 16:56:20,508: 41: pixi_hr_project_logger: INFO: common:  yaml file: schema.yaml loaded successfully]
[2023-08-23 16:56:20,509: 64: pixi_hr_project_logger: INFO: common:  Created directory at: artifacts]
[2023-08-23 16:56:20,509: 33: pixi_hr_project_logger: INFO: 3263865135:  Starting the Model Training Pipeline]
[2023-08-23 16:56:20,509: 64: pixi_hr_project_logger: INFO: common:  Created directory at: artifacts/model_trainer]
[2023-08-23 16:56:20,553: 73: pixi_hr_project_logger: INFO: 965417682:  Model saved successfully at artifacts/model_trainer/model.joblib]
[2023-08-23 16:56:20,557: 44: pixi_hr_project_logger: INFO: 326