In [1]:
import os

In [2]:
%pwd

'/Users/macbookpro/Desktop/pixi_hr_project/pixi_hr/research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/Users/macbookpro/Desktop/pixi_hr_project/pixi_hr'

update config.yaml first

In [None]:
# Model Trainer Configuration

model_trainer:
  # Path to the root directory where model trainer artifacts are stored.
  root_dir: artifacts/model_trainer
  
  # Location of the training dataset (in this case, a CSV file).
  train_data_path: artifacts/data_transformation/train_data.csv
  
  # Location of the testing dataset (in this case, a CSV file).
  test_data_path: artifacts/data_transformation/test_data.csv
  
  # Name of the serialized trained model to be saved.
  model_name: model.joblib

Entity

In [40]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:

    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    target_column: str
    model_name: str
    model_type: str
    model_params: dict


Update parameters in params.yaml where we can hypertune

In [None]:
ElasticNet:
    alpha: 0.5
    l1_ratio: 0.7

RandomForest:
  n_estimators: 100
  max_depth: None
  min_samples_split: 2
  min_samples_leaf: 1
  max_features: 'auto'
  random_state: 44

Configuration Manager

In [35]:
from src.pixi_hr.constants import *
from src.pixi_hr.utils.common import read_yaml, create_directories

In [43]:
class ConfigurationManager:
    """
    The ConfigurationManager class is responsible for managing configurations from various YAML files.

    Attributes:
        config (dict): Loaded configuration details.
        params (dict): Loaded parameter details.
        schema (dict): Loaded schema details.
    """

    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,    # Path to the main configuration YAML file
                 params_filepath=PARAMS_FILE_PATH,    # Path to the parameters YAML file
                 schema_filepath=SCHEMA_FILE_PATH):   # Path to the schema YAML file
        """
        Initializes the ConfigurationManager and loads configurations, parameters, and schema.

        Args:
            config_filepath (Path): Path to the main configuration YAML file.
            params_filepath (Path): Path to the parameters YAML file.
            schema_filepath (Path): Path to the schema YAML file.
        """
        
        # Load configurations, parameters, and schema details from their respective YAML files
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        # Convert 'None' string to Python None object for specific parameters
        for param, value in self.params['RandomForest'].items():
            if value == 'None':
                self.params['RandomForest'][param] = None


        # Create root directory for storing all artifacts, as specified in the configuration
        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self, chosen_model_type="RandomForest") -> ModelTrainerConfig:
        """
        Fetches the Model Trainer Configuration based on the chosen model type.

        Args:
        - chosen_model_type (str): The desired model type (either "ElasticNet" or "RandomForest").

        Returns:
            ModelTrainerConfig: A dataclass instance containing the model trainer configuration.
        """
        
        # Extract model trainer configuration
        config = self.config.model_trainer

        # Depending on the chosen model type, fetch the respective parameters
        if chosen_model_type == "ElasticNet":
            params = self.params.ElasticNet
        elif chosen_model_type == "RandomForest":
            params = self.params.RandomForest
        else:
            raise ValueError(f"Unsupported model type: {chosen_model_type}")
        
        schema = self.schema.TARGET_COLUMN

        # Create the directory where model training artifacts will be stored
        create_directories([config.root_dir])

        # Create an instance of the ModelTrainerConfig dataclass using the extracted configuration and parameters
        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            model_name=config.model_name,
            target_column=schema.name,
            model_type=chosen_model_type,
            model_params=params
        )

        return model_trainer_config



Components

In [37]:
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler
import joblib
import pandas as pd
import os
from pixi_hr import logger
from sklearn.ensemble import RandomForestRegressor


In [44]:
class ModelTrainer:
    """
    ModelTrainer Class responsible for training a predictive model.
    
    Attributes:
        config (ModelTrainerConfig): Configuration for the model training.
    
    Methods:
        load_data: Load the training and testing datasets.
        preprocess_data: Preprocesses the data by dropping specific columns and splitting into features and target.
        scale_features: Scales the features using StandardScaler.
        train_model: Trains the model using the training data.
        save_model: Saves the trained model to a specified directory.
        main: Orchestrates the model training process.
    """

    def __init__(self, config: ModelTrainerConfig):
        """
        Initializes the ModelTrainer class with a configuration.

        Args:
            config (ModelTrainerConfig): Configuration for model training.
        """
        self.config = config

    def load_data(self):
        """Load training and test data."""
        self.train_data = pd.read_csv(self.config.train_data_path)
        self.test_data = pd.read_csv(self.config.test_data_path)

    def print_data(self):
        print("Scaled Train Features")
        print(pd.DataFrame(self.train_x).head())
        print("Scaled Test Features")
        print(pd.DataFrame(self.test_x).head())



    def preprocess_data(self):
        """Drop unwanted columns and split data into features and target."""
        
        # Filter out columns that are not related to job_qualifications (i.e., prefixed by 'qual_')
        qualification_columns = [col for col in self.train_data.columns if col.startswith('qual_')]
        
        self.train_x = self.train_data[qualification_columns]
        self.train_y = self.train_data[self.config.target_column]

        self.test_x = self.test_data[qualification_columns]
        self.test_y = self.test_data[self.config.target_column]


    def scale_features(self):
        """Scale the features using StandardScaler."""
        scaler = StandardScaler()
        self.train_x = scaler.fit_transform(self.train_x)
        self.test_x = scaler.transform(self.test_x)

    def train_model(self, model):
        """
        Train the model using the training data.

        Args:
            model (model instance): Machine learning model to be trained.
        
        Returns:
            model (model instance): Trained machine learning model.
        """
        model.fit(self.train_x, self.train_y)
        return model

    def save_model(self, model):
        """
        Save the trained model to a specified directory.

        Args:
            model (model instance): Trained machine learning model to save.
        """
        try:
            joblib.dump(model, os.path.join(self.config.root_dir, self.config.model_name))
            logger.info(f"Model saved successfully at {os.path.join(self.config.root_dir, self.config.model_name)}")
        except Exception as e:
            logger.error(f"Error saving the model: {e}")
            raise e

    def main(self):

        """Main execution method that orchestrates the model training process."""
        
        logger.info("Loading data...")
        self.load_data()
        
        logger.info("Preprocessing data...")
        self.preprocess_data()

        # Depending on the model type from the configuration, initialize the appropriate model
        if self.config.model_type == "ElasticNet":
            model = ElasticNet(
                alpha=self.config.model_params["alpha"], 
                l1_ratio=self.config.model_params["l1_ratio"], 
                random_state=44
            )
        elif self.config.model_type == "RandomForest":
            model = RandomForestRegressor(
                n_estimators=self.config.model_params["n_estimators"],
                max_depth=self.config.model_params["max_depth"],
                min_samples_split=self.config.model_params["min_samples_split"],
                min_samples_leaf=self.config.model_params["min_samples_leaf"],
                max_features=self.config.model_params["max_features"],
                random_state=self.config.model_params["random_state"]
            )
        else:
            raise ValueError(f"Unsupported model type: {self.config.model_type}")
        
        trained_model = self.train_model(model)
        
        logger.info("Saving the trained model...")
        self.save_model(trained_model)




Troble shooting scaling process and hyperparamters tuning

In [16]:
config_manager = ConfigurationManager()
model_trainer_config = config_manager.get_model_trainer_config()
model_trainer = ModelTrainer(config=model_trainer_config)
model_trainer.load_data()
model_trainer.preprocess_data()
model_trainer.scale_features()
model_trainer.print_data()

[2023-08-26 13:40:58,310: 41: pixi_hr_project_logger: INFO: common:  yaml file: config/config.yaml loaded successfully]
[2023-08-26 13:40:58,312: 41: pixi_hr_project_logger: INFO: common:  yaml file: params.yaml loaded successfully]
[2023-08-26 13:40:58,314: 41: pixi_hr_project_logger: INFO: common:  yaml file: schema.yaml loaded successfully]
[2023-08-26 13:40:58,314: 64: pixi_hr_project_logger: INFO: common:  Created directory at: artifacts]
[2023-08-26 13:40:58,315: 64: pixi_hr_project_logger: INFO: common:  Created directory at: artifacts/model_trainer]
Scaled Train Features
        0         1         2         3         4         5         6    \
0 -0.132453 -0.172005 -0.107833 -0.465600 -0.318223 -0.107833 -0.204734   
1 -0.132453 -0.172005 -0.107833 -0.465600 -0.318223 -0.107833 -0.204734   
2 -0.132453 -0.172005 -0.107833  2.147767 -0.318223 -0.107833 -0.204734   
3 -0.132453 -0.172005 -0.107833 -0.465600 -0.318223 -0.107833 -0.204734   
4 -0.132453 -0.172005 -0.107833 -0.4656

In [17]:
config_manager = ConfigurationManager()
model_trainer_config = config_manager.get_model_trainer_config()
model_trainer = ModelTrainer(config=model_trainer_config)
model_trainer.load_data()
model_trainer.preprocess_data()
model_trainer.scale_features()
model_trainer.print_data()

from sklearn.linear_model import ElasticNet
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

# 1. Define the Parameter Grid:
param_dist = {
    'alpha': uniform(loc=0, scale=4), # You might want to refine the scale based on domain knowledge
    'l1_ratio': uniform(loc=0, scale=1)
}

# 2. Setup the Randomized Search:
elastic_net = ElasticNet()
random_search = RandomizedSearchCV(
    elastic_net, 
    param_distributions=param_dist,
    n_iter=100, 
    cv=5, 
    verbose=1, 
    n_jobs=-1, 
    # scoring='r2'
)

# 3. Fit the Randomized Search:
# Assuming you have train_x and train_y from your previous steps
random_search.fit(model_trainer.train_x, model_trainer.train_y)

# 4. Inspect Results:
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

[2023-08-26 13:41:16,800: 41: pixi_hr_project_logger: INFO: common:  yaml file: config/config.yaml loaded successfully]
[2023-08-26 13:41:16,804: 41: pixi_hr_project_logger: INFO: common:  yaml file: params.yaml loaded successfully]
[2023-08-26 13:41:16,806: 41: pixi_hr_project_logger: INFO: common:  yaml file: schema.yaml loaded successfully]
[2023-08-26 13:41:16,806: 64: pixi_hr_project_logger: INFO: common:  Created directory at: artifacts]
[2023-08-26 13:41:16,807: 64: pixi_hr_project_logger: INFO: common:  Created directory at: artifacts/model_trainer]
Scaled Train Features
        0         1         2         3         4         5         6    \
0 -0.132453 -0.172005 -0.107833 -0.465600 -0.318223 -0.107833 -0.204734   
1 -0.132453 -0.172005 -0.107833 -0.465600 -0.318223 -0.107833 -0.204734   
2 -0.132453 -0.172005 -0.107833  2.147767 -0.318223 -0.107833 -0.204734   
3 -0.132453 -0.172005 -0.107833 -0.465600 -0.318223 -0.107833 -0.204734   
4 -0.132453 -0.172005 -0.107833 -0.4656

Pipeline

In [26]:
from pixi_hr import logger
from pixi_hr.config.configuration import ConfigurationManager
from pixi_hr.components.model_trainer import ModelTrainer

In [45]:
class ModelTrainerPipeline:
    """
    Pipeline class for the model training phase.

    This pipeline performs the following steps:
    1. Initializes configuration management.
    2. Fetches the model training configuration.
    3. Initializes the ModelTrainer component.
    4. Trains the model using the training data.

    Attributes:
    - STAGE_NAME (str): Name of the stage (used for logging purposes).
    - config (ConfigurationManager): Instance of the ConfigurationManager.
    
    Methods:
    - main(): Executes the main functionality of the ModelTrainerPipeline.
    """
    
    STAGE_NAME = "Model Training Stage"

    def __init__(self):
        """
        Initializes the ModelTrainerPipeline.
        Sets up the configuration manager.
        """
        # Step 1: Initialize Configuration Manager
        self.config_manager = ConfigurationManager()

    def main(self):
        """
        Executes the main functionality of the ModelTrainerPipeline.
        """
        logger.info("Starting the Model Training Pipeline")

        # Step 2: Fetch Model Training Configuration
        model_trainer_config = self.config_manager.get_model_trainer_config()

        # Step 3: Initialize Model Trainer Component
        model_trainer = ModelTrainer(config=model_trainer_config)

        # Step 4: Train Model
        model_trainer.main()

        logger.info("Model Training Pipeline Completed Successfully.")

if __name__ == '__main__':
    try:
        logger.info(f">>>>>> Stage: {ModelTrainerPipeline.STAGE_NAME} started <<<<<<")
        model_trainer_pipeline = ModelTrainerPipeline()
        model_trainer_pipeline.main()
        logger.info(f">>>>>> Stage {ModelTrainerPipeline.STAGE_NAME} completed <<<<<< \n\nx==========x")
    except Exception as e:
        logger.exception(f"Error encountered during the {ModelTrainerPipeline.STAGE_NAME}: {e}")
        raise


[2023-08-26 19:17:13,968: 48: pixi_hr_project_logger: INFO: 3263865135:  >>>>>> Stage: Model Training Stage started <<<<<<]
[2023-08-26 19:17:13,976: 41: pixi_hr_project_logger: INFO: common:  yaml file: config/config.yaml loaded successfully]
[2023-08-26 19:17:13,978: 41: pixi_hr_project_logger: INFO: common:  yaml file: params.yaml loaded successfully]
[2023-08-26 19:17:13,980: 41: pixi_hr_project_logger: INFO: common:  yaml file: schema.yaml loaded successfully]
[2023-08-26 19:17:13,981: 64: pixi_hr_project_logger: INFO: common:  Created directory at: artifacts]
[2023-08-26 19:17:13,982: 33: pixi_hr_project_logger: INFO: 3263865135:  Starting the Model Training Pipeline]
[2023-08-26 19:17:13,982: 64: pixi_hr_project_logger: INFO: common:  Created directory at: artifacts/model_trainer]
[2023-08-26 19:17:13,983: 89: pixi_hr_project_logger: INFO: 4289294302:  Loading data...]
[2023-08-26 19:17:14,033: 92: pixi_hr_project_logger: INFO: 4289294302:  Preprocessing data...]
[2023-08-26 19:

  warn(
