In [1]:
import os

In [2]:
%pwd

'd:\\iNeuron_Projects\\End_to_End_ML_Dental_Implant_Sandblasting\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\iNeuron_Projects\\End_to_End_ML_Dental_Implant_Sandblasting'

In [5]:
# Import necessary libraries
import os
from pathlib import Path
import logging
import yaml
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, ElasticNet, BayesianRidge, HuberRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from Dental_Implant_Sandblasting import logger
import json

# Define ModelTrainerConfig dataclass
from dataclasses import dataclass

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path_sa: Path
    test_data_path_sa: Path
    train_data_path_cv: Path
    test_data_path_cv: Path
    test_size: float
    random_state: int
    models: dict
    param_grids: dict
    target_column: str
    cv: int
    scoring: str
    model_path: Path
    imputation_strategy: str
    scaling_method: str
    sa_model_name: str
    cv_model_name: str

# Define ConfigurationManager class
class ConfigurationManager:
    def __init__(self, config_path="config/config.yaml", params_path="params.yaml"):
        self.config = self.read_yaml(config_path)
        self.params = self.read_yaml(params_path)
        self.create_directories([self.config['artifacts_root']])
    
    @staticmethod
    def read_yaml(path_to_yaml: str) -> dict:
        with open(path_to_yaml) as yaml_file:
            content = yaml.safe_load(yaml_file)
        logger.info(f"yaml file: {path_to_yaml} loaded successfully")
        return content

    @staticmethod
    def create_directories(paths: list):
        for path in paths:
            os.makedirs(path, exist_ok=True)
            logger.info(f"created directory at: {path}")
    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config['model_trainer']
        params = self.params

        def convert_to_dict(d):
            return {k: list(v) if isinstance(v, (list, tuple)) else v for k, v in d.items()}

        model_trainer_config = ModelTrainerConfig(
            root_dir=Path(config['root_dir']),
            train_data_path_sa=Path(config['train_data_path_sa']),
            test_data_path_sa=Path(config['test_data_path_sa']),
            train_data_path_cv=Path(config['train_data_path_cv']),
            test_data_path_cv=Path(config['test_data_path_cv']),
            test_size=params['data_transformation']['test_size'],
            random_state=params['data_transformation']['random_state'],
            models=params['model_training']['models'],
            param_grids={key: convert_to_dict(value['param_grid']) for key, value in params['hyperparameter_tuning'].items() if isinstance(value, dict) and 'param_grid' in value},
            target_column=params['model_training']['target_column'],
            cv=params['hyperparameter_tuning']['cv'],
            scoring=params['hyperparameter_tuning']['scoring'],
            model_path=Path(config['model_path']),
            imputation_strategy=params['data_preprocessing']['imputation_strategy'],
            scaling_method=params['data_transformation']['scaling_method'],
            sa_model_name=config['sa_model_name'],
            cv_model_name=config['cv_model_name']
        )
        return model_trainer_config

# Define ModelTrainer class
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def load_data(self):
        try:
            # Load the data for Surface Roughness (Sa)
            train_data_sa = pd.read_csv(self.config.train_data_path_sa)
            test_data_sa = pd.read_csv(self.config.test_data_path_sa)

            X_train_sa = train_data_sa.drop(columns=[self.config.target_column])
            y_train_sa = train_data_sa[self.config.target_column]

            X_test_sa = test_data_sa.drop(columns=[self.config.target_column])
            y_test_sa = test_data_sa[self.config.target_column]

            # Load the data for Cell Viability (CV)
            train_data_cv = pd.read_csv(self.config.train_data_path_cv)
            test_data_cv = pd.read_csv(self.config.test_data_path_cv)

            X_train_cv = train_data_cv.drop(columns=[self.config.target_column])
            y_train_cv = train_data_cv[self.config.target_column]

            X_test_cv = test_data_cv.drop(columns=[self.config.target_column])
            y_test_cv = test_data_cv[self.config.target_column]

            return (X_train_sa, y_train_sa, X_test_sa, y_test_sa), (X_train_cv, y_train_cv, X_test_cv, y_test_cv)
        except Exception as e:
            logger.error(f"Error loading data: {e}")
            raise e

    def preprocess_data(self, X_train, y_train, X_test, y_test):
        try:
            # Imputation strategy
            imputer = SimpleImputer(strategy=self.config.imputation_strategy)
            X_train = imputer.fit_transform(X_train)
            X_test = imputer.transform(X_test)

            # Scaling
            if self.config.scaling_method == "StandardScaler":
                scaler = StandardScaler()
            else:
                raise ValueError(f"Unknown scaling method: {self.config.scaling_method}")

            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            return X_train_scaled, y_train, X_test_scaled, y_test
        except PermissionError as e:
            logger.error(f"Permission denied: {e}")
            raise e
        except Exception as e:
            logger.error(f"Error preprocessing data: {e}")
            raise e

    def evaluate_models(self, X_train, y_train, X_test, y_test):
        try:
            models = self.config.models
            performance_metrics = {}

            for model_name, model_params in models.items():
                logger.info(f"Training {model_name}...")
                if model_name == "ridge":
                    model = Ridge(**model_params)
                elif model_name == "elasticnet":
                    model = ElasticNet(**model_params)
                elif model_name == "bayesian_ridge":
                    model = BayesianRidge(**model_params)
                elif model_name == "huber_regressor":
                    model = HuberRegressor(**model_params)
                elif model_name == "random_forest":
                    model = RandomForestRegressor(**model_params)
                elif model_name == "gradient_boosting":
                    model = GradientBoostingRegressor(**model_params)
                else:
                    raise ValueError(f"Unknown model: {model_name}")

                model.fit(X_train, y_train)
                y_pred_train = model.predict(X_train)

                # Evaluate model performance
                mae = mean_absolute_error(y_train, y_pred_train)
                rmse = mean_squared_error(y_train, y_pred_train, squared=False)
                r2 = r2_score(y_train, y_pred_train)

                performance_metrics[model_name] = {
                    "MAE": mae,
                    "RMSE": rmse,
                    "R2": r2
                }
                logger.info(f"{model_name} - MAE: {mae}")

            # Evaluate on the test set
            for model_name, model in models.items():
                model.fit(X_train, y_train)
                y_pred_test = model.predict(X_test)

                mae_test = mean_absolute_error(y_test, y_pred_test)
                rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
                r2_test = r2_score(y_test, y_pred_test)

                performance_metrics[model_name].update({
                    "Test MAE": mae_test,
                    "Test RMSE": rmse_test,
                    "Test R2": r2_test
                })

            # Visualization: Performance Metrics
            self.visualize_performance(performance_metrics, test=True)
            return performance_metrics
        except Exception as e:
            logger.error(f"Error evaluating models: {e}")
            raise e

    def hyperparameter_tuning(self, X_train, y_train):
        try:
            best_models = {}

            for model_name, param_grid in self.config.param_grids.items():
                if model_name == "ridge":
                    model = Ridge()
                elif model_name == "elasticnet":
                    model = ElasticNet()
                elif model_name == "huber_regressor":
                    model = HuberRegressor()
                elif model_name == "random_forest":
                    model = RandomForestRegressor()
                elif model_name == "gradient_boosting":
                    model = GradientBoostingRegressor()
                else:
                    raise ValueError(f"Unknown model: {model_name}")

                grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=self.config.cv, scoring=self.config.scoring, n_jobs=-1)
                logger.info(f"Tuning {model_name}...")
                grid_search.fit(X_train, y_train)
                best_models[model_name] = grid_search.best_estimator_
                logger.info(f"Best parameters for {model_name}: {grid_search.best_params_}")

            return best_models
        except Exception as e:
            logger.error(f"Error during hyperparameter tuning: {e}")
            raise e

    def evaluate_best_models(self, best_models, X_test, y_test):
        try:
            performance_metrics = {}

            for model_name, model in best_models.items():
                y_pred_test = model.predict(X_test)

                # Calculate performance metrics
                mae = mean_absolute_error(y_test, y_pred_test)
                rmse = mean_squared_error(y_test, y_pred_test, squared=False)
                r2 = r2_score(y_test, y_pred_test)

                # Store performance metrics
                performance_metrics[model_name] = {
                    "Test MAE": mae,
                    "Test RMSE": rmse,
                    "Test R2": r2
                }

                logger.info(f"{model_name} - Test MAE: {mae}, RMSE: {rmse}, R2: {r2}")

            # Visualize performance for the test set
            self.visualize_performance(performance_metrics, test=True)
            return performance_metrics

        except Exception as e:
            logger.error(f"Error evaluating best models: {e}")
            raise e

    def save_models(self, best_models):
        try:
            for model_name, model in best_models.items():
                model_save_path = self.config.model_path / f"{model_name}.joblib"
                model_save_path.parent.mkdir(parents=True, exist_ok=True)
                joblib.dump(model, model_save_path)
                logger.info(f"Saved {model_name} model at: {model_save_path}")

            # Save the best models for 'sa' and 'cv' with the expected filenames
            best_model_sa = best_models.get(self.config.sa_model_name)
            best_model_cv = best_models.get(self.config.cv_model_name)

            if best_model_sa:
                joblib.dump(best_model_sa, self.config.model_path / 'best_model_sa.joblib')
                logger.info("Saved best model for Surface Roughness (Sa) at: artifacts/model_trainer/models/best_model_sa.joblib")

            if best_model_cv:
                joblib.dump(best_model_cv, self.config.model_path / 'best_model_cv.joblib')
                logger.info("Saved best model for Cell Viability (CV) at: artifacts/model_trainer/models/best_model_cv.joblib")

        except Exception as e:
            logger.error(f"Error saving models: {e}")
            raise e

    def visualize_performance(self, performance_metrics, test=False):
        try:
            metric_df = pd.DataFrame(performance_metrics).T
            metric_df = metric_df[['Test MAE', 'Test RMSE', 'Test R2']]

            if test:
                title = 'Test Set Performance'
            else:
                title = 'Training Set Performance'

            plt.figure(figsize=(12, 8))
            sns.barplot(data=metric_df.reset_index().melt(id_vars='index'), x='index', y='value', hue='variable', palette="Set2")
            plt.title(title)
            plt.xlabel('Models')
            plt.ylabel('Score')
            plt.legend(loc='upper right')
            plt.show()
        except Exception as e:
            logger.error(f"Error visualizing performance: {e}")
            raise e

    def execute(self):
        try:
            # Load data
            (X_train_sa, y_train_sa, X_test_sa, y_test_sa), (X_train_cv, y_train_cv, X_test_cv, y_test_cv) = self.load_data()

            # Preprocess data
            X_train_sa_scaled, y_train_sa, X_test_sa_scaled, y_test_sa = self.preprocess_data(X_train_sa, y_train_sa, X_test_sa, y_test_sa)
            X_train_cv_scaled, y_train_cv, X_test_cv_scaled, y_test_cv = self.preprocess_data(X_train_cv, y_train_cv, X_test_cv, y_test_cv)

            # Evaluate initial models for Sa
            model_performance_sa = self.evaluate_models(X_train_sa_scaled, y_train_sa, X_test_sa_scaled, y_test_sa)

            # Evaluate initial models for CV
            model_performance_cv = self.evaluate_models(X_train_cv_scaled, y_train_cv, X_test_cv_scaled, y_test_cv)

            # Hyperparameter tuning for Sa
            best_models_sa = self.hyperparameter_tuning(X_train_sa_scaled, y_train_sa)

            # Hyperparameter tuning for CV
            best_models_cv = self.hyperparameter_tuning(X_train_cv_scaled, y_train_cv)

            # Evaluate the best models for Sa
            performance_metrics_sa = self.evaluate_best_models(best_models_sa, X_test_sa_scaled, y_test_sa)

            # Evaluate the best models for CV
            performance_metrics_cv = self.evaluate_best_models(best_models_cv, X_test_cv_scaled, y_test_cv)

            # Save the best models
            self.save_models(best_models_sa)
            self.save_models(best_models_cv)

            # Log final results
            logger.info("Model training and evaluation completed successfully.")

        except Exception as e:
            logger.exception(f"Error during model training execution: {e}")
            raise e

# Pipeline execution
if __name__ == "__main__":
    try:
        config = ConfigurationManager()
        model_trainer_config = config.get_model_trainer_config()
        model_trainer = ModelTrainer(config=model_trainer_config)
        model_trainer.execute()
    except Exception as e:
        logger.exception(e)
        raise e


[2024-08-31 00:43:39,965: INFO: 244941671: yaml file: config/config.yaml loaded successfully]
[2024-08-31 00:43:40,007: INFO: 244941671: yaml file: params.yaml loaded successfully]
[2024-08-31 00:43:40,009: INFO: 244941671: created directory at: artifacts]
[2024-08-31 00:43:40,064: ERROR: 244941671: Error loading data: "['Result (1=Passed, 0=Failed)'] not found in axis"]
[2024-08-31 00:43:40,067: ERROR: 244941671: Error during model training execution: "['Result (1=Passed, 0=Failed)'] not found in axis"]
Traceback (most recent call last):
  File "C:\Users\Farshid Hesami\AppData\Local\Temp\ipykernel_16184\244941671.py", line 312, in execute
    (X_train_sa, y_train_sa, X_test_sa, y_test_sa), (X_train_cv, y_train_cv, X_test_cv, y_test_cv) = self.load_data()
  File "C:\Users\Farshid Hesami\AppData\Local\Temp\ipykernel_16184\244941671.py", line 121, in load_data
    raise e
  File "C:\Users\Farshid Hesami\AppData\Local\Temp\ipykernel_16184\244941671.py", line 102, in load_data
    X_train_

KeyError: "['Result (1=Passed, 0=Failed)'] not found in axis"

: 