In [1]:
import os

In [2]:
%pwd

'd:\\iNeuron_Projects\\End_to_End_ML_Dental_Implant_Sandblasting\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\iNeuron_Projects\\End_to_End_ML_Dental_Implant_Sandblasting'

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from dataclasses import dataclass
from pathlib import Path
from Dental_Implant_Sandblasting import logger
from Dental_Implant_Sandblasting.utils.common import read_yaml, create_directories
from Dental_Implant_Sandblasting.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH, SCHEMA_FILE_PATH
from sklearn.linear_model import Ridge, ElasticNet, BayesianRidge, HuberRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Define ModelTrainerConfig dataclass
@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    test_size: float
    random_state: int
    models: dict
    param_grids: dict
    alpha: float
    l1_ratio: float
    target_columns: list

# Define ConfigurationManager class
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH, schema_filepath=SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.model_training
        param_grids = self.params.hyperparameter_tuning
        target_columns = list(self.schema.TARGET_COLUMNS.keys())

        create_directories([config.root_dir])
        model_trainer_config = ModelTrainerConfig(
            root_dir=Path(config.root_dir),
            train_data_path=Path(config.train_data_path),
            test_data_path=Path(config.test_data_path),
            test_size=params['test_size'],
            random_state=params['random_state'],
            models=params['models'],
            param_grids=param_grids,
            alpha=params['elasticnet']['alpha'],
            l1_ratio=params['elasticnet']['l1_ratio'],
            target_columns=target_columns
        )
        return model_trainer_config

# Define ModelTrainer class
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
        self.models = {
            "Ridge": Ridge(),
            "ElasticNet": ElasticNet(alpha=self.config.alpha, l1_ratio=self.config.l1_ratio),
            "BayesianRidge": BayesianRidge(),
            "HuberRegressor": HuberRegressor(),
            "RandomForest": RandomForestRegressor(random_state=self.config.random_state),
            "GradientBoosting": GradientBoostingRegressor(random_state=self.config.random_state),
            "SVR": SVR(),
            "XGBRegressor": XGBRegressor(random_state=self.config.random_state)
        }
        self.param_grids = self.config.param_grids

    def load_data(self):
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)

        X_train = train_data.drop(columns=self.config.target_columns)
        y_train = train_data[self.config.target_columns]

        X_test = test_data.drop(columns=self.config.target_columns)
        y_test = test_data[self.config.target_columns]

        return X_train, y_train, X_test, y_test

    def evaluate_models(self, X_train, y_train, target_column):
        model_performance = {}

        for model_name, model in self.models.items():
            print(f"Training {model_name} for {target_column}...")
            cv_scores = cross_val_score(model, X_train, y_train[target_column], cv=5, scoring='neg_mean_absolute_error')
            mae = -cv_scores.mean()

            if model_name not in model_performance:
                model_performance[model_name] = {}

            model_performance[model_name][f"MAE ({target_column})"] = mae

            print(f"{model_name} - MAE ({target_column}): {mae:.4f}")

        return model_performance

    def hyperparameter_tuning(self, X_train, y_train, target_column):
        best_models = {}

        for model_name in ["RandomForest", "GradientBoosting", "Ridge", "ElasticNet", "HuberRegressor", "SVR", "XGBRegressor"]:
            grid_search = GridSearchCV(
                estimator=self.models[model_name],
                param_grid=self.param_grids[model_name.lower()],
                cv=5,
                scoring='neg_mean_absolute_error',
                n_jobs=-1,
                verbose=2
            )
            print(f"Tuning {model_name} for {target_column}...")
            grid_search.fit(X_train, y_train[target_column])
            best_models[model_name] = grid_search.best_estimator_
            print(f"Best parameters for {model_name} ({target_column}): {grid_search.best_params_}")

        return best_models

    def evaluate_best_models(self, best_models, X_test, y_test, target_column):
        performance_metrics = {}

        for model_name, model in best_models.items():
            y_pred = model.predict(X_test)
            mae = mean_absolute_error(y_test[target_column], y_pred)
            rmse = np.sqrt(mean_squared_error(y_test[target_column], y_pred))
            r2 = r2_score(y_test[target_column], y_pred)

            if model_name not in performance_metrics:
                performance_metrics[model_name] = {}

            performance_metrics[model_name][f"MAE ({target_column})"] = mae
            performance_metrics[model_name][f"RMSE ({target_column})"] = rmse
            performance_metrics[model_name][f"R2 ({target_column})"] = r2

            print(f"{model_name} - Test MAE ({target_column}): {mae:.4f}")
            print(f"{model_name} - Test RMSE ({target_column}): {rmse:.4f}")
            print(f"{model_name} - Test R2 ({target_column}): {r2:.4f}")

        return performance_metrics

    def save_best_models(self, best_models):
        for model_name, model in best_models.items():
            joblib.dump(model, self.config.root_dir / f"{model_name}.joblib")
            logger.info(f"Saved best model {model_name} to {self.config.root_dir / f'{model_name}.joblib'}")

    def execute(self):
        X_train, y_train, X_test, y_test = self.load_data()
        overall_performance = {}

        for target_column in self.config.target_columns:
            print(f"\nEvaluating models for target column: {target_column}")
            model_performance = self.evaluate_models(X_train, y_train, target_column)
            overall_performance[target_column] = model_performance

            best_models = self.hyperparameter_tuning(X_train, y_train, target_column)
            performance_metrics = self.evaluate_best_models(best_models, X_test, y_test, target_column)

            best_hyperparameters = {model_name: model.get_params() for model_name, model in best_models.items()}
            print(f"\nBest Hyperparameters for {target_column}:\n", best_hyperparameters)
            print(f"\nPerformance Metrics for {target_column}:\n", performance_metrics)

            self.save_best_models(best_models)

        # Visualizations
        for target_column in self.config.target_columns:
            metrics_df = pd.DataFrame(overall_performance[target_column]).T

            plt.figure(figsize=(14, 6))
            plt.subplot(1, 3, 1)
            sns.barplot(x=metrics_df.index, y=[metrics_df[f'MAE ({target_column})'][model] for model in metrics_df.index])
            plt.title(f'MAE for Best Models ({target_column})')
            plt.ylabel('Mean Absolute Error')
            plt.xlabel('Model')

            plt.subplot(1, 3, 2)
            sns.barplot(x=metrics_df.index, y=[metrics_df[f'RMSE ({target_column})'][model] for model in metrics_df.index])
            plt.title(f'RMSE for Best Models ({target_column})')
            plt.ylabel('Root Mean Squared Error')
            plt.xlabel('Model')

            plt.subplot(1, 3, 3)
            sns.barplot(x=metrics_df.index, y=[metrics_df[f'R2 ({target_column})'][model] for model in metrics_df.index])
            plt.title(f'R2 for Best Models ({target_column})')
            plt.ylabel('R-Squared')
            plt.xlabel('Model')

            plt.tight_layout()
            plt.show()

# Pipeline execution
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.execute()
except Exception as e:
    logger.exception(e)
    raise e
