In [1]:
import os

In [2]:
%pwd

'd:\\iNeuron_Projects\\End_to_End_ML_Dental_Implant_Sandblasting\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\iNeuron_Projects\\End_to_End_ML_Dental_Implant_Sandblasting'

In [5]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
from dataclasses import dataclass
from pathlib import Path
from Dental_Implant_Sandblasting.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH, SCHEMA_FILE_PATH
from Dental_Implant_Sandblasting.utils.common import read_yaml, create_directories, save_json
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error, median_absolute_error
from urllib.parse import urlparse
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import learning_curve, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
import yaml
import logging

# Initialize logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define ModelEvaluationConfig dataclass
@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    test_data_path: Path
    model_path: Path
    all_params: dict
    metric_file_name: Path
    target_column: str

# Define ConfigurationManager class
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH, schema_filepath=SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config['artifacts_root']])
        logger.info(f"Config: {self.config}")
        logger.info(f"Params: {self.params}")
        logger.info(f"Schema: {self.schema}")

    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        evaluation_config = self.config['model_evaluation']
        return ModelEvaluationConfig(
            root_dir=Path(evaluation_config['root_dir']),
            test_data_path=Path(evaluation_config['test_data_path']),
            model_path=Path(evaluation_config['model_path']),
            all_params=self.config,
            metric_file_name=Path(evaluation_config['metric_file_name']),
            target_column=evaluation_config['target_column']
        )

# Define ModelEvaluation class
class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig):
        self.config = config
        self.test_data = None
        self.model_sa = None
        self.model_cv = None
        self.poly = None

    def load_data(self):
        try:
            self.test_data = pd.read_csv(self.config.test_data_path)
            logger.info("Test data loaded successfully.")
        except Exception as e:
            logger.exception("Error loading test data")
            raise e

    def load_model(self):
        try:
            model_sa_path = self.config.model_path / 'best_model_sa.joblib'
            model_cv_path = self.config.model_path / 'best_model_cv.joblib'
            poly_path = self.config.model_path / 'poly_features.joblib'
            
            if not model_sa_path.exists():
                logger.error(f"Model file not found: {model_sa_path}")
                raise FileNotFoundError(f"No such file: '{model_sa_path}'")
            
            if not model_cv_path.exists():
                logger.error(f"Model file not found: {model_cv_path}")
                raise FileNotFoundError(f"No such file: '{model_cv_path}'")
            
            if not poly_path.exists():
                logger.error(f"Polynomial features file not found: {poly_path}")
                raise FileNotFoundError(f"No such file: '{poly_path}'")
            
            self.model_sa = joblib.load(model_sa_path)
            self.model_cv = joblib.load(model_cv_path)
            self.poly = joblib.load(poly_path)
            logger.info("Models and polynomial features loaded successfully.")
        except Exception as e:
            logger.exception("Error loading models or polynomial features")
            raise e

    def data_exploration(self):
        try:
            logger.info("Performing data exploration.")
            print("First few rows of the dataset:")
            display(self.test_data.head())

            print(f"\nData shape: {self.test_data.shape}")
            print("\nData info:")
            self.test_data.info()

            print("\nData types:")
            print(self.test_data.dtypes)

            print("\nSummary statistics:")
            display(self.test_data.describe(include='all'))

            print("\nMissing values by column:")
            print(self.test_data.isnull().sum())

            # Further EDA plots
            plt.figure(figsize=(10, 6))
            sns.heatmap(self.test_data.corr(), annot=True, cmap='coolwarm', fmt=".2f")
            plt.title("Correlation Heatmap")
            plt.show()

            plt.figure(figsize=(8, 5))
            sns.histplot(self.test_data['(Sa) Average of Surface roughness (micrometer)'], kde=True, bins=20)
            plt.title("Distribution of Surface Roughness (Sa)")
            plt.xlabel("Surface Roughness (Sa) (µm)")
            plt.ylabel("Frequency")
            plt.show()

            plt.figure(figsize=(8, 5))
            sns.histplot(self.test_data['Cell Viability (%)'], kde=True, bins=20)
            plt.title("Distribution of Cell Viability")
            plt.xlabel("Cell Viability (%)")
            plt.ylabel("Frequency")
            plt.show()
        except Exception as e:
            logger.exception("Error during data exploration")
            raise e

    def eval_metrics(self, actual, pred):
        rmse = np.sqrt(mean_squared_error(actual, pred))
        mae = mean_absolute_error(actual, pred)
        r2 = r2_score(actual, pred)
        mape = mean_absolute_percentage_error(actual, pred)
        medae = median_absolute_error(actual, pred)
        return rmse, mae, r2, mape, medae

    def save_results(self):
        try:
            test_data = pd.read_csv(self.config.test_data_path)
            model = joblib.load(self.config.model_path / 'model.joblib')

            test_x = test_data.drop([self.config.target_column], axis=1)
            test_y = test_data[[self.config.target_column]]

            predicted_qualities = model.predict(test_x)

            rmse, mae, r2, mape, medae = self.eval_metrics(test_y, predicted_qualities)

            # Saving metrics as local
            scores = {"rmse": rmse, "mae": mae, "r2": r2, "mape": mape, "medae": medae}
            save_json(path=Path(self.config.metric_file_name), data=scores)
            logger.info("Results saved successfully.")
        except Exception as e:
            logger.exception("Error during saving results")
            raise e

    def evaluate_model(self):
        try:
            logger.info("Evaluating the model.")
            X_test = self.test_data.drop(columns=['(Sa) Average of Surface roughness (micrometer)', 'Cell Viability (%)', 'Result (1=Passed, 0=Failed)'])
            y_sa_test = self.test_data['(Sa) Average of Surface roughness (micrometer)']
            y_cv_test = self.test_data['Cell Viability (%)']

            X_poly_test = self.poly.transform(X_test)
            y_sa_pred = self.model_sa.predict(X_poly_test)
            valid_indices = (y_sa_pred > 1.5) & (y_sa_pred < 2.5)
            y_cv_pred = np.zeros_like(y_cv_test)
            if any(valid_indices):
                y_cv_pred[valid_indices] = self.model_cv.predict(X_poly_test[valid_indices])

            # Evaluation metrics for Surface Roughness (Sa)
            mae_sa = mean_absolute_error(y_sa_test, y_sa_pred)
            rmse_sa = np.sqrt(mean_squared_error(y_sa_test, y_sa_pred))
            r2_sa = r2_score(y_sa_test, y_sa_pred)
            mape_sa = mean_absolute_percentage_error(y_sa_test, y_sa_pred)
            medae_sa = median_absolute_error(y_sa_test, y_sa_pred)

            print(f"Surface Roughness (Sa) - Test MAE: {mae_sa:.4f}")
            print(f"Surface Roughness (Sa) - Test RMSE: {rmse_sa:.4f}")
            print(f"Surface Roughness (Sa) - Test R2: {r2_sa:.4f}")
            print(f"Surface Roughness (Sa) - Test MAPE: {mape_sa:.4f}")
            print(f"Surface Roughness (Sa) - Test MedAE: {medae_sa:.4f}")

            # Cross-validation scores
            cv_scores = cross_val_score(self.model_sa, X_poly_test, y_sa_test, cv=5, scoring='neg_mean_absolute_error')
            print(f"Cross-Validation MAE (Sa): {-cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

            # Residuals
            residuals = y_sa_test - y_sa_pred

            # Residual plot
            plt.figure(figsize=(10, 6))
            plt.scatter(y_sa_pred, residuals, alpha=0.5)
            plt.hlines(y=0, xmin=min(y_sa_pred), xmax=max(y_sa_pred), color='r', linestyles='dashed')
            plt.xlabel('Predicted Surface Roughness (Sa)')
            plt.ylabel('Residuals')
            plt.title('Residual Plot for Surface Roughness (Sa)')
            plt.show()

            # Distribution of Residuals
            plt.figure(figsize=(10, 6))
            sns.histplot(residuals, kde=True)
            plt.title('Distribution of Residuals for Surface Roughness (Sa)')
            plt.xlabel('Residuals')
            plt.ylabel('Frequency')
            plt.show()

            # Evaluate Cell Viability (CV)
            if any(valid_indices):
                mae_cv = mean_absolute_error(y_cv_test[valid_indices], y_cv_pred[valid_indices])
                rmse_cv = np.sqrt(mean_squared_error(y_cv_test[valid_indices], y_cv_pred[valid_indices]))
                r2_cv = r2_score(y_cv_test[valid_indices], y_cv_pred[valid_indices])
                mape_cv = mean_absolute_percentage_error(y_cv_test[valid_indices], y_cv_pred[valid_indices])
                medae_cv = median_absolute_error(y_cv_test[valid_indices], y_cv_pred[valid_indices])

                print(f"Cell Viability (CV) - Test MAE: {mae_cv:.4f}")
                print(f"Cell Viability (CV) - Test RMSE: {rmse_cv:.4f}")
                print(f"Cell Viability (CV) - Test R2: {r2_cv:.4f}")
                print(f"Cell Viability (CV) - Test MAPE: {mape_cv:.4f}")
                print(f"Cell Viability (CV) - Test MedAE: {medae_cv:.4f}")

                valid_colors = np.where(y_cv_pred[valid_indices] > 90, 'green', 'red')

                plt.figure(figsize=(10, 6))
                plt.scatter(y_cv_test[valid_indices], y_cv_pred[valid_indices], alpha=0.5, c=valid_colors)
                plt.plot([min(y_cv_test[valid_indices]), max(y_cv_test[valid_indices])], [min(y_cv_test[valid_indices]), max(y_cv_test[valid_indices])], color='r')
                plt.xlabel('Actual Cell Viability')
                plt.ylabel('Predicted Cell Viability')
                plt.title('Actual vs Predicted Cell Viability (Valid Predictions Only)')
                plt.show()

            plt.figure(figsize=(10, 6))
            plt.scatter(y_sa_test, y_sa_pred, alpha=0.5)
            plt.plot([min(y_sa_test), max(y_sa_test)], [min(y_sa_test), max(y_sa_test)], color='r')
            plt.xlabel('Actual Surface Roughness (Sa)')
            plt.ylabel('Predicted Surface Roughness (Sa)')
            plt.title('Actual vs Predicted Surface Roughness (Sa)')
            plt.show()

            # Learning Curves
            train_sizes, train_scores, test_scores = learning_curve(self.model_sa, X_poly_test, y_sa_test, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10))
            train_scores_mean = -train_scores.mean(axis=1)
            test_scores_mean = -test_scores.mean(axis=1)

            plt.figure(figsize=(10, 6))
            plt.plot(train_sizes, train_scores_mean, label='Training Error')
            plt.plot(train_sizes, test_scores_mean, label='Validation Error')
            plt.title('Learning Curve')
            plt.xlabel('Training Set Size')
            plt.ylabel('Mean Absolute Error')
            plt.legend()
            plt.show()
        except Exception as e:
            logger.exception("Error during model evaluation")
            raise e

    def make_predictions(self):
        try:
            logger.info("Making predictions.")
            X_new = self.test_data.drop(columns=['(Sa) Average of Surface roughness (micrometer)', 'Cell Viability (%)', 'Result (1=Passed, 0=Failed)'])
            X_poly_new = self.poly.transform(X_new)
            y_sa_pred_new = self.model_sa.predict(X_poly_new)
            y_cv_pred_new = np.zeros_like(y_sa_pred_new)
            valid_indices_new = (y_sa_pred_new > 1.5) & (y_sa_pred_new < 2.5)
            if any(valid_indices_new):
                y_cv_pred_new[valid_indices_new] = self.model_cv.predict(X_poly_new[valid_indices_new])

            results = pd.DataFrame({
                'Predicted Surface Roughness (Sa)': y_sa_pred_new,
                'Predicted Cell Viability (%)': y_cv_pred_new,
                'Validity': np.where(y_cv_pred_new > 90, 'green', 'red')
            })
            print(results)

            plt.figure(figsize=(10, 6))
            plt.scatter(y_sa_pred_new, y_cv_pred_new, c=results['Validity'], alpha=0.5)
            plt.axhline(90, color='r', linestyle='dashed', linewidth=1)
            plt.xlabel('Predicted Surface Roughness (Sa)')
            plt.ylabel('Predicted Cell Viability (%)')
            plt.title('Predicted Surface Roughness vs Predicted Cell Viability')
            plt.colorbar(label='Validity')
            plt.show()

            plt.figure(figsize=(10, 6))
            plt.hist(y_sa_pred_new, bins=20, alpha=0.7, label='Surface Roughness (Sa)')
            plt.axvline(1.5, color='r', linestyle='dashed', linewidth=1)
            plt.axvline(2.5, color='r', linestyle='dashed', linewidth=1)
            plt.title('Distribution of Predicted Surface Roughness (Sa)')
            plt.xlabel('Surface Roughness (Sa)')
            plt.ylabel('Frequency')
            plt.legend()
            plt.show()

            plt.figure(figsize=(10, 6))
            plt.hist(y_cv_pred_new, bins=20, alpha=0.7, label='Cell Viability (%)', color='orange')
            plt.axvline(90, color='r', linestyle='dashed', linewidth=1)
            plt.title('Distribution of Predicted Cell Viability (%)')
            plt.xlabel('Cell Viability (%)')
            plt.ylabel('Frequency')
            plt.legend()
            plt.show()

            validity_counts = results['Validity'].value_counts()
            plt.figure(figsize=(8, 5))
            plt.bar(validity_counts.index, validity_counts.values, color=['red', 'green'])
            plt.xlabel('Prediction Validity')
            plt.ylabel('Count')
            plt.title('Count of Valid vs Invalid Predictions')
            plt.show()

            residuals = y_sa_pred_new - y_sa_pred_new

            plt.figure(figsize=(10, 6))
            plt.scatter(y_sa_pred_new, residuals, alpha=0.5)
            plt.hlines(y=0, xmin=min(y_sa_pred_new), xmax=max(y_sa_pred_new), color='r', linestyles='dashed')
            plt.xlabel('Predicted Surface Roughness (Sa)')
            plt.ylabel('Residuals')
            plt.title('Residual Plot for Predictions')
            plt.show()

            plt.figure(figsize=(10, 6))
            sns.histplot(residuals, kde=True, color='blue')
            plt.title('Distribution of Residuals for Predictions')
            plt.xlabel('Residuals')
            plt.ylabel('Frequency')
            plt.show()
        except Exception as e:
            logger.exception("Error during predictions")
            raise e

# Pipeline execution
try:
    config = ConfigurationManager()
    model_evaluation_config = config.get_model_evaluation_config()
    model_evaluator = ModelEvaluation(config=model_evaluation_config)
    model_evaluator.load_data()
    model_evaluator.load_model()
    model_evaluator.data_exploration()
    model_evaluator.evaluate_model()
    model_evaluator.make_predictions()
    model_evaluator.save_results()
except Exception as e:
    logger.exception(e)
    raise e


[2024-07-20 00:44:49,774: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-07-20 00:44:49,821: INFO: common: yaml file: params.yaml loaded successfully]
[2024-07-20 00:44:49,834: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-07-20 00:44:49,837: INFO: common: created directory at: artifacts]
[2024-07-20 00:44:49,840: INFO: 2977693376: Config: {'artifacts_root': 'artifacts', 'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'source_URL': 'https://github.com/farshidhesami/Branching-tutorial/raw/master/Sandblasting-Condition.zip', 'local_data_file': 'artifacts/data_ingestion/data.zip', 'unzip_dir': 'artifacts/data_ingestion'}, 'data_validation': {'root_dir': 'artifacts/data_validation', 'unzip_data_dir': 'artifacts/data_ingestion/Sandblasting-Condition.csv', 'STATUS_FILE': 'artifacts/data_validation/status.txt'}, 'data_transformation': {'root_dir': 'artifacts/data_transformation', 'data_path': 'artifacts/data_ingestion/Sandblasting-Condition

FileNotFoundError: No such file: 'artifacts\model_trainer\models\best_model_sa.joblib'

: 