In [None]:
import pandas as pd
import numpy as np
import joblib
import logging
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline as ImbPipeline
import yaml
from typing import List, Optional, Dict
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV, cross_val_score
import matplotlib.pyplot as plt
from collections import Counter

# ----------------------------
# Step 1: Configure Logging
# ----------------------------
def configure_logging(debug: bool = False) -> logging.Logger:
    """
    Configure logging settings.

    Args:
        debug (bool): Flag to enable detailed debugging.

    Returns:
        logging.Logger: Configured logger instance.
    """
    log_level = logging.DEBUG if debug else logging.INFO
    logging.basicConfig(
        level=log_level,
        format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
        handlers=[
            logging.FileHandler("preprocessing_pipeline.log"),
            logging.StreamHandler()
        ]
    )
    logger = logging.getLogger('PreprocessingPipeline')
    return logger

# ----------------------------
# Step 2: Define the PreprocessingPipeline Class
# ----------------------------
class PreprocessingPipeline:
    def __init__(
        self,
        config_path: Optional[str] = None,
        target_variable: Optional[str] = None,
        numerical_features: Optional[List[str]] = None,
        ordinal_features: Optional[List[str]] = None,
        nominal_features: Optional[List[str]] = None,
        debug: bool = False
    ):
        """
        Initialize the PreprocessingPipeline with optional debugging and configuration.

        Args:
            config_path (str, optional): Path to the YAML configuration file.
            target_variable (str, optional): Name of the target variable.
            numerical_features (List[str], optional): List of numerical feature names.
            ordinal_features (List[str], optional): List of ordinal feature names.
            nominal_features (List[str], optional): List of nominal feature names.
            debug (bool): Flag to enable detailed debugging.
        """
        self.debug = debug
        self.logger = configure_logging(debug=self.debug)
        self.pipeline = None
        self.categorical_indices = []
        self.target_variable = target_variable
        self.numerical_features = numerical_features or []
        self.ordinal_features = ordinal_features or []
        self.nominal_features = nominal_features or []

        if config_path:
            try:
                with open(config_path, 'r') as file:
                    config = yaml.safe_load(file)
                self.target_variable = config.get('target_variable', self.target_variable)
                self.numerical_features = config.get('numerical_features', self.numerical_features)
                self.ordinal_features = config.get('ordinal_features', self.ordinal_features)
                self.nominal_features = config.get('nominal_features', self.nominal_features)
                self.debug = config.get('debug', self.debug)
                self.logger = configure_logging(debug=self.debug)
                self.logger.debug(f"Pipeline initialized with configuration from {config_path}")
            except Exception as e:
                self.logger.error(f"Failed to load configuration file: {e}")
                raise e
        else:
            if not self.target_variable:
                self.logger.error("No target variable provided and no configuration file specified.")
                raise ValueError("Target variable must be provided if no configuration file is used.")
            if not (self.numerical_features or self.ordinal_features or self.nominal_features):
                self.logger.warning("No feature lists provided. The pipeline will attempt to categorize features automatically.")
            self.logger.debug("Pipeline initialized without configuration file.")

    def categorize_features(self, df: pd.DataFrame) -> Dict[str, List[str]]:
        """
        Categorize features into numerical, ordinal, and nominal based on provided lists or data types.

        Args:
            df (pd.DataFrame): The input DataFrame.

        Returns:
            Dict[str, List[str]]: Dictionary with keys 'numerical', 'ordinal', 'nominal' and lists of feature names.
        """
        self.logger.debug("Starting feature categorization.")
        feature_types = {'numerical': [], 'ordinal': [], 'nominal': []}

        # Use provided numerical_features, or categorize automatically
        if self.numerical_features:
            feature_types['numerical'] = self.numerical_features
            self.logger.debug(f"Using provided numerical features: {self.numerical_features}")
        else:
            numerical = [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col]) and col != self.target_variable]
            feature_types['numerical'] = numerical
            self.logger.debug(f"Automatically categorized numerical features: {numerical}")

        # Use provided ordinal_features
        if self.ordinal_features:
            feature_types['ordinal'] = self.ordinal_features
            self.logger.debug(f"Using provided ordinal features: {self.ordinal_features}")

        # Use provided nominal_features, or categorize automatically
        if self.nominal_features:
            feature_types['nominal'] = self.nominal_features
            self.logger.debug(f"Using provided nominal features: {self.nominal_features}")
        else:
            # Nominal features are those not in numerical or ordinal
            nominal = [col for col in df.columns if col not in feature_types['numerical'] + feature_types['ordinal'] + [self.target_variable]]
            feature_types['nominal'] = nominal
            self.logger.debug(f"Automatically categorized nominal features: {nominal}")

        self.numerical_features = feature_types['numerical']
        self.ordinal_features = feature_types['ordinal']
        self.nominal_features = feature_types['nominal']

        # Log the categorized features
        if self.debug:
            self.logger.debug(f"Categorized Numerical Features: {self.numerical_features}")
            self.logger.debug(f"Categorized Ordinal Features: {self.ordinal_features}")
            self.logger.debug(f"Categorized Nominal Features: {self.nominal_features}")
        else:
            self.logger.info(f"Features categorized: Numerical={len(self.numerical_features)}, "
                             f"Ordinal={len(self.ordinal_features)}, Nominal={len(self.nominal_features)}.")

        return feature_types

    def create_preprocessing_pipeline(self, X_train: pd.DataFrame) -> ColumnTransformer:
        """
        Create a preprocessing pipeline compatible with SMOTENC.

        Args:
            X_train (pd.DataFrame): Training feature set for fitting transformers.

        Returns:
            ColumnTransformer: A ColumnTransformer object with numerical, ordinal, and nominal transformers.
        """
        self.logger.debug("Creating preprocessing pipeline with numerical, ordinal, and nominal transformers.")

        # Numerical Transformer
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ])
        self.logger.debug("Numerical transformer created.")

        # Ordinal Categorical Transformer
        ordinal_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('ordinal_encoder', OrdinalEncoder())
        ])
        self.logger.debug("Ordinal transformer created.")

        # Nominal Categorical Transformers: Use OneHotEncoder
        nominal_transformers = []
        for feature in self.nominal_features:
            transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('onehot_encoder', OneHotEncoder(handle_unknown='ignore'))
            ])
            nominal_transformers.append((f'onehot_enc_{feature}', transformer, [feature]))
            self.logger.debug(f"Nominal transformer with OneHotEncoder for '{feature}' created.")

        # Combine Transformers using ColumnTransformer
        preprocessor = ColumnTransformer(transformers=[
            ('num', numerical_transformer, self.numerical_features),
            ('ord', ordinal_transformer, self.ordinal_features),
            *nominal_transformers  # Unpack nominal transformers
        ], remainder='drop')

        self.logger.debug("ColumnTransformer created with numerical, ordinal, and nominal transformers.")

        # Fit the preprocessor to determine feature indices
        preprocessor.fit(X_train)
        self.logger.info("✅ Preprocessor fitted on training data.")

        # Determine the number of output features after preprocessing
        feature_names = preprocessor.get_feature_names_out()
        self.logger.debug(f"Feature names after preprocessing: {feature_names}")

        # Identify indices of all categorical features (from OneHotEncoder)
        categorical_indices = []
        start_idx = 0
        for name, transformer, features in preprocessor.transformers_:
            if name.startswith('onehot_enc_'):
                ohe = transformer.named_steps['onehot_encoder']
                n_categories = len(ohe.categories_[0])
                categorical_indices.extend(list(range(start_idx, start_idx + n_categories)))
                self.logger.debug(f"Feature '{name}' has {n_categories} categories; indices {list(range(start_idx, start_idx + n_categories))}.")
                start_idx += n_categories
            elif name in ['num', 'ord']:
                n_features = len(features)
                self.logger.debug(f"Feature '{name}' has {n_features} features; advancing start index by {n_features}.")
                start_idx += n_features
            else:
                self.logger.warning(f"Unknown transformer '{name}'. Skipping index calculation.")

        self.logger.debug(f"Categorical feature indices for SMOTENC: {categorical_indices}")
        self.categorical_indices = categorical_indices

        return preprocessor

    def fit_transform(self, df: pd.DataFrame) -> Dict[str, any]:
        """
        Fit the preprocessing pipeline, apply SMOTENC, and transform the data.

        Args:
            df (pd.DataFrame): The entire dataset including features and target.

        Returns:
            Dict[str, any]: Dictionary containing preprocessed training and testing data, and inverse-transformed test data.
        """
        self.logger.debug("Starting fit_transform on the entire dataset.")
        # Categorize features
        feature_types = self.categorize_features(df)

        # Split into features and target
        X = df.drop(self.target_variable, axis=1)
        y = df[self.target_variable]

        self.logger.debug("Split data into features and target.")

        # Split into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size=0.25,
            random_state=42,
            stratify=y  # Ensure proportional representation of classes
        )
        self.logger.info("✅ Data split into training and testing sets successfully.")

        # Log class distribution before SMOTENC
        class_distribution_before = Counter(y_train)
        if self.debug:
            self.logger.debug(f"Class distribution before SMOTENC: {class_distribution_before}")
        else:
            self.logger.info(f"Class distribution before SMOTENC: {class_distribution_before}")

        # Fit and transform the training data
        self.pipeline = self.create_preprocessing_pipeline(X_train)
        X_train_preprocessed = self.pipeline.transform(X_train)
        self.logger.info("✅ Training data preprocessed.")

        # Initialize SMOTENC
        smotenc = SMOTENC(
            categorical_features=self.categorical_indices,
            sampling_strategy='auto',
            random_state=42,
            k_neighbors=5
        )
        self.logger.debug("SMOTENC initialized.")

        # Apply SMOTENC to the preprocessed training data
        X_train_resampled, y_train_resampled = smotenc.fit_resample(X_train_preprocessed, y_train)
        self.logger.info("✅ SMOTENC applied to training data.")

        # Log class distribution after SMOTENC
        class_distribution_after = Counter(y_train_resampled)
        if self.debug:
            self.logger.debug(f"Class distribution after SMOTENC: {class_distribution_after}")
        else:
            self.logger.info(f"Class distribution after SMOTENC: {class_distribution_after}")

        # Transform the test data
        X_test_preprocessed = self.pipeline.transform(X_test)
        self.logger.info("✅ Test data preprocessed.")

        # Inverse transform the test set for interpretability
        X_test_inverse = self.inverse_transform_data(
            X_transformed=X_test_preprocessed
        )
        self.logger.info("✅ Inverse transformation applied to test data.")

        # Preserve the original indexing
        X_test_inverse.index = X_test.index
        self.logger.debug("Index preserved for inverse-transformed test data.")

        return {
            'X_train_preprocessed': X_train_resampled,
            'X_test_preprocessed': X_test_preprocessed,
            'y_train': y_train_resampled,
            'y_test': y_test,
            'X_test_inverse': X_test_inverse
        }

    def transform_new_data(self, df_new: pd.DataFrame) -> np.ndarray:
        """
        Transform new data for prediction.

        Args:
            df_new (pd.DataFrame): New data for prediction.

        Returns:
            np.ndarray: Preprocessed data ready for prediction.
        """
        self.logger.debug("Starting transformation of new data for prediction.")

        # Transform the new data
        X_new_preprocessed = self.transform(df_new)
        self.logger.info("✅ New data transformed successfully.")

        return X_new_preprocessed

    def transform(self, X: pd.DataFrame) -> np.ndarray:
        """
        Transform new data using the fitted preprocessing pipeline.

        Args:
            X (pd.DataFrame): New data to transform.

        Returns:
            np.ndarray: Preprocessed data.
        """
        if self.pipeline is None:
            self.logger.error("Preprocessing pipeline has not been fitted. Call fit_transform first.")
            raise AttributeError("Preprocessing pipeline has not been fitted. Call fit_transform first.")
        self.logger.debug("Transforming new data.")
        X_preprocessed = self.pipeline.transform(X)
        if self.debug:
            self.logger.debug(f"Transformed data shape: {X_preprocessed.shape}")
        else:
            self.logger.info("Data transformed.")
        return X_preprocessed

    def inverse_transform_data(self, X_transformed: np.ndarray) -> pd.DataFrame:
        """
        Perform inverse transformation on the transformed data to reconstruct original feature values.

        Args:
            X_transformed (np.ndarray): The transformed feature data.

        Returns:
            pd.DataFrame: The inverse-transformed DataFrame.
        """
        if self.pipeline is None:
            self.logger.error("Preprocessing pipeline has not been fitted. Cannot perform inverse transformation.")
            raise AttributeError("Preprocessing pipeline has not been fitted. Cannot perform inverse transformation.")

        preprocessor = self.pipeline
        logger = logging.getLogger('InverseTransform')
        if self.debug:
            logger.setLevel(logging.DEBUG)
        else:
            logger.setLevel(logging.INFO)

        logger.debug("Starting inverse transformation.")

        # Initialize dictionaries to hold inverse-transformed data
        inverse_data = {}

        # Initialize index tracker
        start_idx = 0

        # Iterate through each transformer in the ColumnTransformer
        for name, transformer, features in preprocessor.transformers_:
            if name == 'remainder':
                continue  # Skip any remainder features
            # Extract the transformed data for current transformer
            if name == 'num':
                end_idx = start_idx + len(features)
                numerical_data = X_transformed[:, start_idx:end_idx]
                numerical_inverse = transformer.named_steps['scaler'].inverse_transform(
                    numerical_data
                )
                inverse_data.update({feature: numerical_inverse[:, idx] for idx, feature in enumerate(features)})
                logger.debug(f"Numerical features {features} inverse transformed.")
                start_idx = end_idx
            elif name == 'ord':
                end_idx = start_idx + len(features)
                ordinal_data = X_transformed[:, start_idx:end_idx]
                ordinal_inverse = transformer.named_steps['ordinal_encoder'].inverse_transform(
                    ordinal_data
                )
                inverse_data.update({feature: ordinal_inverse[:, idx] for idx, feature in enumerate(features)})
                logger.debug(f"Ordinal features {features} inverse transformed.")
                start_idx = end_idx
            elif name.startswith('onehot_enc_'):
                # For OneHotEncoder, need to inverse transform multiple columns
                transformer_steps = transformer.named_steps
                onehot_encoder = transformer_steps['onehot_encoder']
                # Get number of categories for this feature
                n_categories = len(onehot_encoder.categories_[0])
                end_idx = start_idx + n_categories
                nominal_data = X_transformed[:, start_idx:end_idx]
                nominal_inverse = onehot_encoder.inverse_transform(nominal_data)
                inverse_data.update({feature: nominal_inverse[:, 0] for feature in features})
                logger.debug(f"Nominal features {features} inverse transformed.")
                start_idx = end_idx
            else:
                logger.warning(f"Unknown transformer '{name}'. Skipping inversion.")

        # Create the inverse-transformed DataFrame
        inverse_df = pd.DataFrame(inverse_data)

        logger.debug("Inverse-transformed DataFrame constructed.")

        logger.info("✅ Inverse transformation completed successfully.")

        return inverse_df

# ----------------------------
# Step 3: Define the TrainingModule Class
# ----------------------------
class TrainingModule:
    def __init__(self, model_type: str = 'LogisticRegression', model_params: Optional[Dict] = None, debug: bool = False):
        """
        Initialize the TrainingModule with the specified model.

        Args:
            model_type (str): Type of the classifier ('LogisticRegression', 'RandomForestClassifier', etc.).
            model_params (dict, optional): Parameters for the classifier.
            debug (bool): Flag to enable detailed debugging.
        """
        self.debug = debug
        self.logger = configure_logging(debug=self.debug)
        self.model_type = model_type
        self.model_params = model_params or {}
        self.model = self._initialize_model()

    def _initialize_model(self):
        """
        Initialize the classifier based on the specified type and parameters.

        Returns:
            classifier: An instance of the specified classifier.
        """
        try:
            if self.model_type == 'LogisticRegression':
                model = LogisticRegression(**self.model_params)
            elif self.model_type == 'RandomForestClassifier':
                model = RandomForestClassifier(**self.model_params)
            elif self.model_type == 'GradientBoostingClassifier':
                model = GradientBoostingClassifier(**self.model_params)
            else:
                self.logger.error(f"Unsupported model type: {self.model_type}")
                raise ValueError(f"Unsupported model type: {self.model_type}")
            self.logger.debug(f"Initialized {self.model_type} with parameters: {self.model_params}")
            return model
        except Exception as e:
            self.logger.error(f"Error initializing model: {e}")
            raise e

    def train(self, X_train: np.ndarray, y_train: pd.Series, X_val: Optional[np.ndarray] = None, y_val: Optional[pd.Series] = None):
        """
        Train the classifier on the training data.

        Args:
            X_train (np.ndarray): Preprocessed training features.
            y_train (pd.Series): Training target variable.
            X_val (np.ndarray, optional): Preprocessed validation features.
            y_val (pd.Series, optional): Validation target variable.

        Returns:
            self
        """
        self.logger.debug("Starting model training.")
        self.model.fit(X_train, y_train)
        self.logger.info("✅ Model trained successfully.")
        return self

    def evaluate(self, X_test: np.ndarray, y_test: pd.Series) -> Dict[str, float]:
        """
        Evaluate the trained model on the test data.

        Args:
            X_test (np.ndarray): Preprocessed test features.
            y_test (pd.Series): Test target variable.

        Returns:
            dict: Dictionary containing evaluation metrics.
        """
        self.logger.debug("Starting model evaluation.")
        y_pred = self.model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = cross_val_score(self.model, X_test, y_test, scoring='precision', cv=5).mean()
        recall = cross_val_score(self.model, X_test, y_test, scoring='recall', cv=5).mean()
        f1 = cross_val_score(self.model, X_test, y_test, scoring='f1', cv=5).mean()
        roc_auc = cross_val_score(self.model, X_test, y_test, scoring='roc_auc', cv=5).mean()

        # Log performance metrics
        self.logger.info(f"✅ Model Accuracy on Test Set: {accuracy:.2f}")
        self.logger.info(f"✅ Model Precision: {precision:.2f}")
        self.logger.info(f"✅ Model Recall: {recall:.2f}")
        self.logger.info(f"✅ Model F1-Score: {f1:.2f}")
        self.logger.info(f"✅ Model ROC-AUC: {roc_auc:.2f}")

        # Print classification report
        self.logger.debug("Classification Report:")
        self.logger.debug(f"\n{classification_report(y_test, y_pred, zero_division=0)}")

        # Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm)
        disp.plot()
        plt.title("Confusion Matrix")
        plt.show()

        # Store performance metrics
        performance_metrics = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'roc_auc': roc_auc
        }

        # Optionally, save performance metrics to a file for monitoring
        performance_df = pd.DataFrame([performance_metrics])
        performance_df.to_csv('performance_metrics.csv', index=False)
        self.logger.info("✅ Performance metrics saved to 'performance_metrics.csv'.")

        return performance_metrics

    def save_model(self, filepath: str):
        """
        Save the trained model to disk.

        Args:
            filepath (str): Path to save the model.
        """
        joblib.dump(self.model, filepath)
        self.logger.info(f"✅ Model saved to '{filepath}'.")

    def load_model(self, filepath: str):
        """
        Load a trained model from disk.

        Args:
            filepath (str): Path to load the model from.
        """
        self.model = joblib.load(filepath)
        self.logger.info(f"✅ Model loaded from '{filepath}'.")

    def predict(self, X: np.ndarray) -> np.ndarray:
        """
        Make predictions using the trained model.

        Args:
            X (np.ndarray): Preprocessed feature data.

        Returns:
            np.ndarray: Predicted labels.
        """
        self.logger.debug("Making predictions.")
        predictions = self.model.predict(X)
        self.logger.info("✅ Predictions made successfully.")
        return predictions




from sklearn.model_selection import train_test_split

# ----------------------------
# Step 4: Define the Main Execution Logic
# ----------------------------
def main():
    """
    Main function to demonstrate preprocessing and training using the PreprocessingPipeline and TrainingModule.
    """
    # ----------------------------
    # Configuration and Initialization
    # ----------------------------
    # Path to configuration file (if any)
    config_path = 'config.yaml'  # Ensure this file exists or set to None

    # Initialize the PreprocessingPipeline
    pipeline = PreprocessingPipeline(
        config_path=config_path,
        target_variable='salary',  # Specify the target variable
        ordinal_features=['education_level', 'experience'],  # Specify ordinal features if known
        debug=True  # Set to False for concise logs
    )

    # ----------------------------
    # Data Ingestion
    # ----------------------------

    # Expanding the dataset with more rows
    additional_rows = 300

    # Generate new rows
    np.random.seed(42)  # For reproducibility
    expanded_data = {
        'age': np.random.randint(20, 60, additional_rows),
        'salary': np.random.choice([0, 1], additional_rows),
        'education_level': np.random.choice(['Bachelors', 'Masters', 'PhD'], additional_rows),
        'experience': np.random.choice(['Junior', 'Mid', 'Senior'], additional_rows),
        'gender': np.random.choice(['Male', 'Female'], additional_rows),
        'city': np.random.choice(['New York', 'Chicago', 'Los Angeles', 'Houston'], additional_rows),
        'department': np.random.choice(['Sales', 'Engineering', 'HR', 'Marketing', 'Finance'], additional_rows),
    }

    # Create new DataFrame and append to existing data
    df = pd.DataFrame(expanded_data)

    # ----------------------------
    # Data Preprocessing
    # ----------------------------
    # Fit and transform the entire dataset
    preprocessed_data = pipeline.fit_transform(df)

    X_train_preprocessed = preprocessed_data['X_train_preprocessed']
    X_test_preprocessed = preprocessed_data['X_test_preprocessed']
    y_train = preprocessed_data['y_train']
    y_test = preprocessed_data['y_test']
    X_test_inverse = preprocessed_data['X_test_inverse']

    # Save the inverse-transformed test set for interpretability
    X_test_inverse.to_csv('X_test_inverse.csv')
    pipeline.logger.info("✅ Inverse-transformed test set saved as 'X_test_inverse.csv'.")

    # ----------------------------
    # Initialize and Train the Model
    # ----------------------------
    # Initialize the TrainingModule
    training_module = TrainingModule(
        model_type='LogisticRegression',
        model_params={'max_iter': 1000},
        debug=False  # Set to False for concise logs
    )

    # Train the model
    training_module.train(X_train_preprocessed, y_train)
    pipeline.logger.info("✅ Model training completed.")

    # Evaluate the model
    performance_metrics = training_module.evaluate(X_test_preprocessed, y_test)

    # Save the trained model
    training_module.save_model('trained_model.joblib')

    # ----------------------------
    # Prediction Phase
    # ----------------------------
    # Load the trained model (optional if already trained)
    loaded_model = joblib.load('trained_model.joblib')
    pipeline.logger.info("✅ Trained model loaded from 'trained_model.joblib'.")

    # Example Prediction Data
    prediction_data = {
        'age': [28, 40],
        'education_level': ['Masters', 'Bachelors'],
        'experience': ['Mid', 'Senior'],
        'gender': ['Female', 'Male'],
        'city': ['Chicago', 'Houston'],
        'department': ['Engineering', 'Sales']
    }

    X_new = pd.DataFrame(prediction_data)

    # Preprocess the new data
    X_new_preprocessed = pipeline.transform(X_new)
    pipeline.logger.info("✅ New data preprocessed.")

    # Make predictions
    y_new_pred = loaded_model.predict(X_new_preprocessed)
    pipeline.logger.info("✅ Predictions made on new data.")

    # Inverse transform the new data for interpretability
    X_new_inverse = pipeline.inverse_transform_data(
        X_transformed=X_new_preprocessed
    )
    X_new_inverse['predictions'] = y_new_pred
    pipeline.logger.debug("Predictions attached to inverse-transformed DataFrame.")

    # Save the inverse-transformed prediction data
    X_new_inverse.to_csv('X_inverse_prediction.csv', index=False)
    pipeline.logger.info("✅ Inverse-transformed prediction data saved as 'X_inverse_prediction.csv'.")

    # Display the inverse-transformed prediction data
    print("\nInverse Transformed Prediction DataFrame with Predictions:")
    print(X_new_inverse)

if __name__ == "__main__":
    main()

