In [None]:
# data_preprocessor.py

import pandas as pd
import numpy as np
from typing import List, Dict, Optional, Tuple, Any
import logging
import os
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import PowerTransformer, OrdinalEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from imblearn.over_sampling import SMOTE, ADASYN, SMOTENC, SMOTEN, BorderlineSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import probplot
import joblib  # For saving/loading transformers
from inspect import signature  # For parameter validation in SMOTE

class DataPreprocessor:
    def __init__(
        self,
        model_type: str,
        column_assets: Dict[str, List[str]],
        mode: str,  # 'train', 'predict', 'clustering'
        options: Optional[Dict] = None,
        debug: bool = False,
        normalize_debug: bool = False,
        normalize_graphs_output: bool = False,
        graphs_output_dir: str = './plots',
        transformers_dir: str = './transformers'
    ):
        """
        Initialize the DataPreprocessor with model type, column assets, and user-defined options.

        Args:
            model_type (str): Type of the machine learning model (e.g., 'Logistic Regression').
            column_assets (Dict[str, List[str]]): Dictionary containing lists of columns for different categories.
            mode (str): Operational mode ('train', 'predict', 'clustering').
            options (Optional[Dict]): User-defined options for preprocessing steps.
            debug (bool): General debug flag to control overall verbosity.
            normalize_debug (bool): Flag to display normalization plots.
            normalize_graphs_output (bool): Flag to save normalization plots.
            graphs_output_dir (str): Directory to save plots.
            transformers_dir (str): Directory to save/load transformers.
        """
        self.model_type = model_type
        self.column_assets = column_assets
        self.mode = mode.lower()
        if self.mode not in ['train', 'predict', 'clustering']:
            raise ValueError("Mode must be one of 'train', 'predict', or 'clustering'.")
        self.options = options or {}
        self.debug = debug
        self.normalize_debug = normalize_debug
        self.normalize_graphs_output = normalize_graphs_output
        self.graphs_output_dir = graphs_output_dir
        self.transformers_dir = transformers_dir

        # Define model categories for accurate processing
        self.model_category = self.map_model_type_to_category()

        if self.model_category == 'unknown':
            self.logger = logging.getLogger(self.__class__.__name__)
            self.logger.error(f"Model category for '{self.model_type}' is unknown. Check your configuration.")
            raise ValueError(f"Model category for '{self.model_type}' is unknown. Check your configuration.")

        # Initialize y_variable based on mode and model category
        if self.mode in ['train', 'predict'] and self.model_category in ['classification', 'regression']:
            self.y_variable = column_assets.get('y_variable', [])
            if not self.y_variable:
                if self.mode == 'train':
                    raise ValueError("Target variable 'y_variable' must be specified for supervised models in train mode.")
                # In predict mode, y_variable might not be present
        else:
            # For 'clustering' mode or unsupervised prediction
            self.y_variable = []

        # Fetch feature lists
        self.ordinal_categoricals = column_assets.get('ordinal_categoricals', [])
        self.nominal_categoricals = column_assets.get('nominal_categoricals', [])
        self.numericals = column_assets.get('numericals', [])

        # Initialize other variables
        self.scaler = None
        self.transformer = None
        self.ordinal_encoder = None
        self.nominal_encoder = None
        self.preprocessor = None
        self.smote = None
        self.feature_reasons = {col: '' for col in self.ordinal_categoricals + self.nominal_categoricals + self.numericals}
        self.preprocessing_steps = []
        self.normality_results = {}
        self.features_to_transform = []
        self.nominal_encoded_feature_names = []
        self.final_feature_order = []

        # Initialize placeholders for clustering-specific transformers
        self.cluster_transformers = {}
        self.cluster_model = None
        self.cluster_labels = None
        self.silhouette_score = None

        # Define default thresholds for SMOTE recommendations
        self.imbalance_threshold = self.options.get('smote_recommendation', {}).get('imbalance_threshold', 0.1)
        self.noise_threshold = self.options.get('smote_recommendation', {}).get('noise_threshold', 0.1)
        self.overlap_threshold = self.options.get('smote_recommendation', {}).get('overlap_threshold', 0.1)
        self.boundary_threshold = self.options.get('smote_recommendation', {}).get('boundary_threshold', 0.1)

        self.pipeline = None  # Initialize pipeline

        # Configure logging
        self.logger = logging.getLogger(self.__class__.__name__)
        self.logger.setLevel(logging.DEBUG if self.debug else logging.INFO)
        handler = logging.StreamHandler()
        formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s')
        handler.setFormatter(formatter)
        if not self.logger.handlers:
            self.logger.addHandler(handler)

    def get_debug_flag(self, flag_name: str) -> bool:
        """
        Retrieve the value of a specific debug flag from the options.
        Args:
            flag_name (str): The name of the debug flag.
        Returns:
            bool: The value of the debug flag.
        """
        return self.options.get(flag_name, False)

    def _log(self, message: str, step: str, level: str = 'info'):
        """
        Internal method to log messages based on the step-specific debug flags.
        
        Args:
            message (str): The message to log.
            step (str): The preprocessing step name.
            level (str): The logging level ('info', 'debug', etc.).
        """
        debug_flag = self.get_debug_flag(f'debug_{step}')
        if debug_flag:
            if level == 'debug':
                self.logger.debug(message)
            elif level == 'info':
                self.logger.info(message)
            elif level == 'warning':
                self.logger.warning(message)
            elif level == 'error':
                self.logger.error(message)

    def map_model_type_to_category(self) -> str:
        """
        Map the model_type string to a predefined category.

        Returns:
            str: The model category ('classification', 'regression', 'clustering', etc.).
        """
        classification_models = [
            'Logistic Regression',
            'Tree Based Classifier',
            'k-NN Classifier',
            'SVM Classifier',
            'Neural Network Classifier'
        ]

        regression_models = [
            'Linear Regression',
            'Tree Based Regressor',
            'k-NN Regressor',
            'SVM Regressor',
            'Neural Network Regressor'
        ]

        clustering_models = [
            'K-Means', 'Hierarchical Clustering', 'DBSCAN', 'KModes', 'KPrototypes'
        ]


        if self.model_type in classification_models:
            return 'classification'
        elif self.model_type in regression_models:
            return 'regression'
        elif self.model_type in clustering_models:
            return 'clustering'
        else:
            return 'unknown'

    def split_dataset(
        self,
        X: pd.DataFrame,
        y: Optional[pd.Series] = None
    ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame], Optional[pd.Series], Optional[pd.Series]]:
        """
        Split the dataset into training and testing sets while retaining original indices.

        Args:
            X (pd.DataFrame): Features.
            y (Optional[pd.Series]): Target variable.

        Returns:
            Tuple[pd.DataFrame, Optional[pd.DataFrame], Optional[pd.Series], Optional[pd.Series]]: X_train, X_test, y_train, y_test
        """
        step_name = "split_dataset"
        self.logger.info("Step: Split Dataset into Train and Test")

        # Debugging Statements
        self._log(f"Before Split - X shape: {X.shape}", step_name, 'debug')
        if y is not None:
            self._log(f"Before Split - y shape: {y.shape}", step_name, 'debug')
        else:
            self._log("Before Split - y is None", step_name, 'debug')

        # Determine splitting based on mode
        if self.mode == 'train' and self.model_category in ['classification', 'regression']:
            if self.model_category == 'classification':
                stratify = y if self.options.get('split_dataset', {}).get('stratify_for_classification', False) else None
                test_size = self.options.get('split_dataset', {}).get('test_size', 0.2)
                random_state = self.options.get('split_dataset', {}).get('random_state', 42)
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, 
                    test_size=test_size,
                    stratify=stratify, 
                    random_state=random_state
                )
                self._log("Performed stratified split for classification.", step_name, 'debug')
            elif self.model_category == 'regression':
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, 
                    test_size=self.options.get('split_dataset', {}).get('test_size', 0.2),
                    random_state=self.options.get('split_dataset', {}).get('random_state', 42)
                )
                self._log("Performed random split for regression.", step_name, 'debug')
        else:
            # For 'predict' and 'clustering' modes or other categories
            X_train = X.copy()
            X_test = None
            y_train = y.copy() if y is not None else None
            y_test = None
            self.logger.info(f"No splitting performed for mode '{self.mode}' or model category '{self.model_category}'.")

        self.preprocessing_steps.append("Split Dataset into Train and Test")

        # Keep Indices Aligned Through Each Step
        if X_test is not None and y_test is not None:
            # Sort both X_test and y_test by index
            X_test = X_test.sort_index()
            y_test = y_test.sort_index()
            self.logger.debug("Sorted X_test and y_test by index for alignment.")

        # Debugging: Log post-split shapes and index alignment
        self._log(f"After Split - X_train shape: {X_train.shape}, X_test shape: {X_test.shape if X_test is not None else 'N/A'}", step_name, 'debug')
        if self.model_category == 'classification' and y_train is not None and y_test is not None:
            self.logger.debug(f"Class distribution in y_train:\n{y_train.value_counts(normalize=True)}")
            self.logger.debug(f"Class distribution in y_test:\n{y_test.value_counts(normalize=True)}")
        elif self.model_category == 'regression' and y_train is not None and y_test is not None:
            self.logger.debug(f"y_train statistics:\n{y_train.describe()}")
            self.logger.debug(f"y_test statistics:\n{y_test.describe()}")

        # Check index alignment
        if y_train is not None and X_train.index.equals(y_train.index):
            self.logger.debug("X_train and y_train indices are aligned.")
        else:
            self.logger.warning("X_train and y_train indices are misaligned.")

        if X_test is not None and y_test is not None and X_test.index.equals(y_test.index):
            self.logger.debug("X_test and y_test indices are aligned.")
        elif X_test is not None and y_test is not None:
            self.logger.warning("X_test and y_test indices are misaligned.")

        return X_train, X_test, y_train, y_test


    def categorize_features(self, df: pd.DataFrame) -> Dict[str, List[str]]:
        """
        Categorize features into numerical, ordinal, and nominal based on provided lists or data types.
        
        Args:
            df (pd.DataFrame): The input DataFrame.
        
        Returns:
            Dict[str, List[str]]: Dictionary with keys 'numerical', 'ordinal', 'nominal' and lists of feature names.
        """
        self.logger.debug("Starting feature categorization.")
        feature_types = {'numerical': [], 'ordinal': [], 'nominal': []}

        # Use provided numerical_features, or categorize automatically
        if self.numericals:
            feature_types['numerical'] = self.numericals
            self.logger.debug(f"Using provided numerical features: {self.numericals}")
        else:
            numerical = [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col]) and col != self.target_variable]
            feature_types['numerical'] = numerical
            self.logger.debug(f"Automatically categorized numerical features: {numerical}")

        # Use provided ordinal_features
        if self.ordinal_categoricals:
            feature_types['ordinal'] = self.ordinal_categoricals
            self.logger.debug(f"Using provided ordinal features: {self.ordinal_categoricals}")
        else:
            self.logger.warning("No ordinal categoricals provided. Skipping ordinal feature encoding.")

        # Use provided nominal_features, or categorize automatically
        if self.nominal_categoricals:
            feature_types['nominal'] = self.nominal_categoricals
            self.logger.debug(f"Using provided nominal features: {self.nominal_categoricals}")
        else:
            # Nominal features are those not in numerical or ordinal
            nominal = [col for col in df.columns if col not in self.numericals + self.ordinal_categoricals + [self.target_variable]]
            feature_types['nominal'] = nominal
            self.logger.debug(f"Automatically categorized nominal features: {nominal}")

        self.numericals = feature_types['numerical']
        self.ordinal_features = feature_types['ordinal']
        self.nominal_features = feature_types['nominal']

        # Log the categorized features
        if self.debug:
            self.logger.debug(f"Categorized Numerical Features: {self.numericals}")
            self.logger.debug(f"Categorized Ordinal Features: {self.ordinal_features}")
            self.logger.debug(f"Categorized Nominal Features: {self.nominal_features}")
        else:
            self.logger.info(f"Features categorized: Numerical={len(self.numericals)}, "
                            f"Ordinal={len(self.ordinal_features)}, Nominal={len(self.nominal_features)}.")

        self.logger.debug(f"Categorized Numerical Features: {self.numericals}")
        self.logger.debug(f"Categorized Ordinal Features: {self.ordinal_categoricals}")
        self.logger.debug(f"Categorized Nominal Features: {self.nominal_categoricals}")

        # Warn if any category is empty
        if not self.numericals:
            self.logger.warning("No numerical features detected.")
        if not self.ordinal_categoricals:
            self.logger.warning("No ordinal categorical features detected.")
        if not self.nominal_categoricals:
            self.logger.warning("No nominal categorical features detected.")

        return feature_types



    def handle_missing_values(self, X_train: pd.DataFrame, X_test: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Handle missing values for numerical and categorical features based on user options.
        """
        step_name = "handle_missing_values"
        self.logger.info("Step: Handle Missing Values")

        # Fetch user-defined imputation options or set defaults
        impute_options = self.options.get('handle_missing_values', {})
        numerical_strategy = impute_options.get('numerical_strategy', {})
        categorical_strategy = impute_options.get('categorical_strategy', {})

        # Numerical Imputation
        numerical_imputer = None
        new_columns = []
        if self.numericals:
            if self.model_category in ['regression', 'classification', 'clustering']:
                default_num_strategy = 'mean'  # For clustering, mean imputation is acceptable
            else:
                default_num_strategy = 'median'
            num_strategy = numerical_strategy.get('strategy', default_num_strategy)
            num_imputer_type = numerical_strategy.get('imputer', 'SimpleImputer')  # Can be 'SimpleImputer', 'KNNImputer', etc.

            self._log(f"Numerical Imputation Strategy: {num_strategy.capitalize()}, Imputer Type: {num_imputer_type}", step_name, 'debug')

            # Initialize numerical imputer based on user option
            if num_imputer_type == 'SimpleImputer':
                numerical_imputer = SimpleImputer(strategy=num_strategy)
            elif num_imputer_type == 'KNNImputer':
                knn_neighbors = numerical_strategy.get('knn_neighbors', 5)
                numerical_imputer = KNNImputer(n_neighbors=knn_neighbors)
            else:
                self.logger.error(f"Numerical imputer type '{num_imputer_type}' is not supported.")
                raise ValueError(f"Numerical imputer type '{num_imputer_type}' is not supported.")

            # Fit and transform ONLY on X_train
            X_train[self.numericals] = numerical_imputer.fit_transform(X_train[self.numericals])
            self.numerical_imputer = numerical_imputer  # Assign to self for saving
            self.feature_reasons.update({col: self.feature_reasons.get(col, '') + f'Numerical: {num_strategy.capitalize()} Imputation | ' for col in self.numericals})
            new_columns.extend(self.numericals)

            if X_test is not None:
                # Transform ONLY on X_test without fitting
                X_test[self.numericals] = numerical_imputer.transform(X_test[self.numericals])

        # Categorical Imputation
        categorical_imputer = None
        all_categoricals = self.ordinal_categoricals + self.nominal_categoricals
        if all_categoricals:
            default_cat_strategy = 'most_frequent'
            cat_strategy = categorical_strategy.get('strategy', default_cat_strategy)
            cat_imputer_type = categorical_strategy.get('imputer', 'SimpleImputer')

            self._log(f"Categorical Imputation Strategy: {cat_strategy.capitalize()}, Imputer Type: {cat_imputer_type}", step_name, 'debug')

            # Initialize categorical imputer based on user option
            if cat_imputer_type == 'SimpleImputer':
                categorical_imputer = SimpleImputer(strategy=cat_strategy)
            elif cat_imputer_type == 'ConstantImputer':
                fill_value = categorical_strategy.get('fill_value', 'Missing')
                categorical_imputer = SimpleImputer(strategy='constant', fill_value=fill_value)
            else:
                self.logger.error(f"Categorical imputer type '{cat_imputer_type}' is not supported.")
                raise ValueError(f"Categorical imputer type '{cat_imputer_type}' is not supported.")

            # Fit and transform ONLY on X_train
            X_train[all_categoricals] = categorical_imputer.fit_transform(X_train[all_categoricals])
            self.categorical_imputer = categorical_imputer  # Assign to self for saving
            self.feature_reasons.update({
                col: self.feature_reasons.get(col, '') + (f'Categorical: Constant Imputation (Value={categorical_strategy.get("fill_value", "Missing")}) | ' if cat_imputer_type == 'ConstantImputer' else f'Categorical: {cat_strategy.capitalize()} Imputation | ')
                for col in all_categoricals
            })
            new_columns.extend(all_categoricals)

            if X_test is not None:
                # Transform ONLY on X_test without fitting
                X_test[all_categoricals] = categorical_imputer.transform(X_test[all_categoricals])

        self.preprocessing_steps.append("Handle Missing Values")

        # Debugging: Log post-imputation shapes and missing values
        self._log(f"Completed: Handle Missing Values. Dataset shape after imputation: {X_train.shape}", step_name, 'debug')
        self._log(f"Missing values after imputation in X_train:\n{X_train.isnull().sum()}", step_name, 'debug')
        self._log(f"New columns handled: {new_columns}", step_name, 'debug')

        return X_train, X_test


    def handle_outliers(self, X_train: pd.DataFrame, y_train: Optional[pd.Series] = None) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
        """
        Handle outliers based on the model's sensitivity and user options.

        Args:
            X_train (pd.DataFrame): Training features.
            y_train (pd.Series, optional): Training target.

        Returns:
            tuple: X_train without outliers and corresponding y_train.
        """
        step_name = "handle_outliers"
        self.logger.info("Step: Handle Outliers")
        self._log("Starting outlier handling.", step_name, 'debug')

        debug_flag = self.get_debug_flag('debug_handle_outliers')
        initial_shape = X_train.shape[0]
        new_columns = []

        # Fetch user-defined outlier handling options or set defaults
        outlier_options = self.options.get('handle_outliers', {})
        zscore_threshold = outlier_options.get('zscore_threshold', 3)
        iqr_multiplier = outlier_options.get('iqr_multiplier', 1.5)
        winsor_limits = outlier_options.get('winsor_limits', [0.05, 0.05])
        isolation_contamination = outlier_options.get('isolation_contamination', 0.05)

        # Check for target leakage: Ensure y_train is not used in transformations
        if self.mode == 'train' and y_train is not None:
            self._log("y_train is present. Confirming it's not used in outlier handling.", step_name, 'debug')
            # Add any specific checks if transformations accidentally use y_train
            # For example, ensure that no columns derived from y_train are being modified

        for col in self.numericals:
            if self.model_category in ['regression', 'classification']:
                # Z-Score Filtering
                apply_zscore = outlier_options.get('apply_zscore', True)
                if apply_zscore:
                    z_scores = np.abs((X_train[col] - X_train[col].mean()) / X_train[col].std())
                    mask_z = z_scores < zscore_threshold
                    removed_z = (~mask_z).sum()
                    X_train = X_train[mask_z]
                    if y_train is not None:
                        y_train = y_train.loc[X_train.index]
                    self.feature_reasons[col] += f'Outliers handled with Z-Score Filtering (threshold={zscore_threshold}) | '
                    self._log(f"Removed {removed_z} outliers from '{col}' using Z-Score Filtering", step_name, 'debug')

                # IQR Filtering
                apply_iqr = outlier_options.get('apply_iqr', True)
                if apply_iqr:
                    Q1 = X_train[col].quantile(0.25)
                    Q3 = X_train[col].quantile(0.75)
                    IQR = Q3 - Q1
                    lower_bound = Q1 - iqr_multiplier * IQR
                    upper_bound = Q3 + iqr_multiplier * IQR
                    mask_iqr = (X_train[col] >= lower_bound) & (X_train[col] <= upper_bound)
                    removed_iqr = (~mask_iqr).sum()
                    X_train = X_train[mask_iqr]
                    if y_train is not None:
                        y_train = y_train.loc[X_train.index]
                    self.feature_reasons[col] += f'Outliers handled with IQR Filtering (multiplier={iqr_multiplier}) | '
                    self._log(f"Removed {removed_iqr} outliers from '{col}' using IQR Filtering", step_name, 'debug')

            elif self.model_category == 'clustering':
                # For clustering, apply IsolationForest for outlier detection
                contamination = outlier_options.get('isolation_contamination', 0.05)
                iso_forest = IsolationForest(contamination=contamination, random_state=42)
                preds = iso_forest.fit_predict(X_train[[col]])
                mask_iso = preds != -1
                removed_iso = (preds == -1).sum()
                X_train = X_train[mask_iso]
                self.feature_reasons[col] += f'Outliers handled with IsolationForest (contamination={contamination}) | '
                self._log(f"Removed {removed_iso} outliers from '{col}' using IsolationForest", step_name, 'debug')

            else:
                self.logger.warning(f"Model category '{self.model_category}' not recognized for outlier handling.")

        self.preprocessing_steps.append("Handle Outliers")

        # Completion Logging
        self._log(f"Completed: Handle Outliers. Initial samples: {initial_shape}, Final samples: {X_train.shape[0]}", step_name, 'debug')
        self._log(f"Missing values after outlier handling in X_train:\n{X_train.isnull().sum()}", step_name, 'debug')
        self._log(f"Outlier handling applied on columns: {new_columns}", step_name, 'debug')

        return X_train, y_train
    
    def test_normality(self, X_train: pd.DataFrame) -> Dict[str, Dict]:
        """
        Test normality for numerical features based on normality tests and user options.

        Args:
            X_train (pd.DataFrame): Training features.

        Returns:
            Dict[str, Dict]: Dictionary with normality test results for each numerical feature.
        """
        step_name = "Test for Normality"
        self.logger.info(f"Step: {step_name}")
        debug_flag = self.get_debug_flag('debug_test_normality')
        normality_results = {}

        # Fetch user-defined normality test options or set defaults
        normality_options = self.options.get('test_normality', {})
        p_value_threshold = normality_options.get('p_value_threshold', 0.05)
        skewness_threshold = normality_options.get('skewness_threshold', 1.0)
        additional_tests = normality_options.get('additional_tests', [])  # e.g., ['anderson-darling']

        for col in self.numericals:
            data = X_train[col].dropna()
            skewness = data.skew()
            kurtosis = data.kurtosis()

            # Determine which normality test to use based on sample size and user options
            test_used = 'Shapiro-Wilk'
            p_value = 0.0

            if len(data) <= 5000:
                from scipy.stats import shapiro
                stat, p_val = shapiro(data)
                test_used = 'Shapiro-Wilk'
                p_value = p_val
            else:
                from scipy.stats import anderson
                result = anderson(data)
                test_used = 'Anderson-Darling'
                # Determine p-value based on critical values
                p_value = 0.0  # Default to 0
                for cv, sig in zip(result.critical_values, result.significance_level):
                    if result.statistic < cv:
                        p_value = sig / 100
                        break

            # Apply user-defined or default criteria
            if self.model_category in ['regression', 'classification', 'clustering']:
                # Linear, Logistic Regression, and Clustering: Use p-value and skewness
                needs_transform = (p_value < p_value_threshold) or (abs(skewness) > skewness_threshold)
            else:
                # Other models: Use skewness, and optionally p-values based on options
                use_p_value = normality_options.get('use_p_value_other_models', False)
                if use_p_value:
                    needs_transform = (p_value < p_value_threshold) or (abs(skewness) > skewness_threshold)
                else:
                    needs_transform = abs(skewness) > skewness_threshold

            normality_results[col] = {
                'skewness': skewness,
                'kurtosis': kurtosis,
                'p_value': p_value,
                'test_used': test_used,
                'needs_transform': needs_transform
            }

            # Conditional Detailed Logging
            if debug_flag:
                self._log(f"Feature '{col}': p-value={p_value:.4f}, skewness={skewness:.4f}, needs_transform={needs_transform}", step_name, 'debug')

        self.normality_results = normality_results
        self.preprocessing_steps.append(step_name)

        # Completion Logging
        if debug_flag:
            self._log(f"Completed: {step_name}. Normality results computed.", step_name, 'debug')
        else:
            self.logger.info(f"Step '{step_name}' completed: Normality results computed.")

        return normality_results


    def encode_categorical_variables(self, X_train: pd.DataFrame, X_test: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Encode categorical variables using user-specified encoding strategies.
        """
        step_name = "encode_categorical_variables"
        self.logger.info("Step: Encode Categorical Variables")
        self._log("Starting categorical variable encoding.", step_name, 'debug')

        # Fetch user-defined encoding options or set defaults
        encoding_options = self.options.get('encode_categoricals', {})
        ordinal_encoding = encoding_options.get('ordinal_encoding', 'OrdinalEncoder')  # Options: 'OrdinalEncoder', 'None'
        nominal_encoding = encoding_options.get('nominal_encoding', 'OrdinalEncoder')  # Changed from 'OneHotEncoder' to 'OrdinalEncoder'
        handle_unknown = encoding_options.get('handle_unknown', 'use_encoded_value')  # Adjusted for OrdinalEncoder

        # Determine if SMOTENC is being used
        smote_variant = self.options.get('implement_smote', {}).get('variant', None)
        if smote_variant == 'SMOTENC':
            nominal_encoding = 'OrdinalEncoder'  # Ensure compatibility

        transformers = []
        new_columns = []
        if self.ordinal_categoricals and ordinal_encoding != 'None':
            if ordinal_encoding == 'OrdinalEncoder':
                transformers.append(
                    ('ordinal', OrdinalEncoder(), self.ordinal_categoricals)
                )
                self._log(f"Added OrdinalEncoder for features: {self.ordinal_categoricals}", step_name, 'debug')
            else:
                self.logger.error(f"Ordinal encoding method '{ordinal_encoding}' is not supported.")
                raise ValueError(f"Ordinal encoding method '{ordinal_encoding}' is not supported.")
        if self.nominal_categoricals and nominal_encoding != 'None':
            if nominal_encoding == 'OrdinalEncoder':
                transformers.append(
                    ('nominal', OrdinalEncoder(handle_unknown=handle_unknown), self.nominal_categoricals)
                )
                self._log(f"Added OrdinalEncoder for features: {self.nominal_categoricals}", step_name, 'debug')
            elif nominal_encoding == 'FrequencyEncoder':
                # Custom Frequency Encoding
                for col in self.nominal_categoricals:
                    freq = X_train[col].value_counts(normalize=True)
                    X_train[col] = X_train[col].map(freq)
                    if X_test is not None:
                        X_test[col] = X_test[col].map(freq).fillna(0)
                    self.feature_reasons[col] += 'Encoded with Frequency Encoding | '
                    self._log(f"Applied Frequency Encoding to '{col}'.", step_name, 'debug')
            else:
                self.logger.error(f"Nominal encoding method '{nominal_encoding}' is not supported.")
                raise ValueError(f"Nominal encoding method '{nominal_encoding}' is not supported.")

        if not transformers and 'FrequencyEncoder' not in nominal_encoding:
            self.logger.info("No categorical variables to encode.")
            self.preprocessing_steps.append("Encode Categorical Variables")
            self._log(f"Completed: Encode Categorical Variables. No encoding was applied.", step_name, 'debug')
            return X_train, X_test

        if transformers:
            self.preprocessor = ColumnTransformer(
                transformers=transformers,
                remainder='passthrough'  # Keep other columns unchanged
            )

            # Fit and transform training data
            X_train_encoded = self.preprocessor.fit_transform(X_train)
            self._log("Fitted and transformed X_train with ColumnTransformer.", step_name, 'debug')

            # Transform testing data
            if X_test is not None:
                X_test_encoded = self.preprocessor.transform(X_test)
                self._log("Transformed X_test with fitted ColumnTransformer.", step_name, 'debug')
            else:
                X_test_encoded = None

            # Retrieve feature names after encoding
            encoded_feature_names = []
            if self.ordinal_categoricals and ordinal_encoding == 'OrdinalEncoder':
                encoded_feature_names += self.ordinal_categoricals
            if self.nominal_categoricals and nominal_encoding == 'OrdinalEncoder':
                encoded_feature_names += self.nominal_categoricals
            elif self.nominal_categoricals and nominal_encoding == 'FrequencyEncoder':
                encoded_feature_names += self.nominal_categoricals
            passthrough_features = [col for col in X_train.columns if col not in self.ordinal_categoricals + self.nominal_categoricals]
            encoded_feature_names += passthrough_features
            new_columns.extend(encoded_feature_names)

            # Convert numpy arrays back to DataFrames
            X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_feature_names, index=X_train.index)
            if X_test_encoded is not None:
                X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_feature_names, index=X_test.index)
            else:
                X_test_encoded_df = None

            # Store encoders for inverse transformation
            self.ordinal_encoder = self.preprocessor.named_transformers_['ordinal'] if 'ordinal' in self.preprocessor.named_transformers_ else None
            self.nominal_encoder = self.preprocessor.named_transformers_['nominal'] if 'nominal' in self.preprocessor.named_transformers_ else None

            self.preprocessing_steps.append("Encode Categorical Variables")
            self._log(f"Completed: Encode Categorical Variables. X_train_encoded shape: {X_train_encoded_df.shape}", step_name, 'debug')
            self._log(f"Columns after encoding: {encoded_feature_names}", step_name, 'debug')
            self._log(f"Sample of encoded X_train:\n{X_train_encoded_df.head()}", step_name, 'debug')
            self._log(f"New columns added: {new_columns}", step_name, 'debug')

            return X_train_encoded_df, X_test_encoded_df

    def generate_recommendations(self) -> pd.DataFrame:
        """
        Generate a table of preprocessing recommendations based on the model type, data, and user options.

        Returns:
            pd.DataFrame: DataFrame containing recommendations for each feature.
        """
        step_name = "Generate Preprocessor Recommendations"
        self.logger.info(f"Step: {step_name}")
        debug_flag = self.get_debug_flag('debug_generate_recommendations')

        # Generate recommendations based on feature reasons
        recommendations = {}
        for col in self.ordinal_categoricals + self.nominal_categoricals + self.numericals:
            reasons = self.feature_reasons.get(col, '').strip(' | ')
            recommendations[col] = reasons

        recommendations_table = pd.DataFrame.from_dict(
            recommendations, 
            orient='index', 
            columns=['Preprocessing Reason']
        )
        if debug_flag:
            self.logger.debug(f"Preprocessing Recommendations:\n{recommendations_table}")
        else:
            self.logger.info("Preprocessing Recommendations generated.")

        self.preprocessing_steps.append(step_name)

        # Completion Logging
        if debug_flag:
            self._log(f"Completed: {step_name}. Recommendations generated.", step_name, 'debug')
        else:
            self.logger.info(f"Step '{step_name}' completed: Recommendations generated.")

        return recommendations_table


    def save_transformers(self):
        """
        Save fitted transformers to disk for future use during prediction.
        """
        step_name = "Save Transformers"
        self.logger.info(f"Step: {step_name}")
        debug_flag = self.get_debug_flag('debug_save_transformers')  # Assuming a step-specific debug flag
        transformers_path = os.path.join(self.transformers_dir, 'transformers.pkl')
        
        transformers = {
            'numerical_imputer': getattr(self, 'numerical_imputer', None),
            'categorical_imputer': getattr(self, 'categorical_imputer', None),
            'preprocessor': self.pipeline,  # **Include the pipeline**
            # 'scaler': self.scaler,  # **Removed**
            'smote': self.smote,
            'final_feature_order': self.final_feature_order,
            'categorical_indices': self.categorical_indices  # **Added**
        }
        try:
            joblib.dump(transformers, transformers_path)
            if debug_flag:
                self._log(f"Transformers saved at '{transformers_path}'.", step_name, 'debug')
            else:
                self.logger.info(f"Transformers saved at '{transformers_path}'.")
        except Exception as e:
            self.logger.error(f"❌ Failed to save transformers: {e}")
            raise

        self.preprocessing_steps.append(step_name)

    def load_transformers(self) -> dict:
        """
        Load fitted transformers from disk for use during prediction.

        Returns:
            dict: A dictionary containing all necessary transformers.
        """
        step_name = "Load Transformers"
        self.logger.info(f"Step: {step_name}")
        debug_flag = self.get_debug_flag('debug_load_transformers')  # Assuming a step-specific debug flag
        transformers_path = os.path.join(self.transformers_dir, 'transformers.pkl')
        
        if not os.path.exists(transformers_path):
            self.logger.error(f"❌ Transformers file not found at '{transformers_path}'. Cannot proceed with prediction.")
            raise FileNotFoundError(f"Transformers file not found at '{transformers_path}'.")
        
        try:
            transformers = joblib.load(transformers_path)
            
            # Extract transformers
            numerical_imputer = transformers.get('numerical_imputer')
            categorical_imputer = transformers.get('categorical_imputer')
            preprocessor = transformers.get('preprocessor')
            # scaler = transformers.get('scaler')  # **Removed**
            self.pipeline = transformers.get('preprocessor')  # **Load the pipeline**
            smote = transformers.get('smote', None)
            final_feature_order = transformers.get('final_feature_order', [])
            categorical_indices = transformers.get('categorical_indices', [])
            self.categorical_indices = categorical_indices  # **Set the attribute**

            # **Post-Loading Debugging:**
            for name, transformer, features in preprocessor.transformers_:
                if name == 'ord':
                    ordinal_encoder = transformer.named_steps.get('ordinal_encoder', None)
                    if ordinal_encoder:
                        if hasattr(ordinal_encoder, 'categories_'):
                            self.logger.debug(f"✅ OrdinalEncoder for features {features} is fitted.")
                        else:
                            self.logger.error(f"❌ OrdinalEncoder for features {features} is NOT fitted.")
                    else:
                        self.logger.error(f"❌ 'ordinal_encoder' not found in transformer '{name}'.")
        
        except Exception as e:
            self.logger.error(f"❌ Failed to load transformers: {e}")
            raise

        self.preprocessing_steps.append(step_name)
        
        # Additional checks
        if preprocessor is None:
            self.logger.error("❌ Preprocessor is not loaded.")

        if debug_flag:
            self._log(f"Transformers loaded successfully from '{transformers_path}'.", step_name, 'debug')
        else:
            self.logger.info(f"Transformers loaded successfully from '{transformers_path}'.")

        # Return the transformers as a dictionary
        return {
            'numerical_imputer': numerical_imputer,
            'categorical_imputer': categorical_imputer,
            'preprocessor': preprocessor,
            # 'scaler': scaler,  # **Removed**
            'smote': smote,
            'final_feature_order': final_feature_order,
            'categorical_indices': categorical_indices
        }



    def apply_scaling(self, X_train: pd.DataFrame, X_test: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Apply scaling based on the model type and user options.

        Args:
            X_train (pd.DataFrame): Training features.
            X_test (Optional[pd.DataFrame]): Testing features.

        Returns:
            Tuple[pd.DataFrame, Optional[pd.DataFrame]]: Scaled X_train and X_test.
        """
        step_name = "Apply Scaling (If Needed by Model)"
        self.logger.info(f"Step: {step_name}")
        debug_flag = self.get_debug_flag('debug_apply_scaling')

        # Fetch user-defined scaling options or set defaults
        scaling_options = self.options.get('apply_scaling', {})
        scaling_method = scaling_options.get('method', None)  # 'StandardScaler', 'MinMaxScaler', 'RobustScaler', 'None'
        features_to_scale = scaling_options.get('features', self.numericals)

        scaler = None
        scaling_type = 'None'

        if scaling_method is None:
            # Default scaling based on model category
            if self.model_category in ['regression', 'classification', 'clustering']:
                # For clustering, MinMaxScaler is generally preferred
                if self.model_category == 'clustering':
                    scaler = MinMaxScaler()
                    scaling_type = 'MinMaxScaler'
                else:
                    scaler = StandardScaler()
                    scaling_type = 'StandardScaler'
            else:
                scaler = None
                scaling_type = 'None'
        else:
            # User-specified scaling method
            if scaling_method == 'StandardScaler':
                scaler = StandardScaler()
                scaling_type = 'StandardScaler'
            elif scaling_method == 'MinMaxScaler':
                scaler = MinMaxScaler()
                scaling_type = 'MinMaxScaler'
            elif scaling_method == 'RobustScaler':
                scaler = RobustScaler()
                scaling_type = 'RobustScaler'
            elif scaling_method == 'None':
                scaler = None
                scaling_type = 'None'
            else:
                self.logger.error(f"Scaling method '{scaling_method}' is not supported.")
                raise ValueError(f"Scaling method '{scaling_method}' is not supported.")

        # Apply scaling if scaler is defined
        if scaler is not None and features_to_scale:
            self.scaler = scaler
            if debug_flag:
                self._log(f"Features to scale: {features_to_scale}", debug_flag, 'debug')

            # Check if features exist in the dataset
            missing_features = [feat for feat in features_to_scale if feat not in X_train.columns]
            if missing_features:
                self.logger.error(f"The following features specified for scaling are missing in the dataset: {missing_features}")
                raise KeyError(f"The following features specified for scaling are missing in the dataset: {missing_features}")

            X_train[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])
            if X_test is not None:
                X_test[features_to_scale] = scaler.transform(X_test[features_to_scale])

            for col in features_to_scale:
                self.feature_reasons[col] += f'Scaling Applied: {scaling_type} | '

            self.preprocessing_steps.append(step_name)
            if debug_flag:
                self._log(f"Applied {scaling_type} to features: {features_to_scale}", debug_flag, 'debug')
                if hasattr(scaler, 'mean_'):
                    self._log(f"Scaler Parameters: mean={scaler.mean_}", debug_flag, 'debug')
                if hasattr(scaler, 'scale_'):
                    self._log(f"Scaler Parameters: scale={scaler.scale_}", debug_flag, 'debug')
                self._log(f"Sample of scaled X_train:\n{X_train[features_to_scale].head()}", debug_flag, 'debug')
                if X_test is not None:
                    self._log(f"Sample of scaled X_test:\n{X_test[features_to_scale].head()}", debug_flag, 'debug')
            else:
                self.logger.info(f"Step '{step_name}' completed: Applied {scaling_type} to features: {features_to_scale}")
        else:
            self.logger.info("No scaling applied based on user options or no features specified.")
            self.preprocessing_steps.append(step_name)
            if debug_flag:
                self._log(f"Completed: {step_name}. No scaling was applied.", debug_flag, 'debug')
            else:
                self.logger.info(f"Step '{step_name}' completed: No scaling was applied.")

        return X_train, X_test

    def implement_smote(self, X_train: pd.DataFrame, y_train: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
        """
        Implement SMOTE or its variants based on class imbalance, dataset characteristics, and user options.

        Args:
            X_train (pd.DataFrame): Training features.
            y_train (pd.Series): Training target.

        Returns:
            Tuple[pd.DataFrame, pd.Series]: Resampled X_train and y_train.
        """
        step_name = "Implement SMOTE (Train Only)"
        self.logger.info(f"Step: {step_name}")

        # Check if classification
        if self.model_category != 'classification':
            self.logger.info("SMOTE not applicable: Not a classification model.")
            self.preprocessing_steps.append("SMOTE Skipped")
            return X_train, y_train

        # Fetch user-defined SMOTE options or set defaults
        smote_options = self.options.get('implement_smote', {})
        user_smote_variant = smote_options.get('variant', None)
        smote_params = smote_options.get('params', {})

        # Calculate class distribution
        class_counts = y_train.value_counts()
        if len(class_counts) < 2:
            self.logger.warning("SMOTE not applicable: Only one class present.")
            self.preprocessing_steps.append("SMOTE Skipped")
            return X_train, y_train
        majority_class = class_counts.idxmax()
        minority_class = class_counts.idxmin()
        majority_count = class_counts.max()
        minority_count = class_counts.min()
        imbalance_ratio = minority_count / majority_count
        self.logger.info(f"Class Distribution before SMOTE: {class_counts.to_dict()}")
        self.logger.info(f"Imbalance Ratio (Minority/Majority): {imbalance_ratio:.4f}")

        # Determine dataset composition
        has_numericals = len(self.numericals) > 0
        has_categoricals = len(self.ordinal_categoricals) + len(self.nominal_categoricals) > 0

        # Initialize variant_name as None
        variant_name = None

        # Determine SMOTE variant based on dataset composition and user preference
        if has_numericals and not has_categoricals:
            # Numerical-only dataset
            if user_smote_variant:
                variant_name = user_smote_variant
                self.logger.info(f"User-specified SMOTE variant: {variant_name}")
            else:
                self.logger.info("Dataset contains only numerical features. Analyzing to recommend SMOTE variants...")
                smote_recommendation = self.smote_numerics_criteria(
                    X_train=X_train,
                    y_train=y_train,
                    imbalance_threshold=self.imbalance_threshold,
                    noise_threshold=self.noise_threshold,
                    overlap_threshold=self.overlap_threshold,
                    boundary_threshold=self.boundary_threshold,
                    debug=self.get_debug_flag('debug_implement_smote')
                )
                if smote_recommendation:
                    variant_name = smote_recommendation[0]
                    self.logger.info(f"Recommended SMOTE variant: {variant_name}")
                else:
                    variant_name = 'SMOTE'
                    self.logger.info("No specific recommendation from criteria. Using default SMOTE.")
        elif has_numericals and has_categoricals:
            # Mixed dataset
            variant_name = 'SMOTENC'
            self.logger.info("Dataset contains numerical and categorical features. Using SMOTENC.")
        elif not has_numericals and has_categoricals:
            # Categorical-only dataset
            variant_name = 'SMOTEN'
            self.logger.info("Dataset contains only categorical features. Using SMOTEN.")
        else:
            # Fallback
            variant_name = 'SMOTE'
            self.logger.info("Dataset composition not recognized. Using SMOTE as default.")

        # Initialize SMOTE variant
        try:
            self.logger.debug(f"Initializing SMOTE Variant '{variant_name}' with parameters: {smote_params}")

            if variant_name == 'SMOTENC':
                # Combine ordinal and nominal categorical features
                categorical_cols = self.ordinal_categoricals + self.nominal_categoricals
                categorical_features = [X_train.columns.get_loc(col) for col in categorical_cols]
                smote = SMOTENC(
                    categorical_features=categorical_features,
                    random_state=42,
                    **smote_params
                )
            elif variant_name == 'SMOTEN':
                smote = SMOTEN(
                    random_state=42,
                    **smote_params
                )
            elif variant_name in ['SMOTE', 'ADASYN', 'BorderlineSMOTE', 'SMOTETomek', 'SMOTEENN']:
                smote_class = {
                    'SMOTE': SMOTE,
                    'ADASYN': ADASYN,
                    'BorderlineSMOTE': BorderlineSMOTE,
                    'SMOTETomek': SMOTETomek,
                    'SMOTEENN': SMOTEENN
                }.get(variant_name, SMOTE)  # Default to SMOTE if not found
                smote = smote_class(
                    random_state=42,
                    **smote_params
                )
            else:
                self.logger.warning(f"Unknown SMOTE variant '{variant_name}'. Falling back to SMOTE.")
                smote = SMOTE(random_state=42, **smote_params)

            # Validate parameters using inspect
            smote_signature = signature(smote.__class__)
            valid_params = smote_signature.parameters.keys()
            invalid_params = set(smote_params.keys()) - set(valid_params)
            if invalid_params:
                self.logger.warning(f"Invalid parameters for SMOTE variant '{variant_name}': {invalid_params}. These will be ignored.")

        except TypeError as e:
            self.logger.error(f"Error initializing SMOTE variant '{variant_name}': {e}")
            raise
        except Exception as e:
            self.logger.error(f"Unexpected error during SMOTE initialization: {e}")
            raise

        # Validate that all features are numerical before SMOTENC
        if variant_name == 'SMOTENC':
            # All features should be numerical after encoding
            if not np.all([np.issubdtype(dtype, np.number) for dtype in X_train.dtypes]):
                non_numeric_cols = X_train.columns[~X_train.dtypes.apply(np.issubdtype, args=(np.number,))]
                self.logger.error(f"SMOTENC requires all features to be numerical. Non-numeric columns found: {list(non_numeric_cols)}")
                raise ValueError(f"SMOTENC requires all features to be numerical. Non-numeric columns found: {list(non_numeric_cols)}")

        # Apply SMOTE variant
        try:
            self.logger.debug("Applying SMOTE variant...")
            X_res, y_res = smote.fit_resample(X_train, y_train)
            self.smote = smote
            self.preprocessing_steps.append(step_name)
            self.logger.info(f"Applied SMOTE Variant '{variant_name}'. Resampled X_train shape: {X_res.shape}, y_train shape: {y_res.shape}")
            self.logger.debug(f"Class Distribution after SMOTE: {y_res.value_counts().to_dict()}")
            return X_res, y_res
        except ValueError as ve:
            self.logger.error(f"ValueError during SMOTE application: {ve}")
            raise
        except Exception as e:
            self.logger.error(f"Unexpected error during SMOTE application: {e}")
            raise


    def inverse_transform_data(self, X_transformed: np.ndarray) -> pd.DataFrame:
        """
        Perform inverse transformation on the transformed data to reconstruct original feature values.

        Args:
            X_transformed (np.ndarray): The transformed feature data.

        Returns:
            pd.DataFrame: The inverse-transformed DataFrame.
        """
        if self.pipeline is None:
            self.logger.error("Preprocessing pipeline has not been fitted. Cannot perform inverse transformation.")
            raise AttributeError("Preprocessing pipeline has not been fitted. Cannot perform inverse transformation.")

        preprocessor = self.pipeline
        logger = logging.getLogger('InverseTransform')
        if self.debug:
            logger.setLevel(logging.DEBUG)
        else:
            logger.setLevel(logging.INFO)

        logger.debug("Starting inverse transformation.")

        # Initialize dictionaries to hold inverse-transformed data
        inverse_data = {}

        # Initialize index tracker
        start_idx = 0

        # Iterate through each transformer in the ColumnTransformer
        for name, transformer, features in preprocessor.transformers_:
            if name == 'remainder':
                continue  # Skip any remainder features
            # Extract the transformed data for current transformer
            if name == 'num':
                end_idx = start_idx + len(features)
                numerical_data = X_transformed[:, start_idx:end_idx]
                numerical_inverse = transformer.named_steps['scaler'].inverse_transform(numerical_data)
                inverse_data.update({feature: numerical_inverse[:, idx] for idx, feature in enumerate(features)})
                logger.debug(f"Numerical features {features} inverse transformed.")
                start_idx = end_idx
            elif name == 'ord':
                ordinal_encoder = transformer.named_steps.get('ordinal_encoder', None)
                if ordinal_encoder:
                    if hasattr(ordinal_encoder, 'categories_'):
                        end_idx = start_idx + len(features)
                        ordinal_data = X_transformed[:, start_idx:end_idx]
                        ordinal_inverse = ordinal_encoder.inverse_transform(ordinal_data)
                        inverse_data.update({feature: ordinal_inverse[:, idx] for idx, feature in enumerate(features)})
                        logger.debug(f"Ordinal features {features} inverse transformed.")
                        start_idx = end_idx
                    else:
                        logger.error(f"OrdinalEncoder for features {features} is NOT fitted.")
                        raise AttributeError(f"OrdinalEncoder for features {features} is NOT fitted.")
                else:
                    logger.error(f"'ordinal_encoder' not found in transformer '{name}'.")
                    raise AttributeError(f"'ordinal_encoder' not found in transformer '{name}'.")
            elif name.startswith('onehot_enc_'):
                # For OneHotEncoder, need to inverse transform multiple columns
                transformer_steps = transformer.named_steps
                onehot_encoder = transformer_steps['onehot_encoder']
                # Get number of categories for this feature
                n_categories = len(onehot_encoder.categories_[0])
                end_idx = start_idx + n_categories
                nominal_data = X_transformed[:, start_idx:end_idx]
                nominal_inverse = onehot_encoder.inverse_transform(nominal_data)
                inverse_data.update({feature: nominal_inverse[:, 0] for feature in features})
                logger.debug(f"Nominal features {features} inverse transformed.")
                start_idx = end_idx
            else:
                logger.warning(f"Unknown transformer '{name}'. Skipping inversion.")

        # Create the inverse-transformed DataFrame
        inverse_df = pd.DataFrame(inverse_data)

        logger.debug("Inverse-transformed DataFrame constructed.")
        logger.debug(f"Inverse-transformed DataFrame shape: {inverse_df.shape}")

        logger.info("✅ Inverse transformation completed successfully.")

        return inverse_df



    def build_pipeline(self, X_train: pd.DataFrame) -> ColumnTransformer:
        transformers = []

        if self.numericals:
            numerical_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', StandardScaler())  # Include scaler here
            ])
            transformers.append(('num', numerical_transformer, self.numericals))
            self.logger.debug("Numerical transformer (Imputer + Scaler) added to pipeline.")

        if self.ordinal_categoricals:
            ordinal_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('ordinal_encoder', OrdinalEncoder())
            ])
            transformers.append(('ord', ordinal_transformer, self.ordinal_categoricals))
            self.logger.debug("Ordinal transformer added to pipeline.")

        if self.nominal_categoricals:
            for feature in self.nominal_categoricals:
                transformer = Pipeline(steps=[
                    ('imputer', SimpleImputer(strategy='most_frequent')),
                    ('onehot_encoder', OneHotEncoder(handle_unknown='ignore'))
                ])
                transformers.append((f'onehot_enc_{feature}', transformer, [feature]))
                self.logger.debug(f"Nominal transformer for '{feature}' added to pipeline.")

        if not transformers:
            self.logger.error("No transformers to add to the pipeline. Check feature categorization.")
            raise ValueError("No transformers to add to the pipeline. Check feature categorization.")

        preprocessor = ColumnTransformer(transformers=transformers, remainder='drop')
        self.logger.debug("ColumnTransformer constructed with the following transformers:")
        for t in transformers:
            self.logger.debug(t)

        preprocessor.fit(X_train)
        self.logger.info("✅ Preprocessor fitted on training data.")

        # Determine categorical feature indices for SMOTENC
        categorical_indices = []
        current_index = 0
        for name, transformer, features in preprocessor.transformers_:
            if name.startswith('onehot_enc_'):
                ohe = transformer.named_steps['onehot_encoder']
                n_categories = len(ohe.categories_[0])
                categorical_indices.extend(list(range(current_index, current_index + n_categories)))
                self.logger.debug(f"Feature '{name}' has {n_categories} categories; indices {list(range(current_index, current_index + n_categories))}.")
                current_index += n_categories
            elif name in ['num', 'ord']:
                n_features = len(features)
                current_index += n_features
                self.logger.debug(f"Feature '{name}' has {n_features} features; advancing start index by {n_features}.")
            else:
                self.logger.warning(f"Unknown transformer '{name}'. Skipping index calculation.")

        self.categorical_indices = categorical_indices
        self.logger.debug(f"Categorical feature indices for SMOTENC: {self.categorical_indices}")

        # Validate that encoders are fitted
        for name, transformer, features in preprocessor.transformers_:
            if name == 'ord':
                ordinal_encoder = transformer.named_steps.get('ordinal_encoder', None)
                if ordinal_encoder and hasattr(ordinal_encoder, 'categories_'):
                    self.logger.debug(f"✅ OrdinalEncoder for features {features} is fitted.")
                else:
                    self.logger.error(f"❌ OrdinalEncoder for features {features} is NOT fitted.")
                    raise AttributeError(f"OrdinalEncoder for features {features} is NOT fitted.")
        return preprocessor



    def preprocess_train(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Preprocess data for training mode.

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.DataFrame, Optional[pd.DataFrame]]:
                - X_train: Transformed training features.
                - X_test: Transformed test features.
                - y_train: Training target.
                - y_test: Test target.
                - recommendations: Preprocessing recommendations.
                - X_test_inverse: Inverse-transformed test features (optional).
        """
        # Step 1: Split Dataset
        X_train_original, X_test_original, y_train_original, y_test = self.split_dataset(X, y)

        # Debugging: Log shapes after splitting
        self.logger.debug(f"After splitting: X_train_original.shape={X_train_original.shape}, X_test_original.shape={X_test_original.shape}")

        # Step 2: Handle Missing Values
        X_train_missing_values, X_test_missing_values = self.handle_missing_values(X_train_original, X_test_original)

        # Debugging: Log shapes after handling missing values
        self.logger.debug(f"After handling missing values: X_train_missing_values.shape={X_train_missing_values.shape}, X_test_missing_values.shape={X_test_missing_values.shape}")

        # Step 3: Test for Normality
        if self.model_category in ['regression', 'classification', 'clustering']:
            self.test_normality(X_train_missing_values)

        # Step 4: Handle Outliers
        X_train_outliers_handled, y_train_outliers_handled = self.handle_outliers(X_train_missing_values, y_train_original)

        # Debugging: Log shapes after handling outliers
        self.logger.debug(f"After handling outliers: X_train_outliers_handled.shape={X_train_outliers_handled.shape}")

        # Retain a copy of X_test without outliers for reference
        X_test_outliers_handled = X_test_missing_values.copy() if X_test_missing_values is not None else None

        # Step 5: Generate Preprocessing Recommendations
        recommendations = self.generate_recommendations()

        # Step 6: Build and Fit the Pipeline (Modified)
        self.pipeline = self.build_pipeline(X_train_outliers_handled)

        # Fit and transform training data using the pipeline
        X_train_preprocessed = self.pipeline.transform(X_train_outliers_handled)  # Already fitted in build_pipeline

        # Transform test data
        X_test_preprocessed = self.pipeline.transform(X_test_outliers_handled) if X_test_outliers_handled is not None else None
        if X_test_preprocessed is not None:
            self.logger.debug(f"After pipeline transform: X_test_preprocessed.shape={X_test_preprocessed.shape}")

        self.logger.info("✅ Training and test data preprocessed.")

        # Step 7: Implement SMOTENC (Train Only for Classification)
        if self.model_category == 'classification':
            smotenc = SMOTENC(
                categorical_features=self.categorical_indices,  # Now set correctly
                sampling_strategy='auto',
                random_state=42,
                k_neighbors=5
            )
            X_train_smoted, y_train_smoted = smotenc.fit_resample(X_train_preprocessed, y_train_outliers_handled)
            self.smote = smotenc
            self.logger.info("✅ SMOTENC applied to training data.")
            self.logger.debug(f"After SMOTENC: X_train_smoted.shape={X_train_smoted.shape}, y_train_smoted.shape={y_train_smoted.shape}")

            # **Validation:** Ensure SMOTENC did not alter the test set
            if X_test_preprocessed is not None:
                expected_test_shape = (X_test_original.shape[0], X_test_preprocessed.shape[1])
                actual_test_shape = X_test_preprocessed.shape
                if actual_test_shape != expected_test_shape:
                    self.logger.error(f"❌ Test set shape mismatch: Expected {expected_test_shape}, Got {actual_test_shape}")
                    raise ValueError(f"Test set shape mismatch: Expected {expected_test_shape}, Got {actual_test_shape}")
        else:
            X_train_smoted, y_train_smoted = X_train_preprocessed, y_train_outliers_handled
            self.logger.info("⚠️ SMOTENC not applied: Not a classification model.")

        # Step 8: Save Transformers (Including the Pipeline)
        self.final_feature_order = list(self.pipeline.get_feature_names_out())
        X_train_final = pd.DataFrame(X_train_smoted, columns=self.final_feature_order)  # Removed index assignment
        X_test_final = pd.DataFrame(X_test_preprocessed, columns=self.final_feature_order, index=X_test_original.index) if X_test_preprocessed is not None else None

        self.logger.debug(f"Final Training DataFrame shape: {X_train_final.shape}")
        if X_test_final is not None:
            self.logger.debug(f"Final Test DataFrame shape: {X_test_final.shape}")

        self.save_transformers()

        # Confirm Indices Before Inverse Transform (Debugging Step)
        self._log("Indices in X_test_final after transformations:", "preprocess_train", 'debug')
        if X_test_final is not None:
            self._log(X_test_final.index, "preprocess_train", 'debug')
            self._log("Indices in X_test_original:", "preprocess_train", 'debug')
            self._log(X_test_original.index, "preprocess_train", 'debug')

        # Inverse transformations (optional, for interpretability)
        try:
            # Use the final test dataset (fully transformed) for inverse transformations
            if X_test_final is not None:
                X_test_inverse = self.inverse_transform_data(X_test_final.values)
                self.logger.info("✅ Inverse transformations applied successfully.")
                self.logger.debug(f"Inverse-transformed X_test_inverse.shape={X_test_inverse.shape}")
        except Exception as e:
            self.logger.error(f"❌ Inverse transformations failed: {e}")
            X_test_inverse = None

        # Return processed datasets
        return X_train_final, X_test_final, y_train_smoted, y_test, recommendations, X_test_inverse

    def transform(self, X: pd.DataFrame) -> np.ndarray:
        """
        Transform new data using the fitted preprocessing pipeline.

        Args:
            X (pd.DataFrame): New data to transform.

        Returns:
            np.ndarray: Preprocessed data.
        """
        if self.pipeline is None:
            self.logger.error("Preprocessing pipeline has not been fitted. Call fit_transform first.")
            raise AttributeError("Preprocessing pipeline has not been fitted. Call fit_transform first.")
        self.logger.debug("Transforming new data.")
        X_preprocessed = self.pipeline.transform(X)
        if self.debug:
            self.logger.debug(f"Transformed data shape: {X_preprocessed.shape}")
        else:
            self.logger.info("Data transformed.")
        return X_preprocessed

    def preprocess_predict(self, X: pd.DataFrame, transformers: dict) -> Tuple[np.ndarray, pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Preprocess data for prediction mode.

        Args:
            X (pd.DataFrame): New data for prediction.
            transformers (dict): Loaded transformers from save_transformers.

        Returns:
            Tuple[np.ndarray, pd.DataFrame, Optional[pd.DataFrame]]:
                - X_processed: Preprocessed features.
                - recommendations: Preprocessing recommendations.
                - X_inverse: Inverse-transformed features (optional).
        """
        step_name = "Preprocess Predict"
        self.logger.info(f"Starting preprocessing in '{self.mode}' mode.")
        
        # Load transformers
        self.load_transformers()

        # Apply the pipeline to the new data
        try:
            X_processed = self.pipeline.transform(X)
            self.logger.info("✅ New data preprocessed successfully.")
        except Exception as e:
            self.logger.error(f"❌ Preprocessing failed: {e}")
            raise

        # Generate recommendations if needed (optional)
        recommendations = self.generate_recommendations()

        # Inverse transform for interpretability
        try:
            X_inverse = self.inverse_transform_data(X_processed)
            self.logger.info("✅ Inverse transformation applied successfully.")
        except Exception as e:
            self.logger.error(f"❌ Inverse transformation failed: {e}")
            X_inverse = None  # Proceed without inverse transform if it fails

        return X_processed, recommendations, X_inverse

    def preprocess_clustering(self, X: pd.DataFrame, debug: bool = False) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Preprocess data for clustering mode.

        Args:
            X (pd.DataFrame): Input features for clustering.
            debug (bool): Flag to control debug outputs.

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: X_processed, recommendations.
        """
        step_name = "Preprocess Clustering"
        self.logger.info(f"Step: {step_name}")

        if debug:
            self.logger.debug("Starting preprocessing for clustering.")

        # Handle Missing Values
        X_missing, _ = self.handle_missing_values(X, None, debug=debug)

        # Handle Outliers
        X_outliers_handled, _ = self.handle_outliers(X_missing, None, debug=debug)

        # Generate Preprocessing Recommendations
        recommendations = self.generate_recommendations()

        # Build and Fit the Pipeline
        self.pipeline = self.build_pipeline(X_outliers_handled, debug=debug)

        # Transform the data
        X_processed = self.pipeline.transform(X_outliers_handled)
        if debug:
            self.logger.debug(f"After pipeline transform: X_processed.shape={X_processed.shape}")

        self.logger.info("✅ Clustering data preprocessed successfully.")

        return X_processed, recommendations


    def final_preprocessing(
        self, 
        data: pd.DataFrame
    ) -> Tuple:
        """
        Execute the full preprocessing pipeline based on the mode.

        Args:
            data (pd.DataFrame): Input dataset containing features and possibly the target variable.

        Returns:
            Tuple: Depending on mode:
                - 'train': X_train, X_test, y_train, y_test, recommendations, X_test_inverse
                - 'predict': X_processed, recommendations, X_inverse
                - 'clustering': X_processed, recommendations
        """
        self.logger.info(f"Starting: Final Preprocessing Pipeline in '{self.mode}' mode.")

        if self.mode == 'train':
            # Ensure y_variable is present in the data
            if not all(col in data.columns for col in self.y_variable):
                missing_y = [col for col in self.y_variable if col not in data.columns]
                raise ValueError(f"Target variable(s) {missing_y} not found in the dataset.")

            # Separate X and y
            X = data.drop(self.y_variable, axis=1)
            y = data[self.y_variable].iloc[:, 0] if len(self.y_variable) == 1 else data[self.y_variable]

            if y is None:
                raise ValueError("Target variable 'y' must be provided in train mode.")
            return self.preprocess_train(X, y)

        elif self.mode == 'predict':
            # Predict mode: Use all data as X; y is not required
            X = data.copy()
            
            # Load transformers explicitly
            transformers = self.load_transformers()
            
            return self.preprocess_predict(X, transformers)

        elif self.mode == 'clustering':
            # Clustering mode: Use all data as X; y is not used
            X = data.copy()
            return self.preprocess_clustering(X)

        else:
            raise NotImplementedError(f"Mode '{self.mode}' is not implemented.")



    # Optionally, implement a method to display column info for debugging
    def _debug_column_info(self, df: pd.DataFrame, step: str = "Debug Column Info"):
        """
        Display information about DataFrame columns for debugging purposes.

        Args:
            df (pd.DataFrame): The DataFrame to inspect.
            step (str, optional): Description of the current step. Defaults to "Debug Column Info".
        """
        self.logger.debug(f"\n📊 {step}: Column Information")
        for col in df.columns:
            self.logger.debug(f"Column '{col}': {df[col].dtype}, Unique Values: {df[col].nunique()}")
        self.logger.debug("\n")
        
        

import pandas as pd
import logging
import os
import yaml
import joblib

# Assuming DataPreprocessor is defined/imported correctly
# from data_preprocessor import DataPreprocessor

def load_dataset(path: str) -> pd.DataFrame:
    """
    Load the dataset from a CSV file.

    Args:
        path (str): Path to the dataset CSV file.

    Returns:
        pd.DataFrame: Loaded dataset.
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"Dataset not found at {path}")
    return pd.read_csv(path)

def load_config(config_path: str) -> dict:
    """
    Load and parse the YAML configuration file.

    Args:
        config_path (str): Path to the preprocessor_config.yaml file.

    Returns:
        dict: Parsed configuration.
    """
    if not os.path.exists(config_path):
        raise FileNotFoundError(f"Configuration file not found at {config_path}")
    with open(config_path, 'r') as file:
        config = yaml.safe_load(file)
    return config

def main():
    # ----------------------------
    # Step 1: Load Configuration
    # ----------------------------
    config_path = '../../ml-preprocessing-utils/data/dataset/test/preprocessor_config/preprocessor_config.yaml'  # Path to your preprocessor_config.yaml
    try:
        config = load_config(config_path)
        logger_config = config.get('logging', {})
        logger_level = logger_config.get('level', 'INFO').upper()
        logger_format = logger_config.get('format', '%(asctime)s [%(levelname)s] %(message)s')
    except Exception as e:
        print(f"❌ Failed to load configuration: {e}")
        return  # Exit if config loading fails

    # ----------------------------
    # Step 2: Configure Logging
    # ----------------------------
    debug_flag = config.get('logging', {}).get('debug', False)
    logging.basicConfig(
        level=logging.DEBUG if debug_flag else getattr(logging, logger_level, logging.INFO),
        format=logger_format,
        handlers=[
            logging.StreamHandler()
        ]
    )
    logger = logging.getLogger('main_preprocessing')

    # ----------------------------
    # Step 3: Extract Feature Assets
    # ----------------------------
    features_config = config.get('features', {})
    column_assets = {
        'y_variable': features_config.get('y_variable', []),
        'ordinal_categoricals': features_config.get('ordinal_categoricals', []),
        'nominal_categoricals': features_config.get('nominal_categoricals', []),
        'numericals': features_config.get('numericals', [])
    }

    # ----------------------------
    # Step 4: Extract Execution Parameters for Training
    # ----------------------------
    execution_train = config.get('execution', {}).get('train', {})
    train_mode = 'train'

    train_input_path = execution_train.get('input_path', '')
    output_dir = execution_train.get('output_dir', './processed_data')
    transformers_dir = execution_train.get('save_transformers_path', './transformers')
    normalize_debug = execution_train.get('normalize_debug', False)
    normalize_graphs_output = execution_train.get('normalize_graphs_output', False)

    # Validate essential paths
    if not train_input_path:
        logger.error("❌ 'input_path' for training mode is not specified in the configuration.")
        return
    if not os.path.exists(train_input_path):
        logger.error(f"❌ Training input dataset not found at {train_input_path}.")
        return

    # ----------------------------
    # Step 5: Extract Model Configurations
    # ----------------------------
    model_name = config.get('model_type', 'Tree Based Classifier')  # Fetch 'model_type' from config
    model_config = config.get('models', {}).get(model_name, {})
    if not model_config:
        logger.error(f"❌ Model configuration for '{model_name}' not found.")
        return

    # ----------------------------
    # Step 6: Initialize DataPreprocessor
    # ----------------------------
    preprocessor = DataPreprocessor(
        model_type=model_name,  # Use the actual model name
        column_assets=column_assets,
        mode=train_mode,
        options=model_config,
        debug=debug_flag,
        normalize_debug=normalize_debug,
        normalize_graphs_output=normalize_graphs_output,
        graphs_output_dir=config.get('execution', {}).get('shared', {}).get('plot_output_dir', './plots'),
        transformers_dir=transformers_dir
    )
    # ----------------------------
    # Step 3: Initialize FeatureManager
    # ----------------------------
    save_path = config.get('execution', {}).get('features_metadata_path', '../../ml-preprocessing-utils/data/dataset/test/features_info/features_metadata.pkl')
    feature_manager = FeatureManager(save_path=save_path)
    # ----------------------------
    # Step 7: Load Training Dataset
    # ----------------------------
    # Load features and dataset
    try:
        filtered_df, column_assets = feature_manager.load_features_and_dataset(
            debug=True  # Set to False to reduce verbosity
        )
        logger.info("✅ Features loaded and dataset filtered successfully.")
    except Exception as e:
        logger.error(f"❌ Failed to load features and dataset: {e}")
        raise

    # ----------------------------
    # Step 8: Execute Preprocessing
    # ----------------------------
    try:
        X_train, X_test, y_train, y_test, recommendations, X_test_inverse = preprocessor.final_preprocessing(filtered_df)
        logger.info("✅ Preprocessing completed successfully in train mode.")
    except Exception as e:
        logger.error(f"❌ Preprocessing failed in train mode: {e}")
        return

    # ----------------------------
    # Step 9: Save Preprocessed Data
    # ----------------------------
    try:
        os.makedirs(output_dir, exist_ok=True)
        X_train.to_csv(os.path.join(output_dir, 'X_train.csv'), index=False)
        y_train.to_csv(os.path.join(output_dir, 'y_train.csv'), index=False)
        X_test.to_csv(os.path.join(output_dir, 'X_test.csv'), index=False)
        y_test.to_csv(os.path.join(output_dir, 'y_test.csv'), index=False)
        recommendations.to_csv(os.path.join(output_dir, 'preprocessing_recommendations.csv'), index=False)
        logger.info(f"✅ Preprocessed data saved to '{output_dir}'.")
    except Exception as e:
        logger.error(f"❌ Failed to save preprocessed data: {e}")
        return

    # ----------------------------
    # Optional: Visualize Inverse Transformations
    # ----------------------------
    try:
        if X_test_inverse is not None:
            print("Inverse Transformed Test Data:")
            print(X_test_inverse.head())
    except Exception as e:
        logger.error(f"❌ Error during visualization: {e}")
        return

    logger.info("✅ All preprocessing tasks completed successfully.")

if __name__ == "__main__":
    main()
    
    
# predict_main.py

import pandas as pd
import logging
import os
import yaml
import joblib
import numpy as np

# Assuming DataPreprocessor is defined/imported correctly
# from data_preprocessor import DataPreprocessor

def load_dataset(path: str) -> pd.DataFrame:
    """
    Load the dataset from a CSV file.

    Args:
        path (str): Path to the dataset CSV file.

    Returns:
        pd.DataFrame: Loaded dataset.
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"Dataset not found at {path}")
    return pd.read_csv(path)

def load_config(config_path: str) -> dict:
    """
    Load and parse the YAML configuration file.

    Args:
        config_path (str): Path to the preprocessor_config.yaml file.

    Returns:
        dict: Parsed configuration.
    """
    if not os.path.exists(config_path):
        raise FileNotFoundError(f"Configuration file not found at {config_path}")
    with open(config_path, 'r') as file:
        config = yaml.safe_load(file)
    return config

def predict():
    """
    Main function for Predict Mode.
    """
    # ----------------------------
    # Step 1: Load Configuration
    # ----------------------------
    config_path = '../../ml-preprocessing-utils/data/dataset/test/preprocessor_config/preprocessor_config.yaml'  # Path to your preprocessor_config.yaml
    try:
        config = load_config(config_path)
        logger_config = config.get('logging', {})
        logger_level = logger_config.get('level', 'INFO').upper()
        logger_format = logger_config.get('format', '%(asctime)s [%(levelname)s] %(message)s')
    except Exception as e:
        print(f"❌ Failed to load configuration: {e}")
        return  # Exit if config loading fails

    # ----------------------------
    # Step 2: Configure Logging
    # ----------------------------
    debug_flag = config.get('logging', {}).get('debug', False)
    logging.basicConfig(
        level=logging.DEBUG if debug_flag else getattr(logging, logger_level, logging.INFO),
        format=logger_format,
        handlers=[
            logging.StreamHandler()
        ]
    )
    logger = logging.getLogger('predict_preprocessing')

    # ----------------------------
    # Step 3: Extract Feature Assets and Execution Parameters for Predict
    # ----------------------------
    features_config = config.get('features', {})
    column_assets = {
        'y_variable': features_config.get('y_variable', []),  # Not used in predict, but kept for consistency
        'ordinal_categoricals': features_config.get('ordinal_categoricals', []),
        'nominal_categoricals': features_config.get('nominal_categoricals', []),
        'numericals': features_config.get('numericals', [])
    }

    execution_predict = config.get('execution', {}).get('predict', {})
    predict_mode = 'predict'

    prediction_input_path = execution_predict.get('prediction_input_path', '')
    load_transformers_path = execution_predict.get('load_transformers_path', '')
    trained_model_path = execution_predict.get('trained_model_path', '')
    predictions_output_path = execution_predict.get('predictions_output_path', './predictions')
    normalize_debug = execution_predict.get('normalize_debug', False)
    normalize_graphs_output = execution_predict.get('normalize_graphs_output', False)

    # Validate essential paths
    if not prediction_input_path:
        logger.error("❌ 'prediction_input_path' for predict mode is not specified in the configuration.")
        return
    if not os.path.exists(prediction_input_path):
        logger.error(f"❌ Prediction input dataset not found at {prediction_input_path}.")
        return
    if not load_transformers_path:
        logger.error("❌ 'load_transformers_path' for predict mode is not specified in the configuration.")
        return
    if not os.path.exists(load_transformers_path):
        logger.error(f"❌ Transformers file not found at {load_transformers_path}.")
        return
    if not trained_model_path:
        logger.error("❌ 'trained_model_path' for predict mode is not specified in the configuration.")
        return
    if not os.path.exists(trained_model_path):
        logger.error(f"❌ Trained model not found at {trained_model_path}.")
        return

    # ----------------------------
    # Step 4: Initialize DataPreprocessor in Predict Mode
    # ----------------------------
    preprocessor = DataPreprocessor(
        model_type=config.get('model_type', 'Tree Based Classifier'),  # Use the actual model name
        column_assets=column_assets,
        mode=predict_mode,
        options=config.get('models', {}).get(config.get('model_type', 'Tree Based Classifier'), {}),
        debug=debug_flag,
        normalize_debug=normalize_debug,
        normalize_graphs_output=normalize_graphs_output,
        graphs_output_dir=config.get('execution', {}).get('shared', {}).get('plot_output_dir', './plots'),
        transformers_dir=os.path.dirname(load_transformers_path)
    )

    # ----------------------------
    # Step 5: Load Trained Transformers
    # ----------------------------
    try:
        transformers = preprocessor.load_transformers()
        logger.info("✅ Transformers loaded successfully.")
    except Exception as e:
        logger.error(f"❌ Failed to load transformers: {e}")
        return

    # ----------------------------
    # Step 6: Load Trained Model
    # ----------------------------
    # try:
    #     model = joblib.load(trained_model_path)
    #     logger.info(f"✅ Trained model loaded from '{trained_model_path}'.")
    # except Exception as e:
    #     logger.error(f"❌ Failed to load trained model: {e}")
    #     return

    # ----------------------------
    # Step 7: Load Prediction Input Data
    # ----------------------------
    try:
        X_new = load_dataset(prediction_input_path)
        logger.info(f"✅ Prediction input data loaded from '{prediction_input_path}'.")
    except Exception as e:
        logger.error(f"❌ Failed to load prediction input data: {e}")
        return

    # ----------------------------
    # Step 8: Preprocess the New Data
    # ----------------------------
    try:
        X_new_preprocessed = preprocessor.transform(X_new)
        logger.info("✅ New data preprocessed successfully.")
    except Exception as e:
        logger.error(f"❌ Preprocessing of new data failed: {e}")
        return

    try:
        X_new_inverse = preprocessor.inverse_transform_data(X_new_preprocessed)
        logger.info("✅ Inverse transformations applied successfully.")
        logger.info(f"Inverse-transformed X_new_inverse.shape={X_new_inverse.shape}")
    except Exception as e:
        logger.error(f"❌ Inverse transformations failed: {e}")
        X_new_inverse = pd.DataFrame({'predictions': y_new_pred})  # Correct variable assignment

    # ----------------------------
    # Step 9: Make Predictions
    # ----------------------------
    # try:
    #     y_new_pred = model.predict(X_new_preprocessed)
    #     logger.info("✅ Predictions made successfully on new data.")
    # except Exception as e:
    #     logger.error(f"❌ Failed to make predictions: {e}")
    #     return

    # ----------------------------
    # Step 10: Inverse Transform the Data for Interpretability
    # ----------------------------
    y_new_pred = np.random.choice(['1', '0'], size=X_new_preprocessed.shape[0])  # Example for binary predictions
    
    try:
        if X_new_inverse is not None:
            X_new_inverse['predictions'] = y_new_pred
            logger.debug("Predictions attached to inverse-transformed DataFrame.")
        else:
            # If inverse transformation failed, create a DataFrame with predictions only
            X_new_inverse = pd.DataFrame({'predictions': y_new_pred})
            logger.warning("Inverse transformation was not applied. Proceeding with predictions only.")
    except Exception as e:
        logger.error(f"❌ Inverse transformation failed: {e}")
        X_new_inverse = pd.DataFrame({'predictions': y_new_pred})  # Assign to X_new_inverse


    # ----------------------------
    # Step 11: Save Predictions
    # ----------------------------
    try:
        os.makedirs(predictions_output_path, exist_ok=True)
        X_new_inverse.to_csv(os.path.join(predictions_output_path, 'predictions.csv'), index=False)
        logger.info(f"✅ Predictions saved to '{predictions_output_path}/predictions.csv'.")
    except Exception as e:
        logger.error(f"❌ Failed to save predictions: {e}")
        return

    # ----------------------------
    # Step 12: Display Predictions
    # ----------------------------
    try:
        print("\nInverse Transformed Prediction DataFrame with Predictions:")
        print(X_new_inverse.head())
    except Exception as e:
        logger.error(f"❌ Error during displaying predictions: {e}")
        return

    logger.info("✅ Predict mode executed successfully.")

if __name__ == "__main__":
    predict()


