In [None]:

# data_preprocessor.py

import pandas as pd
import numpy as np
from typing import List, Dict, Optional, Tuple
import logging
import os
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import PowerTransformer, OrdinalEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from imblearn.over_sampling import SMOTE, ADASYN, SMOTENC, SMOTEN, BorderlineSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import probplot
import joblib  # Added for saving/loading transformers
from inspect import signature  # Needed for parameter validation in SMOTE

class DataPreprocessor:
    def __init__(
        self,
        model_type: str,
        column_assets: Dict[str, List[str]],
        mode: str,  # 'train', 'predict', 'clustering'
        options: Optional[Dict] = None,
        perform_split: bool = True,
        debug: bool = False,
        debug_split_dataset: bool = False,
        debug_handle_missing_values: bool = False,
        debug_test_normality: bool = False,
        debug_handle_outliers: bool = False,
        debug_choose_transformations: bool = False,
        debug_encode_categoricals: bool = False,
        debug_apply_scaling: bool = False,
        debug_implement_smote: bool = False,
        debug_final_inverse_transformations: bool = False,
        debug_validate_inverse_transformations: bool = False,
        debug_generate_recommendations: bool = False,
        normalize_debug: bool = False,
        normalize_graphs_output: bool = False,
        graphs_output_dir: str = './plots'
    ):
        """
        Initialize the DataPreprocessor with model type, column assets, and user-defined options.

        Args:
            model_type (str): Type of the machine learning model (e.g., 'Logistic Regression').
            column_assets (Dict[str, List[str]]): Dictionary containing lists of columns for different categories.
            mode (str): Operational mode ('train', 'predict', 'clustering').
            options (Optional[Dict]): User-defined options for preprocessing steps.
            perform_split (bool): Whether to perform train-test split (True for training).
            debug (bool): General debug flag to control overall verbosity.
            debug_split_dataset (bool): Debug flag for dataset splitting.
            debug_handle_missing_values (bool): Debug flag for missing value handling.
            debug_test_normality (bool): Debug flag for normality testing.
            debug_handle_outliers (bool): Debug flag for outlier handling.
            debug_choose_transformations (bool): Debug flag for choosing transformations.
            debug_encode_categoricals (bool): Debug flag for encoding categorical variables.
            debug_apply_scaling (bool): Debug flag for feature scaling.
            debug_implement_smote (bool): Debug flag for SMOTE implementation.
            debug_final_inverse_transformations (bool): Debug flag for inverse transformations.
            debug_validate_inverse_transformations (bool): Debug flag for validating inverse transformations.
            debug_generate_recommendations (bool): Debug flag for generating preprocessing recommendations.
            normalize_debug (bool): Flag to display normalization plots.
            normalize_graphs_output (bool): Flag to save normalization plots.
            graphs_output_dir (str): Directory to save plots.
        """
        self.model_type = model_type
        self.column_assets = column_assets
        self.mode = mode.lower()
        if self.mode not in ['train', 'predict', 'clustering']:
            raise ValueError("Mode must be one of 'train', 'predict', or 'clustering'.")
        self.options = options or {}
        self.perform_split = perform_split
        self.debug = debug
        self.debug_split_dataset = debug_split_dataset
        self.debug_handle_missing_values = debug_handle_missing_values
        self.debug_test_normality = debug_test_normality
        self.debug_handle_outliers = debug_handle_outliers
        self.debug_choose_transformations = debug_choose_transformations
        self.debug_encode_categoricals = debug_encode_categoricals
        self.debug_apply_scaling = debug_apply_scaling
        self.debug_implement_smote = debug_implement_smote
        self.debug_final_inverse_transformations = debug_final_inverse_transformations
        self.debug_validate_inverse_transformations = debug_validate_inverse_transformations
        self.debug_generate_recommendations = debug_generate_recommendations
        self.normalize_debug = normalize_debug
        self.normalize_graphs_output = normalize_graphs_output
        self.graphs_output_dir = graphs_output_dir

        # Define model categories for accurate processing
        self.model_category = self.map_model_type_to_category()
        
        self.final_feature_order = []  # Initialize an empty list to store feature order

        # Set y_variable based on model category and mode
        if self.mode == 'train':
            if self.model_category in ['classification', 'regression']:
                self.y_variable = column_assets.get('y_variable', [])
                if not self.y_variable:
                    self.logger.error("No target variable specified in 'y_variable'.")
                    raise ValueError("Target variable 'y_variable' must be specified for supervised models.")
            else:
                self.y_variable = []
        elif self.mode == 'predict':
            if self.model_category in ['classification', 'regression']:
                self.y_variable = column_assets.get('y_variable', [])
            else:
                self.y_variable = []
        elif self.mode == 'clustering':
            # For clustering, no target variable
            self.y_variable = []

        # Fetch feature lists
        self.ordinal_categoricals = column_assets.get('ordinal_categoricals', [])
        self.nominal_categoricals = column_assets.get('nominal_categoricals', [])
        self.numericals = column_assets.get('numericals', [])

        # Initialize other variables
        self.scaler = None
        self.transformer = None
        self.ordinal_encoder = None
        self.nominal_encoder = None
        self.preprocessor = None
        self.smote = None
        self.feature_reasons = {col: '' for col in self.ordinal_categoricals + self.nominal_categoricals + self.numericals}
        self.preprocessing_steps = []
        self.normality_results = {}
        self.features_to_transform = []
        self.nominal_encoded_feature_names = []

        # Initialize placeholders for clustering-specific transformers
        self.cluster_transformers = {}
        self.cluster_model = None
        self.cluster_labels = None
        self.silhouette_score = None


        # Define default thresholds for SMOTE recommendations
        self.imbalance_threshold = self.options.get('smote_recommendation', {}).get('imbalance_threshold', 0.1)
        self.extreme_imbalance_threshold = self.options.get('smote_recommendation', {}).get('extreme_imbalance_threshold', 0.05)
        self.noise_threshold = self.options.get('smote_recommendation', {}).get('noise_threshold', 0.1)
        self.overlap_threshold = self.options.get('smote_recommendation', {}).get('overlap_threshold', 0.1)
        self.boundary_threshold = self.options.get('smote_recommendation', {}).get('boundary_threshold', 0.1)


        # Configure logging
        self.logger = logging.getLogger(self.__class__.__name__)
        self.logger.setLevel(logging.DEBUG if self.debug else logging.INFO)
        handler = logging.StreamHandler()
        formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s')
        handler.setFormatter(formatter)
        if not self.logger.handlers:
            self.logger.addHandler(handler)

    def _log(self, message: str, debug_flag: bool, level: str = 'info'):
        """
        Helper method to handle logging based on debug flags.

        Args:
            message (str): The message to log.
            debug_flag (bool): The debug flag for the specific section.
            level (str): The logging level ('info' or 'debug').
        """
        if debug_flag:
            if level == 'debug':
                self.logger.debug(message)
            elif level == 'info':
                self.logger.info(message)
        else:
            if level != 'debug':
                self.logger.info(message)

    def map_model_type_to_category(self) -> str:
        """
        Map the model_type string to a predefined category.

        Returns:
            str: The model category ('classification', 'regression', 'clustering', etc.).
        """
        classification_models = [
            'Logistic Regression',
            'Tree Based Classifier',
            'k-NN Classifier',
            'SVM Classifier',
            'Neural Network Classifier'
        ]

        regression_models = [
            'Linear Regression',
            'Tree Based Regressor',
            'k-NN Regressor',
            'SVM Regressor',
            'Neural Network Regressor'
        ]

        clustering_models = [
            'K-Means Clustering', 'Hierarchical Clustering', 'DBSCAN', 'KModes', 'KPrototypes'
        ]

        time_series_models = [
            # Add any time series models if applicable
        ]

        if self.model_type in classification_models:
            return 'classification'
        elif self.model_type in regression_models:
            return 'regression'
        elif self.model_type in clustering_models:
            return 'clustering'
        elif self.model_type in time_series_models:
            return 'time_series'
        else:
            return 'unknown'

    def split_dataset(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> Tuple[pd.DataFrame, Optional[pd.DataFrame], Optional[pd.Series], Optional[pd.Series]]:
        step_name = "Split Dataset into Train and Test"
        self.logger.info(f"Step: {step_name}")

        # Debugging Statements
        self._log(f"Before Split - X shape: {X.shape}", self.debug_split_dataset, 'debug')
        if y is not None:
            self._log(f"Before Split - y shape: {y.shape}", self.debug_split_dataset, 'debug')
        else:
            self._log("Before Split - y is None", self.debug_split_dataset, 'debug')

        if self.perform_split and self.mode == 'train':
            if self.model_category == 'classification':
                stratify = y if self.options.get('split_dataset', {}).get('stratify_for_classification', False) else None
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, 
                    test_size=self.options.get('split_dataset', {}).get('test_size', 0.2),
                    stratify=stratify, 
                    random_state=self.options.get('split_dataset', {}).get('random_state', 42)
                )
                if self.debug_split_dataset:
                    self._log("Performed stratified split for classification.", self.debug_split_dataset, 'debug')
            elif self.model_category == 'regression':
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, 
                    test_size=self.options.get('split_dataset', {}).get('test_size', 0.2),
                    random_state=self.options.get('split_dataset', {}).get('random_state', 42)
                )
                if self.debug_split_dataset:
                    self._log("Performed random split for regression.", self.debug_split_dataset, 'debug')
            else:
                stratify = self.options.get('split_dataset', {}).get('stratify', None)
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, 
                    test_size=self.options.get('split_dataset', {}).get('test_size', 0.2),
                    random_state=self.options.get('split_dataset', {}).get('random_state', 42),
                    stratify=stratify
                )
                self.logger.warning("Model category not recognized for specific split strategy. Performed default random split.")
        elif self.mode == 'clustering':
            X_train = X.copy()
            X_test = None
            y_train = None
            y_test = None
            self.logger.info("No splitting performed for clustering models.")
        else:
            X_train = X.copy()
            X_test = None
            y_train = y.copy() if y is not None else None
            y_test = None

        self.preprocessing_steps.append(step_name)

        if self.debug_split_dataset:
            self._log(f"After Split - X_train shape: {X_train.shape}, X_test shape: {X_test.shape if X_test is not None else 'N/A'}", self.debug_split_dataset, 'debug')
            if self.model_category == 'classification' and y_train is not None and y_test is not None:
                self.logger.debug(f"Class distribution in y_train:\n{y_train.value_counts(normalize=True)}")
                self.logger.debug(f"Class distribution in y_test:\n{y_test.value_counts(normalize=True)}")
            elif self.model_category == 'regression' and y_train is not None and y_test is not None:
                self.logger.debug(f"y_train statistics:\n{y_train.describe()}")
                self.logger.debug(f"y_test statistics:\n{y_test.describe()}")
        else:
            self.logger.info(f"Step '{step_name}' completed.")

        return X_train, X_test, y_train, y_test

    def handle_missing_values(self, X_train: pd.DataFrame, X_test: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Handle missing values for numerical and categorical features based on user options.
        """
        step_name = "Handle Missing Values"
        self.logger.info(f"Step: {step_name}")

        # Fetch user-defined imputation options or set defaults
        impute_options = self.options.get('handle_missing_values', {})
        numerical_strategy = impute_options.get('numerical_strategy', {})
        categorical_strategy = impute_options.get('categorical_strategy', {})

        # Numerical Imputation
        numerical_imputer = None
        new_columns = []
        if self.numericals:
            if self.model_category in ['regression', 'classification', 'clustering']:
                default_num_strategy = 'mean'  # For clustering, mean imputation is acceptable
            else:
                default_num_strategy = 'median'
            num_strategy = numerical_strategy.get('strategy', default_num_strategy)
            num_imputer_type = numerical_strategy.get('imputer', 'SimpleImputer')  # Can be 'SimpleImputer', 'KNNImputer', etc.

            if self.debug_handle_missing_values:
                self._log(f"Numerical Imputation Strategy: {num_strategy.capitalize()}, Imputer Type: {num_imputer_type}", self.debug_handle_missing_values, 'debug')

            # Initialize numerical imputer based on user option
            if num_imputer_type == 'SimpleImputer':
                numerical_imputer = SimpleImputer(strategy=num_strategy)
            elif num_imputer_type == 'KNNImputer':
                knn_neighbors = numerical_strategy.get('knn_neighbors', 5)
                numerical_imputer = KNNImputer(n_neighbors=knn_neighbors)
            else:
                self.logger.error(f"Numerical imputer type '{num_imputer_type}' is not supported.")
                raise ValueError(f"Numerical imputer type '{num_imputer_type}' is not supported.")

            # Fit and transform
            X_train[self.numericals] = numerical_imputer.fit_transform(X_train[self.numericals])
            self.feature_reasons.update({col: self.feature_reasons.get(col, '') + f'Numerical: {num_strategy.capitalize()} Imputation | ' for col in self.numericals})
            new_columns.extend(self.numericals)

            if X_test is not None:
                X_test[self.numericals] = numerical_imputer.transform(X_test[self.numericals])

        # Categorical Imputation
        categorical_imputer = None
        all_categoricals = self.ordinal_categoricals + self.nominal_categoricals
        if all_categoricals:
            default_cat_strategy = 'most_frequent'
            cat_strategy = categorical_strategy.get('strategy', default_cat_strategy)
            cat_imputer_type = categorical_strategy.get('imputer', 'SimpleImputer')

            if self.debug_handle_missing_values:
                self._log(f"Categorical Imputation Strategy: {cat_strategy.capitalize()}, Imputer Type: {cat_imputer_type}", self.debug_handle_missing_values, 'debug')

            # Initialize categorical imputer based on user option
            if cat_imputer_type == 'SimpleImputer':
                categorical_imputer = SimpleImputer(strategy=cat_strategy)
            elif cat_imputer_type == 'ConstantImputer':
                fill_value = categorical_strategy.get('fill_value', 'Missing')
                categorical_imputer = SimpleImputer(strategy='constant', fill_value=fill_value)
            else:
                self.logger.error(f"Categorical imputer type '{cat_imputer_type}' is not supported.")
                raise ValueError(f"Categorical imputer type '{cat_imputer_type}' is not supported.")

            # Fit and transform
            X_train[all_categoricals] = categorical_imputer.fit_transform(X_train[all_categoricals])
            self.feature_reasons.update({
                col: self.feature_reasons.get(col, '') + (f'Categorical: Constant Imputation (Value={categorical_strategy.get("fill_value", "Missing")}) | ' if cat_imputer_type == 'ConstantImputer' else f'Categorical: {cat_strategy.capitalize()} Imputation | ')
                for col in all_categoricals
            })
            new_columns.extend(all_categoricals)

            if X_test is not None:
                X_test[all_categoricals] = categorical_imputer.transform(X_test[all_categoricals])

        self.preprocessing_steps.append(step_name)

        if self.debug_handle_missing_values:
            self.logger.debug(f"Completed: {step_name}. Dataset shape after imputation: {X_train.shape}")
            self.logger.debug(f"Missing values after imputation in X_train:\n{X_train.isnull().sum()}")
            self.logger.debug(f"New columns handled: {new_columns}")
        else:
            self.logger.info(f"Step '{step_name}' completed.")

        return X_train, X_test

    def test_normality(self, X_train: pd.DataFrame) -> Dict[str, Dict]:
        """
        Test normality for numerical features based on normality tests and user options.

        Args:
            X_train (pd.DataFrame): Training features.

        Returns:
            Dict[str, Dict]: Dictionary with normality test results for each numerical feature.
        """
        step_name = "Test for Normality"
        self.logger.info(f"Step: {step_name}")
        normality_results = {}

        # Fetch user-defined normality test options or set defaults
        normality_options = self.options.get('test_normality', {})
        p_value_threshold = normality_options.get('p_value_threshold', 0.05)
        skewness_threshold = normality_options.get('skewness_threshold', 1.0)
        additional_tests = normality_options.get('additional_tests', [])  # e.g., ['anderson-darling']

        for col in self.numericals:
            data = X_train[col].dropna()
            skewness = data.skew()
            kurtosis = data.kurtosis()

            # Determine which normality test to use based on sample size and user options
            test_used = 'Shapiro-Wilk'
            p_value = 0.0

            if len(data) <= 5000:
                from scipy.stats import shapiro
                stat, p_val = shapiro(data)
                test_used = 'Shapiro-Wilk'
                p_value = p_val
            else:
                from scipy.stats import anderson
                result = anderson(data)
                test_used = 'Anderson-Darling'
                # Determine p-value based on critical values
                p_value = 0.0  # Default to 0
                for cv, sig in zip(result.critical_values, result.significance_level):
                    if result.statistic < cv:
                        p_value = sig / 100
                        break

            # Apply user-defined or default criteria
            if self.model_category in ['regression', 'classification', 'clustering']:
                # Linear, Logistic Regression, and Clustering: Use p-value and skewness
                needs_transform = (p_value < p_value_threshold) or (abs(skewness) > skewness_threshold)
            else:
                # Other models: Use skewness, and optionally p-values based on options
                use_p_value = normality_options.get('use_p_value_other_models', False)
                if use_p_value:
                    needs_transform = (p_value < p_value_threshold) or (abs(skewness) > skewness_threshold)
                else:
                    needs_transform = abs(skewness) > skewness_threshold

            normality_results[col] = {
                'skewness': skewness,
                'kurtosis': kurtosis,
                'p_value': p_value,
                'test_used': test_used,
                'needs_transform': needs_transform
            }

            self.logger.debug(f"Feature '{col}': p-value={p_value:.4f}, skewness={skewness:.4f}, needs_transform={needs_transform}")

        self.normality_results = normality_results
        self.preprocessing_steps.append(step_name)

        if self.debug_test_normality:
            self._log(f"Completed: {step_name}. Normality results computed.", self.debug_test_normality, 'debug')
        else:
            self.logger.info(f"Step '{step_name}' completed: Normality results computed.")

        return normality_results

    def plot_qq(
        self, 
        X_original: pd.DataFrame, 
        X_transformed: pd.DataFrame, 
        numerical_features: List[str], 
        model_type: str
    ):
        """
        Plot QQ plots before and after normalization for specified numerical features.

        Args:
            X_original (pd.DataFrame): Original numerical features before normalization.
            X_transformed (pd.DataFrame): Transformed numerical features after normalization.
            numerical_features (List[str]): List of numerical feature names.
            model_type (str): Type of the machine learning model.
        """
        for feature in numerical_features:
            plt.figure(figsize=(12, 5))

            # QQ Plot for Original Distribution
            plt.subplot(1, 2, 1)
            probplot(X_original[feature], dist="norm", plot=plt)
            plt.title(f'Original QQ Plot of {feature}')

            # QQ Plot for Transformed Distribution
            plt.subplot(1, 2, 2)
            probplot(X_transformed[feature], dist="norm", plot=plt)
            plt.title(f'Transformed QQ Plot of {feature}')

            plt.suptitle(f'QQ Plot Normalization Check for {feature} ({model_type})', fontsize=16)
            plt.tight_layout(rect=[0, 0.03, 1, 0.95])

            # Display plot in Jupyter if normalize_debug is True
            if self.normalize_debug:
                plt.show()

            # Save the plot if normalize_graphs_output is True
            if self.normalize_graphs_output:
                # Automate naming based on model type and feature
                safe_model_type = model_type.replace(" ", "_")
                plot_filename = f'{safe_model_type}_{feature}_qq_plot.png'
                plot_path = os.path.join(self.graphs_output_dir, plot_filename)
                os.makedirs(os.path.dirname(plot_path), exist_ok=True)
                plt.savefig(plot_path)
                self.logger.info(f"Saved QQ plot for '{feature}' at '{plot_path}'")

            plt.close()

    def plot_normalization(self, X_original: pd.DataFrame, X_transformed: pd.DataFrame, numerical_features: List[str], model_type: str):
        """
        Plot feature distributions before and after normalization.

        Args:
            X_original (pd.DataFrame): Original numerical features before normalization.
            X_transformed (pd.DataFrame): Transformed numerical features after normalization.
            numerical_features (List[str]): List of numerical feature names.
            model_type (str): Type of the machine learning model.
        """
        for feature in numerical_features:
            plt.figure(figsize=(12, 5))

            # Original Distribution
            plt.subplot(1, 2, 1)
            sns.histplot(X_original[feature], kde=True, color='blue')
            plt.title(f'Original Distribution of {feature}')

            # Transformed Distribution
            plt.subplot(1, 2, 2)
            sns.histplot(X_transformed[feature], kde=True, color='green')
            plt.title(f'Transformed Distribution of {feature}')

            plt.suptitle(f'Normalization Check for {feature} ({model_type})', fontsize=16)
            plt.tight_layout(rect=[0, 0.03, 1, 0.95])

            # Display plot in Jupyter if normalize_debug is True
            if self.normalize_debug:
                plt.show()

            # Save the plot if normalize_graphs_output is True
            if self.normalize_graphs_output:
                # Automate naming based on model type and feature
                safe_model_type = model_type.replace(" ", "_")
                plot_filename = f'{safe_model_type}_{feature}_normalization.png'
                plot_path = os.path.join(self.graphs_output_dir, plot_filename)
                os.makedirs(os.path.dirname(plot_path), exist_ok=True)
                plt.savefig(plot_path)
                self.logger.info(f"Saved normalization plot for '{feature}' at '{plot_path}'")

            plt.close()

    def handle_outliers(self, X_train: pd.DataFrame, y_train: Optional[pd.Series] = None) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
        """
        Handle outliers based on the model's sensitivity and user options.

        Args:
            X_train (pd.DataFrame): Training features.
            y_train (pd.Series, optional): Training target.

        Returns:
            tuple: X_train without outliers and corresponding y_train.
        """
        step_name = "Handle Outliers"
        self.logger.info(f"Step: {step_name}")
        initial_shape = X_train.shape[0]
        new_columns = []

        # Fetch user-defined outlier handling options or set defaults
        outlier_options = self.options.get('handle_outliers', {})
        zscore_threshold = outlier_options.get('zscore_threshold', 3)
        iqr_multiplier = outlier_options.get('iqr_multiplier', 1.5)
        winsor_limits = outlier_options.get('winsor_limits', [0.05, 0.05])
        isolation_contamination = outlier_options.get('isolation_contamination', 0.05)

        for col in self.numericals:
            if self.model_category in ['regression', 'classification']:
                # Z-Score Filtering
                apply_zscore = outlier_options.get('apply_zscore', True)
                if apply_zscore:
                    z_scores = np.abs((X_train[col] - X_train[col].mean()) / X_train[col].std())
                    mask_z = z_scores < zscore_threshold
                    removed_z = (~mask_z).sum()
                    X_train = X_train[mask_z]
                    if y_train is not None:
                        y_train = y_train.loc[X_train.index]
                    self.feature_reasons[col] += f'Outliers handled with Z-Score Filtering (threshold={zscore_threshold}) | '
                    if self.debug_handle_outliers:
                        self._log(f"Removed {removed_z} outliers from '{col}' using Z-Score Filtering", self.debug_handle_outliers, 'debug')

                # IQR Filtering
                apply_iqr = outlier_options.get('apply_iqr', True)
                if apply_iqr:
                    Q1 = X_train[col].quantile(0.25)
                    Q3 = X_train[col].quantile(0.75)
                    IQR = Q3 - Q1
                    lower_bound = Q1 - iqr_multiplier * IQR
                    upper_bound = Q3 + iqr_multiplier * IQR
                    mask_iqr = (X_train[col] >= lower_bound) & (X_train[col] <= upper_bound)
                    removed_iqr = (~mask_iqr).sum()
                    X_train = X_train[mask_iqr]
                    if y_train is not None:
                        y_train = y_train.loc[X_train.index]
                    self.feature_reasons[col] += f'Outliers handled with IQR Filtering (multiplier={iqr_multiplier}) | '
                    if self.debug_handle_outliers:
                        self._log(f"Removed {removed_iqr} outliers from '{col}' using IQR Filtering", self.debug_handle_outliers, 'debug')

            elif self.model_category in ['svm', 'knn']:
                # IQR Filtering
                apply_iqr = outlier_options.get('apply_iqr', True)
                if apply_iqr:
                    Q1 = X_train[col].quantile(0.25)
                    Q3 = X_train[col].quantile(0.75)
                    IQR = Q3 - Q1
                    lower_bound = Q1 - iqr_multiplier * IQR
                    upper_bound = Q3 + iqr_multiplier * IQR
                    mask_iqr = (X_train[col] >= lower_bound) & (X_train[col] <= upper_bound)
                    removed_iqr = (~mask_iqr).sum()
                    X_train = X_train[mask_iqr]
                    if y_train is not None:
                        y_train = y_train.loc[X_train.index]
                    self.feature_reasons[col] += f'Outliers handled with IQR Filtering (multiplier={iqr_multiplier}) | '
                    if self.debug_handle_outliers:
                        self._log(f"Removed {removed_iqr} outliers from '{col}' using IQR Filtering", self.debug_handle_outliers, 'debug')

                # Winsorization
                apply_winsor = outlier_options.get('apply_winsor', True)
                if apply_winsor:
                    from scipy.stats.mstats import winsorize
                    X_train[col] = winsorize(X_train[col], limits=winsor_limits)
                    self.feature_reasons[col] += f'Outliers handled with Winsorization (limits={winsor_limits}) | '
                    if self.debug_handle_outliers:
                        self._log(f"Winsorized '{col}' to limits {winsor_limits}", self.debug_handle_outliers, 'debug')

            elif self.model_category == 'neural_networks':
                # Winsorization
                apply_winsor = outlier_options.get('apply_winsor', True)
                if apply_winsor:
                    from scipy.stats.mstats import winsorize
                    X_train[col] = winsorize(X_train[col], limits=winsor_limits)
                    self.feature_reasons[col] += f'Outliers handled with Winsorization (limits={winsor_limits}) | '
                    if self.debug_handle_outliers:
                        self._log(f"Winsorized '{col}' to limits {winsor_limits}", self.debug_handle_outliers, 'debug')

            elif self.model_category == 'clustering':
                # For clustering, apply IQR Filtering
                apply_iqr = outlier_options.get('apply_iqr', True)
                if apply_iqr:
                    Q1 = X_train[col].quantile(0.25)
                    Q3 = X_train[col].quantile(0.75)
                    IQR = Q3 - Q1
                    lower_bound = Q1 - iqr_multiplier * IQR
                    upper_bound = Q3 + iqr_multiplier * IQR
                    mask_iqr = (X_train[col] >= lower_bound) & (X_train[col] <= upper_bound)
                    removed_iqr = (~mask_iqr).sum()
                    X_train = X_train[mask_iqr]
                    # y_train is None for clustering
                    self.feature_reasons[col] += f'Outliers handled with IQR Filtering (multiplier={iqr_multiplier}) | '
                    if self.debug_handle_outliers:
                        self._log(f"Removed {removed_iqr} outliers from '{col}' using IQR Filtering", self.debug_handle_outliers, 'debug')

            else:
                self.logger.warning(f"Model category '{self.model_category}' not recognized for outlier handling.")

        self.preprocessing_steps.append(step_name)

        if self.debug_handle_outliers:
            self.logger.debug(f"Completed: {step_name}. Dataset shape after outlier handling: {X_train.shape}")
            self.logger.debug(f"Missing values after outlier handling in X_train:\n{X_train.isnull().sum()}")
            self.logger.debug(f"New columns handled: {new_columns}")
        else:
            self.logger.info(f"Step '{step_name}' completed.")

        return X_train, y_train

    def choose_and_apply_transformations(self, X_train: pd.DataFrame, X_test: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Choose and apply transformations based on normality tests and user options.

        Args:
            X_train (pd.DataFrame): Training features.
            X_test (Optional[pd.DataFrame]): Testing features.

        Returns:
            Tuple[pd.DataFrame, Optional[pd.DataFrame]]: Transformed X_train and X_test.
        """
        step_name = "Choose and Apply Transformations (Based on Normality Tests)"
        self.logger.info(f"Step: {step_name}")

        # Fetch user-defined transformation options or set defaults
        transformation_options = self.options.get('choose_transformations', {})
        transformation_method = transformation_options.get('method', 'power')  # 'power', 'log', 'None'
        skewness_threshold = transformation_options.get('skewness_threshold', 1.0)

        # Initialize list to collect features needing transformation based on normality tests
        if self.normality_results:
            features_to_transform = [col for col in self.numericals if self.normality_results[col]['needs_transform']]
        else:
            # Default transformation for clustering if normality_results is empty
            features_to_transform = self.numericals  # Apply to all numerical features
            self.logger.info("No normality results available. Applying default transformations to all numerical features.")

        if features_to_transform:
            self.features_to_transform = features_to_transform  # Store the transformed features
            if transformation_method == 'power':
                method = transformation_options.get('power_method', 'yeo-johnson')  # 'yeo-johnson' or 'box-cox'
                self.transformer = PowerTransformer(method=method)
                self.logger.debug(f"Applying PowerTransformer with method '{method}' to features: {features_to_transform}")
                X_train[features_to_transform] = self.transformer.fit_transform(X_train[features_to_transform])
                if X_test is not None:
                    # Ensure X_test is reindexed to match X_train after transformations
                    X_test = X_test.reindex(X_train.index)
                    X_test[features_to_transform] = self.transformer.transform(X_test[features_to_transform])
                for col in features_to_transform:
                    self.feature_reasons[col] += f'Applied PowerTransformer ({method}) | '
            elif transformation_method == 'log':
                # Apply log transformation if data is strictly positive
                apply_log = True
                for col in features_to_transform:
                    if (X_train[col] <= 0).any():
                        self.logger.warning(f"Cannot apply log transform to '{col}' as it contains non-positive values.")
                        apply_log = False
                        break
                if apply_log:
                    self.logger.debug(f"Applying Log Transform to features: {features_to_transform}")
                    X_train[features_to_transform] = np.log1p(X_train[features_to_transform])
                    if X_test is not None:
                        X_test[features_to_transform] = np.log1p(X_test[features_to_transform])
                    for col in features_to_transform:
                        self.feature_reasons[col] += 'Applied Log Transform | '
                else:
                    self.logger.info("Log Transform skipped due to non-positive values.")
            elif transformation_method is None:
                self.logger.info("Transformation method set to None. No transformations applied.")
            else:
                self.logger.error(f"Transformation method '{transformation_method}' is not supported.")
                raise ValueError(f"Transformation method '{transformation_method}' is not supported.")

            self.preprocessing_steps.append(step_name)
            if self.debug_choose_transformations:
                self.logger.debug(f"Completed: {step_name}. Transformed features: {features_to_transform}")
                self.logger.debug(f"Sample of transformed X_train:\n{X_train[features_to_transform].head()}")
                if X_test is not None:
                    self.logger.debug(f"Sample of transformed X_test:\n{X_test[features_to_transform].head()}")
            else:
                self.logger.info(f"Step '{step_name}' completed: Applied transformations to {len(features_to_transform)} features.")
        else:
            self.logger.info("No significant skewness or p-value indicators detected. No transformations applied.")
            self.preprocessing_steps.append(step_name)
            if self.debug_choose_transformations:
                self.logger.debug(f"Completed: {step_name}. No transformations were applied.")
            else:
                self.logger.info(f"Step '{step_name}' completed: No transformations were applied.")

        return X_train, X_test

    def encode_categorical_variables(self, X_train: pd.DataFrame, X_test: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Encode categorical variables using user-specified encoding strategies.

        Args:
            X_train (pd.DataFrame): Training features.
            X_test (Optional[pd.DataFrame]): Testing features.

        Returns:
            Tuple[pd.DataFrame, Optional[pd.DataFrame]]: Encoded X_train and X_test.
        """
        step_name = "Encode Categorical Variables"
        self.logger.info(f"Step: {step_name}")

        # Fetch user-defined encoding options or set defaults
        encoding_options = self.options.get('encode_categoricals', {})
        ordinal_encoding = encoding_options.get('ordinal_encoding', 'OrdinalEncoder')  # Options: 'OrdinalEncoder', 'None'
        nominal_encoding = encoding_options.get('nominal_encoding', 'OneHotEncoder')  # Options: 'OneHotEncoder', 'OrdinalEncoder', 'FrequencyEncoder', etc.
        handle_unknown = encoding_options.get('handle_unknown', 'ignore')  # For OneHotEncoder

        # Determine if SMOTENC is being used
        smote_variant = self.options.get('implement_smote', {}).get('variant', None)
        if smote_variant == 'SMOTENC':
            nominal_encoding = 'OrdinalEncoder'  # Override to ensure compatibility

        transformers = []
        new_columns = []
        if self.ordinal_categoricals and ordinal_encoding != 'None':
            if ordinal_encoding == 'OrdinalEncoder':
                transformers.append(
                    ('ordinal', OrdinalEncoder(), self.ordinal_categoricals)
                )
            else:
                self.logger.error(f"Ordinal encoding method '{ordinal_encoding}' is not supported.")
                raise ValueError(f"Ordinal encoding method '{ordinal_encoding}' is not supported.")
        if self.nominal_categoricals and nominal_encoding != 'None':
            if nominal_encoding == 'OneHotEncoder':
                transformers.append(
                    ('nominal', OneHotEncoder(handle_unknown=handle_unknown, sparse_output=False), self.nominal_categoricals)
                )
            elif nominal_encoding == 'OrdinalEncoder':
                transformers.append(
                    ('nominal', OrdinalEncoder(), self.nominal_categoricals)
                )
            elif nominal_encoding == 'FrequencyEncoder':
                # Custom Frequency Encoding
                for col in self.nominal_categoricals:
                    freq = X_train[col].value_counts(normalize=True)
                    X_train[col] = X_train[col].map(freq)
                    if X_test is not None:
                        X_test[col] = X_test[col].map(freq).fillna(0)
                    self.feature_reasons[col] += 'Encoded with Frequency Encoding | '
                    self.logger.debug(f"Applied Frequency Encoding to '{col}'.")
                transformers = []  # No transformers needed
            else:
                self.logger.error(f"Nominal encoding method '{nominal_encoding}' is not supported.")
                raise ValueError(f"Nominal encoding method '{nominal_encoding}' is not supported.")

        if not transformers and 'FrequencyEncoder' not in nominal_encoding:
            self.logger.info("No categorical variables to encode.")
            self.preprocessing_steps.append(step_name)
            if self.debug_encode_categoricals:
                self.logger.debug(f"Completed: {step_name}. No encoding was applied.")
            else:
                self.logger.info(f"Step '{step_name}' completed: No encoding was applied.")
            return X_train, X_test

        if transformers:
            self.preprocessor = ColumnTransformer(
                transformers=transformers,
                remainder='passthrough'  # Keep other columns unchanged
            )

            # Fit and transform training data
            X_train_encoded = self.preprocessor.fit_transform(X_train)
            if self.debug_encode_categoricals:
                self._log("Fitted and transformed X_train with ColumnTransformer.", self.debug_encode_categoricals, 'debug')
            else:
                self.logger.info("Fitted and transformed X_train with ColumnTransformer.")

            # Transform testing data
            if X_test is not None:
                X_test_encoded = self.preprocessor.transform(X_test)
                if self.debug_encode_categoricals:
                    self._log("Transformed X_test with fitted ColumnTransformer.", self.debug_encode_categoricals, 'debug')
                else:
                    self.logger.info("Transformed X_test with fitted ColumnTransformer.")
            else:
                X_test_encoded = None

            # Retrieve feature names after encoding
            encoded_feature_names = []
            if self.ordinal_categoricals and ordinal_encoding == 'OrdinalEncoder':
                encoded_feature_names += self.ordinal_categoricals
            if self.nominal_categoricals and nominal_encoding == 'OneHotEncoder':
                nominal_encoded_names = self.preprocessor.named_transformers_['nominal'].get_feature_names_out(self.nominal_categoricals).tolist()
                encoded_feature_names += nominal_encoded_names
                new_columns.extend(nominal_encoded_names)
                self.nominal_encoded_feature_names = nominal_encoded_names  # Update the list
                self.logger.debug(f"Nominal encoded feature names (OneHotEncoder): {self.nominal_encoded_feature_names}")
            elif self.nominal_categoricals and nominal_encoding == 'OrdinalEncoder':
                encoded_feature_names += self.nominal_categoricals
                new_columns.extend(self.nominal_categoricals)
                self.nominal_encoded_feature_names = self.nominal_categoricals  # Update the list
                self.logger.debug(f"Nominal encoded feature names (OrdinalEncoder): {self.nominal_encoded_feature_names}")

            # Identify passthrough (numerical) feature names
            passthrough_features = [col for col in X_train.columns if col not in self.ordinal_categoricals + self.nominal_categoricals]
            encoded_feature_names += passthrough_features
            new_columns.extend(passthrough_features)

            # Convert numpy arrays back to DataFrames
            X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_feature_names, index=X_train.index)
            if X_test_encoded is not None:
                X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_feature_names, index=X_test.index)
            else:
                X_test_encoded_df = None

            # Store encoders for inverse transformation
            self.ordinal_encoder = self.preprocessor.named_transformers_['ordinal'] if 'ordinal' in self.preprocessor.named_transformers_ else None
            self.nominal_encoder = self.preprocessor.named_transformers_['nominal'] if 'nominal' in self.preprocessor.named_transformers_ else None

            # Store encoded nominal feature names for inverse transformation
            if self.nominal_categoricals and nominal_encoding == 'OneHotEncoder':
                self.nominal_encoded_feature_names = nominal_encoded_names

            self.preprocessing_steps.append(step_name)
            if self.debug_encode_categoricals:
                self.logger.debug(f"Completed: {step_name}. X_train_encoded shape: {X_train_encoded_df.shape}")
                self.logger.debug(f"Columns after encoding: {encoded_feature_names}")
                self.logger.debug(f"Sample of encoded X_train:\n{X_train_encoded_df.head()}")
                self.logger.debug(f"New columns added: {new_columns}")
            else:
                self.logger.info(f"Step '{step_name}' completed: Encoded categorical variables.")

            return X_train_encoded_df, X_test_encoded_df

        else:
            # Frequency Encoding was applied; no transformers to handle
            self.preprocessing_steps.append(step_name)
            if self.debug_encode_categoricals:
                self.logger.debug(f"Completed: {step_name}. Frequency encoding applied to nominal features.")
            else:
                self.logger.info(f"Step '{step_name}' completed: Frequency encoding applied to nominal features.")
            return X_train, X_test

    def apply_scaling(self, X_train: pd.DataFrame, X_test: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Apply scaling based on the model type and user options.

        Args:
            X_train (pd.DataFrame): Training features.
            X_test (Optional[pd.DataFrame]): Testing features.

        Returns:
            Tuple[pd.DataFrame, Optional[pd.DataFrame]]: Scaled X_train and X_test.
        """
        step_name = "Apply Scaling (If Needed by Model)"
        self.logger.info(f"Step: {step_name}")

        # Fetch user-defined scaling options or set defaults
        scaling_options = self.options.get('apply_scaling', {})
        scaling_method = scaling_options.get('method', None)  # 'StandardScaler', 'MinMaxScaler', 'RobustScaler', 'None'
        features_to_scale = scaling_options.get('features', self.numericals)

        scaler = None
        scaling_type = 'None'

        if scaling_method is None:
            # Default scaling based on model category
            if self.model_category in ['regression', 'classification', 'neural_networks', 'clustering']:
                # For clustering, MinMaxScaler is generally preferred
                if self.model_category == 'clustering':
                    scaler = MinMaxScaler()
                    scaling_type = 'MinMaxScaler'
                else:
                    scaler = StandardScaler()
                    scaling_type = 'StandardScaler'
            else:
                scaler = None
                scaling_type = 'None'
        else:
            # User-specified scaling method
            if scaling_method == 'StandardScaler':
                scaler = StandardScaler()
                scaling_type = 'StandardScaler'
            elif scaling_method == 'MinMaxScaler':
                scaler = MinMaxScaler()
                scaling_type = 'MinMaxScaler'
            elif scaling_method == 'RobustScaler':
                scaler = RobustScaler()
                scaling_type = 'RobustScaler'
            elif scaling_method == 'None':
                scaler = None
                scaling_type = 'None'
            else:
                self.logger.error(f"Scaling method '{scaling_method}' is not supported.")
                raise ValueError(f"Scaling method '{scaling_method}' is not supported.")

        # Apply scaling if scaler is defined
        if scaler is not None and features_to_scale:
            self.scaler = scaler
            self.logger.debug(f"Features to scale: {features_to_scale}")

            # Check if features exist in the dataset
            missing_features = [feat for feat in features_to_scale if feat not in X_train.columns]
            if missing_features:
                self.logger.error(f"The following features specified for scaling are missing in the dataset: {missing_features}")
                raise KeyError(f"The following features specified for scaling are missing in the dataset: {missing_features}")

            X_train[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])
            if X_test is not None:
                X_test[features_to_scale] = scaler.transform(X_test[features_to_scale])

            for col in features_to_scale:
                self.feature_reasons[col] += f'Scaling Applied: {scaling_type} | '

            self.preprocessing_steps.append(step_name)
            if self.debug_apply_scaling:
                self.logger.debug(f"Applied {scaling_type} to features: {features_to_scale}")
                if hasattr(scaler, 'mean_'):
                    self.logger.debug(f"Scaler Parameters: mean={scaler.mean_}")
                if hasattr(scaler, 'scale_'):
                    self.logger.debug(f"Scaler Parameters: scale={scaler.scale_}")
                self.logger.debug(f"Sample of scaled X_train:\n{X_train[features_to_scale].head()}")
                if X_test is not None:
                    self.logger.debug(f"Sample of scaled X_test:\n{X_test[features_to_scale].head()}")
            else:
                self.logger.info(f"Step '{step_name}' completed: Applied {scaling_type} to features: {features_to_scale}")
        else:
            self.logger.info("No scaling applied based on user options or no features specified.")
            self.preprocessing_steps.append(step_name)
            if self.debug_apply_scaling:
                self.logger.debug(f"Completed: {step_name}. No scaling was applied.")
            else:
                self.logger.info(f"Step '{step_name}' completed: No scaling was applied.")

        return X_train, X_test


    def smote_numerics_criteria(
        self,
        X_train: pd.DataFrame,
        y_train: pd.Series,
        imbalance_threshold: float = 0.1,
        extreme_imbalance_threshold: float = 0.05,
        noise_threshold: float = 0.1,
        overlap_threshold: float = 0.1,
        boundary_threshold: float = 0.1,
        debug: bool = False
    ) -> List[str]:
        """
        Recommend SMOTE variants for numerical-only datasets based on dataset characteristics.

        Args:
            X_train (pd.DataFrame): Training features.
            y_train (pd.Series): Training target.
            imbalance_threshold (float): Threshold for considering class imbalance.
            extreme_imbalance_threshold (float): Threshold for extreme class imbalance.
            noise_threshold (float): Threshold for noise level.
            overlap_threshold (float): Threshold for class overlap.
            boundary_threshold (float): Threshold for boundary complexities.
            debug (bool): Flag to enable debug logging.

        Returns:
            List[str]: Recommended SMOTE variants in order of preference.
        """
        recommendations = []
        
        # Step 1: Class Distribution
        class_distribution = y_train.value_counts(normalize=True)
        majority_class = class_distribution.idxmax()
        minority_class = class_distribution.idxmin()

        severe_imbalance = class_distribution[minority_class] < imbalance_threshold
        extreme_imbalance = class_distribution[minority_class] < extreme_imbalance_threshold

        if debug:
            self.logger.debug(f"X_train Shape: {X_train.shape}")
            self.logger.debug(f"Class Distribution: {class_distribution.to_dict()}")
            if extreme_imbalance:
                self.logger.warning(f"Extreme imbalance detected: {class_distribution[minority_class]:.2%}")

        # Step 2: Noise Analysis
        minority_samples = X_train[y_train == minority_class]
        majority_samples = X_train[y_train == majority_class]

        try:
            knn = NearestNeighbors(n_neighbors=5).fit(majority_samples)
            distances, _ = knn.kneighbors(minority_samples)
            median_distance = np.median(distances)
            noise_ratio = np.mean(distances < median_distance)
            noisy_data = noise_ratio > noise_threshold

            if debug:
                self.logger.debug(f"Median Distance to Nearest Neighbors: {median_distance}")
                self.logger.debug(f"Noise Ratio: {noise_ratio:.2%}")
        except ValueError as e:
            self.logger.error(f"Noise analysis error: {e}")
            noisy_data = False

        # Step 3: Overlap Analysis
        try:
            pdistances = pairwise_distances(minority_samples, majority_samples)
            overlap_metric = np.mean(pdistances < 1.0)  # Threshold can be adjusted
            overlapping_classes = overlap_metric > overlap_threshold

            if debug:
                self.logger.debug(f"Overlap Metric: {overlap_metric:.2%}")
        except ValueError as e:
            self.logger.error(f"Overlap analysis error: {e}")
            overlapping_classes = False

        # Step 4: Boundary Concentration
        try:
            boundary_ratio = np.mean(np.min(distances, axis=1) < np.percentile(distances, 25))
            boundary_concentration = boundary_ratio > boundary_threshold

            if debug:
                self.logger.debug(f"Boundary Concentration Ratio: {boundary_ratio:.2%}")
        except Exception as e:
            self.logger.error(f"Boundary concentration error: {e}")
            boundary_concentration = False

        # Step 5: Recommendations
        if extreme_imbalance:
            recommendations.append("ADASYN" if not noisy_data else "SMOTEENN")
            recommendations.append("SMOTEENN")
            recommendations.append("SMOTETomek")
            recommendations.append("BorderlineSMOTE")
            if debug:
                self.logger.debug("Extreme imbalance detected. Recommended variants: ADASYN/SMOTEENN, SMOTEENN, SMOTETomek, BorderlineSMOTE")
            return recommendations

        if severe_imbalance:
            recommendations.append("ADASYN" if not noisy_data else "SMOTEENN")
            recommendations.append("SMOTEENN")
            recommendations.append("SMOTETomek")
            recommendations.append("BorderlineSMOTE")
            if debug:
                self.logger.debug("Severe imbalance detected. Recommended variants: ADASYN/SMOTEENN, SMOTEENN, SMOTETomek, BorderlineSMOTE")
            return recommendations

        if noisy_data:
            recommendations.append("SMOTEENN")
            if debug:
                self.logger.debug("Noisy data detected. Recommended variant: SMOTEENN")
        
        if overlapping_classes:
            recommendations.append("SMOTETomek")
            if debug:
                self.logger.debug("Overlapping classes detected. Recommended variant: SMOTETomek")
        
        if boundary_concentration:
            recommendations.append("BorderlineSMOTE")
            if debug:
                self.logger.debug("Boundary concentration detected. Recommended variant: BorderlineSMOTE")
        
        if not recommendations:
            recommendations.append("SMOTE")
            if debug:
                self.logger.debug("No specific issues detected. Recommended variant: SMOTE")
        
        # Remove duplicates while preserving order
        seen = set()
        recommendations = [x for x in recommendations if not (x in seen or seen.add(x))]

        return recommendations

    def implement_smote(self, X_train: pd.DataFrame, y_train: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
        """
        Implement SMOTE or its variants based on class imbalance, dataset characteristics, and user options.

        Args:
            X_train (pd.DataFrame): Training features.
            y_train (pd.Series): Training target.

        Returns:
            Tuple[pd.DataFrame, pd.Series]: Resampled X_train and y_train.
        """
        step_name = "Implement SMOTE (Train Only)"
        self.logger.info(f"Step: {step_name}")

        # Check if classification
        if self.model_category != 'classification':
            self.logger.info("SMOTE not applicable: Not a classification model.")
            self.preprocessing_steps.append("SMOTE Skipped")
            return X_train, y_train

        # Fetch user-defined SMOTE options or set defaults
        smote_options = self.options.get('implement_smote', {})
        user_smote_variant = smote_options.get('variant', None)
        smote_params = smote_options.get('params', {})

        # Calculate class distribution
        class_counts = y_train.value_counts()
        if len(class_counts) < 2:
            self.logger.warning("SMOTE not applicable: Only one class present.")
            self.preprocessing_steps.append("SMOTE Skipped")
            return X_train, y_train
        majority_class = class_counts.idxmax()
        minority_class = class_counts.idxmin()
        majority_count = class_counts.max()
        minority_count = class_counts.min()
        imbalance_ratio = minority_count / majority_count
        self.logger.info(f"Class Distribution before SMOTE: {class_counts.to_dict()}")
        self.logger.info(f"Imbalance Ratio (Minority/Majority): {imbalance_ratio:.4f}")

        # Determine dataset composition
        has_numericals = len(self.numericals) > 0
        has_categoricals = len(self.ordinal_categoricals) + len(self.nominal_categoricals) > 0

        # SMOTE Variant Selection
        recommended_variants = []
        variant_name = 'SMOTE'  # Default variant

        if user_smote_variant:
            variant_name = user_smote_variant
            self.logger.info(f"User-specified SMOTE variant: {variant_name}")
        else:
            if has_numericals and not has_categoricals:
                # Numeric only dataset: Recommend based on criteria
                self.logger.info("Dataset contains only numerical features. Analyzing to recommend SMOTE variants...")
                recommended_variants = self.smote_numerics_criteria(
                    X_train=X_train,
                    y_train=y_train,
                    imbalance_threshold=self.imbalance_threshold,
                    extreme_imbalance_threshold=self.extreme_imbalance_threshold,
                    noise_threshold=self.noise_threshold,
                    overlap_threshold=self.overlap_threshold,
                    boundary_threshold=self.boundary_threshold,
                    debug=self.debug_implement_smote
                )
                # Select the first recommended variant
                if recommended_variants:
                    variant_name = recommended_variants[0]
                    self.logger.info(f"Recommended SMOTE variant: {variant_name}")
                else:
                    variant_name = 'SMOTE'
                    self.logger.info("No specific recommendation from criteria. Using default SMOTE.")
            elif has_numericals and has_categoricals:
                # Mixed dataset: Use SMOTENC
                variant_name = 'SMOTENC'
                self.logger.info("Dataset contains numerical and categorical features. Using SMOTENC.")
            elif not has_numericals and has_categoricals:
                # Categorical only dataset: Use SMOTEN
                variant_name = 'SMOTEN'
                self.logger.info("Dataset contains only categorical features. Using SMOTEN.")
            else:
                # Fallback to SMOTE
                variant_name = 'SMOTE'
                self.logger.info("Dataset composition not recognized. Using SMOTE as default.")

        # Initialize SMOTE variant
        try:
            self.logger.debug(f"Initializing SMOTE Variant '{variant_name}' with parameters: {smote_params}")

            if variant_name == 'SMOTENC':
                if not self.nominal_encoded_feature_names:
                    self.logger.error("No nominal encoded feature names available for SMOTENC.")
                    raise ValueError("No nominal encoded feature names available for SMOTENC.")
                categorical_features = [X_train.columns.get_loc(col) for col in self.nominal_encoded_feature_names]
                smote = SMOTENC(
                    categorical_features=categorical_features,
                    random_state=42,
                    **smote_params
                )
            elif variant_name == 'SMOTEN':
                smote = SMOTEN(
                    random_state=42,
                    **smote_params
                )
            elif variant_name in ['SMOTE', 'ADASYN', 'BorderlineSMOTE', 'SMOTETomek', 'SMOTEENN']:
                smote_class = {
                    'SMOTE': SMOTE,
                    'ADASYN': ADASYN,
                    'BorderlineSMOTE': BorderlineSMOTE,
                    'SMOTETomek': SMOTETomek,
                    'SMOTEENN': SMOTEENN
                }.get(variant_name, SMOTE)  # Default to SMOTE if not found
                smote = smote_class(
                    random_state=42,
                    **smote_params
                )
            else:
                self.logger.warning(f"Unknown SMOTE variant '{variant_name}'. Falling back to SMOTE.")
                smote = SMOTE(random_state=42, **smote_params)

            # Validate parameters using inspect
            smote_signature = signature(smote.__class__)
            valid_params = smote_signature.parameters.keys()
            invalid_params = set(smote_params.keys()) - set(valid_params)
            if invalid_params:
                self.logger.warning(f"Invalid parameters for SMOTE variant '{variant_name}': {invalid_params}. These will be ignored.")

        except TypeError as e:
            self.logger.error(f"Error initializing SMOTE variant '{variant_name}': {e}")
            raise
        except Exception as e:
            self.logger.error(f"Unexpected error during SMOTE initialization: {e}")
            raise

        # Apply SMOTE variant
        try:
            X_res, y_res = smote.fit_resample(X_train, y_train)
            self.smote = smote
            self.preprocessing_steps.append(step_name)
            self.logger.info(f"Applied SMOTE Variant '{variant_name}'. Resampled X_train shape: {X_res.shape}, y_train shape: {y_res.shape}")
            return X_res, y_res
        except ValueError as ve:
            self.logger.error(f"ValueError during SMOTE application: {ve}")
            raise
        except Exception as e:
            self.logger.error(f"Unexpected error during SMOTE application: {e}")
            raise



    def final_inverse_transformations(self, X_test_preprocessed: pd.DataFrame, X_test_original: pd.DataFrame) -> pd.DataFrame:
        """
        Perform inverse transformations to revert preprocessed data back to its original form based on user options.

        Args:
            X_test_preprocessed (pd.DataFrame): Preprocessed test features.
            X_test_original (pd.DataFrame): Original test features.

        Returns:
            pd.DataFrame: Inverse-transformed test features.
        """
        step_name = "Final Inverse Transformations for Interpretability"
        self.logger.info(f"Step: {step_name}")

        # Fetch user-defined inverse transformation options or set defaults
        inverse_options = self.options.get('inverse_transformations', {})
        inverse_scaling = inverse_options.get('inverse_scaling', True)
        inverse_transformation = inverse_options.get('inverse_transformation', True)
        inverse_encoding = inverse_options.get('inverse_encoding', True)

        # Initialize DataFrame for inverse transformed data
        X_inverse = pd.DataFrame(index=X_test_preprocessed.index)

        # Inverse Scaling
        if inverse_scaling and hasattr(self, 'scaler') and self.scaler is not None:
            try:
                # Ensure that features_to_scale were scaled
                scaling_features = self.options.get('apply_scaling', {}).get('features', self.numericals)
                X_inverse[scaling_features] = self.scaler.inverse_transform(X_test_preprocessed[scaling_features])
                for col in scaling_features:
                    self.feature_reasons[col] += f'Inverse Scaling Applied | '
                if self.debug_final_inverse_transformations:
                    self.logger.debug("Inverse Scaling Completed")
                    self.logger.debug(f"Sample of inverse-scaled data:\n{X_inverse[scaling_features].head()}")
            except Exception as e:
                self.logger.error(f"Error during inverse Scaling: {e}")
                raise
        else:
            # If scaling was not applied, retain original numerical features
            X_inverse[self.numericals] = X_test_preprocessed[self.numericals]

        # Inverse Transformation (PowerTransformer or Log Transform)
        if inverse_transformation and self.transformer is not None:
            try:
                # Inverse transform only the transformed features
                X_inverse[self.features_to_transform] = self.transformer.inverse_transform(X_inverse[self.features_to_transform])
                for col in self.features_to_transform:
                    self.feature_reasons[col] += f'Inverse Transformation Applied | '
                if self.debug_final_inverse_transformations:
                    self.logger.debug("Inverse Transformation Applied")
            except Exception as e:
                self.logger.error(f"Error during inverse Transformation: {e}")
                raise

        # Inverse Encoding for Ordinal Categorical Features
        if inverse_encoding and self.ordinal_categoricals and self.ordinal_encoder:
            try:
                X_inverse[self.ordinal_categoricals] = self.ordinal_encoder.inverse_transform(X_test_preprocessed[self.ordinal_categoricals])
                for col in self.ordinal_categoricals:
                    self.feature_reasons[col] += 'Inverse Ordinal Encoding Applied | '
                if self.debug_final_inverse_transformations:
                    self.logger.debug("Inverse Ordinal Encoding Completed for Ordinal Categorical Features")
            except Exception as e:
                self.logger.error(f"Error during inverse Ordinal Encoding for Ordinal Categorical Features: {e}")
                raise

        # Inverse Encoding for Nominal Categorical Features
        if inverse_encoding and self.nominal_categoricals and self.preprocessor and 'nominal' in self.preprocessor.named_transformers_:
            try:
                if hasattr(self.preprocessor.named_transformers_['nominal'], 'get_feature_names_out'):
                    # Extract nominal encoded features
                    nominal_encoded = X_test_preprocessed[self.preprocessor.named_transformers_['nominal'].get_feature_names_out(self.nominal_categoricals)]
                    nominal_original = self.preprocessor.named_transformers_['nominal'].inverse_transform(nominal_encoded)
                    nominal_original_df = pd.DataFrame(nominal_original, columns=self.nominal_categoricals, index=X_test_preprocessed.index)
                    X_inverse[self.nominal_categoricals] = nominal_original_df
                    for col in self.nominal_categoricals:
                        self.feature_reasons[col] += 'Inverse One-Hot Encoding Applied | '
                    if self.debug_final_inverse_transformations:
                        self.logger.debug("Inverse One-Hot Encoding Completed for Nominal Categorical Features")
                else:
                    self.logger.warning("Nominal encoder does not support get_feature_names_out. Skipping inverse transformation for nominal features.")
            except Exception as e:
                self.logger.error(f"Error during inverse One-Hot Encoding for Nominal Categorical Features: {e}")
                raise

        # Combine all features
        try:
            # Include passthrough (non-transformed) features
            passthrough_features = [col for col in X_test_original.columns if col not in self.numericals + self.ordinal_categoricals + self.nominal_categoricals]
            if passthrough_features:
                X_inverse = pd.concat([X_inverse, X_test_preprocessed[passthrough_features]], axis=1)

            # Reorder columns to match the original DataFrame
            X_final_inverse = X_inverse[X_test_original.columns]
        except Exception as e:
            self.logger.error(f"Error during combining inverse transformed data: {e}")
            raise

        self.preprocessing_steps.append(step_name)
        if self.debug_final_inverse_transformations:
            self.logger.debug(f"Completed: {step_name}. Inverse-transformed X_test shape: {X_final_inverse.shape}")
            self.logger.debug(f"Sample of inverse-transformed X_test:\n{X_final_inverse.head()}")
        else:
            self.logger.info(f"Step '{step_name}' completed: Inverse transformations applied.")

        return X_final_inverse

    def validate_inverse_transformations(self, X_original: pd.DataFrame, X_inverse: pd.DataFrame, tolerance: float = 1e-4):
        """
        Validate that inverse transformations accurately restore original data within acceptable tolerances.

        Args:
            X_original (pd.DataFrame): Original features.
            X_inverse (pd.DataFrame): Inverse-transformed features.
            tolerance (float, optional): Tolerance level for differences. Defaults to 1e-4.
        """
        step_name = "Final Inverse Transformation Validation"
        self.logger.info(f"Step: {step_name}")

        differences = {}

        for col in self.nominal_categoricals + self.ordinal_categoricals:
            diff = X_original[col].astype(str) != X_inverse[col].astype(str)
            differences[col] = {
                'total_differences': diff.sum(),
                'percentage_differences': (diff.sum() / len(diff)) * 100
            }

        for col in self.numericals:
            diff = np.abs(X_original[col] - X_inverse[col]) > tolerance
            differences[col] = {
                'total_differences': diff.sum(),
                'percentage_differences': (diff.sum() / len(diff)) * 100
            }

        # Display the differences
        for col, stats in differences.items():
            self.logger.info(f"Column: {col}")
            self.logger.info(f" - Total Differences: {stats['total_differences']}")
            self.logger.info(f" - Percentage Differences: {stats['percentage_differences']:.2f}%")

            if stats['total_differences'] > 0:
                self.logger.warning(f"Differences found in column '{col}':")
                if col in self.nominal_categoricals + self.ordinal_categoricals:
                    mask = X_original[col].astype(str) != X_inverse[col].astype(str)
                else:
                    mask = np.abs(X_original[col] - X_inverse[col]) > tolerance
                comparison = pd.concat([
                    X_original.loc[mask, col].reset_index(drop=True).rename('Original'),
                    X_inverse.loc[mask, col].reset_index(drop=True).rename('Inverse Transformed')
                ], axis=1)
                self.logger.debug(f"Differences in '{col}':\n{comparison}")
                self.logger.debug("\n")

        # Check if indices are aligned
        if not X_original.index.equals(X_inverse.index):
            self.logger.warning("Indices of original and inverse transformed data do not match.")
        else:
            self.logger.info("Success: Indices of original and inverse transformed data are aligned.")

        # Check overall success
        total_differences = sum([v['total_differences'] for v in differences.values()])
        if total_differences == 0:
            self.logger.info("Inverse Transformation Validation Passed: No differences found.")
        else:
            self.logger.warning(f"Inverse Transformation Validation Failed: {total_differences} differences found across all features.")

        self.preprocessing_steps.append(step_name)
        if self.debug_validate_inverse_transformations:
            self.logger.debug(f"Completed: {step_name}. Validation results generated.")
        else:
            self.logger.info(f"Step '{step_name}' completed: Validation results generated.")

    def generate_recommendations(self) -> pd.DataFrame:
        """
        Generate a table of preprocessing recommendations based on the model type, data, and user options.

        Returns:
            pd.DataFrame: DataFrame containing recommendations for each feature.
        """
        step_name = "Generate Preprocessor Recommendations"
        self.logger.info(f"Step: {step_name}")

        # Generate recommendations based on feature reasons
        recommendations = {}
        for col in self.ordinal_categoricals + self.nominal_categoricals + self.numericals:
            reasons = self.feature_reasons.get(col, '').strip(' | ')
            recommendations[col] = reasons

        recommendations_table = pd.DataFrame.from_dict(
            recommendations, 
            orient='index', 
            columns=['Preprocessing Reason']
        )
        self.logger.debug(f"Preprocessing Recommendations:\n{recommendations_table}")
        self.preprocessing_steps.append(step_name)

        if self.debug_generate_recommendations:
            self.logger.debug(f"Completed: {step_name}. Recommendations generated.")
        else:
            self.logger.info(f"Step '{step_name}' completed: Recommendations generated.")

        return recommendations_table

    def save_transformers(self):
        """
        Save fitted transformers to disk for future use during prediction.
        """
        transformers_path = os.path.join(self.graphs_output_dir, 'transformers.pkl')
        transformers = {
            'numerical_imputer': self.numerical_imputer if hasattr(self, 'numerical_imputer') else None,
            'categorical_imputer': self.categorical_imputer if hasattr(self, 'categorical_imputer') else None,
            'transformer': self.transformer,
            'preprocessor': self.preprocessor,
            'scaler': self.scaler,
            'ordinal_encoder': self.ordinal_encoder,
            'nominal_encoder': self.nominal_encoder,
            'cluster_transformers': self.cluster_transformers,
            'smote': self.smote,
            'final_feature_order': self.final_feature_order
        }
        joblib.dump(transformers, transformers_path)
        self.logger.info(f"Transformers saved at '{transformers_path}'.")

    def load_transformers(self):
        """
        Load transformers from disk for use during prediction.
        """
        transformers_path = os.path.join(self.graphs_output_dir, 'transformers.pkl')
        if not os.path.exists(transformers_path):
            self.logger.error(f"Transformers file not found at '{transformers_path}'. Cannot proceed with prediction.")
            raise FileNotFoundError(f"Transformers file not found at '{transformers_path}'.")

        transformers = joblib.load(transformers_path)
        self.numerical_imputer = transformers.get('numerical_imputer')
        self.categorical_imputer = transformers.get('categorical_imputer')
        self.transformer = transformers.get('transformer')
        self.preprocessor = transformers.get('preprocessor')
        self.scaler = transformers.get('scaler')
        self.ordinal_encoder = transformers.get('ordinal_encoder')
        self.nominal_encoder = transformers.get('nominal_encoder')
        self.cluster_transformers = transformers.get('cluster_transformers', {})
        self.smote = transformers.get('smote', None)
        self.final_feature_order = transformers.get('final_feature_order', [])
        self.logger.info(f"Transformers loaded from '{transformers_path}'.")

    def handle_missing_values_predict(self, X: pd.DataFrame) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Handle missing values for prediction mode using fitted imputers.

        Args:
            X (pd.DataFrame): Input features.

        Returns:
            Tuple[pd.DataFrame, Optional[pd.DataFrame]]: Transformed X and None.
        """
        self.logger.info("Handling missing values for prediction.")

        if self.numericals:
            X[self.numericals] = self.numerical_imputer.transform(X[self.numericals])

        all_categoricals = self.ordinal_categoricals + self.nominal_categoricals
        if all_categoricals:
            X[all_categoricals] = self.categorical_imputer.transform(X[all_categoricals])

        return X, None

    def choose_and_apply_transformations_predict(self, X: pd.DataFrame) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Apply transformations for prediction mode using fitted transformers.

        Args:
            X (pd.DataFrame): Input features.

        Returns:
            Tuple[pd.DataFrame, Optional[pd.DataFrame]]: Transformed X and None.
        """
        self.logger.info("Applying transformations for prediction.")

        if self.transformer is not None:
            X[self.features_to_transform] = self.transformer.transform(X[self.features_to_transform])

        return X, None

    def encode_categorical_variables_predict(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Encode categorical variables for prediction mode using fitted encoders.

        Args:
            X (pd.DataFrame): Input features.

        Returns:
            pd.DataFrame: Encoded features.
        """
        self.logger.info("Encoding categorical variables for prediction.")

        if self.preprocessor is not None:
            X_encoded = self.preprocessor.transform(X)
            encoded_feature_names = []
            if self.ordinal_categoricals and self.ordinal_encoder is not None:
                encoded_feature_names += self.ordinal_categoricals
            if self.nominal_categoricals and self.nominal_encoder is not None:
                if hasattr(self.preprocessor.named_transformers_['nominal'], 'get_feature_names_out'):
                    nominal_encoded_names = self.preprocessor.named_transformers_['nominal'].get_feature_names_out(self.nominal_categoricals).tolist()
                    encoded_feature_names += nominal_encoded_names
            passthrough_features = [col for col in X.columns if col not in self.ordinal_categoricals + self.nominal_categoricals]
            encoded_feature_names += passthrough_features

            X_encoded_df = pd.DataFrame(X_encoded, columns=encoded_feature_names, index=X.index)
            return X_encoded_df

        return X

    def apply_scaling_predict(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Apply scaling for prediction mode using fitted scaler.

        Args:
            X (pd.DataFrame): Input features.

        Returns:
            pd.DataFrame: Scaled features.
        """
        self.logger.info("Applying scaling for prediction.")

        if self.scaler is not None and self.options.get('apply_scaling', {}).get('features', []):
            scaling_features = self.options.get('apply_scaling', {}).get('features', self.numericals)
            X[scaling_features] = self.scaler.transform(X[scaling_features])

        return X

    def final_preprocessing(
        self, 
        X: pd.DataFrame, 
        y: Optional[pd.Series] = None  # Make y optional
    ) -> Tuple:
        """
        Execute the full preprocessing pipeline based on the mode.

        Args:
            X (pd.DataFrame): Input features.
            y (Optional[pd.Series]): Target variable (required for training).

        Returns:
            Tuple: Processed data based on mode.
        """
        step_name = "Final Preprocessing Pipeline"
        self.logger.info(f"Starting: {step_name} in '{self.mode}' mode.")

        if self.mode == 'train':
            # Step 1: Split Dataset
            X_train, X_test, y_train, y_test = self.split_dataset(X, y)

            # Step 2: Handle Missing Values
            X_train, X_test = self.handle_missing_values(X_train, X_test)

            # Step 3: Test for Normality (including 'clustering')
            if self.model_category in ['regression', 'classification', 'clustering']:
                self.test_normality(X_train)

            # Step 4: Handle Outliers
            X_train, y_train = self.handle_outliers(X_train, y_train)

            # Step 5: Generate Preprocessing Recommendations
            recommendations = self.generate_recommendations()
            self.logger.info("Preprocessing Recommendations Generated.")

            # Step 6: Choose and Apply Transformations
            X_train_before_transformation = X_train.copy()
            X_test_before_transformation = X_test.copy() if X_test is not None else None
            X_train, X_test = self.choose_and_apply_transformations(X_train, X_test)

            # Step 7: Plot Normalization Before and After Transformations
            if self.transformer is not None:
                self.plot_normalization(
                    X_original=X_train_before_transformation[self.numericals],
                    X_transformed=X_train[self.numericals],
                    numerical_features=self.numericals,
                    model_type=self.model_type
                )
                self.plot_qq(
                    X_original=X_train_before_transformation[self.numericals],
                    X_transformed=X_train[self.numericals],
                    numerical_features=self.numericals,
                    model_type=self.model_type
                )
                if X_test_before_transformation is not None:
                    self.plot_normalization(
                        X_original=X_test_before_transformation[self.numericals],
                        X_transformed=X_test[self.numericals],
                        numerical_features=self.numericals,
                        model_type=self.model_type
                    )
                    self.plot_qq(
                        X_original=X_test_before_transformation[self.numericals],
                        X_transformed=X_test[self.numericals],
                        numerical_features=self.numericals,
                        model_type=self.model_type
                    )

            # Step 8: Encode Categorical Variables
            X_train, X_test = self.encode_categorical_variables(X_train, X_test)

            # Step 9: Apply Scaling
            X_train_before_scaling = X_train.copy()
            X_test_before_scaling = X_test.copy() if X_test is not None else None
            X_train, X_test = self.apply_scaling(X_train, X_test)

            # Step 10: Plot Normalization After Scaling
            if hasattr(self, 'scaler') and self.scaler is not None:
                scaling_features = self.options.get('apply_scaling', {}).get('features', self.numericals)
                self.plot_normalization(
                    X_original=X_train_before_scaling[scaling_features],
                    X_transformed=X_train[scaling_features],
                    numerical_features=scaling_features,
                    model_type=self.model_type
                )
                self.plot_qq(
                    X_original=X_train_before_scaling[scaling_features],
                    X_transformed=X_train[scaling_features],
                    numerical_features=scaling_features,
                    model_type=self.model_type
                )
                if X_test_before_scaling is not None:
                    self.plot_normalization(
                        X_original=X_test_before_scaling[scaling_features],
                        X_transformed=X_test[scaling_features],
                        numerical_features=scaling_features,
                        model_type=self.model_type
                    )
                    self.plot_qq(
                        X_original=X_test_before_scaling[scaling_features],
                        X_transformed=X_test[scaling_features],
                        numerical_features=scaling_features,
                        model_type=self.model_type
                    )

            # Step 11: Implement SMOTE (Train Only for Classification)
            if self.model_category == 'classification':
                X_train, y_train = self.implement_smote(X_train, y_train)
            else:
                self.logger.info("SMOTE not applied: Not a classification model.")

            # Step 12: Save Transformers for Prediction
            if self.mode == 'train':
                # Assign and save final feature order
                self.final_feature_order = list(X_train.columns)
                self.logger.info(f"Final feature order: {self.final_feature_order}")
                
                # Reindex to ensure consistent ordering
                X_train = X_train[self.final_feature_order]
                if X_test is not None:
                    X_test = X_test[self.final_feature_order]
                
                # Save transformers including feature order
                self.save_transformers()

            self.preprocessing_steps.append(step_name)
            self.logger.info(f"Step '{step_name}' completed in '{self.mode}' mode.")

            if self.model_category in ['classification', 'regression']:
                return X_train, X_test, y_train, y_test, recommendations
            elif self.model_category == 'clustering':
                return X_train, recommendations
            else:
                return X_train, recommendations

        elif self.mode == 'clustering':
            # Perform clustering-specific preprocessing steps
            # Step 1: Handle Missing Values
            X_processed, _ = self.handle_missing_values(X, None)

            # Step 2: Handle Outliers
            X_processed, _ = self.handle_outliers(X_processed, None)

            # Step 3: Choose and Apply Transformations
            X_processed, _ = self.choose_and_apply_transformations(X_processed, None)

            # Step 4: Encode Categorical Variables
            X_processed, _ = self.encode_categorical_variables(X_processed, None)

            # Step 5: Apply Scaling
            X_processed, _ = self.apply_scaling(X_processed, None)

            # Step 6: Generate Preprocessing Recommendations
            recommendations = self.generate_recommendations()
            self.logger.info("Preprocessing Recommendations Generated.")

            # Step 7: Plot Normalization Before and After Transformations
            if self.transformer is not None:
                self.plot_normalization(
                    X_original=X.copy()[self.numericals],
                    X_transformed=X_processed[self.numericals],
                    numerical_features=self.numericals,
                    model_type=self.model_type
                )
                self.plot_qq(
                    X_original=X.copy()[self.numericals],
                    X_transformed=X_processed[self.numericals],
                    numerical_features=self.numericals,
                    model_type=self.model_type
                )

            # Step 8: Save Transformers for Prediction
            self.save_transformers()

            self.preprocessing_steps.append(step_name)
            self.logger.info(f"Step '{step_name}' completed in '{self.mode}' mode.")

            return X_processed, recommendations

        elif self.mode == 'predict':
            # Step 1: Load Transformers
            self.load_transformers()

            # Step 2: Handle Missing Values using fitted imputers
            X_processed, _ = self.handle_missing_values_predict(X)

            # Step 3: Choose and Apply Transformations using fitted transformers
            X_processed, _ = self.choose_and_apply_transformations_predict(X_processed)

            # Step 4: Encode Categorical Variables using fitted encoders
            X_processed = self.encode_categorical_variables_predict(X_processed)

            # Step 5: Apply Scaling using fitted scaler
            X_processed = self.apply_scaling_predict(X_processed)

            # Ensure correct final order
            if self.final_feature_order:
                missing_in_processed = set(self.final_feature_order) - set(X_processed.columns)
                if missing_in_processed:
                    raise KeyError(f"Missing columns in X_processed: {missing_in_processed}")

                # Reindex
                X_processed = X_processed[self.final_feature_order]
        
            # Step 6: Generate Preprocessing Recommendations
            recommendations = self.generate_recommendations()
            self.logger.info("Preprocessing Recommendations Generated.")

            # Step 7: Save Preprocessed Data for Prediction
            recommendations.to_csv(os.path.join(self.graphs_output_dir, 'preprocessing_recommendations.csv'))

            self.preprocessing_steps.append(step_name)
            self.logger.info(f"Step '{step_name}' completed in '{self.mode}' mode.")

            return X_processed, recommendations

        else:
            # Handle other modes if necessary
            raise NotImplementedError(f"Mode '{self.mode}' is not yet implemented.")


# Main Execution Script

import pandas as pd
import numpy as np
from typing import List, Dict, Optional, Tuple
import logging
import os

# from feature_manager import FeatureManager  # Ensure correct import
# from data_preprocessor import DataPreprocessor  # Ensure correct import

def initialize_feature_manager(save_path: str):
    """
    Initialize the FeatureManager and ensure the save directory exists.

    Args:
        save_path (str): Path to save the feature metadata pickle file.

    Returns:
        FeatureManager: An instance of the FeatureManager class.
    """
    # Ensure the directory for save_path exists
    save_dir = os.path.dirname(save_path)
    if save_dir and not os.path.exists(save_dir):
        os.makedirs(save_dir, exist_ok=True)
    return FeatureManager(save_path=save_path)

def load_dataset(path: str) -> pd.DataFrame:
    """
    Load the dataset from a CSV file.

    Args:
        path (str): Path to the dataset CSV file.

    Returns:
        pd.DataFrame: Loaded dataset.
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"Dataset not found at {path}")
    return pd.read_csv(path)

def main():
    # ----------------------------
    # Step 1: Define Paths and Flags
    # ----------------------------

    # Define the path to the single pickle file containing all metadata
    save_path = '../../ml-preprocessing-utils/data/dataset/test/features_info/features_metadata.pkl'  # Adjust as needed

    # Define a debug flag based on user preference
    debug_flag = False  # Set to False for minimal outputs

    # Define normalization plotting flags
    normalize_debug = False  # Set to True to display plots in Jupyter
    normalize_graphs_output = False  # Set to True to save plots to a directory
    graphs_output_dir = '../../ml-preprocessing-utils/data/dataset/test/plots'  # Specify the desired output directory

    # Configure root logger based on debug_flag
    logging.basicConfig(
        level=logging.DEBUG if debug_flag else logging.INFO, 
        format='%(asctime)s [%(levelname)s] %(message)s',
        handlers=[
            logging.StreamHandler()
        ]
    )
    logger = logging.getLogger('main_preprocessing')

    # ----------------------------
    # Step 2: Initialize FeatureManager
    # ----------------------------

    feature_manager = initialize_feature_manager(save_path=save_path)

    # ----------------------------
    # Step 3: Save Features and Metadata
    # ----------------------------

    # Define paths for saving
    start_dataset_path = '../../data/processed/final_ml_dataset.csv'  # Original dataset CSV path

    # Load the original dataset
    try:
        logger.info(f"📥 Loading original dataset from {start_dataset_path}...")
        original_df = load_dataset(start_dataset_path)
        logger.info("✅ Original dataset loaded successfully.")
    except Exception as e:
        logger.error(f"❌ Failed to load original dataset: {e}")
        return  # Exit if loading fails

    # Define feature categories and column names
    ordinal_categoricals = []
    nominal_categoricals = []
    numericals = [
        'release_ball_direction_x', 'release_ball_direction_z', 'release_ball_direction_y',
        'elbow_release_angle', 'elbow_max_angle',
        'wrist_release_angle', 'wrist_max_angle',
        'knee_release_angle', 'knee_max_angle',
        'release_ball_speed', 'calculated_release_angle',
        'release_ball_velocity_x', 'release_ball_velocity_y', 'release_ball_velocity_z'
    ]
    y_variable = ['result']
    
    # Final columns we keep
    final_keep_list = ordinal_categoricals + nominal_categoricals + numericals + y_variable

    # ----------------------------
    # Step 4: Save Selected Features and Metadata
    # ----------------------------

    missing_columns = set(final_keep_list) - set(original_df.columns)
    if missing_columns:
        logger.error(f"The following columns are missing in the dataset: {missing_columns}")
        return

    logger.info("🔍 Selecting and filtering dataset based on defined features...")
    selected_features_df = original_df[final_keep_list]
    logger.info("✅ Selected features filtered successfully.")

    assert 'result' in y_variable, "'result' must be in y_variable."
    assert y_variable not in numericals, "'result' should not be in numericals."

    try:
        feature_manager.save_features(
            features_df=selected_features_df,
            ordinal_categoricals=ordinal_categoricals,
            nominal_categoricals=nominal_categoricals,
            numericals=numericals,
            y_variable=y_variable,
            dataset_csv_path=start_dataset_path
        )
    except Exception as e:
        logger.error(f"❌ Failed to save features and metadata: {e}")
        return

    # ----------------------------
    # Step 5: Load Features and Filtered Dataset
    # ----------------------------

    try:
        filtered_df, column_assets = feature_manager.load_features_and_dataset(
            debug=debug_flag
        )
        logger.info("✅ Features loaded and dataset filtered successfully.")
    except Exception as e:
        logger.error(f"❌ Failed to load features and dataset: {e}")
        return

    # Access column assets
    ordinals = column_assets.get('ordinal_categoricals', [])
    nominals = column_assets.get('nominal_categoricals', [])
    nums = column_assets.get('numericals', [])
    y_var = column_assets.get('y_variable', [])
    logger.debug(f"y_var = {y_var}")
    if debug_flag:
        logger.debug("\n📥 Loaded Data:")
        logger.debug(f"Column Assets: {column_assets}")
        logger.debug(f"Ordinal Categoricals: {ordinals}")
        logger.debug(f"Nominal Categoricals: {nominals}")
        logger.debug(f"Numericals: {nums}")
        logger.debug(f"Y Variable: {y_var}")
    else:
        logger.info("Features and metadata loaded successfully.")

    # ----------------------------
    # Step 6: Define Models and Options
    # ----------------------------

    # Define your models and their specific preprocessing options
    model_types = [
        'Logistic Regression',
        'Tree Based Classifier',
        'Linear Regression',
        'Tree Based Regressor',
        # 'K-Means Clustering',
    ]

    model_specific_options = {
        # -----------------------------------------------------------
        # 1. LOGISTIC REGRESSION (Classification)
        # -----------------------------------------------------------
        'Logistic Regression': {
            'split_dataset': {
                'test_size': 0.25,
                'random_state': 42,
                'stratify_for_classification': True
            },
            'handle_missing_values': {
                'numerical_strategy': {
                    'strategy': 'mean',
                    'imputer': 'SimpleImputer'
                },
                'categorical_strategy': {
                    'strategy': 'most_frequent',
                    'imputer': 'SimpleImputer'
                }
            },
            'test_normality': {
                'p_value_threshold': 0.05,
                'skewness_threshold': 1.0,
                'use_p_value_other_models': False
            },
            'handle_outliers': {
                'zscore_threshold': 3,
                'iqr_multiplier': 1.5,
                'apply_zscore': True,
                'apply_iqr': True,
                'apply_winsor': False,
                'apply_isolation_forest': False
            },
            'choose_transformations': {
                'method': 'power',  # 'power', 'log', or None
                'power_method': 'yeo-johnson',
                'skewness_threshold': 1.0
            },
            'encode_categoricals': {
                'ordinal_encoding': 'OrdinalEncoder',
                'nominal_encoding': 'OrdinalEncoder',  
                'handle_unknown': 'ignore'
            },
            'apply_scaling': {
                'method': 'StandardScaler',  
                'features': nums
            },
            'implement_smote': {
                'variant': 'SMOTENC',
                'params': {
                    'k_neighbors': 5
                }
            },
            'inverse_transformations': {
                'inverse_scaling': True,
                'inverse_transformation': True,
                'inverse_encoding': True
            },
            # Debug flags for each step
            'debug_split_dataset': False,
            'debug_handle_missing_values': False,
            'debug_test_normality': False,
            'debug_handle_outliers': False,
            'debug_choose_transformations': False,
            'debug_encode_categoricals': False,
            'debug_apply_scaling': False,
            'debug_implement_smote': False,
            'debug_final_inverse_transformations': False,
            'debug_validate_inverse_transformations': False,
            'debug_generate_recommendations': False
        },

        # -----------------------------------------------------------
        # 2. TREE BASED CLASSIFIER (Classification)
        # -----------------------------------------------------------
        'Tree Based Classifier': {
            'split_dataset': {
                'test_size': 0.2,
                'random_state': 42,
                'stratify_for_classification': True
            },
            'handle_missing_values': {
                'numerical_strategy': {
                    'strategy': 'median',
                    'imputer': 'SimpleImputer'
                },
                'categorical_strategy': {
                    'strategy': 'most_frequent',
                    'imputer': 'SimpleImputer'
                }
            },
            'test_normality': {
                'p_value_threshold': 0.05,
                'skewness_threshold': 1.0,
                'use_p_value_other_models': False
            },
            'handle_outliers': {
                'zscore_threshold': 3,
                'iqr_multiplier': 1.5,
                'apply_zscore': False,
                'apply_iqr': True,
                'apply_winsor': False,
                'apply_isolation_forest': False
            },
            'choose_transformations': {
                'method': None,
                'skewness_threshold': 1.0
            },
            'encode_categoricals': {
                'ordinal_encoding': 'OrdinalEncoder',
                'nominal_encoding': 'OrdinalEncoder',
                'handle_unknown': 'ignore'
            },
            'apply_scaling': {
                'method': None,
                'features': []
            },
            'implement_smote': {
                'variant': 'SMOTENC',
                'params': {
                    'k_neighbors': 5
                }
            },
            'inverse_transformations': {
                'inverse_scaling': True,
                'inverse_transformation': True,
                'inverse_encoding': True
            },
            # Debug flags
            'debug_split_dataset': False,
            'debug_handle_missing_values': False,
            'debug_test_normality': False,
            'debug_handle_outliers': False,
            'debug_choose_transformations': False,
            'debug_encode_categoricals': False,
            'debug_apply_scaling': False,
            'debug_implement_smote': False,
            'debug_final_inverse_transformations': False,
            'debug_validate_inverse_transformations': False,
            'debug_generate_recommendations': False
        },

        # -----------------------------------------------------------
        # 3. K-MEANS CLUSTERING (Clustering)
        # -----------------------------------------------------------
        'K-Means Clustering': {
            'split_dataset': {
                'test_size': None,
                'random_state': 42,
                'stratify_for_classification': False
            },
            'handle_missing_values': {
                'numerical_strategy': {
                    'strategy': 'mean',
                    'imputer': 'SimpleImputer'
                },
                'categorical_strategy': {
                    'strategy': 'most_frequent',
                    'imputer': 'SimpleImputer'
                }
            },
            'test_normality': {
                'p_value_threshold': 0.05,
                'skewness_threshold': 1.0,
                'use_p_value_other_models': False
            },
            'handle_outliers': {
                'zscore_threshold': 3,
                'iqr_multiplier': 1.5,
                'apply_zscore': True,
                'apply_iqr': True,
                'apply_winsor': False,
                'apply_isolation_forest': False
            },
            'choose_transformations': {
                'method': 'power',
                'power_method': 'yeo-johnson',
                'skewness_threshold': 1.0
            },
            'encode_categoricals': {
                'ordinal_encoding': 'OrdinalEncoder',
                'nominal_encoding': 'OrdinalEncoder',
                'handle_unknown': 'ignore'
            },
            'apply_scaling': {
                'method': 'MinMaxScaler',
                'features': nums
            },
            'implement_smote': {
                'variant': None,
                'params': {}
            },
            'inverse_transformations': {
                'inverse_scaling': True,
                'inverse_transformation': True,
                'inverse_encoding': True
            },
            # Debug flags
            'debug_split_dataset': False,
            'debug_handle_missing_values': True,
            'debug_test_normality': True,
            'debug_handle_outliers': True,
            'debug_choose_transformations': True,
            'debug_encode_categoricals': True,
            'debug_apply_scaling': True,
            'debug_implement_smote': False,
            'debug_final_inverse_transformations': True,
            'debug_validate_inverse_transformations': True,
            'debug_generate_recommendations': True
        },

        # -----------------------------------------------------------
        # 4. LINEAR REGRESSION (Regression)
        # -----------------------------------------------------------
        'Linear Regression': {
            'split_dataset': {
                'test_size': 0.2,
                'random_state': 42,
                'stratify_for_classification': False  # Not classification, so no stratify
            },
            'handle_missing_values': {
                'numerical_strategy': {
                    'strategy': 'mean',     # Could choose 'mean' or 'median'
                    'imputer': 'SimpleImputer'
                },
                'categorical_strategy': {
                    'strategy': 'most_frequent',
                    'imputer': 'SimpleImputer'
                }
            },
            'test_normality': {
                'p_value_threshold': 0.05,
                'skewness_threshold': 1.0,
                'use_p_value_other_models': False
            },
            'handle_outliers': {
                'zscore_threshold': 3,
                'iqr_multiplier': 1.5,
                'apply_zscore': True,   # More typical for linear regression
                'apply_iqr': True,
                'apply_winsor': False,
                'apply_isolation_forest': False
            },
            'choose_transformations': {
                'method': 'power',   # 'power', 'log', or None
                'power_method': 'yeo-johnson',
                'skewness_threshold': 1.0
            },
            'encode_categoricals': {
                'ordinal_encoding': 'OrdinalEncoder',
                'nominal_encoding': 'OneHotEncoder',  # For linear regression, one-hot is often used
                'handle_unknown': 'ignore'
            },
            'apply_scaling': {
                'method': 'StandardScaler',  # Common for regression
                'features': nums
            },
            'implement_smote': {
                'variant': None,  # SMOTE is typically for classification, so skip here
                'params': {}
            },
            'inverse_transformations': {
                'inverse_scaling': True,
                'inverse_transformation': True,
                'inverse_encoding': True
            },
            # Debug flags
            'debug_split_dataset': True,
            'debug_handle_missing_values': True,
            'debug_test_normality': True,
            'debug_handle_outliers': True,
            'debug_choose_transformations': True,
            'debug_encode_categoricals': True,
            'debug_apply_scaling': True,
            'debug_implement_smote': False,  # Not classification
            'debug_final_inverse_transformations': True,
            'debug_validate_inverse_transformations': True,
            'debug_generate_recommendations': True
        },

        # -----------------------------------------------------------
        # 5. TREE BASED REGRESSOR (Regression)
        # -----------------------------------------------------------
        'Tree Based Regressor': {
            'split_dataset': {
                'test_size': 0.2,
                'random_state': 42,
                'stratify_for_classification': False
            },
            'handle_missing_values': {
                'numerical_strategy': {
                    'strategy': 'median',
                    'imputer': 'SimpleImputer'
                },
                'categorical_strategy': {
                    'strategy': 'most_frequent',
                    'imputer': 'SimpleImputer'
                }
            },
            'test_normality': {
                'p_value_threshold': 0.05,
                'skewness_threshold': 1.0,
                'use_p_value_other_models': False
            },
            'handle_outliers': {
                'zscore_threshold': 3,
                'iqr_multiplier': 1.5,
                'apply_zscore': False,  
                'apply_iqr': True,
                'apply_winsor': False,
                'apply_isolation_forest': False
            },
            'choose_transformations': {
                'method': None,   # Typically not needed for tree-based
                'skewness_threshold': 1.0
            },
            'encode_categoricals': {
                'ordinal_encoding': 'OrdinalEncoder',
                'nominal_encoding': 'OrdinalEncoder',  
                'handle_unknown': 'ignore'
            },
            'apply_scaling': {
                'method': None,   # Trees don't need scaling
                'features': []
            },
            'implement_smote': {
                'variant': None,  # Not classification
                'params': {}
            },
            'inverse_transformations': {
                'inverse_scaling': True,
                'inverse_transformation': True,
                'inverse_encoding': True
            },
            # Debug flags
            'debug_split_dataset': False,
            'debug_handle_missing_values': True,
            'debug_test_normality': True,
            'debug_handle_outliers': True,
            'debug_choose_transformations': True,
            'debug_encode_categoricals': True,
            'debug_apply_scaling': True,
            'debug_implement_smote': False,
            'debug_final_inverse_transformations': True,
            'debug_validate_inverse_transformations': True,
            'debug_generate_recommendations': True
        }
    }

    # ----------------------------
    # Step 7: Loop Over Each Model
    # ----------------------------
    results = []
    for model_type in model_types:
        # 1) Grab the options for the model
        options = model_specific_options.get(model_type, {})

        # 2) Extract debug flags from options (to pass into the constructor).
        debug_split_dataset = options.pop('debug_split_dataset', False)
        debug_handle_missing_values = options.pop('debug_handle_missing_values', False)
        debug_test_normality = options.pop('debug_test_normality', False)
        debug_handle_outliers = options.pop('debug_handle_outliers', False)
        debug_choose_transformations = options.pop('debug_choose_transformations', False)
        debug_encode_categoricals = options.pop('debug_encode_categoricals', False)
        debug_apply_scaling = options.pop('debug_apply_scaling', False)
        debug_implement_smote = options.pop('debug_implement_smote', False)
        debug_final_inverse_transformations = options.pop('debug_final_inverse_transformations', False)
        debug_validate_inverse_transformations = options.pop('debug_validate_inverse_transformations', False)
        debug_generate_recommendations = options.pop('debug_generate_recommendations', False)

        # 3) Determine mode & whether we split
        if 'Clustering' in model_type:
            mode = 'clustering'
            perform_split = False
        else:
            mode = 'train'
            perform_split = True

        # 4) Instantiate DataPreprocessor
        preprocessor = DataPreprocessor(
            model_type=model_type,
            column_assets=column_assets,
            mode=mode,
            #options=options,  # Pass in the entire dictionary (minus the popped debug flags)
            perform_split=perform_split,
            debug=debug_flag,
            debug_split_dataset=debug_split_dataset,
            debug_handle_missing_values=debug_handle_missing_values,
            debug_test_normality=debug_test_normality,
            debug_handle_outliers=debug_handle_outliers,
            debug_choose_transformations=debug_choose_transformations,
            debug_encode_categoricals=debug_encode_categoricals,
            debug_apply_scaling=debug_apply_scaling,
            debug_implement_smote=debug_implement_smote,
            debug_final_inverse_transformations=debug_final_inverse_transformations,
            debug_validate_inverse_transformations=debug_validate_inverse_transformations,
            debug_generate_recommendations=debug_generate_recommendations,
            normalize_debug=normalize_debug,
            normalize_graphs_output=normalize_graphs_output,
            graphs_output_dir=graphs_output_dir
        )

        # 5) Execute the preprocessing pipeline
        try:
            if perform_split and preprocessor.y_variable:
                # Supervised learning
                X = filtered_df.drop(preprocessor.y_variable, axis=1)
                y = filtered_df[preprocessor.y_variable].iloc[:, 0]
                logger.debug(f"y shape: {y.shape}, type: {type(y)}")
            elif mode == 'predict':
                # Handle prediction (if applicable)
                # Existing prediction handling...
                pass
            else:
                # Unsupervised learning (clustering)
                X = filtered_df.copy()
                y = None

            # Preprocess
            preprocessed = preprocessor.final_preprocessing(X, y)

            # 6) Unpack results based on model category
            if preprocessor.mode == 'train':
                if preprocessor.model_category in ['classification', 'regression']:
                    X_train, X_test, y_train, y_test, recommendations = preprocessed
                else:
                    raise ValueError(f"Unsupported model category during training: {preprocessor.model_category}")
            elif preprocessor.model_category == 'clustering':
                X_processed, recommendations = preprocessed
            elif preprocessor.mode == 'predict':
                # Handle prediction outputs
                # X, y, _, _, recommendations = preprocessed
                pass
            else:
                raise ValueError(f"Unsupported model category: {preprocessor.model_category}")

        except Exception as e:
            logger.error(f"Preprocessing failed for {model_type}: {e}")
            continue

        # 7) Show Preprocessing Recommendations
        if debug_flag:
            logger.debug("\n📋 Preprocessing Recommendations:")
            logger.debug(recommendations)
        else:
            logger.info("Preprocessing Recommendations:")
            logger.info(recommendations)

        # 8) Show dataset shapes
        if debug_flag:
            logger.debug("\n📊 Preprocessed Dataset Shapes:")
        else:
            logger.info("Preprocessed Dataset Shapes:")

        if mode == 'train':
            if debug_flag:
                logger.debug(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape if X_test is not None else 'N/A'}")
                logger.debug(f"y_train shape: {y_train.shape if y_train is not None else 'N/A'}, y_test shape: {y_test.shape if y_test is not None else 'N/A'}")
            else:
                logger.info(f"X_train: {X_train.shape}, X_test: {X_test.shape if X_test is not None else 'N/A'}")
                logger.info(f"y_train: {y_train.shape if y_train is not None else 'N/A'}, y_test: {y_test.shape if y_test is not None else 'N/A'}")
        else:
            if debug_flag:
                if 'X_processed' in locals():
                    logger.debug(f"X_processed shape: {X_processed.shape}")
            else:
                if 'X_processed' in locals():
                    logger.info(f"X_processed: {X_processed.shape}")

        # 9) Save the preprocessed data for each model
        try:
            safe_model_type = model_type.replace(" ", "_")
            save_dir = os.path.join(graphs_output_dir, safe_model_type)
            os.makedirs(save_dir, exist_ok=True)

            if mode == 'train':
                X_train.to_csv(os.path.join(save_dir, 'X_train.csv'), index=False)
                if y_train is not None:
                    y_train.to_csv(os.path.join(save_dir, 'y_train.csv'), index=False)

                if X_test is not None:
                    X_test.to_csv(os.path.join(save_dir, 'X_test.csv'), index=False)
                if y_test is not None:
                    y_test.to_csv(os.path.join(save_dir, 'y_test.csv'), index=False)
            elif preprocessor.mode == 'clustering':
                if 'X_processed' in locals():
                    X_processed.to_csv(os.path.join(save_dir, 'X_processed.csv'), index=False)

            recommendations.to_csv(os.path.join(save_dir, 'preprocessing_recommendations.csv'))

            logger.info(f"Preprocessed data saved for model '{model_type}' to '{save_dir}'.")
        except Exception as e:
            logger.error(f"Failed to save preprocessed data for model '{model_type}': {e}")
            continue

        # 10) (Optional) Validate Inverse Transformations
        try:
            inv_opts = options.get('inverse_transformations', {})
            inverse_scaling = inv_opts.get('inverse_scaling', True)
            inverse_transformation = inv_opts.get('inverse_transformation', True)
            inverse_encoding = inv_opts.get('inverse_encoding', True)

            # Only do if any of these are True and we have the fitted objects
            if ((inverse_scaling and preprocessor.scaler) or
                (inverse_transformation and preprocessor.transformer) or
                (inverse_encoding and
                    (preprocessor.ordinal_encoder or preprocessor.nominal_encoder))):

                if mode == 'train' and X_test is not None:
                    # Reconstruct the original subset for inverse transformation
                    features_to_inverse = (
                        list(preprocessor.numericals) 
                        + list(preprocessor.ordinal_categoricals) 
                        + list(preprocessor.nominal_categoricals)
                    )
                    X_test_original_subset = filtered_df.loc[X_test.index, features_to_inverse]

                    X_test_inverse = preprocessor.final_inverse_transformations(
                        X_test_preprocessed=X_test, 
                        X_test_original=X_test_original_subset
                    )
                    # Validate
                    preprocessor.validate_inverse_transformations(
                        X_original=X_test_original_subset,
                        X_inverse=X_test_inverse,
                        tolerance=1e-4
                    )

                elif mode == 'clustering':
                    if 'X_processed' in locals():
                        features_to_inverse = (
                            list(preprocessor.numericals) 
                            + list(preprocessor.ordinal_categoricals) 
                            + list(preprocessor.nominal_categoricals)
                        )
                        X_processed_original_subset = filtered_df.loc[X_processed.index, features_to_inverse]
                        X_processed_inverse = preprocessor.final_inverse_transformations(
                            X_test_preprocessed=X_processed,
                            X_test_original=X_processed_original_subset
                        )
                        # Validate
                        preprocessor.validate_inverse_transformations(
                            X_original=X_processed_original_subset,
                            X_inverse=X_processed_inverse,
                            tolerance=1e-4
                        )

                elif mode == 'predict':
                    # Handle prediction-specific inverse transformations if needed
                    pass

        except Exception as e:
            logger.error(f"Inverse transformations or validation failed for '{model_type}': {e}")

    logger.info("✅ All model preprocessing complete.")

if __name__ == "__main__":
    main()