In [26]:
# datapreprocessor.py

import pandas as pd
import numpy as np
from typing import List, Dict, Optional, Tuple, Any
import logging
import os
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import PowerTransformer, OrdinalEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from imblearn.over_sampling import SMOTE, ADASYN, SMOTENC, SMOTEN, BorderlineSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import probplot
import joblib  # For saving/loading transformers
from inspect import signature  # For parameter validation in SMOTE

class DataPreprocessor:
    def __init__(
        self,
        model_type: str,
        y_variable: List[str],
        ordinal_categoricals: List[str],
        nominal_categoricals: List[str],
        numericals: List[str],
        mode: str,  # 'train', 'predict', 'clustering'
        options: Optional[Dict] = None,
        debug: bool = False,
        normalize_debug: bool = False,
        normalize_graphs_output: bool = False,
        graphs_output_dir: str = './plots',
        transformers_dir: str = './transformers',
        # New time series parameters:
        time_column: Optional[str] = None,
        window_size: Optional[int] = None,
        horizon: Optional[int] = None,
        step_size: Optional[int] = None,
        max_sequence_length: Optional[int] = None,
        # Remove use_dtw and dynamic_window_adjustment and replace with:
        time_series_sequence_mode: str = "set_window",  # Accepts "set_window", "dtw", "pad", or "variable_length"
        sequence_categorical: Optional[List[str]] = None
    ):
        self.model_type = model_type
        self.y_variable = y_variable
        self.ordinal_categoricals = ordinal_categoricals
        self.nominal_categoricals = nominal_categoricals
        self.numericals = numericals
        self.mode = mode.lower()
        if self.mode not in ['train', 'predict', 'clustering']:
            raise ValueError("Mode must be one of 'train', 'predict', or 'clustering'.")
        self.options = options or {}
        self.debug = debug
        self.normalize_debug = normalize_debug
        self.normalize_graphs_output = normalize_graphs_output
        self.graphs_output_dir = graphs_output_dir
        self.transformers_dir = transformers_dir

        # New time series parameters
        self.time_column = time_column
        self.window_size = window_size
        self.horizon = horizon
        self.step_size = step_size
        self.max_sequence_length = max_sequence_length
        # New consolidated mode for segmentation:
        self.time_series_sequence_mode = time_series_sequence_mode  # "set_window", "dtw", "pad", or "variable_length"
        self.sequence_categorical = sequence_categorical

        # (… rest of initialization remains the same …)
        self.hierarchical_categories = {}
        model_type_lower = self.model_type.lower()
        if any(kw in model_type_lower for kw in ['lstm', 'rnn', 'time series']):
            self.model_category = 'time_series'
        else:
            self.model_category = self.map_model_type_to_category()
        self.categorical_indices = []
        if self.model_category == 'unknown':
            self.logger = logging.getLogger(self.__class__.__name__)
            self.logger.error(f"Model category for '{self.model_type}' is unknown. Check your configuration.")
            raise ValueError(f"Model category for '{self.model_type}' is unknown. Check your configuration.")
        if self.mode in ['train', 'predict']:
            if not self.y_variable:
                raise ValueError("Target variable 'y_variable' must be specified for supervised models in train/predict mode.")
        elif self.mode == 'clustering':
            self.y_variable = []


        # ----------------------------------------------------

        # Initialize other variables
        self.scaler = None
        self.transformer = None
        self.ordinal_encoder = None
        self.nominal_encoder = None
        self.preprocessor = None
        self.smote = None
        self.feature_reasons = {col: '' for col in self.ordinal_categoricals + self.nominal_categoricals + self.numericals}
        self.preprocessing_steps = []
        self.normality_results = {}
        self.features_to_transform = []
        self.nominal_encoded_feature_names = []
        self.final_feature_order = []

        # Initialize placeholders for clustering-specific transformers
        self.cluster_transformers = {}
        self.cluster_model = None
        self.cluster_labels = None
        self.silhouette_score = None

        # Define default thresholds for SMOTE recommendations
        self.imbalance_threshold = self.options.get('smote_recommendation', {}).get('imbalance_threshold', 0.1)
        self.noise_threshold = self.options.get('smote_recommendation', {}).get('noise_threshold', 0.1)
        self.overlap_threshold = self.options.get('smote_recommendation', {}).get('overlap_threshold', 0.1)
        self.boundary_threshold = self.options.get('smote_recommendation', {}).get('boundary_threshold', 0.1)

        self.pipeline = None  # Initialize pipeline

        # Initialize logging
        self.logger = logging.getLogger(self.__class__.__name__)
        self.logger.setLevel(logging.DEBUG if self.debug else logging.INFO)
        handler = logging.StreamHandler()
        formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s')
        handler.setFormatter(formatter)
        if not self.logger.handlers:
            self.logger.addHandler(handler)
            
        # Initialize feature_reasons with 'all_numericals' for clustering
        self.feature_reasons = {col: '' for col in self.ordinal_categoricals + self.nominal_categoricals + self.numericals}
        if self.model_category == 'clustering':
            self.feature_reasons['all_numericals'] = ''

    def get_debug_flag(self, flag_name: str) -> bool:
        """
        Retrieve the value of a specific debug flag from the options.
        Args:
            flag_name (str): The name of the debug flag.
        Returns:
            bool: The value of the debug flag.
        """
        return self.options.get(flag_name, False)

    def _log(self, message: str, step: str, level: str = 'info'):
        """
        Internal method to log messages based on the step-specific debug flags.
        
        Args:
            message (str): The message to log.
            step (str): The preprocessing step name.
            level (str): The logging level ('info', 'debug', etc.).
        """
        debug_flag = self.get_debug_flag(f'debug_{step}')
        if debug_flag:
            if level == 'debug':
                self.logger.debug(message)
            elif level == 'info':
                self.logger.info(message)
            elif level == 'warning':
                self.logger.warning(message)
            elif level == 'error':
                self.logger.error(message)

    def map_model_type_to_category(self) -> str:
        """
        Map the model_type string to a predefined category based on keywords.

        Returns:
            str: The model category ('classification', 'regression', 'clustering', etc.).
        """
        classification_keywords = ['classifier', 'classification', 'logistic', 'svm', 'support vector machine', 'knn', 'neural network']
        regression_keywords = ['regressor', 'regression', 'linear', 'knn', 'neural network']  # Removed 'svm'
        clustering_keywords = ['k-means', 'clustering', 'dbscan', 'kmodes', 'kprototypes']

        model_type_lower = self.model_type.lower()

        for keyword in classification_keywords:
            if keyword in model_type_lower:
                return 'classification'

        for keyword in regression_keywords:
            if keyword in model_type_lower:
                return 'regression'

        for keyword in clustering_keywords:
            if keyword in model_type_lower:
                return 'clustering'

        return 'unknown'

    def filter_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        step_name = "filter_columns"
        self.logger.info(f"Step: {step_name}")

        # Combine all feature lists from configuration
        desired_features = self.numericals + self.ordinal_categoricals + self.nominal_categoricals

        # For time series models, ensure the time column is included
        if self.model_category == 'time_series' and self.time_column:
            if self.time_column not in df.columns:
                self.logger.error(f"Time column '{self.time_column}' not found in input data.")
                raise ValueError(f"Time column '{self.time_column}' not found in the input data.")
            # Add the time column if it is not already part of the feature lists
            if self.time_column not in desired_features:
                desired_features.append(self.time_column)

        # Debug log: report target variable info
        self.logger.debug(f"y_variable provided: {self.y_variable}")
        if self.y_variable and all(col in df.columns for col in self.y_variable):
            self.logger.debug(f"Unique values in target column(s): {df[self.y_variable].drop_duplicates().to_dict()}")

        # For 'train' mode, ensure the target variable is present and excluded from features
        if self.mode == 'train':
            if not all(col in df.columns for col in self.y_variable):
                missing_y = [col for col in self.y_variable if col not in df.columns]
                self.logger.error(f"Target variable(s) {missing_y} not found in the input data.")
                raise ValueError(f"Target variable(s) {missing_y} not found in the input data.")
            # Exclude y_variable from features (if present)
            desired_features = [col for col in desired_features if col not in self.y_variable]
            # Retain y_variable in the final DataFrame
            filtered_df = df[desired_features + self.y_variable].copy()
        else:
            # For 'predict' and 'clustering' modes, exclude y_variable from the features
            filtered_df = df[desired_features].copy()

        # Check that all desired features are present in the input DataFrame
        missing_features = [col for col in desired_features if col not in df.columns]
        if missing_features:
            self.logger.error(f"The following required features are missing in the input data: {missing_features}")
            raise ValueError(f"The following required features are missing in the input data: {missing_features}")

        self.logger.info(f"✅ Filtered DataFrame to include only specified features. Shape: {filtered_df.shape}")
        self.logger.debug(f"Selected Features: {desired_features}")
        if self.mode == 'train':
            self.logger.debug(f"Retained Target Variable(s): {self.y_variable}")

        return filtered_df


    def create_sequences_by_category(self, X: np.ndarray, y: np.ndarray, group_ids: np.ndarray) -> Tuple[Any, Any, np.ndarray]:
        # Convert group_ids to tuple keys if more than one grouping column is provided.
        if group_ids.ndim > 1:
            group_keys_full = np.array([tuple(row) for row in group_ids])
        else:
            group_keys_full = group_ids

        unique_groups = np.unique(group_keys_full, axis=0)
        sequences_X = []
        sequences_y = []
        group_keys_list = []
        
        for idx, group in enumerate(unique_groups):
            if group_keys_full.ndim > 1:
                indices = np.where(np.all(group_keys_full == group, axis=1))[0]
            else:
                indices = np.where(group_keys_full == group)[0]
            seq_X = X[indices, :]
            seq_y = y[indices]
            sequences_X.append(seq_X)
            sequences_y.append(seq_y)
            group_keys_list.append(group)
            self.logger.debug(f"Group {group} - seq_y shape: {seq_y.shape}")

        if self.time_series_sequence_mode in ["dtw", "pad"]:
            max_length = max(seq.shape[0] for seq in sequences_X)
            self.logger.debug(f"Maximum sequence length determined: {max_length}")
        # For "variable_length", we leave sequences as they are.

        aligned_X = []
        aligned_y = []
        
        for idx, (seq_X, seq_y) in enumerate(zip(sequences_X, sequences_y)):
            current_length = seq_X.shape[0]
            if self.time_series_sequence_mode == "dtw" and current_length < max_length:
                self.logger.debug(f"Group {unique_groups[idx]}: applying DTW warping. Original shape: {seq_X.shape}")
                original_seq = seq_X.copy()
                path = dtw_path(seq_X, seq_X)
                seq_X_aligned = warp_sequence(seq_X, path, max_length)
                pad_width = max_length - current_length
                seq_y_aligned = np.pad(seq_y, ((0, pad_width), (0, 0)), mode='constant', constant_values=0)
                aligned_X.append(seq_X_aligned)
                aligned_y.append(seq_y_aligned)
            elif self.time_series_sequence_mode == "pad" and current_length < max_length:
                self.logger.debug(f"Group {unique_groups[idx]}: applying zero padding. Original shape: {seq_X.shape}")
                pad_width = max_length - current_length
                seq_X_aligned = np.pad(seq_X, ((0, pad_width), (0, 0)), mode='constant', constant_values=0)
                seq_y_aligned = np.pad(seq_y, ((0, pad_width), (0, 0)), mode='constant', constant_values=0)
                aligned_X.append(seq_X_aligned)
                aligned_y.append(seq_y_aligned)
            else:
                aligned_X.append(seq_X)
                aligned_y.append(seq_y)
        
        if self.time_series_sequence_mode == "variable_length":
            X_seq = aligned_X
            y_seq = aligned_y
        else:
            X_seq = np.array(aligned_X)
            y_seq = np.array(aligned_y)
        
        return X_seq, y_seq, np.array(group_keys_list)




    def apply_dtw_alignment(self, sequences: np.ndarray) -> np.ndarray:
        """
        Align a set of sequences using DTW so that all sequences match the reference length.
        
        Args:
            sequences: Array of sequences with shape (num_sequences, seq_length, num_features)
        
        Returns:
            aligned_sequences: Array of DTW-aligned sequences.
        """
        ref = sequences[0]
        target_length = ref.shape[0]
        aligned_sequences = []
        
        for seq in sequences:
            path = dtw_path(seq, ref)
            aligned_seq = warp_sequence(seq, path, target_length)
            aligned_sequences.append(aligned_seq)
        
        return np.array(aligned_sequences)

    def create_sequences(self, X: np.ndarray, y: np.ndarray) -> Tuple[Any, Any]:
        X_seq, y_seq = [], []
        for i in range(0, len(X) - self.window_size - self.horizon + 1, self.step_size):
            seq_X = X[i:i+self.window_size]
            seq_y = y[i+self.window_size:i+self.window_size+self.horizon]
            if self.time_series_sequence_mode != "variable_length" and self.max_sequence_length and seq_X.shape[0] < self.max_sequence_length:
                pad_width = self.max_sequence_length - seq_X.shape[0]
                seq_X = np.pad(seq_X, ((0, pad_width), (0, 0)), mode='constant', constant_values=0)
            X_seq.append(seq_X)
            y_seq.append(seq_y)
        
        if self.time_series_sequence_mode != "variable_length":
            X_seq = np.array(X_seq)
            y_seq = np.array(y_seq)
        
        if isinstance(y_seq, np.ndarray) and y_seq.ndim == 3 and y_seq.shape[-1] == 1:
            y_seq = np.squeeze(y_seq, axis=-1)
            self.logger.debug("Squeezed extra dimension from y_seq to shape: " + str(y_seq.shape))
        
        # If time_series_sequence_mode is "dtw", perform DTW alignment on the sequences.
        if self.time_series_sequence_mode == "dtw":
            if not np.all([seq.shape[0] == X_seq[0].shape[0] for seq in X_seq]):
                X_seq = self.apply_dtw_alignment(X_seq)
            else:
                self.logger.debug("All sequences are already uniform; skipping DTW alignment.")
        
        return X_seq, y_seq


    def temporal_encode_sequences(self, X_seq: Any, group_keys: np.ndarray) -> Any:
        if group_keys.ndim == 1:
            group_keys = group_keys.reshape(-1, 1)
        num_group = group_keys.shape[1]
        for i in range(num_group):
            col_name = self.sequence_categorical[i] if isinstance(self.sequence_categorical, list) else self.sequence_categorical
            if col_name not in self.hierarchical_categories or not self.hierarchical_categories[col_name]:
                self.hierarchical_categories[col_name] = sorted(np.unique(group_keys[:, i]))
                self.logger.debug(f"Hierarchical categories for '{col_name}': {self.hierarchical_categories[col_name]}")
        
        encoded_sequences = []
        for idx, seq in enumerate(X_seq):
            seq_length = seq.shape[0]
            pos_encoding = np.linspace(0, 1, seq_length).reshape(-1, 1)
            if group_keys.shape[1] == 1:
                group_value = group_keys[idx, 0]
                col_name = self.sequence_categorical[0] if isinstance(self.sequence_categorical, list) else self.sequence_categorical
                categories = self.hierarchical_categories[col_name]
                one_hot = np.zeros((seq_length, len(categories)))
                if group_value in categories:
                    one_hot[:, categories.index(group_value)] = 1
                else:
                    self.logger.warning(f"Group key {group_value} not found in categories for '{col_name}'.")
            else:
                one_hot_list = []
                for i in range(group_keys.shape[1]):
                    col_name = self.sequence_categorical[i] if isinstance(self.sequence_categorical, list) else self.sequence_categorical
                    categories = self.hierarchical_categories[col_name]
                    group_value = group_keys[idx, i]
                    one_hot_col = np.zeros((seq_length, len(categories)))
                    if group_value in categories:
                        one_hot_col[:, categories.index(group_value)] = 1
                    else:
                        self.logger.warning(f"Group value {group_value} not found in categories for '{col_name}'.")
                    one_hot_list.append(one_hot_col)
                one_hot = np.concatenate(one_hot_list, axis=1)
        
            seq_encoded = np.concatenate([seq, one_hot, pos_encoding], axis=1)
            encoded_sequences.append(seq_encoded)
        
        if self.time_series_sequence_mode != "variable_length":
            encoded_sequences = np.array(encoded_sequences)
        return encoded_sequences




    def split_dataset(
        self,
        X: pd.DataFrame,
        y: Optional[pd.Series] = None
    ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame], Optional[pd.Series], Optional[pd.Series]]:
        """
        Split the dataset into training and testing sets while retaining original indices.

        Args:
            X (pd.DataFrame): Features.
            y (Optional[pd.Series]): Target variable.

        Returns:
            Tuple[pd.DataFrame, Optional[pd.DataFrame], Optional[pd.Series], Optional[pd.Series]]: X_train, X_test, y_train, y_test
        """
        step_name = "split_dataset"
        self.logger.info("Step: Split Dataset into Train and Test")

        # Debugging Statements
        self._log(f"Before Split - X shape: {X.shape}", step_name, 'debug')
        if y is not None:
            self._log(f"Before Split - y shape: {y.shape}", step_name, 'debug')
        else:
            self._log("Before Split - y is None", step_name, 'debug')

        # Determine splitting based on mode
        if self.mode == 'train' and self.model_category in ['classification', 'regression']:
            if self.model_category == 'classification':
                stratify = y if self.options.get('split_dataset', {}).get('stratify_for_classification', False) else None
                test_size = self.options.get('split_dataset', {}).get('test_size', 0.2)
                random_state = self.options.get('split_dataset', {}).get('random_state', 42)
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, 
                    test_size=test_size,
                    stratify=stratify, 
                    random_state=random_state
                )
                self._log("Performed stratified split for classification.", step_name, 'debug')
            elif self.model_category == 'regression':
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, 
                    test_size=self.options.get('split_dataset', {}).get('test_size', 0.2),
                    random_state=self.options.get('split_dataset', {}).get('random_state', 42)
                )
                self._log("Performed random split for regression.", step_name, 'debug')
        else:
            # For 'predict' and 'clustering' modes or other categories
            X_train = X.copy()
            X_test = None
            y_train = y.copy() if y is not None else None
            y_test = None
            self.logger.info(f"No splitting performed for mode '{self.mode}' or model category '{self.model_category}'.")

        self.preprocessing_steps.append("Split Dataset into Train and Test")

        # Keep Indices Aligned Through Each Step
        if X_test is not None and y_test is not None:
            # Sort both X_test and y_test by index
            X_test = X_test.sort_index()
            y_test = y_test.sort_index()
            self.logger.debug("Sorted X_test and y_test by index for alignment.")

        # Debugging: Log post-split shapes and index alignment
        self._log(f"After Split - X_train shape: {X_train.shape}, X_test shape: {X_test.shape if X_test is not None else 'N/A'}", step_name, 'debug')
        if self.model_category == 'classification' and y_train is not None and y_test is not None:
            self.logger.debug(f"Class distribution in y_train:\n{y_train.value_counts(normalize=True)}")
            self.logger.debug(f"Class distribution in y_test:\n{y_test.value_counts(normalize=True)}")
        elif self.model_category == 'regression' and y_train is not None and y_test is not None:
            self.logger.debug(f"y_train statistics:\n{y_train.describe()}")
            self.logger.debug(f"y_test statistics:\n{y_test.describe()}")

        # Check index alignment
        if y_train is not None and X_train.index.equals(y_train.index):
            self.logger.debug("X_train and y_train indices are aligned.")
        else:
            self.logger.warning("X_train and y_train indices are misaligned.")

        if X_test is not None and y_test is not None and X_test.index.equals(y_test.index):
            self.logger.debug("X_test and y_test indices are aligned.")
        elif X_test is not None and y_test is not None:
            self.logger.warning("X_test and y_test indices are misaligned.")

        return X_train, X_test, y_train, y_test

    def handle_missing_values(self, X_train: pd.DataFrame, X_test: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Handle missing values for numerical and categorical features based on user options.
        """
        step_name = "handle_missing_values"
        self.logger.info("Step: Handle Missing Values")

        # Fetch user-defined imputation options or set defaults
        impute_options = self.options.get('handle_missing_values', {})
        numerical_strategy = impute_options.get('numerical_strategy', {})
        categorical_strategy = impute_options.get('categorical_strategy', {})

        # Numerical Imputation
        numerical_imputer = None
        new_columns = []
        if self.numericals:
            if self.model_category in ['regression', 'classification', 'clustering']:
                default_num_strategy = 'median'  # Changed to median as per preprocessor_config.yaml
            else:
                default_num_strategy = 'median'
            num_strategy = numerical_strategy.get('strategy', default_num_strategy)
            num_imputer_type = numerical_strategy.get('imputer', 'SimpleImputer')  # Can be 'SimpleImputer', 'KNNImputer', etc.

            self._log(f"Numerical Imputation Strategy: {num_strategy.capitalize()}, Imputer Type: {num_imputer_type}", step_name, 'debug')

            # Initialize numerical imputer based on user option
            if num_imputer_type == 'SimpleImputer':
                numerical_imputer = SimpleImputer(strategy=num_strategy)
            elif num_imputer_type == 'KNNImputer':
                knn_neighbors = numerical_strategy.get('knn_neighbors', 5)
                numerical_imputer = KNNImputer(n_neighbors=knn_neighbors)
            else:
                self.logger.error(f"Numerical imputer type '{num_imputer_type}' is not supported.")
                raise ValueError(f"Numerical imputer type '{num_imputer_type}' is not supported.")

            # Fit and transform ONLY on X_train
            X_train[self.numericals] = numerical_imputer.fit_transform(X_train[self.numericals])
            self.numerical_imputer = numerical_imputer  # Assign to self for saving
            self.feature_reasons.update({col: self.feature_reasons.get(col, '') + f'Numerical: {num_strategy.capitalize()} Imputation | ' for col in self.numericals})
            new_columns.extend(self.numericals)

            if X_test is not None:
                # Transform ONLY on X_test without fitting
                X_test[self.numericals] = numerical_imputer.transform(X_test[self.numericals])

        # Categorical Imputation
        categorical_imputer = None
        all_categoricals = self.ordinal_categoricals + self.nominal_categoricals
        if all_categoricals:
            default_cat_strategy = 'most_frequent'
            cat_strategy = categorical_strategy.get('strategy', default_cat_strategy)
            cat_imputer_type = categorical_strategy.get('imputer', 'SimpleImputer')

            self._log(f"Categorical Imputation Strategy: {cat_strategy.capitalize()}, Imputer Type: {cat_imputer_type}", step_name, 'debug')

            # Initialize categorical imputer based on user option
            if cat_imputer_type == 'SimpleImputer':
                categorical_imputer = SimpleImputer(strategy=cat_strategy)
            elif cat_imputer_type == 'ConstantImputer':
                fill_value = categorical_strategy.get('fill_value', 'Missing')
                categorical_imputer = SimpleImputer(strategy='constant', fill_value=fill_value)
            else:
                self.logger.error(f"Categorical imputer type '{cat_imputer_type}' is not supported.")
                raise ValueError(f"Categorical imputer type '{cat_imputer_type}' is not supported.")

            # Fit and transform ONLY on X_train
            X_train[all_categoricals] = categorical_imputer.fit_transform(X_train[all_categoricals])
            self.categorical_imputer = categorical_imputer  # Assign to self for saving
            self.feature_reasons.update({
                col: self.feature_reasons.get(col, '') + (f'Categorical: Constant Imputation (Value={categorical_strategy.get("fill_value", "Missing")}) | ' if cat_imputer_type == 'ConstantImputer' else f'Categorical: {cat_strategy.capitalize()} Imputation | ')
                for col in all_categoricals
            })
            new_columns.extend(all_categoricals)

            if X_test is not None:
                # Transform ONLY on X_test without fitting
                X_test[all_categoricals] = categorical_imputer.transform(X_test[all_categoricals])

        self.preprocessing_steps.append("Handle Missing Values")

        # Debugging: Log post-imputation shapes and missing values
        self._log(f"Completed: Handle Missing Values. Dataset shape after imputation: {X_train.shape}", step_name, 'debug')
        self._log(f"Missing values after imputation in X_train:\n{X_train.isnull().sum()}", step_name, 'debug')
        self._log(f"New columns handled: {new_columns}", step_name, 'debug')

        return X_train, X_test

    def handle_outliers(self, X_train: pd.DataFrame, y_train: Optional[pd.Series] = None) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
        """
        Handle outliers based on the model's sensitivity and user options.
        For time_series models, apply a custom outlier handling using a rolling median filter
        to replace extreme values rather than dropping rows (to preserve temporal alignment).

        Args:
            X_train (pd.DataFrame): Training features.
            y_train (pd.Series, optional): Training target.

        Returns:
            tuple: X_train with outliers handled and corresponding y_train.
        """
        step_name = "handle_outliers"
        self.logger.info("Step: Handle Outliers")
        self._log("Starting outlier handling.", step_name, 'debug')
        debug_flag = self.get_debug_flag('debug_handle_outliers')
        initial_shape = X_train.shape[0]
        outlier_options = self.options.get('handle_outliers', {})
        zscore_threshold = outlier_options.get('zscore_threshold', 3)
        iqr_multiplier = outlier_options.get('iqr_multiplier', 1.5)
        isolation_contamination = outlier_options.get('isolation_contamination', 0.05)

        # ----- NEW: Custom outlier handling branch for time series -----
        if self.model_category == 'time_series':
            self.logger.info("Applying custom outlier handling for time_series using rolling median filter.")
            # For time series, do not drop rows—instead, replace outliers with the rolling median.
            for col in self.numericals:
                # Compute rolling statistics with a window of 5 (centered)
                rolling_median = X_train[col].rolling(window=5, center=True, min_periods=1).median()
                rolling_q1 = X_train[col].rolling(window=5, center=True, min_periods=1).quantile(0.25)
                rolling_q3 = X_train[col].rolling(window=5, center=True, min_periods=1).quantile(0.75)
                rolling_iqr = rolling_q3 - rolling_q1
                # Identify outliers as those deviating more than the multiplier times the rolling IQR
                outlier_mask = abs(X_train[col] - rolling_median) > (iqr_multiplier * rolling_iqr)
                num_outliers = outlier_mask.sum()
                # Replace outlier values with the corresponding rolling median
                X_train.loc[outlier_mask, col] = rolling_median[outlier_mask]
                self.logger.debug(f"Replaced {num_outliers} outliers in column '{col}' with rolling median.")
            self.preprocessing_steps.append("Handle Outliers (time_series custom)")
            self._log(f"Completed: Handle Outliers for time_series. Initial samples: {initial_shape}, Final samples: {X_train.shape[0]}", step_name, 'debug')
            return X_train, y_train
        # -----------------------------------------------------------------

        # Existing outlier handling for regression and classification
        if self.model_category in ['regression', 'classification']:
            self.logger.info(f"Applying univariate outlier detection for {self.model_category}.")
            for col in self.numericals:
                # Z-Score Filtering
                apply_zscore = outlier_options.get('apply_zscore', True)
                if apply_zscore:
                    z_scores = np.abs((X_train[col] - X_train[col].mean()) / X_train[col].std())
                    mask_z = z_scores < zscore_threshold
                    removed_z = (~mask_z).sum()
                    X_train = X_train[mask_z]
                    if y_train is not None:
                        y_train = y_train.loc[X_train.index]
                    self.feature_reasons[col] += f'Outliers handled with Z-Score Filtering (threshold={zscore_threshold}) | '
                    self._log(f"Removed {removed_z} outliers from '{col}' using Z-Score Filtering.", step_name, 'debug')

                # IQR Filtering
                apply_iqr = outlier_options.get('apply_iqr', True)
                if apply_iqr:
                    Q1 = X_train[col].quantile(0.25)
                    Q3 = X_train[col].quantile(0.75)
                    IQR = Q3 - Q1
                    lower_bound = Q1 - iqr_multiplier * IQR
                    upper_bound = Q3 + iqr_multiplier * IQR
                    mask_iqr = (X_train[col] >= lower_bound) & (X_train[col] <= upper_bound)
                    removed_iqr = (~mask_iqr).sum()
                    X_train = X_train[mask_iqr]
                    if y_train is not None:
                        y_train = y_train.loc[X_train.index]
                    self.feature_reasons[col] += f'Outliers handled with IQR Filtering (multiplier={iqr_multiplier}) | '
                    self._log(f"Removed {removed_iqr} outliers from '{col}' using IQR Filtering.", step_name, 'debug')

        elif self.model_category == 'clustering':
            self.logger.info("Applying multivariate IsolationForest for clustering.")
            contamination = isolation_contamination
            iso_forest = IsolationForest(contamination=contamination, random_state=42)
            preds = iso_forest.fit_predict(X_train[self.numericals])
            mask_iso = preds != -1
            removed_iso = (preds == -1).sum()
            X_train = X_train[mask_iso]
            if y_train is not None:
                y_train = y_train.loc[X_train.index]
            self.feature_reasons['all_numericals'] += f'Outliers handled with Multivariate IsolationForest (contamination={contamination}) | '
            self._log(f"Removed {removed_iso} outliers using Multivariate IsolationForest.", step_name, 'debug')
        else:
            self.logger.warning(f"Model category '{self.model_category}' not recognized for outlier handling.")

        self.preprocessing_steps.append("Handle Outliers")
        self._log(f"Completed: Handle Outliers. Initial samples: {initial_shape}, Final samples: {X_train.shape[0]}", step_name, 'debug')
        self._log(f"Missing values after outlier handling in X_train:\n{X_train.isnull().sum()}", step_name, 'debug')
        return X_train, y_train


    def test_normality(self, X_train: pd.DataFrame) -> Dict[str, Dict]:
        """
        Test normality for numerical features based on normality tests and user options.

        Args:
            X_train (pd.DataFrame): Training features.

        Returns:
            Dict[str, Dict]: Dictionary with normality test results for each numerical feature.
        """
        step_name = "Test for Normality"
        self.logger.info(f"Step: {step_name}")
        debug_flag = self.get_debug_flag('debug_test_normality')
        normality_results = {}

        # Fetch user-defined normality test options or set defaults
        normality_options = self.options.get('test_normality', {})
        p_value_threshold = normality_options.get('p_value_threshold', 0.05)
        skewness_threshold = normality_options.get('skewness_threshold', 1.0)
        additional_tests = normality_options.get('additional_tests', [])  # e.g., ['anderson-darling']

        for col in self.numericals:
            data = X_train[col].dropna()
            skewness = data.skew()
            kurtosis = data.kurtosis()

            # Determine which normality test to use based on sample size and user options
            test_used = 'Shapiro-Wilk'
            p_value = 0.0

            if len(data) <= 5000:
                from scipy.stats import shapiro
                stat, p_val = shapiro(data)
                test_used = 'Shapiro-Wilk'
                p_value = p_val
            else:
                from scipy.stats import anderson
                result = anderson(data)
                test_used = 'Anderson-Darling'
                # Determine p-value based on critical values
                p_value = 0.0  # Default to 0
                for cv, sig in zip(result.critical_values, result.significance_level):
                    if result.statistic < cv:
                        p_value = sig / 100
                        break

            # Apply user-defined or default criteria
            if self.model_category in ['regression', 'classification', 'clustering']:
                # Linear, Logistic Regression, and Clustering: Use p-value and skewness
                needs_transform = (p_value < p_value_threshold) or (abs(skewness) > skewness_threshold)
            else:
                # Other models: Use skewness, and optionally p-values based on options
                use_p_value = normality_options.get('use_p_value_other_models', False)
                if use_p_value:
                    needs_transform = (p_value < p_value_threshold) or (abs(skewness) > skewness_threshold)
                else:
                    needs_transform = abs(skewness) > skewness_threshold

            normality_results[col] = {
                'skewness': skewness,
                'kurtosis': kurtosis,
                'p_value': p_value,
                'test_used': test_used,
                'needs_transform': needs_transform
            }

            # Conditional Detailed Logging
            if debug_flag:
                self._log(f"Feature '{col}': p-value={p_value:.4f}, skewness={skewness:.4f}, needs_transform={needs_transform}", step_name, 'debug')

        self.normality_results = normality_results
        self.preprocessing_steps.append(step_name)

        # Completion Logging
        if debug_flag:
            self._log(f"Completed: {step_name}. Normality results computed.", step_name, 'debug')
        else:
            self.logger.info(f"Step '{step_name}' completed: Normality results computed.")

        return normality_results

    def encode_categorical_variables(self, X_train: pd.DataFrame, X_test: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Encode categorical variables using user-specified encoding strategies.
        """
        step_name = "encode_categorical_variables"
        self.logger.info("Step: Encode Categorical Variables")
        self._log("Starting categorical variable encoding.", step_name, 'debug')

        # Fetch user-defined encoding options or set defaults
        encoding_options = self.options.get('encode_categoricals', {})
        ordinal_encoding = encoding_options.get('ordinal_encoding', 'OrdinalEncoder')  # Options: 'OrdinalEncoder', 'None'
        nominal_encoding = encoding_options.get('nominal_encoding', 'OneHotEncoder')  # Changed from 'OneHotEncoder' to 'OrdinalEncoder'
        handle_unknown = encoding_options.get('handle_unknown', 'use_encoded_value')  # Adjusted for OrdinalEncoder

        # Determine if SMOTENC is being used
        smote_variant = self.options.get('implement_smote', {}).get('variant', None)
        if smote_variant == 'SMOTENC':
            nominal_encoding = 'OrdinalEncoder'  # Ensure compatibility

        transformers = []
        new_columns = []
        if self.ordinal_categoricals and ordinal_encoding != 'None':
            if ordinal_encoding == 'OrdinalEncoder':
                transformers.append(
                    ('ordinal', OrdinalEncoder(), self.ordinal_categoricals)
                )
                self._log(f"Added OrdinalEncoder for features: {self.ordinal_categoricals}", step_name, 'debug')
            else:
                self.logger.error(f"Ordinal encoding method '{ordinal_encoding}' is not supported.")
                raise ValueError(f"Ordinal encoding method '{ordinal_encoding}' is not supported.")
        if self.nominal_categoricals and nominal_encoding != 'None':
            if nominal_encoding == 'OrdinalEncoder':
                transformers.append(
                    ('nominal', OrdinalEncoder(handle_unknown=handle_unknown), self.nominal_categoricals)
                )
                self._log(f"Added OrdinalEncoder for features: {self.nominal_categoricals}", step_name, 'debug')
            elif nominal_encoding == 'FrequencyEncoder':
                # Custom Frequency Encoding
                for col in self.nominal_categoricals:
                    freq = X_train[col].value_counts(normalize=True)
                    X_train[col] = X_train[col].map(freq)
                    if X_test is not None:
                        X_test[col] = X_test[col].map(freq).fillna(0)
                    self.feature_reasons[col] += 'Encoded with Frequency Encoding | '
                    self._log(f"Applied Frequency Encoding to '{col}'.", step_name, 'debug')
            else:
                self.logger.error(f"Nominal encoding method '{nominal_encoding}' is not supported.")
                raise ValueError(f"Nominal encoding method '{nominal_encoding}' is not supported.")

        if not transformers and 'FrequencyEncoder' not in nominal_encoding:
            self.logger.info("No categorical variables to encode.")
            self.preprocessing_steps.append("Encode Categorical Variables")
            self._log(f"Completed: Encode Categorical Variables. No encoding was applied.", step_name, 'debug')
            return X_train, X_test

        if transformers:
            self.preprocessor = ColumnTransformer(
                transformers=transformers,
                remainder='passthrough',
                verbose_feature_names_out=False  # Disable prefixing
            )

            # Fit and transform training data
            X_train_encoded = self.preprocessor.fit_transform(X_train)
            self._log("Fitted and transformed X_train with ColumnTransformer.", step_name, 'debug')

            # Transform testing data
            if X_test is not None:
                X_test_encoded = self.preprocessor.transform(X_test)
                self._log("Transformed X_test with fitted ColumnTransformer.", step_name, 'debug')
            else:
                X_test_encoded = None

            # Retrieve feature names after encoding
            encoded_feature_names = []
            if self.ordinal_categoricals and ordinal_encoding == 'OrdinalEncoder':
                encoded_feature_names += self.ordinal_categoricals
            if self.nominal_categoricals and nominal_encoding == 'OrdinalEncoder':
                encoded_feature_names += self.nominal_categoricals
            elif self.nominal_categoricals and nominal_encoding == 'FrequencyEncoder':
                encoded_feature_names += self.nominal_categoricals
            passthrough_features = [col for col in X_train.columns if col not in self.ordinal_categoricals + self.nominal_categoricals]
            encoded_feature_names += passthrough_features
            new_columns.extend(encoded_feature_names)

            # Convert numpy arrays back to DataFrames
            X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_feature_names, index=X_train.index)
            if X_test_encoded is not None:
                X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_feature_names, index=X_test.index)
            else:
                X_test_encoded_df = None

            # Store encoders for inverse transformation
            self.ordinal_encoder = self.preprocessor.named_transformers_.get('ordinal', None)
            self.nominal_encoder = self.preprocessor.named_transformers_.get('nominal', None)

            self.preprocessing_steps.append("Encode Categorical Variables")
            self._log(f"Completed: Encode Categorical Variables. X_train_encoded shape: {X_train_encoded_df.shape}", step_name, 'debug')
            self._log(f"Columns after encoding: {encoded_feature_names}", step_name, 'debug')
            self._log(f"Sample of encoded X_train:\n{X_train_encoded_df.head()}", step_name, 'debug')
            self._log(f"New columns added: {new_columns}", step_name, 'debug')

            return X_train_encoded_df, X_test_encoded_df

    def generate_recommendations(self) -> pd.DataFrame:
        """
        Generate a table of preprocessing recommendations based on the model type, data, and user options.

        Returns:
            pd.DataFrame: DataFrame containing recommendations for each feature.
        """
        step_name = "Generate Preprocessor Recommendations"
        self.logger.info(f"Step: {step_name}")
        debug_flag = self.get_debug_flag('debug_generate_recommendations')

        # Generate recommendations based on feature reasons
        recommendations = {}
        for col in self.ordinal_categoricals + self.nominal_categoricals + self.numericals:
            reasons = self.feature_reasons.get(col, '').strip(' | ')
            recommendations[col] = reasons

        recommendations_table = pd.DataFrame.from_dict(
            recommendations, 
            orient='index', 
            columns=['Preprocessing Reason']
        )
        if debug_flag:
            self.logger.debug(f"Preprocessing Recommendations:\n{recommendations_table}")
        else:
            self.logger.info("Preprocessing Recommendations generated.")

        self.preprocessing_steps.append(step_name)

        # Completion Logging
        if debug_flag:
            self._log(f"Completed: {step_name}. Recommendations generated.", step_name, 'debug')
        else:
            self.logger.info(f"Step '{step_name}' completed: Recommendations generated.")

        return recommendations_table

    def save_transformers(self):
        step_name = "Save Transformers"
        self.logger.info(f"Step: {step_name}")
        debug_flag = self.get_debug_flag('debug_save_transformers')
        
        # Ensure the transformers directory exists
        os.makedirs(self.transformers_dir, exist_ok=True)
        transformers_path = os.path.join(self.transformers_dir, 'transformers.pkl')  # Consistent file path
        
        transformers = {
            'numerical_imputer': getattr(self, 'numerical_imputer', None),
            'categorical_imputer': getattr(self, 'categorical_imputer', None),
            'preprocessor': self.pipeline,   # Includes all preprocessing steps
            'smote': self.smote,
            'final_feature_order': self.final_feature_order,
            'categorical_indices': self.categorical_indices
        }
        try:
            joblib.dump(transformers, transformers_path)
            if debug_flag:
                self._log(f"Transformers saved at '{transformers_path}'.", step_name, 'debug')
            else:
                self.logger.info(f"Transformers saved at '{transformers_path}'.")
        except Exception as e:
            self.logger.error(f"❌ Failed to save transformers: {e}")
            raise

        self.preprocessing_steps.append(step_name)

    def load_transformers(self) -> dict:
        step_name = "Load Transformers"
        self.logger.info(f"Step: {step_name}")
        debug_flag = self.get_debug_flag('debug_load_transformers')  # Assuming a step-specific debug flag
        transformers_path = os.path.join(self.transformers_dir, 'transformers.pkl')  # Correct path

        # Debug log
        self.logger.debug(f"Loading transformers from: {transformers_path}")

        if not os.path.exists(transformers_path):
            self.logger.error(f"❌ Transformers file not found at '{transformers_path}'. Cannot proceed with prediction.")
            raise FileNotFoundError(f"Transformers file not found at '{transformers_path}'.")

        try:
            transformers = joblib.load(transformers_path)

            # Extract transformers
            numerical_imputer = transformers.get('numerical_imputer')
            categorical_imputer = transformers.get('categorical_imputer')
            preprocessor = transformers.get('preprocessor')
            smote = transformers.get('smote', None)
            final_feature_order = transformers.get('final_feature_order', [])
            categorical_indices = transformers.get('categorical_indices', [])
            self.categorical_indices = categorical_indices  # Set the attribute

            # **Post-Loading Debugging:**
            if preprocessor is not None:
                try:
                    # Do not attempt to transform dummy data here
                    self.logger.debug(f"Pipeline loaded. Ready to transform new data.")
                except AttributeError as e:
                    self.logger.error(f"Pipeline's get_feature_names_out is not available: {e}")
                    expected_features = []
            else:
                self.logger.error("❌ Preprocessor is not loaded.")
                raise AttributeError("Preprocessor is not loaded.")

        except Exception as e:
            self.logger.error(f"❌ Failed to load transformers: {e}")
            raise

        self.preprocessing_steps.append(step_name)

        # Additional checks
        if preprocessor is None:
            self.logger.error("❌ Preprocessor is not loaded.")

        if debug_flag:
            self._log(f"Transformers loaded successfully from '{transformers_path}'.", step_name, 'debug')
        else:
            self.logger.info(f"Transformers loaded successfully from '{transformers_path}'.")

        # Set the pipeline
        self.pipeline = preprocessor

        # Return the transformers as a dictionary
        return {
            'numerical_imputer': numerical_imputer,
            'categorical_imputer': categorical_imputer,
            'preprocessor': preprocessor,
            'smote': smote,
            'final_feature_order': final_feature_order,
            'categorical_indices': categorical_indices
        }

    def apply_scaling(self, X_train: pd.DataFrame, X_test: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Apply scaling based on the model type and user options.

        Args:
            X_train (pd.DataFrame): Training features.
            X_test (Optional[pd.DataFrame]): Testing features.

        Returns:
            Tuple[pd.DataFrame, Optional[pd.DataFrame]]: Scaled X_train and X_test.
        """
        step_name = "Apply Scaling (If Needed by Model)"
        self.logger.info(f"Step: {step_name}")
        debug_flag = self.get_debug_flag('debug_apply_scaling')

        # Fetch user-defined scaling options or set defaults
        scaling_options = self.options.get('apply_scaling', {})
        scaling_method = scaling_options.get('method', None)  # 'StandardScaler', 'MinMaxScaler', 'RobustScaler', 'None'
        features_to_scale = scaling_options.get('features', self.numericals)

        scaler = None
        scaling_type = 'None'

        if scaling_method is None:
            # Default scaling based on model category
            if self.model_category in ['regression', 'classification', 'clustering']:
                # For clustering, MinMaxScaler is generally preferred
                if self.model_category == 'clustering':
                    scaler = MinMaxScaler()
                    scaling_type = 'MinMaxScaler'
                else:
                    scaler = StandardScaler()
                    scaling_type = 'StandardScaler'
            else:
                scaler = None
                scaling_type = 'None'
        else:
            # Normalize the scaling_method string to handle case-insensitivity
            scaling_method_normalized = scaling_method.lower()
            if scaling_method_normalized == 'standardscaler':
                scaler = StandardScaler()
                scaling_type = 'StandardScaler'
            elif scaling_method_normalized == 'minmaxscaler':
                scaler = MinMaxScaler()
                scaling_type = 'MinMaxScaler'
            elif scaling_method_normalized == 'robustscaler':
                scaler = RobustScaler()
                scaling_type = 'RobustScaler'
            elif scaling_method_normalized == 'none':
                scaler = None
                scaling_type = 'None'
            else:
                self.logger.error(f"Scaling method '{scaling_method}' is not supported.")
                raise ValueError(f"Scaling method '{scaling_method}' is not supported.")

        # Apply scaling if scaler is defined
        if scaler is not None and features_to_scale:
            self.scaler = scaler
            if debug_flag:
                self._log(f"Features to scale: {features_to_scale}", step_name, 'debug')

            # Check if features exist in the dataset
            missing_features = [feat for feat in features_to_scale if feat not in X_train.columns]
            if missing_features:
                self.logger.error(f"The following features specified for scaling are missing in the dataset: {missing_features}")
                raise KeyError(f"The following features specified for scaling are missing in the dataset: {missing_features}")

            X_train[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])
            if X_test is not None:
                X_test[features_to_scale] = scaler.transform(X_test[features_to_scale])

            for col in features_to_scale:
                self.feature_reasons[col] += f'Scaling Applied: {scaling_type} | '

            self.preprocessing_steps.append(step_name)
            if debug_flag:
                self._log(f"Applied {scaling_type} to features: {features_to_scale}", step_name, 'debug')
                if hasattr(scaler, 'mean_'):
                    self._log(f"Scaler Parameters: mean={scaler.mean_}", step_name, 'debug')
                if hasattr(scaler, 'scale_'):
                    self._log(f"Scaler Parameters: scale={scaler.scale_}", step_name, 'debug')
                self._log(f"Sample of scaled X_train:\n{X_train[features_to_scale].head()}", step_name, 'debug')
                if X_test is not None:
                    self._log(f"Sample of scaled X_test:\n{X_test[features_to_scale].head()}", step_name, 'debug')
            else:
                self.logger.info(f"Step '{step_name}' completed: Applied {scaling_type} to features: {features_to_scale}")
        else:
            self.logger.info("No scaling applied based on user options or no features specified.")
            self.preprocessing_steps.append(step_name)
            if debug_flag:
                self._log(f"Completed: {step_name}. No scaling was applied.", step_name, 'debug')
            else:
                self.logger.info(f"Step '{step_name}' completed: No scaling was applied.")

        return X_train, X_test

    def determine_n_neighbors(self, minority_count: int, default_neighbors: int = 5) -> int:
        """
        Determine the appropriate number of neighbors for SMOTE based on minority class size.

        Args:
            minority_count (int): Number of samples in the minority class.
            default_neighbors (int): Default number of neighbors to use if possible.

        Returns:
            int: Determined number of neighbors for SMOTE.
        """
        if minority_count <= 1:
            raise ValueError("SMOTE cannot be applied when the minority class has less than 2 samples.")
        
        # Ensure n_neighbors does not exceed minority_count - 1
        n_neighbors = min(default_neighbors, minority_count - 1)
        return n_neighbors

    def implement_smote(self, X_train: pd.DataFrame, y_train: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
        """
        Implement SMOTE or its variants based on class imbalance with automated n_neighbors selection.

        Args:
            X_train (pd.DataFrame): Training features (transformed).
            y_train (pd.Series): Training target.

        Returns:
            Tuple[pd.DataFrame, pd.Series]: Resampled X_train and y_train.
        """
        step_name = "Implement SMOTE (Train Only)"
        self.logger.info(f"Step: {step_name}")

        # Check if classification
        if self.model_category != 'classification':
            self.logger.info("SMOTE not applicable: Not a classification model.")
            self.preprocessing_steps.append("SMOTE Skipped")
            return X_train, y_train

        # Calculate class distribution
        class_counts = y_train.value_counts()
        if len(class_counts) < 2:
            self.logger.warning("SMOTE not applicable: Only one class present.")
            self.preprocessing_steps.append("SMOTE Skipped")
            return X_train, y_train

        majority_class = class_counts.idxmax()
        minority_class = class_counts.idxmin()
        majority_count = class_counts.max()
        minority_count = class_counts.min()
        imbalance_ratio = minority_count / majority_count
        self.logger.info(f"Class Distribution before SMOTE: {class_counts.to_dict()}")
        self.logger.info(f"Imbalance Ratio (Minority/Majority): {imbalance_ratio:.4f}")

        # Determine SMOTE variant based on dataset composition
        has_numericals = len(self.numericals) > 0
        has_categoricals = len(self.ordinal_categoricals) + len(self.nominal_categoricals) > 0

        # Automatically select SMOTE variant
        if has_numericals and has_categoricals:
            smote_variant = 'SMOTENC'
            self.logger.info("Dataset contains both numerical and categorical features. Using SMOTENC.")
        elif has_numericals and not has_categoricals:
            smote_variant = 'SMOTE'
            self.logger.info("Dataset contains only numerical features. Using SMOTE.")
        elif has_categoricals and not has_numericals:
            smote_variant = 'SMOTEN'
            self.logger.info("Dataset contains only categorical features. Using SMOTEN.")
        else:
            smote_variant = 'SMOTE'  # Fallback
            self.logger.info("Feature composition unclear. Using SMOTE as default.")

        # Initialize SMOTE based on the variant
        try:
            if smote_variant == 'SMOTENC':
                if not self.categorical_indices:
                    # Determine categorical indices if not already set
                    categorical_features = []
                    for name, transformer, features in self.pipeline.transformers_:
                        if 'ord' in name or 'nominal' in name:
                            if isinstance(transformer, Pipeline):
                                encoder = transformer.named_steps.get('ordinal_encoder') or transformer.named_steps.get('onehot_encoder')
                                if hasattr(encoder, 'categories_'):
                                    # Calculate indices based on transformers order
                                    # This can be complex; for simplicity, assuming categorical features are the first
                                    categorical_features.extend(range(len(features)))
                    self.categorical_indices = categorical_features
                    self.logger.debug(f"Categorical feature indices for SMOTENC: {self.categorical_indices}")
                n_neighbors = self.determine_n_neighbors(minority_count, default_neighbors=5)
                smote = SMOTENC(categorical_features=self.categorical_indices, random_state=42, k_neighbors=n_neighbors)
                self.logger.debug(f"Initialized SMOTENC with categorical features indices: {self.categorical_indices} and n_neighbors={n_neighbors}")
            elif smote_variant == 'SMOTEN':
                n_neighbors = self.determine_n_neighbors(minority_count, default_neighbors=5)
                smote = SMOTEN(random_state=42, n_neighbors=n_neighbors)
                self.logger.debug(f"Initialized SMOTEN with n_neighbors={n_neighbors}")
            else:
                n_neighbors = self.determine_n_neighbors(minority_count, default_neighbors=5)
                smote = SMOTE(random_state=42, k_neighbors=n_neighbors)
                self.logger.debug(f"Initialized SMOTE with n_neighbors={n_neighbors}")
        except ValueError as ve:
            self.logger.error(f"❌ SMOTE initialization failed: {ve}")
            raise
        except Exception as e:
            self.logger.error(f"❌ Unexpected error during SMOTE initialization: {e}")
            raise

        # Apply SMOTE
        try:
            X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
            self.logger.info(f"Applied {smote_variant}. Resampled dataset shape: {X_resampled.shape}")
            self.preprocessing_steps.append("Implement SMOTE")
            self.smote = smote  # Assign to self for saving
            self.logger.debug(f"Selected n_neighbors for SMOTE: {n_neighbors}")
            return X_resampled, y_resampled
        except Exception as e:
            self.logger.error(f"❌ SMOTE application failed: {e}")
            raise

    def inverse_transform_data(self, X_transformed: np.ndarray, original_data: Optional[pd.DataFrame] = None) -> pd.DataFrame:
        """
        Perform inverse transformation on the transformed data to reconstruct original feature values.

        Args:
            X_transformed (np.ndarray): The transformed feature data.
            original_data (Optional[pd.DataFrame]): The original data before transformation.

        Returns:
            pd.DataFrame: The inverse-transformed DataFrame including passthrough columns.
        """
        if self.pipeline is None:
            self.logger.error("Preprocessing pipeline has not been fitted. Cannot perform inverse transformation.")
            raise AttributeError("Preprocessing pipeline has not been fitted. Cannot perform inverse transformation.")

        preprocessor = self.pipeline
        logger = logging.getLogger('InverseTransform')
        if self.debug or self.get_debug_flag('debug_final_inverse_transformations'):
            logger.setLevel(logging.DEBUG)
        else:
            logger.setLevel(logging.INFO)

        logger.debug(f"[DEBUG Inverse] Starting inverse transformation. Input shape: {X_transformed.shape}")

        # Initialize variables
        inverse_data = {}
        transformations_applied = False  # Flag to check if any transformations are applied
        start_idx = 0  # Starting index for slicing

        # Iterate over each transformer in the ColumnTransformer
        for name, transformer, features in preprocessor.transformers_:
            if name == 'remainder':
                logger.debug(f"[DEBUG Inverse] Skipping 'remainder' transformer (passthrough columns).")
                continue  # Skip passthrough columns

            end_idx = start_idx + len(features)
            logger.debug(f"[DEBUG Inverse] Transformer '{name}' handling features {features} with slice {start_idx}:{end_idx}")

            # Check if the transformer has an inverse_transform method
            if hasattr(transformer, 'named_steps'):
                # Access the last step in the pipeline (e.g., scaler or encoder)
                last_step = list(transformer.named_steps.keys())[-1]
                inverse_transformer = transformer.named_steps[last_step]

                if hasattr(inverse_transformer, 'inverse_transform'):
                    transformed_slice = X_transformed[:, start_idx:end_idx]
                    inverse_slice = inverse_transformer.inverse_transform(transformed_slice)

                    # Assign inverse-transformed data to the corresponding feature names
                    for idx, feature in enumerate(features):
                        inverse_data[feature] = inverse_slice[:, idx]

                    logger.debug(f"[DEBUG Inverse] Applied inverse_transform on transformer '{last_step}' for features {features}.")
                    transformations_applied = True
                else:
                    logger.debug(f"[DEBUG Inverse] Transformer '{last_step}' does not support inverse_transform. Skipping.")
            else:
                logger.debug(f"[DEBUG Inverse] Transformer '{name}' does not have 'named_steps'. Skipping.")

            start_idx = end_idx  # Update starting index for next transformer

        # Convert the inverse_data dictionary to a DataFrame
        if transformations_applied:
            inverse_df = pd.DataFrame(inverse_data, index=original_data.index if original_data is not None else None)
            logger.debug(f"[DEBUG Inverse] Inverse DataFrame shape (transformed columns): {inverse_df.shape}")
            logger.debug(f"[DEBUG Inverse] Sample of inverse-transformed data:\n{inverse_df.head()}")
        else:
            if original_data is not None:
                logger.warning("⚠️ No reversible transformations were applied. Returning original data.")
                inverse_df = original_data.copy()
                logger.debug(f"[DEBUG Inverse] Returning a copy of original_data with shape: {inverse_df.shape}")
            else:
                logger.error("❌ No transformations were applied and original_data was not provided. Cannot perform inverse transformation.")
                raise ValueError("No transformations were applied and original_data was not provided.")

        # Identify passthrough columns by excluding transformed features
        if original_data is not None and transformations_applied:
            transformed_features = set(inverse_data.keys())
            all_original_features = set(original_data.columns)
            passthrough_columns = list(all_original_features - transformed_features)
            logger.debug(f"[DEBUG Inverse] Inverse DataFrame columns before pass-through merge: {inverse_df.columns.tolist()}")
            logger.debug(f"[DEBUG Inverse] all_original_features: {list(all_original_features)}")
            logger.debug(f"[DEBUG Inverse] passthrough_columns: {passthrough_columns}")

            if passthrough_columns:
                logger.debug(f"[DEBUG Inverse] Passthrough columns to merge: {passthrough_columns}")
                passthrough_data = original_data[passthrough_columns].copy()
                inverse_df = pd.concat([inverse_df, passthrough_data], axis=1)

                # Ensure the final DataFrame has the same column order as original_data
                inverse_df = inverse_df[original_data.columns]
                logger.debug(f"[DEBUG Inverse] Final inverse DataFrame shape: {inverse_df.shape}")
                
                # Check for missing columns after inverse transform
                expected_columns = set(original_data.columns)
                final_columns = set(inverse_df.columns)
                missing_after_inverse = expected_columns - final_columns

                if missing_after_inverse:
                    err_msg = (
                    f"Inverse transform error: The following columns are missing "
                    f"after inverse transform: {missing_after_inverse}"
                    )
                    logger.error(err_msg)
                    raise ValueError(err_msg)
            else:
                logger.debug("[DEBUG Inverse] No passthrough columns to merge.")
        else:
            logger.debug("[DEBUG Inverse] Either no original_data provided or no transformations were applied.")

        return inverse_df



    def build_pipeline(self, X_train: pd.DataFrame) -> ColumnTransformer:
        transformers = []

        # Handle Numerical Features
        if self.numericals:
            numerical_strategy = self.options.get('handle_missing_values', {}).get('numerical_strategy', {}).get('strategy', 'median')
            numerical_imputer = self.options.get('handle_missing_values', {}).get('numerical_strategy', {}).get('imputer', 'SimpleImputer')

            if numerical_imputer == 'SimpleImputer':
                num_imputer = SimpleImputer(strategy=numerical_strategy)
            elif numerical_imputer == 'KNNImputer':
                knn_neighbors = self.options.get('handle_missing_values', {}).get('numerical_strategy', {}).get('knn_neighbors', 5)
                num_imputer = KNNImputer(n_neighbors=knn_neighbors)
            else:
                raise ValueError(f"Unsupported numerical imputer type: {numerical_imputer}")

            # Determine scaling method
            scaling_method = self.options.get('apply_scaling', {}).get('method', None)
            if scaling_method is None:
                # Default scaling based on model category
                if self.model_category in ['regression', 'classification', 'clustering']:
                    # For clustering, MinMaxScaler is generally preferred
                    if self.model_category == 'clustering':
                        scaler = MinMaxScaler()
                        scaling_type = 'MinMaxScaler'
                    else:
                        scaler = StandardScaler()
                        scaling_type = 'StandardScaler'
                else:
                    scaler = 'passthrough'
                    scaling_type = 'None'
            else:
                # Normalize the scaling_method string to handle case-insensitivity
                scaling_method_normalized = scaling_method.lower()
                if scaling_method_normalized == 'standardscaler':
                    scaler = StandardScaler()
                    scaling_type = 'StandardScaler'
                elif scaling_method_normalized == 'minmaxscaler':
                    scaler = MinMaxScaler()
                    scaling_type = 'MinMaxScaler'
                elif scaling_method_normalized == 'robustscaler':
                    scaler = RobustScaler()
                    scaling_type = 'RobustScaler'
                elif scaling_method_normalized == 'none':
                    scaler = 'passthrough'
                    scaling_type = 'None'
                else:
                    raise ValueError(f"Unsupported scaling method: {scaling_method}")

            numerical_transformer = Pipeline(steps=[
                ('imputer', num_imputer),
                ('scaler', scaler)
            ])

            transformers.append(('num', numerical_transformer, self.numericals))
            self.logger.debug(f"Numerical transformer added with imputer '{numerical_imputer}' and scaler '{scaling_type}'.")

        # Handle Ordinal Categorical Features
        if self.ordinal_categoricals:
            ordinal_strategy = self.options.get('encode_categoricals', {}).get('ordinal_encoding', 'OrdinalEncoder')
            if ordinal_strategy == 'OrdinalEncoder':
                ordinal_transformer = Pipeline(steps=[
                    ('imputer', SimpleImputer(strategy='most_frequent')),
                    ('ordinal_encoder', OrdinalEncoder())
                ])
                transformers.append(('ord', ordinal_transformer, self.ordinal_categoricals))
                self.logger.debug("Ordinal transformer added with OrdinalEncoder.")
            else:
                raise ValueError(f"Unsupported ordinal encoding strategy: {ordinal_strategy}")

        # Handle Nominal Categorical Features
        if self.nominal_categoricals:
            nominal_strategy = self.options.get('encode_categoricals', {}).get('nominal_encoding', 'OneHotEncoder')
            if nominal_strategy == 'OneHotEncoder':
                nominal_transformer = Pipeline(steps=[
                    ('imputer', SimpleImputer(strategy='most_frequent')),
                    ('onehot_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
                ])
                transformers.append(('nominal', nominal_transformer, self.nominal_categoricals))
                self.logger.debug("Nominal transformer added with OneHotEncoder.")
            elif nominal_strategy == 'OrdinalEncoder':
                nominal_transformer = Pipeline(steps=[
                    ('imputer', SimpleImputer(strategy='most_frequent')),
                    ('ordinal_encoder', OrdinalEncoder())
                ])
                transformers.append(('nominal_ord', nominal_transformer, self.nominal_categoricals))
                self.logger.debug("Nominal transformer added with OrdinalEncoder.")
            elif nominal_strategy == 'FrequencyEncoder':
                # Implement custom Frequency Encoding
                for feature in self.nominal_categoricals:
                    freq = X_train[feature].value_counts(normalize=True)
                    X_train[feature] = X_train[feature].map(freq)
                    self.feature_reasons[feature] += 'Frequency Encoding applied | '
                    self.logger.debug(f"Frequency Encoding applied to '{feature}'.")
            else:
                raise ValueError(f"Unsupported nominal encoding strategy: {nominal_strategy}")

        if not transformers and 'FrequencyEncoder' not in nominal_strategy:
            self.logger.error("No transformers added to the pipeline. Check feature categorization and configuration.")
            raise ValueError("No transformers added to the pipeline. Check feature categorization and configuration.")

        preprocessor = ColumnTransformer(transformers=transformers, remainder='passthrough')
        self.logger.debug("ColumnTransformer constructed with the following transformers:")
        for t in transformers:
            self.logger.debug(t)

        preprocessor.fit(X_train)
        self.logger.info("✅ Preprocessor fitted on training data.")

        # Determine categorical feature indices for SMOTENC if needed
        if self.options.get('implement_smote', {}).get('variant', None) == 'SMOTENC':
            if not self.categorical_indices:
                categorical_features = []
                for name, transformer, features in preprocessor.transformers_:
                    if 'ord' in name or 'nominal' in name:
                        if isinstance(transformer, Pipeline):
                            encoder = transformer.named_steps.get('ordinal_encoder') or transformer.named_steps.get('onehot_encoder')
                            if hasattr(encoder, 'categories_'):
                                # Calculate indices based on transformers order
                                # This can be complex; for simplicity, assuming categorical features are the first
                                categorical_features.extend(range(len(features)))
                self.categorical_indices = categorical_features
                self.logger.debug(f"Categorical feature indices for SMOTENC: {self.categorical_indices}")

        return preprocessor

    def phase_scaling(self, df: pd.DataFrame, numeric_cols: List[str], group_column: str) -> Tuple[pd.DataFrame, Dict]:
        """
        Normalize numeric features within each group (e.g. each phase) using RobustScaler.
        Logs summary statistics before and after scaling.

        Args:
            df (pd.DataFrame): Input DataFrame.
            numeric_cols (List[str]): List of numeric columns to scale.
            group_column (str): The column used for grouping (e.g., 'phase').

        Returns:
            Tuple[pd.DataFrame, Dict]: The DataFrame with scaled values and a dictionary of fitted scalers per group.
        """
        from sklearn.preprocessing import RobustScaler

        scalers = {}
        groups = df[group_column].unique()
        self.logger.info(f"Starting phase-aware normalization on column '{group_column}' for groups: {groups}")
        for grp in groups:
            phase_mask = df[group_column] == grp
            df_grp = df.loc[phase_mask, numeric_cols]
            # Log before scaling
            self.logger.debug(f"Before scaling for group '{grp}':\n{df_grp.describe()}")
            scaler = RobustScaler().fit(df_grp)
            df.loc[phase_mask, numeric_cols] = scaler.transform(df_grp)
            scalers[grp] = scaler
            # Log after scaling
            self.logger.debug(f"After scaling for group '{grp}':\n{df.loc[phase_mask, numeric_cols].describe()}")
        return df, scalers


    # NEW: Adaptive Window Calculation based on group duration statistics.
    @staticmethod
    def calculate_phase_window(phase_data: pd.DataFrame, base_size: int = 100, std_dev: int = 2) -> int:
        """
        Estimate an optimal window size for a given phase (or group) based on its duration statistics.
        
        Args:
            phase_data (pd.DataFrame): Data for a specific phase/group.
            base_size (int): Minimum window size.
            std_dev (int): Multiplier for standard deviation.
        
        Returns:
            int: Calculated window size.
        """
        # Assuming a grouping column exists (e.g., 'pitch_trial_id') to measure durations
        durations = phase_data.groupby('pitch_trial_id').size()
        avg = durations.mean()
        std = durations.std()
        return int(np.clip(avg + std_dev * std, base_size, 300))


    # NEW: Validation for target sequence alignment.
    def check_target_alignment(self, X_seq: Any, y_seq: Any, horizon: int) -> bool:
        """
        For sliding window segmentation (set_window), ensure the target has `horizon` rows.
        For grouping-based segmentation (e.g., dtw, pad, variable_length), ensure the target length
        equals the sequence length.
        """
        for idx, (seq, target) in enumerate(zip(X_seq, y_seq)):
            # Use len() if seq is a list; use .shape[0] if it's a NumPy array.
            if hasattr(seq, 'shape'):
                seq_length = seq.shape[0]
            else:
                seq_length = len(seq)
            if self.time_series_sequence_mode == "set_window":
                expected_length = horizon
            else:
                expected_length = seq_length

            self.logger.debug(
                f"Sequence {idx}: full length = {seq_length}, expected target length = {expected_length}, "
                f"actual target length = {len(target) if not hasattr(target, 'shape') else target.shape[0]}"
            )
            if (hasattr(target, 'shape') and target.shape[0] != expected_length) or (not hasattr(target, 'shape') and len(target) != expected_length):
                self.logger.error(
                    f"Alignment error in sequence {idx}: expected target length {expected_length} but got "
                    f"{target.shape[0] if hasattr(target, 'shape') else len(target)}"
                )
                return False
        return True





    # NEW: Validation for phase (or group) transitions.
    @staticmethod
    def validate_phase_transitions(sequences: list, phase_column: str, valid_transitions: Dict[str, List[str]]) -> bool:
        """
        Validate that sequences contain biomechanically valid transitions between groups.
        
        Args:
            sequences (list): List of DataFrames or arrays that include a column for phases.
            phase_column (str): Name of the column that contains the group/phase information.
            valid_transitions (Dict[str, List[str]]): Dictionary mapping a phase to the list of allowed next phases.
        
        Returns:
            bool: True if the error rate is below the threshold, False otherwise.
        """
        errors = 0
        for seq in sequences:
            phases = pd.Series(seq[:, phase_column]) if isinstance(seq, np.ndarray) else seq[phase_column]
            phases = phases.unique()
            for i in range(len(phases) - 1):
                current = phases[i]
                next_phase = phases[i+1]
                if next_phase not in valid_transitions.get(current, []):
                    errors += 1
        # For simplicity, we define a tolerance (here <1% error)
        return errors / len(sequences) < 0.01


    # ---------------------------------------------------------------------
    # Updated preprocess_time_series: now includes an optional phase-aware normalization step.
    def preprocess_time_series(self, data: pd.DataFrame) -> Tuple[Any, None, Any, None, pd.DataFrame, None]:
        """
        Preprocess data specifically for time series models.
        
        Steps:
          1. Handle missing values and outliers.
          2. Sort the data by the time column.
          3. Optionally perform phase-aware normalization if enabled.
          4. Extract features and target.
          5. Build and fit the preprocessing pipeline.
          6. Transform the features.
          7. Create sequences:
             - If time_series_sequence_mode is one of ["dtw", "pad", "variable_length"], use grouping-based segmentation.
             - If time_series_sequence_mode is "set_window", use fixed sliding window segmentation.
          8. If grouping was used, apply hierarchical temporal encoding.
          9. Validate target alignment.
         10. Generate recommendations and save transformers.
        
        Returns:
            Tuple containing:
              - X_seq: Sequence array (or list) for time series inputs.
              - None for X_test.
              - y_seq: Sequence array for targets.
              - None for y_test.
              - recommendations: Preprocessing recommendations DataFrame.
              - None for inverse-transformed test data.
        """
        # 1. Handle missing values
        data_clean, _ = self.handle_missing_values(data)
    
        # 2. Handle outliers
        X_temp = data_clean.drop(columns=self.y_variable)
        y_temp = data_clean[self.y_variable]
        X_temp, y_temp = self.handle_outliers(X_temp, y_temp)
        data_clean = pd.concat([X_temp, y_temp], axis=1)
    
        # 3. Sort by time column
        if self.time_column is None:
            raise ValueError("For time series models, 'time_column' must be specified.")
        data_clean['__time__'] = pd.to_datetime(data_clean[self.time_column])
        data_sorted = data_clean.sort_values(by='__time__').drop(columns=['__time__'])
        assert all(col in data_sorted.columns for col in self.y_variable), "Target variable(s) missing after sorting!"
        self.logger.debug(f"Columns after sorting: {data_sorted.columns.tolist()}")
    
        # 4. Optionally perform phase-aware normalization if enabled in options.
        phase_norm_opts = self.options.get('phase_aware_normalization', {})
        if phase_norm_opts.get('enabled', False):
            group_col = phase_norm_opts.get('group_column', 'phase')
            num_cols = phase_norm_opts.get('numeric_columns', self.numericals)
            self.logger.info(f"Phase-aware normalization enabled on group '{group_col}'.")
            self.logger.debug(f"Before phase scaling (for columns {num_cols}):\n{data_sorted.groupby(group_col)[num_cols].describe()}")
            data_sorted, phase_scalers = self.phase_scaling(data_sorted, num_cols, group_col)
            self.logger.debug(f"After phase scaling (for columns {num_cols}):\n{data_sorted.groupby(group_col)[num_cols].describe()}")
    
        # 5. Extract features and target
        X_clean = data_sorted.drop(columns=self.y_variable)
        y_clean = data_sorted[self.y_variable]
    
        # 6. Build and fit preprocessing pipeline
        self.pipeline = self.build_pipeline(X_clean)
        X_preprocessed = self.pipeline.fit_transform(X_clean)
    
        # 7. Create sequences based on time_series_sequence_mode
        if self.time_series_sequence_mode in ["dtw", "pad", "variable_length"]:
            if self.sequence_categorical is not None:
                if isinstance(self.sequence_categorical, list) and len(self.sequence_categorical) > 1:
                    group_ids = data_sorted[self.sequence_categorical].values  # 2D array
                else:
                    group_ids = data_sorted[self.sequence_categorical[0]].values  # 1D array
                self.logger.info(f"Grouping-based segmentation enabled using keys: {self.sequence_categorical}. Mode: {self.time_series_sequence_mode}")
                X_seq, y_seq, group_keys = self.create_sequences_by_category(X_preprocessed, y_clean.values, group_ids)
                # Apply hierarchical temporal encoding.
                X_seq = self.temporal_encode_sequences(X_seq, group_keys)
            else:
                self.logger.warning("Grouping variable not provided. Treating entire session as one group.")
                group_ids = np.ones(len(data_sorted), dtype=int)
                X_seq, y_seq, _ = self.create_sequences_by_category(X_preprocessed, y_clean.values, group_ids)
        elif self.time_series_sequence_mode == "set_window":
            # Fixed sliding window segmentation.
            X_seq, y_seq = self.create_sequences(X_preprocessed, y_clean.values)
        else:
            raise ValueError(f"Invalid time_series_sequence_mode: {self.time_series_sequence_mode}")
    
        # 8. Validate target alignment.
        if not self.check_target_alignment(X_seq, y_seq, self.horizon):
            self.logger.warning("⚠️ Target alignment check failed: Some sequences may not have matching target lengths.")
        else:
            self.logger.debug("Target alignment check passed for all sequences.")
    
        # 9. Generate recommendations and save transformers.
        recommendations = self.generate_recommendations()
        self.final_feature_order = list(self.pipeline.get_feature_names_out())
        self.save_transformers()
    
        return X_seq, None, y_seq, None, recommendations, None



    def preprocess_train(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Preprocess training data for various model types.
        For time series models, delegate to preprocess_time_series.
        
        Returns:
            - For standard models: X_train_final, X_test_final, y_train_smoted, y_test, recommendations, X_test_inverse.
            - For time series models: X_seq, None, y_seq, None, recommendations, None.
        """
        # If the model is time series, use the dedicated time series preprocessing flow.
        if self.model_category == 'time_series':
            return self.preprocess_time_series(X, y)
        
        # Standard preprocessing flow for classification/regression/clustering
        X_train_original, X_test_original, y_train_original, y_test = self.split_dataset(X, y)
        X_train_missing_values, X_test_missing_values = self.handle_missing_values(X_train_original, X_test_original)
        
        # Only perform normality tests if applicable
        if self.model_category in ['regression', 'classification', 'clustering']:
            self.test_normality(X_train_missing_values)
        
        X_train_outliers_handled, y_train_outliers_handled = self.handle_outliers(X_train_missing_values, y_train_original)
        X_test_outliers_handled = X_test_missing_values.copy() if X_test_missing_values is not None else None
        recommendations = self.generate_recommendations()
        self.pipeline = self.build_pipeline(X_train_outliers_handled)
        X_train_preprocessed = self.pipeline.fit_transform(X_train_outliers_handled)
        X_test_preprocessed = self.pipeline.transform(X_test_outliers_handled) if X_test_outliers_handled is not None else None

        if self.model_category == 'classification':
            try:
                X_train_smoted, y_train_smoted = self.implement_smote(X_train_preprocessed, y_train_outliers_handled)
            except Exception as e:
                self.logger.error(f"❌ SMOTE application failed: {e}")
                raise
        else:
            X_train_smoted, y_train_smoted = X_train_preprocessed, y_train_outliers_handled
            self.logger.info("⚠️ SMOTE not applied: Not a classification model.")

        self.final_feature_order = list(self.pipeline.get_feature_names_out())
        X_train_final = pd.DataFrame(X_train_smoted, columns=self.final_feature_order)
        X_test_final = pd.DataFrame(X_test_preprocessed, columns=self.final_feature_order, index=X_test_original.index) if X_test_preprocessed is not None else None

        try:
            self.save_transformers()
        except Exception as e:
            self.logger.error(f"❌ Saving transformers failed: {e}")
            raise

        try:
            if X_test_final is not None:
                X_test_inverse = self.inverse_transform_data(X_test_final.values, original_data=X_test_original)
                self.logger.info("✅ Inverse transformations applied successfully.")
            else:
                X_test_inverse = None
        except Exception as e:
            self.logger.error(f"❌ Inverse transformations failed: {e}")
            X_test_inverse = None

        return X_train_final, X_test_final, y_train_smoted, y_test, recommendations, X_test_inverse


    def transform(self, X: pd.DataFrame) -> np.ndarray:
        """
        Transform new data using the fitted preprocessing pipeline.

        Args:
            X (pd.DataFrame): New data to transform.

        Returns:
            np.ndarray: Preprocessed data.
        """
        if self.pipeline is None:
            self.logger.error("Preprocessing pipeline has not been fitted. Cannot perform inverse transformation.")
            raise AttributeError("Preprocessing pipeline has not been fitted. Cannot perform inverse transformation.")
        self.logger.debug("Transforming new data.")
        X_preprocessed = self.pipeline.transform(X)
        if self.debug:
            self.logger.debug(f"Transformed data shape: {X_preprocessed.shape}")
        else:
            self.logger.info("Data transformed.")
        return X_preprocessed

    def preprocess_predict(self, X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Preprocess new data for prediction.

        Args:
            X (pd.DataFrame): New data for prediction.

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame]]: X_preprocessed, recommendations, X_inversed
        """
        step_name = "Preprocess Predict"
        self.logger.info(f"Step: {step_name}")

        # Log initial columns and feature count
        self.logger.debug(f"Initial columns in prediction data: {X.columns.tolist()}")
        self.logger.debug(f"Initial number of features: {X.shape[1]}")

        # Load transformers
        try:
            transformers = self.load_transformers()
            self.logger.debug("Transformers loaded successfully.")
        except Exception as e:
            self.logger.error(f"❌ Failed to load transformers: {e}")
            raise

        # Filter columns based on raw feature names
        try:
            X_filtered = self.filter_columns(X)
            self.logger.debug(f"Columns after filtering: {X_filtered.columns.tolist()}")
            self.logger.debug(f"Number of features after filtering: {X_filtered.shape[1]}")
        except Exception as e:
            self.logger.error(f"❌ Failed during column filtering: {e}")
            raise

        # Handle missing values
        try:
            X_filtered, _ = self.handle_missing_values(X_filtered)
            self.logger.debug(f"Columns after handling missing values: {X_filtered.columns.tolist()}")
            self.logger.debug(f"Number of features after handling missing values: {X_filtered.shape[1]}")
        except Exception as e:
            self.logger.error(f"❌ Failed during missing value handling: {e}")
            raise

        # Ensure all expected raw features are present
        expected_raw_features = self.numericals + self.ordinal_categoricals + self.nominal_categoricals
        provided_features = X_filtered.columns.tolist()

        self.logger.debug(f"Expected raw features: {expected_raw_features}")
        self.logger.debug(f"Provided features: {provided_features}")

        missing_raw_features = set(expected_raw_features) - set(provided_features)
        if missing_raw_features:
            self.logger.error(f"❌ Missing required raw feature columns in prediction data: {missing_raw_features}")
            raise ValueError(f"Missing required raw feature columns in prediction data: {missing_raw_features}")

        # Handle unexpected columns (optional: ignore or log)
        unexpected_features = set(provided_features) - set(expected_raw_features)
        if unexpected_features:
            self.logger.warning(f"⚠️ Unexpected columns in prediction data that will be ignored: {unexpected_features}")

        # Ensure the order of columns matches the pipeline's expectation (optional)
        X_filtered = X_filtered[expected_raw_features]
        self.logger.debug("Reordered columns to match the pipeline's raw feature expectations.")

        # Transform data using the loaded pipeline
        try:
            X_preprocessed_np = self.pipeline.transform(X_filtered)
            self.logger.debug(f"Transformed data shape: {X_preprocessed_np.shape}")
        except Exception as e:
            self.logger.error(f"❌ Transformation failed: {e}")
            raise

        # Retrieve feature names from the pipeline or use stored final_feature_order
        if hasattr(self.pipeline, 'get_feature_names_out'):
            try:
                columns = self.pipeline.get_feature_names_out()
                self.logger.debug(f"Derived feature names from pipeline: {columns.tolist()}")
            except Exception as e:
                self.logger.warning(f"Could not retrieve feature names from pipeline: {e}")
                columns = self.final_feature_order
                self.logger.debug(f"Using stored final_feature_order for column names: {columns}")
        else:
            columns = self.final_feature_order
            self.logger.debug(f"Using stored final_feature_order for column names: {columns}")

        # Convert NumPy array back to DataFrame with correct column names
        try:
            X_preprocessed_df = pd.DataFrame(X_preprocessed_np, columns=columns, index=X_filtered.index)
            self.logger.debug(f"X_preprocessed_df columns: {X_preprocessed_df.columns.tolist()}")
            self.logger.debug(f"Sample of X_preprocessed_df:\n{X_preprocessed_df.head()}")
        except Exception as e:
            self.logger.error(f"❌ Failed to convert transformed data to DataFrame: {e}")
            raise

        # Inverse transform for interpretability (optional, for interpretability)
        try:
            self.logger.debug(f"[DEBUG] Original data shape before inverse transform: {X.shape}")
            X_inversed = self.inverse_transform_data(X_preprocessed_np, original_data=X)
            self.logger.debug(f"[DEBUG] Inversed data shape: {X_inversed.shape}")
        except Exception as e:
            self.logger.error(f"❌ Inverse transformation failed: {e}")
            X_inversed = None

        # Generate recommendations (if applicable)
        try:
            recommendations = self.generate_recommendations()
            self.logger.debug("Generated preprocessing recommendations.")
        except Exception as e:
            self.logger.error(f"❌ Failed to generate recommendations: {e}")
            recommendations = pd.DataFrame()

        # Prepare outputs
        return X_preprocessed_df, recommendations, X_inversed

    def preprocess_clustering(self, X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Preprocess data for clustering mode.

        Args:
            X (pd.DataFrame): Input features for clustering.

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: X_processed, recommendations.
        """
        step_name = "Preprocess Clustering"
        self.logger.info(f"Step: {step_name}")
        debug_flag = self.get_debug_flag('debug_handle_missing_values')  # Use relevant debug flags

        # Handle Missing Values
        X_missing, _ = self.handle_missing_values(X, None)
        self.logger.debug(f"After handling missing values: X_missing.shape={X_missing.shape}")

        # Handle Outliers
        X_outliers_handled, _ = self.handle_outliers(X_missing, None)
        self.logger.debug(f"After handling outliers: X_outliers_handled.shape={X_outliers_handled.shape}")

        # Test Normality (optional for clustering)
        if self.model_category in ['clustering']:
            self.logger.info("Skipping normality tests for clustering.")
        else:
            self.test_normality(X_outliers_handled)

        # Generate Preprocessing Recommendations
        recommendations = self.generate_recommendations()

        # Build and Fit the Pipeline
        self.pipeline = self.build_pipeline(X_outliers_handled)
        self.logger.debug("Pipeline built and fitted.")

        # Transform the data
        X_processed = self.pipeline.transform(X_outliers_handled)
        self.logger.debug(f"After pipeline transform: X_processed.shape={X_processed.shape}")

        # Optionally, inverse transformations can be handled if necessary

        # Save Transformers (if needed)
        # Not strictly necessary for clustering unless you plan to apply the same preprocessing on new data
        self.save_transformers()

        self.logger.info("✅ Clustering data preprocessed successfully.")

        return X_processed, recommendations

    def final_preprocessing(self, data: pd.DataFrame) -> Tuple:
        """
        Execute the full preprocessing pipeline based on the mode.

        For 'train' mode:
        - If time series: pass the full filtered DataFrame (which includes the target) 
            to preprocess_time_series.
        - Else: split the data into X and y, then call preprocess_train.
        For 'predict' and 'clustering' modes, the existing flow remains unchanged.

        Returns:
            Tuple: Depending on mode:
                - 'train': For standard models: X_train, X_test, y_train, y_test, recommendations, X_test_inverse.
                            For time series models: X_seq, None, y_seq, None, recommendations, None.
                - 'predict': X_preprocessed, recommendations, X_inverse.
                - 'clustering': X_processed, recommendations.
        """
        self.logger.info(f"Starting: Final Preprocessing Pipeline in '{self.mode}' mode.")
        
        try:
            data = self.filter_columns(data)
            self.logger.info("✅ Column filtering completed successfully.")
        except Exception as e:
            self.logger.error(f"❌ Column filtering failed: {e}")
            raise

        if self.mode == 'train':
            if self.model_category == 'time_series':
                # For time series mode, do not split the DataFrame.
                # Pass the full filtered data (which still contains the target variable)
                # so that the time series preprocessing flow can extract the target after cleaning and sorting.
                return self.preprocess_time_series(data)
            else:
                if not all(col in data.columns for col in self.y_variable):
                    missing_y = [col for col in self.y_variable if col not in data.columns]
                    raise ValueError(f"Target variable(s) {missing_y} not found in the dataset.")
                X = data.drop(self.y_variable, axis=1)
                y = data[self.y_variable].iloc[:, 0] if len(self.y_variable) == 1 else data[self.y_variable]
                return self.preprocess_train(X, y)
        
        elif self.mode == 'predict':
            X = data.copy()
            transformers_path = os.path.join(self.transformers_dir, 'transformers.pkl')
            if not os.path.exists(transformers_path):
                self.logger.error(f"❌ Transformers file not found at '{self.transformers_dir}'. Cannot proceed with prediction.")
                raise FileNotFoundError(f"Transformers file not found at '{self.transformers_dir}'.")
            X_preprocessed, recommendations, X_inversed = self.preprocess_predict(X)
            self.logger.info("✅ Preprocessing completed successfully in predict mode.")
            return X_preprocessed, recommendations, X_inversed
        
        elif self.mode == 'clustering':
            X = data.copy()
            return self.preprocess_clustering(X)
        
        else:
            raise NotImplementedError(f"Mode '{self.mode}' is not implemented.")



    # Optionally, implement a method to display column info for debugging
    def _debug_column_info(self, df: pd.DataFrame, step: str = "Debug Column Info"):
        """
        Display information about DataFrame columns for debugging purposes.

        Args:
            df (pd.DataFrame): The DataFrame to inspect.
            step (str, optional): Description of the current step. Defaults to "Debug Column Info".
        """
        self.logger.debug(f"\n📊 {step}: Column Information")
        for col in df.columns:
            self.logger.debug(f"Column '{col}': {df[col].dtype}, Unique Values: {df[col].nunique()}")
        self.logger.debug("\n")

In [47]:
# datapreprocessor.py

import pandas as pd
import numpy as np
from typing import List, Dict, Optional, Tuple, Any
import logging
import os
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import PowerTransformer, OrdinalEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from imblearn.over_sampling import SMOTE, ADASYN, SMOTENC, SMOTEN, BorderlineSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import probplot
import joblib  # For saving/loading transformers
from inspect import signature  # For parameter validation in SMOTE
from functools import wraps

class DataPreprocessor:
    def __init__(
        self,
        model_type: str,
        y_variable: List[str],
        ordinal_categoricals: List[str],
        nominal_categoricals: List[str],
        numericals: List[str],
        mode: str,  # 'train', 'predict', 'clustering'
        options: Optional[Dict] = None,
        debug: bool = False,
        normalize_debug: bool = False,
        normalize_graphs_output: bool = False,
        graphs_output_dir: str = './plots',
        transformers_dir: str = './transformers',
        # New time series parameters:
        time_column: Optional[str] = None,
        window_size: Optional[int] = None,
        horizon: Optional[int] = None,
        step_size: Optional[int] = None,
        max_sequence_length: Optional[int] = None,
        time_series_sequence_mode: str = "set_window",  # "set_window", "dtw", "pad", or "variable_length"
        sequence_categorical: Optional[List[str]] = None,
        # NEW: Secondary grouping for sub-phase segmentation (for DTW/pad modes)
        sequence_dtw_or_pad_categorical: Optional[List[str]] = None
    ):
        # Explicitly cast grouping columns to lists (or empty lists if None)
        self.sequence_categorical = list(sequence_categorical) if sequence_categorical else []
        self.sequence_dtw_or_pad_categorical = list(sequence_dtw_or_pad_categorical) if sequence_dtw_or_pad_categorical else []
        
        # Validate hierarchical structure: no overlapping columns allowed.
        if set(self.sequence_categorical) & set(self.sequence_dtw_or_pad_categorical):
            conflicting = set(self.sequence_categorical) & set(self.sequence_dtw_or_pad_categorical)
            raise ValueError(f"Categorical conflict in {conflicting}. Top-level and sub-phase groups must form a strict hierarchy")
        
        # NEW: Initialize follow-through metadata storage for debugging extreme durations.
        self.follow_through_stats = []  # Will store dicts with keys: group_key, phase, length (in seconds), num_rows
        # Optionally, define a default time_step (e.g., 1/60 sec for 60Hz)
        self.time_step = self.options.get('time_step', 1/60) if self.options else 1/60

        # (Rest of the __init__ remains unchanged.)
        self.model_type = model_type
        self.y_variable = y_variable
        self.ordinal_categoricals = ordinal_categoricals
        self.nominal_categoricals = nominal_categoricals
        self.numericals = numericals
        self.mode = mode.lower()
        if self.mode not in ['train', 'predict', 'clustering']:
            raise ValueError("Mode must be one of 'train', 'predict', or 'clustering'.")
        self.options = options or {}
        self.debug = debug
        self.normalize_debug = normalize_debug
        self.normalize_graphs_output = normalize_graphs_output
        self.graphs_output_dir = graphs_output_dir
        self.transformers_dir = transformers_dir

        # New time series parameters
        self.time_column = time_column
        self.window_size = window_size
        self.horizon = horizon
        self.step_size = step_size
        self.max_sequence_length = max_sequence_length
        self.time_series_sequence_mode = time_series_sequence_mode
        # Re-assign grouping lists to ensure proper type (already cast above)
        self.sequence_categorical = sequence_categorical
        self.sequence_dtw_or_pad_categorical = sequence_dtw_or_pad_categorical

        # NEW: Phase alignment safeguards
        self.max_phase_distortion = self.options.get('max_phase_distortion', 0.3)  # 20% distortion allowed
        self.max_length_variance = self.options.get('max_length_variance', 5)  # allowable variation in phase lengths

        # Extra check for overlapping groups if both grouping lists exist
        if self.sequence_categorical and self.sequence_dtw_or_pad_categorical:
            overlap = set(self.sequence_categorical) & set(self.sequence_dtw_or_pad_categorical)
            if overlap:
                raise ValueError(f"Overlapping grouping columns: {overlap}. Top-level and sub-phase groups must be distinct")

        # ... (Initialize remaining attributes, logging, pipelines, etc.) ...
        self.hierarchical_categories = {}
        model_type_lower = self.model_type.lower()
        if any(kw in model_type_lower for kw in ['lstm', 'rnn', 'time series']):
            self.model_category = 'time_series'
        else:
            self.model_category = self.map_model_type_to_category()


        self.categorical_indices = []
        if self.model_category == 'unknown':
            self.logger = logging.getLogger(self.__class__.__name__)
            self.logger.error(f"Model category for '{self.model_type}' is unknown. Check your configuration.")
            raise ValueError(f"Model category for '{self.model_type}' is unknown. Check your configuration.")
        if self.mode in ['train', 'predict']:
            if not self.y_variable:
                raise ValueError("Target variable 'y_variable' must be specified for supervised models in train/predict mode.")
        elif self.mode == 'clustering':
            self.y_variable = []




        # ----------------------------------------------------

        # Initialize other variables
        self.scaler = None
        self.transformer = None
        self.ordinal_encoder = None
        self.nominal_encoder = None
        self.preprocessor = None
        self.smote = None
        self.feature_reasons = {col: '' for col in self.ordinal_categoricals + self.nominal_categoricals + self.numericals}
        self.preprocessing_steps = []
        self.normality_results = {}
        self.features_to_transform = []
        self.nominal_encoded_feature_names = []
        self.final_feature_order = []

        # Initialize placeholders for clustering-specific transformers
        self.cluster_transformers = {}
        self.cluster_model = None
        self.cluster_labels = None
        self.silhouette_score = None

        # Define default thresholds for SMOTE recommendations
        self.imbalance_threshold = self.options.get('smote_recommendation', {}).get('imbalance_threshold', 0.1)
        self.noise_threshold = self.options.get('smote_recommendation', {}).get('noise_threshold', 0.1)
        self.overlap_threshold = self.options.get('smote_recommendation', {}).get('overlap_threshold', 0.1)
        self.boundary_threshold = self.options.get('smote_recommendation', {}).get('boundary_threshold', 0.1)

        self.pipeline = None  # Initialize pipeline

        # Initialize logging
        self.logger = logging.getLogger(self.__class__.__name__)
        self.logger.setLevel(logging.DEBUG if self.debug else logging.INFO)
        handler = logging.StreamHandler()
        formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s')
        handler.setFormatter(formatter)
        if not self.logger.handlers:
            self.logger.addHandler(handler)
            
        # Initialize feature_reasons with 'all_numericals' for clustering
        self.feature_reasons = {col: '' for col in self.ordinal_categoricals + self.nominal_categoricals + self.numericals}
        if self.model_category == 'clustering':
            self.feature_reasons['all_numericals'] = ''

    def get_debug_flag(self, flag_name: str) -> bool:
        """
        Retrieve the value of a specific debug flag from the options.
        Args:
            flag_name (str): The name of the debug flag.
        Returns:
            bool: The value of the debug flag.
        """
        return self.options.get(flag_name, False)

    def _log(self, message: str, step: str, level: str = 'info'):
        """
        Internal method to log messages based on the step-specific debug flags.
        
        Args:
            message (str): The message to log.
            step (str): The preprocessing step name.
            level (str): The logging level ('info', 'debug', etc.).
        """
        debug_flag = self.get_debug_flag(f'debug_{step}')
        if debug_flag:
            if level == 'debug':
                self.logger.debug(message)
            elif level == 'info':
                self.logger.info(message)
            elif level == 'warning':
                self.logger.warning(message)
            elif level == 'error':
                self.logger.error(message)

    def map_model_type_to_category(self) -> str:
        """
        Map the model_type string to a predefined category based on keywords.

        Returns:
            str: The model category ('classification', 'regression', 'clustering', etc.).
        """
        classification_keywords = ['classifier', 'classification', 'logistic', 'svm', 'support vector machine', 'knn', 'neural network']
        regression_keywords = ['regressor', 'regression', 'linear', 'knn', 'neural network']  # Removed 'svm'
        clustering_keywords = ['k-means', 'clustering', 'dbscan', 'kmodes', 'kprototypes']

        model_type_lower = self.model_type.lower()

        for keyword in classification_keywords:
            if keyword in model_type_lower:
                return 'classification'

        for keyword in regression_keywords:
            if keyword in model_type_lower:
                return 'regression'

        for keyword in clustering_keywords:
            if keyword in model_type_lower:
                return 'clustering'

        return 'unknown'

    def filter_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        step_name = "filter_columns"
        self.logger.info(f"Step: {step_name}")

        # Combine all feature lists from configuration
        desired_features = self.numericals + self.ordinal_categoricals + self.nominal_categoricals

        # For time series models, ensure the time column is included
        if self.model_category == 'time_series' and self.time_column:
            if self.time_column not in df.columns:
                self.logger.error(f"Time column '{self.time_column}' not found in input data.")
                raise ValueError(f"Time column '{self.time_column}' not found in the input data.")
            if self.time_column not in desired_features:
                desired_features.append(self.time_column)

        # Debug log: report target variable info
        self.logger.debug(f"y_variable provided: {self.y_variable}")
        if self.y_variable and all(col in df.columns for col in self.y_variable):
            self.logger.debug(f"Unique values in target column(s): {df[self.y_variable].drop_duplicates().to_dict()}")

        # For 'train' mode, ensure the target variable is present and excluded from features
        if self.mode == 'train':
            if not all(col in df.columns for col in self.y_variable):
                missing_y = [col for col in self.y_variable if col not in df.columns]
                self.logger.error(f"Target variable(s) {missing_y} not found in the input data.")
                raise ValueError(f"Target variable(s) {missing_y} not found in the input data.")
            desired_features = [col for col in desired_features if col not in self.y_variable]
            filtered_df = df[desired_features + self.y_variable].copy()
        else:
            filtered_df = df[desired_features].copy()

        # Check that all desired features are present in the input DataFrame
        missing_features = [col for col in desired_features if col not in df.columns]
        if missing_features:
            self.logger.error(f"The following required features are missing in the input data: {missing_features}")
            raise ValueError(f"The following required features are missing in the input data: {missing_features}")

        # Additional numeric type check for expected numeric columns
        for col in self.numericals:
            if col in filtered_df.columns and not np.issubdtype(filtered_df[col].dtype, np.number):
                raise TypeError(f"Numerical column '{col}' has non-numeric dtype {filtered_df[col].dtype}")

        self.logger.info(f"✅ Filtered DataFrame to include only specified features. Shape: {filtered_df.shape}")
        self.logger.debug(f"Selected Features: {desired_features}")
        if self.mode == 'train':
            self.logger.debug(f"Retained Target Variable(s): {self.y_variable}")

        return filtered_df



    def _group_top_level(self, data: pd.DataFrame):
        """
        Group the data based on top-level sequence categorical variables.
        Returns the grouped DataFrames (without converting them to NumPy arrays)
        to ensure that subsequent processing (such as sub-phase segmentation) has access
        to DataFrame methods like .groupby and .columns.
        """
        if not self.sequence_categorical:
            return [('default_group', data)]
        
        groups = data.groupby(self.sequence_categorical)
        self.logger.debug(f"Group keys: {list(groups.groups.keys())}")
        
        validated_groups = []
        for name, group in groups:
            try:
                self.logger.debug(f"Group '{name}' type: {type(group)}, Shape: {group.shape if hasattr(group, 'shape') else 'N/A'}")
            except Exception as e:
                self.logger.error(f"Error obtaining shape for group {name}: {e}")
            if isinstance(group, pd.DataFrame):
                # *** FIX: Return the DataFrame (not group.values) so that it retains the .columns attribute ***
                validated_groups.append((name, group))
            else:
                self.logger.warning(f"Unexpected group type {type(group)} for group {name}")
        return validated_groups




    def _segment_subphases(self, group_data: pd.DataFrame):
        """
        Segment a group's data into sub-phases based on the secondary grouping.
        For each phase, convert to a NumPy array (after filtering to numeric columns)
        and compute metadata (duration in seconds, number of frames, group identifier).
        Also validates timestamps if possible.
        """
        # Take a deep breath: Start sub-phase segmentation.
        if not self.sequence_dtw_or_pad_categorical:
            if self.numericals:
                group_data = group_data[[col for col in group_data.columns if col in self.numericals]]
            return {"default_phase": group_data}

        # Convert groupby iterator to list.
        phase_groups = list(group_data.groupby(self.sequence_dtw_or_pad_categorical))
        phase_lengths = {}
        subphases = {}
        # New: initialize metadata list for this group.
        phase_stats = []

        # If a time column is specified, validate timestamps.
        if self.time_column and self.time_column in group_data.columns:
            try:
                self._validate_timestamps(group_data)
            except Exception as e:
                self.logger.warning(f"Timestamp validation error in group {getattr(group_data, 'name', 'unknown')}: {e}")

        for phase, phase_df in phase_groups:
            phase_length = len(phase_df)
            phase_lengths[phase] = phase_length
            self.logger.debug(f"Sub-phase '{phase}' raw length: {phase_length}")
            if isinstance(phase_df, pd.DataFrame):
                if self.numericals:
                    numeric_phase_df = phase_df[[col for col in phase_df.columns if col in self.numericals]]
                else:
                    numeric_phase_df = phase_df
                subphases[phase] = numeric_phase_df.values
            else:
                self.logger.error(f"Unexpected data type {type(phase_df)} for phase {phase}")

            # New: compute metadata for this phase.
            phase_duration = phase_length * self.time_step  # Convert frames to seconds.
            group_key = getattr(group_data, 'name', 'unknown')
            phase_stats.append({
                "group_key": group_key,
                "phase": phase,
                "length": phase_duration,
                "num_rows": phase_length
            })

        self.logger.info(f"Sub-phase length stats - Min: {min(phase_lengths.values())}, Max: {max(phase_lengths.values())}")
        if max(phase_lengths.values()) / min(phase_lengths.values()) > 3:
            self.logger.warning(f"Excessive length disparity in current group: {max(phase_lengths.values())}/{min(phase_lengths.values())}={max(phase_lengths.values())/min(phase_lengths.values()):.1f}x. Risk: unrealistic warping!")

        # Store Follow-Through stats for later outlier reporting.
        for stat in phase_stats:
            if stat["phase"] == "Follow Through":
                self.follow_through_stats.append(stat)

        return subphases


    def _validate_timestamps(self, phase_data: pd.DataFrame):
        """
        Validate that timestamps in phase_data have no large discontinuities (>1 second gap).
        Logs a warning if a gap is detected.
        """
        time_col = self.time_column
        if time_col not in phase_data.columns:
            return
        diffs = phase_data[time_col].diff().dropna()
        if (diffs > 1.0).any():
            gap_loc = diffs.idxmax()
            self.logger.warning(
                f"Timestamp jump in group {getattr(phase_data, 'name', 'unknown')}: {diffs[gap_loc]:.2f}s gap at index {gap_loc}"
            )

    def _flag_extreme_phases(self, phase_stats):
        """
        Identify and log any extreme Follow-Through phases (duration > 30 seconds).
        """
        follow_throughs = [s for s in phase_stats if s["phase"] == "Follow Through"]
        if follow_throughs:
            max_ft = max(follow_throughs, key=lambda x: x["length"])
            if max_ft["length"] > 30:
                self.logger.error(
                    f"Extreme Follow-Through: group {max_ft['group_key']} length={max_ft['length']:.3f}s "
                    f"({max_ft['num_rows']} frames)"
                )

    def _log_top_outliers(self):
        """
        Log the top 5 longest Follow-Through durations from the recorded metadata.
        """
        if not self.follow_through_stats:
            self.logger.debug("No Follow-Through stats recorded.")
            return
        sorted_ft = sorted(self.follow_through_stats, key=lambda x: x["length"], reverse=True)[:5]
        self.logger.debug("Top 5 Follow-Through Durations:")
        for i, stats in enumerate(sorted_ft, 1):
            self.logger.debug(f"{i}. Group {stats['group_key']}: {stats['length']:.3f}s ({stats['num_rows']} frames)")

    def _filter_follow_through(self, phase_stats):
        """
        Dynamically filter groups based on Follow-Through duration.
        Discard a group if its Follow-Through duration exceeds mean + 5σ.
        """
        ft_lengths = [s["length"] for s in phase_stats if s["phase"] == "Follow Through"]
        if not ft_lengths:
            return True
        mean = np.mean(ft_lengths)
        std = np.std(ft_lengths)
        threshold = mean + 5 * std
        for stats in phase_stats:
            if stats["phase"] == "Follow Through" and stats["length"] > threshold:
                self.logger.warning(
                    f"Discarding group {stats['group_key']}: Follow-Through {stats['length']:.3f}s > 5σ ({threshold:.1f}s)"
                )
                return False
        return True


    @staticmethod
    def pad_sequence(seq: np.ndarray, target_length: int) -> np.ndarray:
        """
        Pad or truncate the given sequence to match the target length.
        """
        seq = np.array(seq)
        current_length = seq.shape[0]
        if current_length >= target_length:
            return seq[:target_length]
        else:
            pad_width = target_length - current_length
            padding = np.zeros((pad_width, seq.shape[1]))
            return np.concatenate([seq, padding], axis=0)

    def require_array_type(func):
        """
        Decorator that asserts the second argument (usually the data input)
        has a 'shape' attribute, i.e. is array-like.
        """
        @wraps(func)
        def wrapper(*args, **kwargs):
            # args[1] should be the phase_data input for _align_phase
            if not hasattr(args[1], 'shape'):
                raise TypeError(f"Function {func.__name__} requires array-like input")
            return func(*args, **kwargs)
        return wrapper
        
    @require_array_type
    def _align_phase(self, phase_data, target_length: int) -> np.ndarray:
        """
        Align a sub-phase's sequence to a target length using DTW (if enabled) or padding.
        Logs input/output shapes and alignment method details.
        """
        # If phase_data is a DataFrame, filter to only numeric columns.
        if isinstance(phase_data, pd.DataFrame):
            if self.numericals:
                phase_data = phase_data[[col for col in phase_data.columns if col in self.numericals]]
            else:
                phase_data = phase_data.copy()
        
        if isinstance(phase_data, dict):
            self.logger.error(f"Received dict instead of array. Current keys: {list(phase_data.keys())}")
            raise TypeError("Phase data must be array-like")
        
        if not hasattr(phase_data, 'shape'):
            self.logger.error(f"Invalid data type {type(phase_data)}. Expected array/DataFrame")
            raise TypeError("Phase data must be array-like")
        
        self.logger.debug("Phase data (first 5 rows):")
        try:
            self.logger.debug(phase_data.head())
        except Exception:
            self.logger.debug("Unable to display phase_data.head()")
        if isinstance(phase_data, pd.DataFrame):
            self.logger.debug("Phase data dtypes:")
            self.logger.debug(phase_data.dtypes)
        
        phase_array = self.safe_array_conversion(phase_data)
        if phase_array.ndim != 2:
            self.logger.error(f"Invalid input shape {phase_array.shape} - expected a 2D array")
            raise ValueError("DTW alignment requires a 2D array input")
        
        self.logger.debug(f"Column dtypes: {[phase_array[:, i].dtype for i in range(phase_array.shape[1])]}")

        if not np.issubdtype(phase_array.dtype, np.number):
            self.logger.warning(f"Non-numeric dtype detected: {phase_array.dtype}. Attempting conversion to np.float32.")
            try:
                phase_array = phase_array.astype(np.float32)
            except Exception as e:
                self.logger.error(f"Failed to convert phase data to float32: {e}")
                raise

        if phase_array.dtype.names:
            numeric_mask = np.array([np.issubdtype(phase_array.dtype.fields[col][0], np.number)
                                    for col in phase_array.dtype.names])
        else:
            numeric_mask = np.array([np.issubdtype(phase_array[:, i].dtype, np.number)
                                    for i in range(phase_array.shape[1])])
        
        self.logger.debug(f"Numeric mask for columns: {numeric_mask}")
        current_length = phase_array.shape[0]

        # --- Step 3: Apply DTW or Padding ---
        if numeric_mask.all():
            if self.time_series_sequence_mode == "dtw":
                distortion = abs(current_length - target_length) / target_length
                if distortion > self.max_phase_distortion:
                    if self.options.get('fallback_alignment', False):
                        self.logger.warning(
                            f"High distortion ({distortion*100:.1f}%) exceeds DTW threshold ({self.max_phase_distortion*100:.1f}%). Falling back to padding."
                        )
                        result = self.pad_sequence(phase_array, target_length)
                        self.logger.debug(f"Aligned phase from {current_length} to {target_length} (method: padding)")
                        return result
                    else:
                        raise Exception(f"Phase distortion {distortion*100:.1f}% exceeds allowed threshold of {self.max_phase_distortion*100:.1f}%.")
                alignment_path = dtw_path(phase_array, phase_array)
                aligned_numeric = warp_sequence(phase_array, alignment_path, target_length)
                self.logger.debug(f"Aligned phase from {current_length} to {target_length} (method: DTW)")
                return aligned_numeric
            else:
                result = self.pad_sequence(phase_array, target_length)
                self.logger.debug(f"Aligned phase from {current_length} to {target_length} (method: padding)")
                return result
        else:
            # Non-numeric columns are handled separately.
            numeric_part = phase_array[:, numeric_mask] if numeric_mask.any() else np.empty((phase_array.shape[0], 0))
            categorical_part = phase_array[:, ~numeric_mask] if (~numeric_mask).any() else np.empty((phase_array.shape[0], 0))
            
            if numeric_part.shape[1] > 0:
                if self.time_series_sequence_mode == "dtw":
                    distortion = abs(current_length - target_length) / target_length
                    if distortion > self.max_phase_distortion:
                        if self.options.get('fallback_alignment', False):
                            self.logger.warning(
                                f"High distortion ({distortion*100:.1f}%) exceeds DTW threshold. Falling back to padding for numeric part."
                            )
                            aligned_numeric = self.pad_sequence(numeric_part, target_length)
                        else:
                            raise Exception(f"Phase distortion {distortion*100:.1f}% exceeds allowed threshold for numeric data.")
                    else:
                        alignment_path = dtw_path(numeric_part, numeric_part)
                        aligned_numeric = warp_sequence(numeric_part, alignment_path, target_length)
                else:
                    aligned_numeric = self.pad_sequence(numeric_part, target_length)
            else:
                aligned_numeric = np.empty((target_length, 0))
            
            if categorical_part.shape[1] > 0:
                aligned_categorical = self.pad_sequence(categorical_part, target_length)
            else:
                aligned_categorical = np.empty((target_length, 0))
            
            aligned_full = np.empty((target_length, phase_array.shape[1]), dtype=phase_array.dtype)
            aligned_full[:, numeric_mask] = aligned_numeric
            aligned_full[:, ~numeric_mask] = aligned_categorical
            
            self.logger.debug(f"Aligned phase from {current_length} to {target_length} (method: {'DTW' if self.time_series_sequence_mode == 'dtw' else 'padding'})")
            self.logger.debug(f"Aligned full array shape: {aligned_full.shape}")
            return aligned_full




        
    @staticmethod
    def safe_array_conversion(data):
        """
        Convert input data to a NumPy array if it is not already.
        Handles both structured and unstructured arrays.
        """
        if isinstance(data, np.ndarray):
            if data.dtype.names:
                # For structured arrays, view as float32 and reshape to combine fields.
                return data.view(np.float32).reshape(data.shape + (-1,))
            return data
        elif hasattr(data, 'values'):
            arr = data.values
            if arr.dtype.names:
                return arr.view(np.float32).reshape(arr.shape + (-1,))
            return arr
        else:
            return np.array(data)


    @staticmethod
    def calculate_phase_window(phase_data: pd.DataFrame, base_size: int = 100, std_dev: int = 2) -> int:
        """
        Estimate an optimal window size for a given phase based on its duration statistics.
        Clamps the result between base_size and an upper limit (here 300).
        """
        # Assuming 'pitch_trial_id' exists to group duration lengths
        durations = phase_data.groupby('pitch_trial_id').size()
        avg = durations.mean()
        std = durations.std()
        window_size = int(np.clip(avg + std_dev * std, base_size, 300))
        return window_size






    def _validate_sequences(self, aligned_sequences: dict):
        """
        Validate that each group in aligned_sequences has a consistent total length.
        Also, dynamically filter out groups with extreme Follow-Through durations.
        """
        if not isinstance(aligned_sequences, (dict, list)):
            self.logger.error(f"Invalid sequence container type {type(aligned_sequences)}")
        
        if isinstance(aligned_sequences, dict):
            valid_sequences = {k: v for k, v in aligned_sequences.items() if isinstance(v, dict)}
        else:
            valid_sequences = aligned_sequences

        # Check individual phase shapes within each group.
        for group_key, phases in aligned_sequences.items():
            shapes = [phase.shape for phase in phases.values() if phase is not None]
            if len(set(shapes)) > 1:
                self.logger.error(f"Inconsistent phase shapes in group {group_key}: {shapes}")
        
        # --- Dynamic Outlier Filtering for Follow-Through ---
        filtered_groups = {}
        for group_key, phases in aligned_sequences.items():
            # Retrieve metadata for this group from follow_through_stats.
            group_phase_stats = [s for s in self.follow_through_stats if s["group_key"] == group_key]
            if self._filter_follow_through(group_phase_stats):
                filtered_groups[group_key] = phases
            else:
                self.logger.warning(f"Group {group_key} discarded due to extreme Follow-Through duration.")
        
        return filtered_groups


    def post_processing_report(self):
        """
        Generate a post-processing report of Follow-Through statistics after filtering.
        """
        ft_lengths = [s["length"] for s in self.follow_through_stats if s["phase"] == "Follow Through"]
        if ft_lengths:
            report = (
                f"Follow-Through Stats After Filtering:\n"
                f"- Min: {min(ft_lengths):.3f}s\n"
                f"- Max: {max(ft_lengths):.3f}s\n"
                f"- σ: {np.std(ft_lengths):.3f}s"
            )
            self.logger.info(report)
        else:
            self.logger.info("No Follow-Through phases recorded.")



    def validate_pipeline_data_types(self):
        """
        Run sample tests to check that:
        - Grouping returns arrays.
        - Phase segmentation produces a dictionary of arrays.
        - DTW alignment returns a 2D array.
        """
        test_data = self.load_test_dataset()  # Assumes implementation of load_test_dataset exists
        
        # Stage 1: Grouping
        groups = self._group_top_level(test_data)
        assert all(isinstance(g[1], np.ndarray) for g in groups), "Grouping must return arrays"
        
        # Stage 2: Phase Segmentation (using the first group's data)
        subphases = self._segment_subphases(groups[0][1])
        assert isinstance(subphases, dict), "Subphases should be a dict of arrays"
        assert all(isinstance(v, np.ndarray) for v in subphases.values()), "Subphase data must be array-like"
        
        # Stage 3: Alignment (using one of the subphase arrays)
        some_phase = list(subphases.values())[0]
        aligned = self._align_phase(some_phase, 100)
        assert isinstance(aligned, np.ndarray), "Alignment must return an array"
        
        self.logger.info("All pipeline data type checks passed.")


    def _filter_sequences(self, sequences: dict):
        """
        Filter sequences by ensuring that within each group the ratio of valid (non-None) phases is acceptable.
        Raises an error if more than 30% of sequences in any group are invalid.
        """
        valid_sequences = {}
        for seq_id, phases in sequences.items():
            valid_phases = [p for p in phases.values() if p is not None]
            if len(valid_phases) / len(phases) < 0.7:
                raise ValueError(f"Over 30% of phases in sequence {seq_id} are invalid.")
            valid_sequences[seq_id] = phases
        return valid_sequences

    def _apply_smote_ts(self, aligned_data):
        """
        If the option 'apply_smote_ts' is enabled, apply SMOTE-TS to balance the sequences temporally.
        Assumes the existence of a SMOTE_TS class and a _detect_phase_transitions helper.
        """
        if not self.options.get('apply_smote_ts'):
            return aligned_data
        
        phase_boundaries = self._detect_phase_transitions(aligned_data)  # You must implement this helper as needed.
        smote = SMOTE_TS(
            phases=self.sequence_dtw_or_pad_categorical,
            dtw_window=int(self.max_phase_distortion * 100),  # Converts the distortion to sample count (example)
            phase_markers=phase_boundaries
        )
        return smote.fit_resample(aligned_data)

    def _verify_temporal_flow(self, sequence):
        """
        Check that the sequence has valid phase transitions.
        Assumes existence of a list/dict VALID_TRANSITIONS.
        """
        transitions = detect_phase_transitions(sequence)  # Implement or import detect_phase_transitions as needed.
        if not all(t in VALID_TRANSITIONS for t in transitions):
            raise ValueError("Impossible phase sequence detected")
        return True

    def _debug_plot_alignment(self, original, aligned):
        """
        If debugging is enabled, plot a comparison between original and aligned key features.
        Assumes that plot_comparison is a plotting function defined elsewhere.
        """
        if self.debug:
            plot_comparison(original[:, 'key_feature'], aligned[:, 'key_feature'], self.time_column)

    def _validate_distributions(self, pre, post):
        """
        Run KS-tests on numerical features to ensure that distributions remain similar after alignment.
        """
        import scipy.stats
        ks_tests = {}
        for feature in self.numericals:
            ks = scipy.stats.ks_2samp(pre[:, feature], post[:, feature])
            if ks.pvalue < 0.01:
                self.logger.warning(f"Distribution changed significantly for {feature}")
            ks_tests[feature] = ks
        return ks_tests


    def create_sequences_by_category(self, X: np.ndarray, y: np.ndarray, group_ids: np.ndarray) -> Tuple[Any, Any, np.ndarray]:
        # Convert group_ids to tuple keys if more than one grouping column is provided.
        if group_ids.ndim > 1:
            group_keys_full = np.array([tuple(row) for row in group_ids])
        else:
            group_keys_full = group_ids

        unique_groups = np.unique(group_keys_full, axis=0)
        sequences_X = []
        sequences_y = []
        group_keys_list = []
        
        for idx, group in enumerate(unique_groups):
            if group_keys_full.ndim > 1:
                indices = np.where(np.all(group_keys_full == group, axis=1))[0]
            else:
                indices = np.where(group_keys_full == group)[0]
            seq_X = X[indices, :]
            seq_y = y[indices]
            sequences_X.append(seq_X)
            sequences_y.append(seq_y)
            group_keys_list.append(group)
            self.logger.debug(f"Group {group} - seq_y shape: {seq_y.shape}")

        if self.time_series_sequence_mode in ["dtw", "pad"]:
            max_length = max(seq.shape[0] for seq in sequences_X)
            self.logger.debug(f"Maximum sequence length determined: {max_length}")
        # For "variable_length", we leave sequences as they are.

        aligned_X = []
        aligned_y = []
        
        for idx, (seq_X, seq_y) in enumerate(zip(sequences_X, sequences_y)):
            current_length = seq_X.shape[0]
            if self.time_series_sequence_mode == "dtw" and current_length < max_length:
                self.logger.debug(f"Group {unique_groups[idx]}: applying DTW warping. Original shape: {seq_X.shape}")
                original_seq = seq_X.copy()
                path = dtw_path(seq_X, seq_X)
                seq_X_aligned = warp_sequence(seq_X, path, max_length)
                pad_width = max_length - current_length
                seq_y_aligned = np.pad(seq_y, ((0, pad_width), (0, 0)), mode='constant', constant_values=0)
                aligned_X.append(seq_X_aligned)
                aligned_y.append(seq_y_aligned)
            elif self.time_series_sequence_mode == "pad" and current_length < max_length:
                self.logger.debug(f"Group {unique_groups[idx]}: applying zero padding. Original shape: {seq_X.shape}")
                pad_width = max_length - current_length
                seq_X_aligned = np.pad(seq_X, ((0, pad_width), (0, 0)), mode='constant', constant_values=0)
                seq_y_aligned = np.pad(seq_y, ((0, pad_width), (0, 0)), mode='constant', constant_values=0)
                aligned_X.append(seq_X_aligned)
                aligned_y.append(seq_y_aligned)
            else:
                aligned_X.append(seq_X)
                aligned_y.append(seq_y)
        
        if self.time_series_sequence_mode == "variable_length":
            X_seq = aligned_X
            y_seq = aligned_y
        else:
            X_seq = np.array(aligned_X)
            y_seq = np.array(aligned_y)
        
        return X_seq, y_seq, np.array(group_keys_list)




    def apply_dtw_alignment(self, sequences: np.ndarray) -> np.ndarray:
        """
        Align a set of sequences using DTW so that all sequences match the reference length.
        
        Args:
            sequences: Array of sequences with shape (num_sequences, seq_length, num_features)
        
        Returns:
            aligned_sequences: Array of DTW-aligned sequences.
        """
        ref = sequences[0]
        target_length = ref.shape[0]
        aligned_sequences = []
        
        for seq in sequences:
            path = dtw_path(seq, ref)
            aligned_seq = warp_sequence(seq, path, target_length)
            aligned_sequences.append(aligned_seq)
        
        return np.array(aligned_sequences)

    def create_sequences(self, X: np.ndarray, y: np.ndarray) -> Tuple[Any, Any]:
        X_seq, y_seq = [], []
        for i in range(0, len(X) - self.window_size - self.horizon + 1, self.step_size):
            seq_X = X[i:i+self.window_size]
            seq_y = y[i+self.window_size:i+self.window_size+self.horizon]
            if self.time_series_sequence_mode != "variable_length" and self.max_sequence_length and seq_X.shape[0] < self.max_sequence_length:
                pad_width = self.max_sequence_length - seq_X.shape[0]
                seq_X = np.pad(seq_X, ((0, pad_width), (0, 0)), mode='constant', constant_values=0)
            X_seq.append(seq_X)
            y_seq.append(seq_y)
        
        if self.time_series_sequence_mode != "variable_length":
            X_seq = np.array(X_seq)
            y_seq = np.array(y_seq)
        
        if isinstance(y_seq, np.ndarray) and y_seq.ndim == 3 and y_seq.shape[-1] == 1:
            y_seq = np.squeeze(y_seq, axis=-1)
            self.logger.debug("Squeezed extra dimension from y_seq to shape: " + str(y_seq.shape))
        
        # If time_series_sequence_mode is "dtw", perform DTW alignment on the sequences.
        if self.time_series_sequence_mode == "dtw":
            if not np.all([seq.shape[0] == X_seq[0].shape[0] for seq in X_seq]):
                X_seq = self.apply_dtw_alignment(X_seq)
            else:
                self.logger.debug("All sequences are already uniform; skipping DTW alignment.")
        
        return X_seq, y_seq


    def temporal_encode_sequences(self, X_seq: Any, group_keys: np.ndarray) -> Any:
        if group_keys.ndim == 1:
            group_keys = group_keys.reshape(-1, 1)
        num_group = group_keys.shape[1]
        for i in range(num_group):
            col_name = self.sequence_categorical[i] if isinstance(self.sequence_categorical, list) else self.sequence_categorical
            if col_name not in self.hierarchical_categories or not self.hierarchical_categories[col_name]:
                self.hierarchical_categories[col_name] = sorted(np.unique(group_keys[:, i]))
                self.logger.debug(f"Hierarchical categories for '{col_name}': {self.hierarchical_categories[col_name]}")
        
        encoded_sequences = []
        for idx, seq in enumerate(X_seq):
            seq_length = seq.shape[0]
            pos_encoding = np.linspace(0, 1, seq_length).reshape(-1, 1)
            if group_keys.shape[1] == 1:
                group_value = group_keys[idx, 0]
                col_name = self.sequence_categorical[0] if isinstance(self.sequence_categorical, list) else self.sequence_categorical
                categories = self.hierarchical_categories[col_name]
                one_hot = np.zeros((seq_length, len(categories)))
                if group_value in categories:
                    one_hot[:, categories.index(group_value)] = 1
                else:
                    self.logger.warning(f"Group key {group_value} not found in categories for '{col_name}'.")
            else:
                one_hot_list = []
                for i in range(group_keys.shape[1]):
                    col_name = self.sequence_categorical[i] if isinstance(self.sequence_categorical, list) else self.sequence_categorical
                    categories = self.hierarchical_categories[col_name]
                    group_value = group_keys[idx, i]
                    one_hot_col = np.zeros((seq_length, len(categories)))
                    if group_value in categories:
                        one_hot_col[:, categories.index(group_value)] = 1
                    else:
                        self.logger.warning(f"Group value {group_value} not found in categories for '{col_name}'.")
                    one_hot_list.append(one_hot_col)
                one_hot = np.concatenate(one_hot_list, axis=1)
        
            seq_encoded = np.concatenate([seq, one_hot, pos_encoding], axis=1)
            encoded_sequences.append(seq_encoded)
        
        if self.time_series_sequence_mode != "variable_length":
            encoded_sequences = np.array(encoded_sequences)
        return encoded_sequences




    def split_dataset(
        self,
        X: pd.DataFrame,
        y: Optional[pd.Series] = None
    ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame], Optional[pd.Series], Optional[pd.Series]]:
        """
        Split the dataset into training and testing sets while retaining original indices.

        Args:
            X (pd.DataFrame): Features.
            y (Optional[pd.Series]): Target variable.

        Returns:
            Tuple[pd.DataFrame, Optional[pd.DataFrame], Optional[pd.Series], Optional[pd.Series]]: X_train, X_test, y_train, y_test
        """
        step_name = "split_dataset"
        self.logger.info("Step: Split Dataset into Train and Test")

        # Debugging Statements
        self._log(f"Before Split - X shape: {X.shape}", step_name, 'debug')
        if y is not None:
            self._log(f"Before Split - y shape: {y.shape}", step_name, 'debug')
        else:
            self._log("Before Split - y is None", step_name, 'debug')

        # Determine splitting based on mode
        if self.mode == 'train' and self.model_category in ['classification', 'regression']:
            if self.model_category == 'classification':
                stratify = y if self.options.get('split_dataset', {}).get('stratify_for_classification', False) else None
                test_size = self.options.get('split_dataset', {}).get('test_size', 0.2)
                random_state = self.options.get('split_dataset', {}).get('random_state', 42)
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, 
                    test_size=test_size,
                    stratify=stratify, 
                    random_state=random_state
                )
                self._log("Performed stratified split for classification.", step_name, 'debug')
            elif self.model_category == 'regression':
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, 
                    test_size=self.options.get('split_dataset', {}).get('test_size', 0.2),
                    random_state=self.options.get('split_dataset', {}).get('random_state', 42)
                )
                self._log("Performed random split for regression.", step_name, 'debug')
        else:
            # For 'predict' and 'clustering' modes or other categories
            X_train = X.copy()
            X_test = None
            y_train = y.copy() if y is not None else None
            y_test = None
            self.logger.info(f"No splitting performed for mode '{self.mode}' or model category '{self.model_category}'.")

        self.preprocessing_steps.append("Split Dataset into Train and Test")

        # Keep Indices Aligned Through Each Step
        if X_test is not None and y_test is not None:
            # Sort both X_test and y_test by index
            X_test = X_test.sort_index()
            y_test = y_test.sort_index()
            self.logger.debug("Sorted X_test and y_test by index for alignment.")

        # Debugging: Log post-split shapes and index alignment
        self._log(f"After Split - X_train shape: {X_train.shape}, X_test shape: {X_test.shape if X_test is not None else 'N/A'}", step_name, 'debug')
        if self.model_category == 'classification' and y_train is not None and y_test is not None:
            self.logger.debug(f"Class distribution in y_train:\n{y_train.value_counts(normalize=True)}")
            self.logger.debug(f"Class distribution in y_test:\n{y_test.value_counts(normalize=True)}")
        elif self.model_category == 'regression' and y_train is not None and y_test is not None:
            self.logger.debug(f"y_train statistics:\n{y_train.describe()}")
            self.logger.debug(f"y_test statistics:\n{y_test.describe()}")

        # Check index alignment
        if y_train is not None and X_train.index.equals(y_train.index):
            self.logger.debug("X_train and y_train indices are aligned.")
        else:
            self.logger.warning("X_train and y_train indices are misaligned.")

        if X_test is not None and y_test is not None and X_test.index.equals(y_test.index):
            self.logger.debug("X_test and y_test indices are aligned.")
        elif X_test is not None and y_test is not None:
            self.logger.warning("X_test and y_test indices are misaligned.")

        return X_train, X_test, y_train, y_test

    def handle_missing_values(self, X_train: pd.DataFrame, X_test: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Handle missing values for numerical and categorical features based on user options.
        """
        step_name = "handle_missing_values"
        self.logger.info("Step: Handle Missing Values")

        # Fetch user-defined imputation options or set defaults
        impute_options = self.options.get('handle_missing_values', {})
        numerical_strategy = impute_options.get('numerical_strategy', {})
        categorical_strategy = impute_options.get('categorical_strategy', {})

        # Numerical Imputation
        numerical_imputer = None
        new_columns = []
        if self.numericals:
            if self.model_category in ['regression', 'classification', 'clustering']:
                default_num_strategy = 'median'  # Changed to median as per preprocessor_config.yaml
            else:
                default_num_strategy = 'median'
            num_strategy = numerical_strategy.get('strategy', default_num_strategy)
            num_imputer_type = numerical_strategy.get('imputer', 'SimpleImputer')  # Can be 'SimpleImputer', 'KNNImputer', etc.

            self._log(f"Numerical Imputation Strategy: {num_strategy.capitalize()}, Imputer Type: {num_imputer_type}", step_name, 'debug')

            # Initialize numerical imputer based on user option
            if num_imputer_type == 'SimpleImputer':
                numerical_imputer = SimpleImputer(strategy=num_strategy)
            elif num_imputer_type == 'KNNImputer':
                knn_neighbors = numerical_strategy.get('knn_neighbors', 5)
                numerical_imputer = KNNImputer(n_neighbors=knn_neighbors)
            else:
                self.logger.error(f"Numerical imputer type '{num_imputer_type}' is not supported.")
                raise ValueError(f"Numerical imputer type '{num_imputer_type}' is not supported.")

            # Fit and transform ONLY on X_train
            X_train[self.numericals] = numerical_imputer.fit_transform(X_train[self.numericals])
            self.numerical_imputer = numerical_imputer  # Assign to self for saving
            self.feature_reasons.update({col: self.feature_reasons.get(col, '') + f'Numerical: {num_strategy.capitalize()} Imputation | ' for col in self.numericals})
            new_columns.extend(self.numericals)

            if X_test is not None:
                # Transform ONLY on X_test without fitting
                X_test[self.numericals] = numerical_imputer.transform(X_test[self.numericals])

        # Categorical Imputation
        categorical_imputer = None
        all_categoricals = self.ordinal_categoricals + self.nominal_categoricals
        if all_categoricals:
            default_cat_strategy = 'most_frequent'
            cat_strategy = categorical_strategy.get('strategy', default_cat_strategy)
            cat_imputer_type = categorical_strategy.get('imputer', 'SimpleImputer')

            self._log(f"Categorical Imputation Strategy: {cat_strategy.capitalize()}, Imputer Type: {cat_imputer_type}", step_name, 'debug')

            # Initialize categorical imputer based on user option
            if cat_imputer_type == 'SimpleImputer':
                categorical_imputer = SimpleImputer(strategy=cat_strategy)
            elif cat_imputer_type == 'ConstantImputer':
                fill_value = categorical_strategy.get('fill_value', 'Missing')
                categorical_imputer = SimpleImputer(strategy='constant', fill_value=fill_value)
            else:
                self.logger.error(f"Categorical imputer type '{cat_imputer_type}' is not supported.")
                raise ValueError(f"Categorical imputer type '{cat_imputer_type}' is not supported.")

            # Fit and transform ONLY on X_train
            X_train[all_categoricals] = categorical_imputer.fit_transform(X_train[all_categoricals])
            self.categorical_imputer = categorical_imputer  # Assign to self for saving
            self.feature_reasons.update({
                col: self.feature_reasons.get(col, '') + (f'Categorical: Constant Imputation (Value={categorical_strategy.get("fill_value", "Missing")}) | ' if cat_imputer_type == 'ConstantImputer' else f'Categorical: {cat_strategy.capitalize()} Imputation | ')
                for col in all_categoricals
            })
            new_columns.extend(all_categoricals)

            if X_test is not None:
                # Transform ONLY on X_test without fitting
                X_test[all_categoricals] = categorical_imputer.transform(X_test[all_categoricals])

        self.preprocessing_steps.append("Handle Missing Values")

        # Debugging: Log post-imputation shapes and missing values
        self._log(f"Completed: Handle Missing Values. Dataset shape after imputation: {X_train.shape}", step_name, 'debug')
        self._log(f"Missing values after imputation in X_train:\n{X_train.isnull().sum()}", step_name, 'debug')
        self._log(f"New columns handled: {new_columns}", step_name, 'debug')

        return X_train, X_test

    def handle_outliers(self, X_train: pd.DataFrame, y_train: Optional[pd.Series] = None) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
        """
        Handle outliers based on the model's sensitivity and user options.
        For time_series models, apply a custom outlier handling using a rolling median filter
        to replace extreme values rather than dropping rows (to preserve temporal alignment).

        Args:
            X_train (pd.DataFrame): Training features.
            y_train (pd.Series, optional): Training target.

        Returns:
            tuple: X_train with outliers handled and corresponding y_train.
        """
        step_name = "handle_outliers"
        self.logger.info("Step: Handle Outliers")
        self._log("Starting outlier handling.", step_name, 'debug')
        debug_flag = self.get_debug_flag('debug_handle_outliers')
        initial_shape = X_train.shape[0]
        outlier_options = self.options.get('handle_outliers', {})
        zscore_threshold = outlier_options.get('zscore_threshold', 3)
        iqr_multiplier = outlier_options.get('iqr_multiplier', 1.5)
        isolation_contamination = outlier_options.get('isolation_contamination', 0.05)

        # ----- NEW: Custom outlier handling branch for time series -----
        if self.model_category == 'time_series':
            self.logger.info("Applying custom outlier handling for time_series using rolling median filter.")
            # For time series, do not drop rows—instead, replace outliers with the rolling median.
            for col in self.numericals:
                # Compute rolling statistics with a window of 5 (centered)
                rolling_median = X_train[col].rolling(window=5, center=True, min_periods=1).median()
                rolling_q1 = X_train[col].rolling(window=5, center=True, min_periods=1).quantile(0.25)
                rolling_q3 = X_train[col].rolling(window=5, center=True, min_periods=1).quantile(0.75)
                rolling_iqr = rolling_q3 - rolling_q1
                # Identify outliers as those deviating more than the multiplier times the rolling IQR
                outlier_mask = abs(X_train[col] - rolling_median) > (iqr_multiplier * rolling_iqr)
                num_outliers = outlier_mask.sum()
                # Replace outlier values with the corresponding rolling median
                X_train.loc[outlier_mask, col] = rolling_median[outlier_mask]
                self.logger.debug(f"Replaced {num_outliers} outliers in column '{col}' with rolling median.")
            self.preprocessing_steps.append("Handle Outliers (time_series custom)")
            self._log(f"Completed: Handle Outliers for time_series. Initial samples: {initial_shape}, Final samples: {X_train.shape[0]}", step_name, 'debug')
            return X_train, y_train
        # -----------------------------------------------------------------

        # Existing outlier handling for regression and classification
        if self.model_category in ['regression', 'classification']:
            self.logger.info(f"Applying univariate outlier detection for {self.model_category}.")
            for col in self.numericals:
                # Z-Score Filtering
                apply_zscore = outlier_options.get('apply_zscore', True)
                if apply_zscore:
                    z_scores = np.abs((X_train[col] - X_train[col].mean()) / X_train[col].std())
                    mask_z = z_scores < zscore_threshold
                    removed_z = (~mask_z).sum()
                    X_train = X_train[mask_z]
                    if y_train is not None:
                        y_train = y_train.loc[X_train.index]
                    self.feature_reasons[col] += f'Outliers handled with Z-Score Filtering (threshold={zscore_threshold}) | '
                    self._log(f"Removed {removed_z} outliers from '{col}' using Z-Score Filtering.", step_name, 'debug')

                # IQR Filtering
                apply_iqr = outlier_options.get('apply_iqr', True)
                if apply_iqr:
                    Q1 = X_train[col].quantile(0.25)
                    Q3 = X_train[col].quantile(0.75)
                    IQR = Q3 - Q1
                    lower_bound = Q1 - iqr_multiplier * IQR
                    upper_bound = Q3 + iqr_multiplier * IQR
                    mask_iqr = (X_train[col] >= lower_bound) & (X_train[col] <= upper_bound)
                    removed_iqr = (~mask_iqr).sum()
                    X_train = X_train[mask_iqr]
                    if y_train is not None:
                        y_train = y_train.loc[X_train.index]
                    self.feature_reasons[col] += f'Outliers handled with IQR Filtering (multiplier={iqr_multiplier}) | '
                    self._log(f"Removed {removed_iqr} outliers from '{col}' using IQR Filtering.", step_name, 'debug')

        elif self.model_category == 'clustering':
            self.logger.info("Applying multivariate IsolationForest for clustering.")
            contamination = isolation_contamination
            iso_forest = IsolationForest(contamination=contamination, random_state=42)
            preds = iso_forest.fit_predict(X_train[self.numericals])
            mask_iso = preds != -1
            removed_iso = (preds == -1).sum()
            X_train = X_train[mask_iso]
            if y_train is not None:
                y_train = y_train.loc[X_train.index]
            self.feature_reasons['all_numericals'] += f'Outliers handled with Multivariate IsolationForest (contamination={contamination}) | '
            self._log(f"Removed {removed_iso} outliers using Multivariate IsolationForest.", step_name, 'debug')
        else:
            self.logger.warning(f"Model category '{self.model_category}' not recognized for outlier handling.")

        self.preprocessing_steps.append("Handle Outliers")
        self._log(f"Completed: Handle Outliers. Initial samples: {initial_shape}, Final samples: {X_train.shape[0]}", step_name, 'debug')
        self._log(f"Missing values after outlier handling in X_train:\n{X_train.isnull().sum()}", step_name, 'debug')
        return X_train, y_train


    def test_normality(self, X_train: pd.DataFrame) -> Dict[str, Dict]:
        """
        Test normality for numerical features based on normality tests and user options.

        Args:
            X_train (pd.DataFrame): Training features.

        Returns:
            Dict[str, Dict]: Dictionary with normality test results for each numerical feature.
        """
        step_name = "Test for Normality"
        self.logger.info(f"Step: {step_name}")
        debug_flag = self.get_debug_flag('debug_test_normality')
        normality_results = {}

        # Fetch user-defined normality test options or set defaults
        normality_options = self.options.get('test_normality', {})
        p_value_threshold = normality_options.get('p_value_threshold', 0.05)
        skewness_threshold = normality_options.get('skewness_threshold', 1.0)
        additional_tests = normality_options.get('additional_tests', [])  # e.g., ['anderson-darling']

        for col in self.numericals:
            data = X_train[col].dropna()
            skewness = data.skew()
            kurtosis = data.kurtosis()

            # Determine which normality test to use based on sample size and user options
            test_used = 'Shapiro-Wilk'
            p_value = 0.0

            if len(data) <= 5000:
                from scipy.stats import shapiro
                stat, p_val = shapiro(data)
                test_used = 'Shapiro-Wilk'
                p_value = p_val
            else:
                from scipy.stats import anderson
                result = anderson(data)
                test_used = 'Anderson-Darling'
                # Determine p-value based on critical values
                p_value = 0.0  # Default to 0
                for cv, sig in zip(result.critical_values, result.significance_level):
                    if result.statistic < cv:
                        p_value = sig / 100
                        break

            # Apply user-defined or default criteria
            if self.model_category in ['regression', 'classification', 'clustering']:
                # Linear, Logistic Regression, and Clustering: Use p-value and skewness
                needs_transform = (p_value < p_value_threshold) or (abs(skewness) > skewness_threshold)
            else:
                # Other models: Use skewness, and optionally p-values based on options
                use_p_value = normality_options.get('use_p_value_other_models', False)
                if use_p_value:
                    needs_transform = (p_value < p_value_threshold) or (abs(skewness) > skewness_threshold)
                else:
                    needs_transform = abs(skewness) > skewness_threshold

            normality_results[col] = {
                'skewness': skewness,
                'kurtosis': kurtosis,
                'p_value': p_value,
                'test_used': test_used,
                'needs_transform': needs_transform
            }

            # Conditional Detailed Logging
            if debug_flag:
                self._log(f"Feature '{col}': p-value={p_value:.4f}, skewness={skewness:.4f}, needs_transform={needs_transform}", step_name, 'debug')

        self.normality_results = normality_results
        self.preprocessing_steps.append(step_name)

        # Completion Logging
        if debug_flag:
            self._log(f"Completed: {step_name}. Normality results computed.", step_name, 'debug')
        else:
            self.logger.info(f"Step '{step_name}' completed: Normality results computed.")

        return normality_results

    def encode_categorical_variables(self, X_train: pd.DataFrame, X_test: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Encode categorical variables using user-specified encoding strategies.
        """
        step_name = "encode_categorical_variables"
        self.logger.info("Step: Encode Categorical Variables")
        self._log("Starting categorical variable encoding.", step_name, 'debug')

        # Fetch user-defined encoding options or set defaults
        encoding_options = self.options.get('encode_categoricals', {})
        ordinal_encoding = encoding_options.get('ordinal_encoding', 'OrdinalEncoder')  # Options: 'OrdinalEncoder', 'None'
        nominal_encoding = encoding_options.get('nominal_encoding', 'OneHotEncoder')  # Changed from 'OneHotEncoder' to 'OrdinalEncoder'
        handle_unknown = encoding_options.get('handle_unknown', 'use_encoded_value')  # Adjusted for OrdinalEncoder

        # Determine if SMOTENC is being used
        smote_variant = self.options.get('implement_smote', {}).get('variant', None)
        if smote_variant == 'SMOTENC':
            nominal_encoding = 'OrdinalEncoder'  # Ensure compatibility

        transformers = []
        new_columns = []
        if self.ordinal_categoricals and ordinal_encoding != 'None':
            if ordinal_encoding == 'OrdinalEncoder':
                transformers.append(
                    ('ordinal', OrdinalEncoder(), self.ordinal_categoricals)
                )
                self._log(f"Added OrdinalEncoder for features: {self.ordinal_categoricals}", step_name, 'debug')
            else:
                self.logger.error(f"Ordinal encoding method '{ordinal_encoding}' is not supported.")
                raise ValueError(f"Ordinal encoding method '{ordinal_encoding}' is not supported.")
        if self.nominal_categoricals and nominal_encoding != 'None':
            if nominal_encoding == 'OrdinalEncoder':
                transformers.append(
                    ('nominal', OrdinalEncoder(handle_unknown=handle_unknown), self.nominal_categoricals)
                )
                self._log(f"Added OrdinalEncoder for features: {self.nominal_categoricals}", step_name, 'debug')
            elif nominal_encoding == 'FrequencyEncoder':
                # Custom Frequency Encoding
                for col in self.nominal_categoricals:
                    freq = X_train[col].value_counts(normalize=True)
                    X_train[col] = X_train[col].map(freq)
                    if X_test is not None:
                        X_test[col] = X_test[col].map(freq).fillna(0)
                    self.feature_reasons[col] += 'Encoded with Frequency Encoding | '
                    self._log(f"Applied Frequency Encoding to '{col}'.", step_name, 'debug')
            else:
                self.logger.error(f"Nominal encoding method '{nominal_encoding}' is not supported.")
                raise ValueError(f"Nominal encoding method '{nominal_encoding}' is not supported.")

        if not transformers and 'FrequencyEncoder' not in nominal_encoding:
            self.logger.info("No categorical variables to encode.")
            self.preprocessing_steps.append("Encode Categorical Variables")
            self._log(f"Completed: Encode Categorical Variables. No encoding was applied.", step_name, 'debug')
            return X_train, X_test

        if transformers:
            self.preprocessor = ColumnTransformer(
                transformers=transformers,
                remainder='passthrough',
                verbose_feature_names_out=False  # Disable prefixing
            )

            # Fit and transform training data
            X_train_encoded = self.preprocessor.fit_transform(X_train)
            self._log("Fitted and transformed X_train with ColumnTransformer.", step_name, 'debug')

            # Transform testing data
            if X_test is not None:
                X_test_encoded = self.preprocessor.transform(X_test)
                self._log("Transformed X_test with fitted ColumnTransformer.", step_name, 'debug')
            else:
                X_test_encoded = None

            # Retrieve feature names after encoding
            encoded_feature_names = []
            if self.ordinal_categoricals and ordinal_encoding == 'OrdinalEncoder':
                encoded_feature_names += self.ordinal_categoricals
            if self.nominal_categoricals and nominal_encoding == 'OrdinalEncoder':
                encoded_feature_names += self.nominal_categoricals
            elif self.nominal_categoricals and nominal_encoding == 'FrequencyEncoder':
                encoded_feature_names += self.nominal_categoricals
            passthrough_features = [col for col in X_train.columns if col not in self.ordinal_categoricals + self.nominal_categoricals]
            encoded_feature_names += passthrough_features
            new_columns.extend(encoded_feature_names)

            # Convert numpy arrays back to DataFrames
            X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_feature_names, index=X_train.index)
            if X_test_encoded is not None:
                X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_feature_names, index=X_test.index)
            else:
                X_test_encoded_df = None

            # Store encoders for inverse transformation
            self.ordinal_encoder = self.preprocessor.named_transformers_.get('ordinal', None)
            self.nominal_encoder = self.preprocessor.named_transformers_.get('nominal', None)

            self.preprocessing_steps.append("Encode Categorical Variables")
            self._log(f"Completed: Encode Categorical Variables. X_train_encoded shape: {X_train_encoded_df.shape}", step_name, 'debug')
            self._log(f"Columns after encoding: {encoded_feature_names}", step_name, 'debug')
            self._log(f"Sample of encoded X_train:\n{X_train_encoded_df.head()}", step_name, 'debug')
            self._log(f"New columns added: {new_columns}", step_name, 'debug')

            return X_train_encoded_df, X_test_encoded_df

    def generate_recommendations(self) -> pd.DataFrame:
        """
        Generate a table of preprocessing recommendations based on the model type, data, and user options.

        Returns:
            pd.DataFrame: DataFrame containing recommendations for each feature.
        """
        step_name = "Generate Preprocessor Recommendations"
        self.logger.info(f"Step: {step_name}")
        debug_flag = self.get_debug_flag('debug_generate_recommendations')

        # Generate recommendations based on feature reasons
        recommendations = {}
        for col in self.ordinal_categoricals + self.nominal_categoricals + self.numericals:
            reasons = self.feature_reasons.get(col, '').strip(' | ')
            recommendations[col] = reasons

        recommendations_table = pd.DataFrame.from_dict(
            recommendations, 
            orient='index', 
            columns=['Preprocessing Reason']
        )
        if debug_flag:
            self.logger.debug(f"Preprocessing Recommendations:\n{recommendations_table}")
        else:
            self.logger.info("Preprocessing Recommendations generated.")

        self.preprocessing_steps.append(step_name)

        # Completion Logging
        if debug_flag:
            self._log(f"Completed: {step_name}. Recommendations generated.", step_name, 'debug')
        else:
            self.logger.info(f"Step '{step_name}' completed: Recommendations generated.")

        return recommendations_table

    def save_transformers(self):
        step_name = "Save Transformers"
        self.logger.info(f"Step: {step_name}")
        debug_flag = self.get_debug_flag('debug_save_transformers')
        
        # Ensure the transformers directory exists
        os.makedirs(self.transformers_dir, exist_ok=True)
        transformers_path = os.path.join(self.transformers_dir, 'transformers.pkl')  # Consistent file path
        
        transformers = {
            'numerical_imputer': getattr(self, 'numerical_imputer', None),
            'categorical_imputer': getattr(self, 'categorical_imputer', None),
            'preprocessor': self.pipeline,   # Includes all preprocessing steps
            'smote': self.smote,
            'final_feature_order': self.final_feature_order,
            'categorical_indices': self.categorical_indices
        }
        try:
            joblib.dump(transformers, transformers_path)
            if debug_flag:
                self._log(f"Transformers saved at '{transformers_path}'.", step_name, 'debug')
            else:
                self.logger.info(f"Transformers saved at '{transformers_path}'.")
        except Exception as e:
            self.logger.error(f"❌ Failed to save transformers: {e}")
            raise

        self.preprocessing_steps.append(step_name)

    def load_transformers(self) -> dict:
        step_name = "Load Transformers"
        self.logger.info(f"Step: {step_name}")
        debug_flag = self.get_debug_flag('debug_load_transformers')  # Assuming a step-specific debug flag
        transformers_path = os.path.join(self.transformers_dir, 'transformers.pkl')  # Correct path

        # Debug log
        self.logger.debug(f"Loading transformers from: {transformers_path}")

        if not os.path.exists(transformers_path):
            self.logger.error(f"❌ Transformers file not found at '{transformers_path}'. Cannot proceed with prediction.")
            raise FileNotFoundError(f"Transformers file not found at '{transformers_path}'.")

        try:
            transformers = joblib.load(transformers_path)

            # Extract transformers
            numerical_imputer = transformers.get('numerical_imputer')
            categorical_imputer = transformers.get('categorical_imputer')
            preprocessor = transformers.get('preprocessor')
            smote = transformers.get('smote', None)
            final_feature_order = transformers.get('final_feature_order', [])
            categorical_indices = transformers.get('categorical_indices', [])
            self.categorical_indices = categorical_indices  # Set the attribute

            # **Post-Loading Debugging:**
            if preprocessor is not None:
                try:
                    # Do not attempt to transform dummy data here
                    self.logger.debug(f"Pipeline loaded. Ready to transform new data.")
                except AttributeError as e:
                    self.logger.error(f"Pipeline's get_feature_names_out is not available: {e}")
                    expected_features = []
            else:
                self.logger.error("❌ Preprocessor is not loaded.")
                raise AttributeError("Preprocessor is not loaded.")

        except Exception as e:
            self.logger.error(f"❌ Failed to load transformers: {e}")
            raise

        self.preprocessing_steps.append(step_name)

        # Additional checks
        if preprocessor is None:
            self.logger.error("❌ Preprocessor is not loaded.")

        if debug_flag:
            self._log(f"Transformers loaded successfully from '{transformers_path}'.", step_name, 'debug')
        else:
            self.logger.info(f"Transformers loaded successfully from '{transformers_path}'.")

        # Set the pipeline
        self.pipeline = preprocessor

        # Return the transformers as a dictionary
        return {
            'numerical_imputer': numerical_imputer,
            'categorical_imputer': categorical_imputer,
            'preprocessor': preprocessor,
            'smote': smote,
            'final_feature_order': final_feature_order,
            'categorical_indices': categorical_indices
        }

    def apply_scaling(self, X_train: pd.DataFrame, X_test: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Apply scaling based on the model type and user options.

        Args:
            X_train (pd.DataFrame): Training features.
            X_test (Optional[pd.DataFrame]): Testing features.

        Returns:
            Tuple[pd.DataFrame, Optional[pd.DataFrame]]: Scaled X_train and X_test.
        """
        step_name = "Apply Scaling (If Needed by Model)"
        self.logger.info(f"Step: {step_name}")
        debug_flag = self.get_debug_flag('debug_apply_scaling')

        # Fetch user-defined scaling options or set defaults
        scaling_options = self.options.get('apply_scaling', {})
        scaling_method = scaling_options.get('method', None)  # 'StandardScaler', 'MinMaxScaler', 'RobustScaler', 'None'
        features_to_scale = scaling_options.get('features', self.numericals)

        scaler = None
        scaling_type = 'None'

        if scaling_method is None:
            # Default scaling based on model category
            if self.model_category in ['regression', 'classification', 'clustering']:
                # For clustering, MinMaxScaler is generally preferred
                if self.model_category == 'clustering':
                    scaler = MinMaxScaler()
                    scaling_type = 'MinMaxScaler'
                else:
                    scaler = StandardScaler()
                    scaling_type = 'StandardScaler'
            else:
                scaler = None
                scaling_type = 'None'
        else:
            # Normalize the scaling_method string to handle case-insensitivity
            scaling_method_normalized = scaling_method.lower()
            if scaling_method_normalized == 'standardscaler':
                scaler = StandardScaler()
                scaling_type = 'StandardScaler'
            elif scaling_method_normalized == 'minmaxscaler':
                scaler = MinMaxScaler()
                scaling_type = 'MinMaxScaler'
            elif scaling_method_normalized == 'robustscaler':
                scaler = RobustScaler()
                scaling_type = 'RobustScaler'
            elif scaling_method_normalized == 'none':
                scaler = None
                scaling_type = 'None'
            else:
                self.logger.error(f"Scaling method '{scaling_method}' is not supported.")
                raise ValueError(f"Scaling method '{scaling_method}' is not supported.")

        # Apply scaling if scaler is defined
        if scaler is not None and features_to_scale:
            self.scaler = scaler
            if debug_flag:
                self._log(f"Features to scale: {features_to_scale}", step_name, 'debug')

            # Check if features exist in the dataset
            missing_features = [feat for feat in features_to_scale if feat not in X_train.columns]
            if missing_features:
                self.logger.error(f"The following features specified for scaling are missing in the dataset: {missing_features}")
                raise KeyError(f"The following features specified for scaling are missing in the dataset: {missing_features}")

            X_train[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])
            if X_test is not None:
                X_test[features_to_scale] = scaler.transform(X_test[features_to_scale])

            for col in features_to_scale:
                self.feature_reasons[col] += f'Scaling Applied: {scaling_type} | '

            self.preprocessing_steps.append(step_name)
            if debug_flag:
                self._log(f"Applied {scaling_type} to features: {features_to_scale}", step_name, 'debug')
                if hasattr(scaler, 'mean_'):
                    self._log(f"Scaler Parameters: mean={scaler.mean_}", step_name, 'debug')
                if hasattr(scaler, 'scale_'):
                    self._log(f"Scaler Parameters: scale={scaler.scale_}", step_name, 'debug')
                self._log(f"Sample of scaled X_train:\n{X_train[features_to_scale].head()}", step_name, 'debug')
                if X_test is not None:
                    self._log(f"Sample of scaled X_test:\n{X_test[features_to_scale].head()}", step_name, 'debug')
            else:
                self.logger.info(f"Step '{step_name}' completed: Applied {scaling_type} to features: {features_to_scale}")
        else:
            self.logger.info("No scaling applied based on user options or no features specified.")
            self.preprocessing_steps.append(step_name)
            if debug_flag:
                self._log(f"Completed: {step_name}. No scaling was applied.", step_name, 'debug')
            else:
                self.logger.info(f"Step '{step_name}' completed: No scaling was applied.")

        return X_train, X_test

    def determine_n_neighbors(self, minority_count: int, default_neighbors: int = 5) -> int:
        """
        Determine the appropriate number of neighbors for SMOTE based on minority class size.

        Args:
            minority_count (int): Number of samples in the minority class.
            default_neighbors (int): Default number of neighbors to use if possible.

        Returns:
            int: Determined number of neighbors for SMOTE.
        """
        if minority_count <= 1:
            raise ValueError("SMOTE cannot be applied when the minority class has less than 2 samples.")
        
        # Ensure n_neighbors does not exceed minority_count - 1
        n_neighbors = min(default_neighbors, minority_count - 1)
        return n_neighbors

    def implement_smote(self, X_train: pd.DataFrame, y_train: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
        """
        Implement SMOTE or its variants based on class imbalance with automated n_neighbors selection.

        Args:
            X_train (pd.DataFrame): Training features (transformed).
            y_train (pd.Series): Training target.

        Returns:
            Tuple[pd.DataFrame, pd.Series]: Resampled X_train and y_train.
        """
        step_name = "Implement SMOTE (Train Only)"
        self.logger.info(f"Step: {step_name}")

        # Check if classification
        if self.model_category != 'classification':
            self.logger.info("SMOTE not applicable: Not a classification model.")
            self.preprocessing_steps.append("SMOTE Skipped")
            return X_train, y_train

        # Calculate class distribution
        class_counts = y_train.value_counts()
        if len(class_counts) < 2:
            self.logger.warning("SMOTE not applicable: Only one class present.")
            self.preprocessing_steps.append("SMOTE Skipped")
            return X_train, y_train

        majority_class = class_counts.idxmax()
        minority_class = class_counts.idxmin()
        majority_count = class_counts.max()
        minority_count = class_counts.min()
        imbalance_ratio = minority_count / majority_count
        self.logger.info(f"Class Distribution before SMOTE: {class_counts.to_dict()}")
        self.logger.info(f"Imbalance Ratio (Minority/Majority): {imbalance_ratio:.4f}")

        # Determine SMOTE variant based on dataset composition
        has_numericals = len(self.numericals) > 0
        has_categoricals = len(self.ordinal_categoricals) + len(self.nominal_categoricals) > 0

        # Automatically select SMOTE variant
        if has_numericals and has_categoricals:
            smote_variant = 'SMOTENC'
            self.logger.info("Dataset contains both numerical and categorical features. Using SMOTENC.")
        elif has_numericals and not has_categoricals:
            smote_variant = 'SMOTE'
            self.logger.info("Dataset contains only numerical features. Using SMOTE.")
        elif has_categoricals and not has_numericals:
            smote_variant = 'SMOTEN'
            self.logger.info("Dataset contains only categorical features. Using SMOTEN.")
        else:
            smote_variant = 'SMOTE'  # Fallback
            self.logger.info("Feature composition unclear. Using SMOTE as default.")

        # Initialize SMOTE based on the variant
        try:
            if smote_variant == 'SMOTENC':
                if not self.categorical_indices:
                    # Determine categorical indices if not already set
                    categorical_features = []
                    for name, transformer, features in self.pipeline.transformers_:
                        if 'ord' in name or 'nominal' in name:
                            if isinstance(transformer, Pipeline):
                                encoder = transformer.named_steps.get('ordinal_encoder') or transformer.named_steps.get('onehot_encoder')
                                if hasattr(encoder, 'categories_'):
                                    # Calculate indices based on transformers order
                                    # This can be complex; for simplicity, assuming categorical features are the first
                                    categorical_features.extend(range(len(features)))
                    self.categorical_indices = categorical_features
                    self.logger.debug(f"Categorical feature indices for SMOTENC: {self.categorical_indices}")
                n_neighbors = self.determine_n_neighbors(minority_count, default_neighbors=5)
                smote = SMOTENC(categorical_features=self.categorical_indices, random_state=42, k_neighbors=n_neighbors)
                self.logger.debug(f"Initialized SMOTENC with categorical features indices: {self.categorical_indices} and n_neighbors={n_neighbors}")
            elif smote_variant == 'SMOTEN':
                n_neighbors = self.determine_n_neighbors(minority_count, default_neighbors=5)
                smote = SMOTEN(random_state=42, n_neighbors=n_neighbors)
                self.logger.debug(f"Initialized SMOTEN with n_neighbors={n_neighbors}")
            else:
                n_neighbors = self.determine_n_neighbors(minority_count, default_neighbors=5)
                smote = SMOTE(random_state=42, k_neighbors=n_neighbors)
                self.logger.debug(f"Initialized SMOTE with n_neighbors={n_neighbors}")
        except ValueError as ve:
            self.logger.error(f"❌ SMOTE initialization failed: {ve}")
            raise
        except Exception as e:
            self.logger.error(f"❌ Unexpected error during SMOTE initialization: {e}")
            raise

        # Apply SMOTE
        try:
            X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
            self.logger.info(f"Applied {smote_variant}. Resampled dataset shape: {X_resampled.shape}")
            self.preprocessing_steps.append("Implement SMOTE")
            self.smote = smote  # Assign to self for saving
            self.logger.debug(f"Selected n_neighbors for SMOTE: {n_neighbors}")
            return X_resampled, y_resampled
        except Exception as e:
            self.logger.error(f"❌ SMOTE application failed: {e}")
            raise

    def inverse_transform_data(self, X_transformed: np.ndarray, original_data: Optional[pd.DataFrame] = None) -> pd.DataFrame:
        """
        Perform inverse transformation on the transformed data to reconstruct original feature values.

        Args:
            X_transformed (np.ndarray): The transformed feature data.
            original_data (Optional[pd.DataFrame]): The original data before transformation.

        Returns:
            pd.DataFrame: The inverse-transformed DataFrame including passthrough columns.
        """
        if self.pipeline is None:
            self.logger.error("Preprocessing pipeline has not been fitted. Cannot perform inverse transformation.")
            raise AttributeError("Preprocessing pipeline has not been fitted. Cannot perform inverse transformation.")

        preprocessor = self.pipeline
        logger = logging.getLogger('InverseTransform')
        if self.debug or self.get_debug_flag('debug_final_inverse_transformations'):
            logger.setLevel(logging.DEBUG)
        else:
            logger.setLevel(logging.INFO)

        logger.debug(f"[DEBUG Inverse] Starting inverse transformation. Input shape: {X_transformed.shape}")

        # Initialize variables
        inverse_data = {}
        transformations_applied = False  # Flag to check if any transformations are applied
        start_idx = 0  # Starting index for slicing

        # Iterate over each transformer in the ColumnTransformer
        for name, transformer, features in preprocessor.transformers_:
            if name == 'remainder':
                logger.debug(f"[DEBUG Inverse] Skipping 'remainder' transformer (passthrough columns).")
                continue  # Skip passthrough columns

            end_idx = start_idx + len(features)
            logger.debug(f"[DEBUG Inverse] Transformer '{name}' handling features {features} with slice {start_idx}:{end_idx}")

            # Check if the transformer has an inverse_transform method
            if hasattr(transformer, 'named_steps'):
                # Access the last step in the pipeline (e.g., scaler or encoder)
                last_step = list(transformer.named_steps.keys())[-1]
                inverse_transformer = transformer.named_steps[last_step]

                if hasattr(inverse_transformer, 'inverse_transform'):
                    transformed_slice = X_transformed[:, start_idx:end_idx]
                    inverse_slice = inverse_transformer.inverse_transform(transformed_slice)

                    # Assign inverse-transformed data to the corresponding feature names
                    for idx, feature in enumerate(features):
                        inverse_data[feature] = inverse_slice[:, idx]

                    logger.debug(f"[DEBUG Inverse] Applied inverse_transform on transformer '{last_step}' for features {features}.")
                    transformations_applied = True
                else:
                    logger.debug(f"[DEBUG Inverse] Transformer '{last_step}' does not support inverse_transform. Skipping.")
            else:
                logger.debug(f"[DEBUG Inverse] Transformer '{name}' does not have 'named_steps'. Skipping.")

            start_idx = end_idx  # Update starting index for next transformer

        # Convert the inverse_data dictionary to a DataFrame
        if transformations_applied:
            inverse_df = pd.DataFrame(inverse_data, index=original_data.index if original_data is not None else None)
            logger.debug(f"[DEBUG Inverse] Inverse DataFrame shape (transformed columns): {inverse_df.shape}")
            logger.debug(f"[DEBUG Inverse] Sample of inverse-transformed data:\n{inverse_df.head()}")
        else:
            if original_data is not None:
                logger.warning("⚠️ No reversible transformations were applied. Returning original data.")
                inverse_df = original_data.copy()
                logger.debug(f"[DEBUG Inverse] Returning a copy of original_data with shape: {inverse_df.shape}")
            else:
                logger.error("❌ No transformations were applied and original_data was not provided. Cannot perform inverse transformation.")
                raise ValueError("No transformations were applied and original_data was not provided.")

        # Identify passthrough columns by excluding transformed features
        if original_data is not None and transformations_applied:
            transformed_features = set(inverse_data.keys())
            all_original_features = set(original_data.columns)
            passthrough_columns = list(all_original_features - transformed_features)
            logger.debug(f"[DEBUG Inverse] Inverse DataFrame columns before pass-through merge: {inverse_df.columns.tolist()}")
            logger.debug(f"[DEBUG Inverse] all_original_features: {list(all_original_features)}")
            logger.debug(f"[DEBUG Inverse] passthrough_columns: {passthrough_columns}")

            if passthrough_columns:
                logger.debug(f"[DEBUG Inverse] Passthrough columns to merge: {passthrough_columns}")
                passthrough_data = original_data[passthrough_columns].copy()
                inverse_df = pd.concat([inverse_df, passthrough_data], axis=1)

                # Ensure the final DataFrame has the same column order as original_data
                inverse_df = inverse_df[original_data.columns]
                logger.debug(f"[DEBUG Inverse] Final inverse DataFrame shape: {inverse_df.shape}")
                
                # Check for missing columns after inverse transform
                expected_columns = set(original_data.columns)
                final_columns = set(inverse_df.columns)
                missing_after_inverse = expected_columns - final_columns

                if missing_after_inverse:
                    err_msg = (
                    f"Inverse transform error: The following columns are missing "
                    f"after inverse transform: {missing_after_inverse}"
                    )
                    logger.error(err_msg)
                    raise ValueError(err_msg)
            else:
                logger.debug("[DEBUG Inverse] No passthrough columns to merge.")
        else:
            logger.debug("[DEBUG Inverse] Either no original_data provided or no transformations were applied.")

        return inverse_df



    def build_pipeline(self, X_train: pd.DataFrame) -> ColumnTransformer:
        transformers = []

        # Handle Numerical Features
        if self.numericals:
            numerical_strategy = self.options.get('handle_missing_values', {}).get('numerical_strategy', {}).get('strategy', 'median')
            numerical_imputer = self.options.get('handle_missing_values', {}).get('numerical_strategy', {}).get('imputer', 'SimpleImputer')

            if numerical_imputer == 'SimpleImputer':
                num_imputer = SimpleImputer(strategy=numerical_strategy)
            elif numerical_imputer == 'KNNImputer':
                knn_neighbors = self.options.get('handle_missing_values', {}).get('numerical_strategy', {}).get('knn_neighbors', 5)
                num_imputer = KNNImputer(n_neighbors=knn_neighbors)
            else:
                raise ValueError(f"Unsupported numerical imputer type: {numerical_imputer}")

            # Determine scaling method
            scaling_method = self.options.get('apply_scaling', {}).get('method', None)
            if scaling_method is None:
                # Default scaling based on model category
                if self.model_category in ['regression', 'classification', 'clustering']:
                    # For clustering, MinMaxScaler is generally preferred
                    if self.model_category == 'clustering':
                        scaler = MinMaxScaler()
                        scaling_type = 'MinMaxScaler'
                    else:
                        scaler = StandardScaler()
                        scaling_type = 'StandardScaler'
                else:
                    scaler = 'passthrough'
                    scaling_type = 'None'
            else:
                # Normalize the scaling_method string to handle case-insensitivity
                scaling_method_normalized = scaling_method.lower()
                if scaling_method_normalized == 'standardscaler':
                    scaler = StandardScaler()
                    scaling_type = 'StandardScaler'
                elif scaling_method_normalized == 'minmaxscaler':
                    scaler = MinMaxScaler()
                    scaling_type = 'MinMaxScaler'
                elif scaling_method_normalized == 'robustscaler':
                    scaler = RobustScaler()
                    scaling_type = 'RobustScaler'
                elif scaling_method_normalized == 'none':
                    scaler = 'passthrough'
                    scaling_type = 'None'
                else:
                    raise ValueError(f"Unsupported scaling method: {scaling_method}")

            numerical_transformer = Pipeline(steps=[
                ('imputer', num_imputer),
                ('scaler', scaler)
            ])

            transformers.append(('num', numerical_transformer, self.numericals))
            self.logger.debug(f"Numerical transformer added with imputer '{numerical_imputer}' and scaler '{scaling_type}'.")

        # Handle Ordinal Categorical Features
        if self.ordinal_categoricals:
            ordinal_strategy = self.options.get('encode_categoricals', {}).get('ordinal_encoding', 'OrdinalEncoder')
            if ordinal_strategy == 'OrdinalEncoder':
                ordinal_transformer = Pipeline(steps=[
                    ('imputer', SimpleImputer(strategy='most_frequent')),
                    ('ordinal_encoder', OrdinalEncoder())
                ])
                transformers.append(('ord', ordinal_transformer, self.ordinal_categoricals))
                self.logger.debug("Ordinal transformer added with OrdinalEncoder.")
            else:
                raise ValueError(f"Unsupported ordinal encoding strategy: {ordinal_strategy}")

        # Handle Nominal Categorical Features
        if self.nominal_categoricals:
            nominal_strategy = self.options.get('encode_categoricals', {}).get('nominal_encoding', 'OneHotEncoder')
            if nominal_strategy == 'OneHotEncoder':
                nominal_transformer = Pipeline(steps=[
                    ('imputer', SimpleImputer(strategy='most_frequent')),
                    ('onehot_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
                ])
                transformers.append(('nominal', nominal_transformer, self.nominal_categoricals))
                self.logger.debug("Nominal transformer added with OneHotEncoder.")
            elif nominal_strategy == 'OrdinalEncoder':
                nominal_transformer = Pipeline(steps=[
                    ('imputer', SimpleImputer(strategy='most_frequent')),
                    ('ordinal_encoder', OrdinalEncoder())
                ])
                transformers.append(('nominal_ord', nominal_transformer, self.nominal_categoricals))
                self.logger.debug("Nominal transformer added with OrdinalEncoder.")
            elif nominal_strategy == 'FrequencyEncoder':
                # Implement custom Frequency Encoding
                for feature in self.nominal_categoricals:
                    freq = X_train[feature].value_counts(normalize=True)
                    X_train[feature] = X_train[feature].map(freq)
                    self.feature_reasons[feature] += 'Frequency Encoding applied | '
                    self.logger.debug(f"Frequency Encoding applied to '{feature}'.")
            else:
                raise ValueError(f"Unsupported nominal encoding strategy: {nominal_strategy}")

        if not transformers and 'FrequencyEncoder' not in nominal_strategy:
            self.logger.error("No transformers added to the pipeline. Check feature categorization and configuration.")
            raise ValueError("No transformers added to the pipeline. Check feature categorization and configuration.")

        preprocessor = ColumnTransformer(transformers=transformers, remainder='passthrough')
        self.logger.debug("ColumnTransformer constructed with the following transformers:")
        for t in transformers:
            self.logger.debug(t)

        preprocessor.fit(X_train)
        self.logger.info("✅ Preprocessor fitted on training data.")

        # Determine categorical feature indices for SMOTENC if needed
        if self.options.get('implement_smote', {}).get('variant', None) == 'SMOTENC':
            if not self.categorical_indices:
                categorical_features = []
                for name, transformer, features in preprocessor.transformers_:
                    if 'ord' in name or 'nominal' in name:
                        if isinstance(transformer, Pipeline):
                            encoder = transformer.named_steps.get('ordinal_encoder') or transformer.named_steps.get('onehot_encoder')
                            if hasattr(encoder, 'categories_'):
                                # Calculate indices based on transformers order
                                # This can be complex; for simplicity, assuming categorical features are the first
                                categorical_features.extend(range(len(features)))
                self.categorical_indices = categorical_features
                self.logger.debug(f"Categorical feature indices for SMOTENC: {self.categorical_indices}")

        return preprocessor

    def phase_scaling(self, df: pd.DataFrame, numeric_cols: List[str], group_column: str) -> Tuple[pd.DataFrame, Dict]:
        """
        Normalize numeric features within each group (e.g. each phase) using RobustScaler.
        Logs summary statistics before and after scaling.

        Args:
            df (pd.DataFrame): Input DataFrame.
            numeric_cols (List[str]): List of numeric columns to scale.
            group_column (str): The column used for grouping (e.g., 'phase').

        Returns:
            Tuple[pd.DataFrame, Dict]: The DataFrame with scaled values and a dictionary of fitted scalers per group.
        """
        from sklearn.preprocessing import RobustScaler

        scalers = {}
        groups = df[group_column].unique()
        self.logger.info(f"Starting phase-aware normalization on column '{group_column}' for groups: {groups}")
        for grp in groups:
            phase_mask = df[group_column] == grp
            df_grp = df.loc[phase_mask, numeric_cols]
            # Log before scaling
            self.logger.debug(f"Before scaling for group '{grp}':\n{df_grp.describe()}")
            scaler = RobustScaler().fit(df_grp)
            df.loc[phase_mask, numeric_cols] = scaler.transform(df_grp)
            scalers[grp] = scaler
            # Log after scaling
            self.logger.debug(f"After scaling for group '{grp}':\n{df.loc[phase_mask, numeric_cols].describe()}")
        return df, scalers


    @staticmethod
    def calculate_phase_window(phase_data: pd.DataFrame, base_size: int = 100, std_dev: int = 2) -> int:
        """
        Estimate an optimal window size for a given phase based on its duration statistics.
        Clamps the result between base_size and an upper limit (here 300).
        """
        # Assuming 'pitch_trial_id' exists to group duration lengths
        durations = phase_data.groupby('pitch_trial_id').size()
        avg = durations.mean()
        std = durations.std()
        window_size = int(np.clip(avg + std_dev * std, base_size, 300))
        return window_size



    def check_target_alignment(self, X_seq: Any, y_seq: Any, horizon: int) -> bool:
        """
        Verify that for each sequence the target length matches expectations.
        For 'set_window' mode, the target should have 'horizon' rows;
        otherwise, it should equal the sequence length.
        """
        for idx, (seq, target) in enumerate(zip(X_seq, y_seq)):
            seq_length = seq.shape[0] if hasattr(seq, 'shape') else len(seq)
            expected_length = horizon if self.time_series_sequence_mode == "set_window" else seq_length
            actual_length = target.shape[0] if hasattr(target, 'shape') else len(target)
            self.logger.debug(
                f"Sequence {idx}: sequence length = {seq_length}, expected target length = {expected_length}, actual target length = {actual_length}"
            )
            if actual_length != expected_length:
                self.logger.error(
                    f"Alignment error in sequence {idx}: expected target length {expected_length} but got {actual_length}"
                )
                return False
        return True







    @staticmethod
    def validate_phase_transitions(sequences: list, phase_column: str, valid_transitions: Dict[str, List[str]]) -> bool:
        """
        Check that sequences contain only biomechanically valid phase transitions.
        Returns True if the error rate is below a defined tolerance.
        """
        errors = 0
        total_checks = 0
        for seq in sequences:
            # Extract phases from the sequence (assumes seq is an array or DataFrame)
            phases = pd.Series(seq[:, phase_column]) if isinstance(seq, np.ndarray) else seq[phase_column]
            phases = phases.unique()
            for i in range(len(phases) - 1):
                current = phases[i]
                next_phase = phases[i + 1]
                total_checks += 1
                if next_phase not in valid_transitions.get(current, []):
                    errors += 1
        tolerance = 0.01  # less than 1% error allowed
        if total_checks == 0:
            return True
        error_rate = errors / total_checks
        return error_rate < tolerance



    # ---------------------------------------------------------------------
    def preprocess_time_series(self, data: pd.DataFrame) -> Tuple[Any, None, Any, None, pd.DataFrame, None]:
        """
        Preprocess data specifically for time series models.
        Steps:
        1. Handle missing values and outliers.
        2. Sort data by time.
        3. Optionally apply phase-aware normalization.
        4. Group data by top-level sequences.
        5. For each group, segment into sub-phases.
        6. Align each sub-phase using _align_phase.
        7. Validate overall sequence lengths and filter sequences.
        8. Optionally apply SMOTE-TS.
        9. Generate recommendations and save transformers.
        """
        # 1. Missing values and outlier handling
        data_clean, _ = self.handle_missing_values(data)
        X_temp = data_clean.drop(columns=self.y_variable)
        y_temp = data_clean[self.y_variable]
        X_temp, y_temp = self.handle_outliers(X_temp, y_temp)
        data_clean = pd.concat([X_temp, y_temp], axis=1)
        
        # 2. Sort by time column
        if self.time_column is None:
            raise ValueError("For time series models, 'time_column' must be specified.")
        data_clean['__time__'] = pd.to_datetime(data_clean[self.time_column])
        data_sorted = data_clean.sort_values(by='__time__').drop(columns=['__time__'])
        
        # 3. Optionally apply phase-aware normalization
        phase_norm_opts = self.options.get('phase_aware_normalization', {})
        if phase_norm_opts.get('enabled', False):
            group_col = phase_norm_opts.get('group_column', 'phase')
            num_cols = phase_norm_opts.get('numeric_columns', self.numericals)
            self.logger.info(f"Phase-aware normalization enabled on group '{group_col}'.")
            data_sorted, _ = self.phase_scaling(data_sorted, num_cols, group_col)
        
        # 4. Split features and target
        X_clean = data_sorted.drop(columns=self.y_variable)
        y_clean = data_sorted[self.y_variable]
        
        # 5. Build and fit the preprocessing pipeline
        self.pipeline = self.build_pipeline(X_clean)
        X_preprocessed = self.pipeline.fit_transform(X_clean)
        
    # 6. Group by top-level sequences and segment sub-phases
    if self.sequence_categorical is not None:
        grouped = self._group_top_level(data_sorted)
        aligned_groups = {}
        for group_key, group_data in grouped:
            subphases = self._segment_subphases(group_data)
            aligned_subphases = {}
            # --- Determine the "Longest" Phase ---
            target_length = max([len(phase) for phase in subphases.values()])
            self.logger.debug(f"Longest sub-phase: {target_length} (phase: {max(subphases, key=lambda x: len(subphases[x]))})")
            # Align each sub-phase and plot alignment.
            for phase, phase_data in subphases.items():
                try:
                    aligned = self._align_phase(phase_data, target_length)
                    aligned_subphases[phase] = aligned
                    self._debug_plot_alignment(phase_data, aligned, phase, group_key)
                except Exception as e:
                    self.logger.error(f"Alignment failed for group {group_key}, phase {phase}: {e}")
                    aligned_subphases[phase] = None
            # Verify that all sub-phases were aligned.
            valid = all(aligned is not None for aligned in aligned_subphases.values())
            if not valid:
                self.logger.warning(f"Skipping invalid group {group_key}")
                continue
            aligned_groups[group_key] = aligned_subphases
        # Validate and dynamically filter sequences.
        validated = self._validate_sequences(aligned_groups)
        filtered = self._filter_sequences(validated)
        # Optionally apply SMOTE-TS.
        final_aligned = self._apply_smote_ts(filtered)
        X_seq = final_aligned
        y_seq = None
    elif self.time_series_sequence_mode == "set_window":
        X_seq, y_seq = self.create_sequences(X_preprocessed, y_clean.values)
    else:
        raise ValueError(f"Invalid time_series_sequence_mode: {self.time_series_sequence_mode}")
    
    # 9. Validate target alignment.
    if y_seq is not None and not self.check_target_alignment(X_seq, y_seq, self.horizon):
        self.logger.warning("Target alignment check failed: Some sequences may not have matching target lengths.")
    
    # NEW: Flag extreme Follow-Throughs using the metadata from all groups.
    self._flag_extreme_phases(self.follow_through_stats)
    # Log the top 5 longest Follow-Through durations.
    self._log_top_outliers()
    # 10. Generate recommendations and save transformers.
    recommendations = self.generate_recommendations()
    self.final_feature_order = list(self.pipeline.get_feature_names_out())
    self.save_transformers()
    
    # NEW: Post-processing report.
    self.post_processing_report()
    
    return X_seq, None, y_seq, None, recommendations, None




    def preprocess_train(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Preprocess training data for various model types.
        For time series models, delegate to preprocess_time_series.
        
        Returns:
            - For standard models: X_train_final, X_test_final, y_train_smoted, y_test, recommendations, X_test_inverse.
            - For time series models: X_seq, None, y_seq, None, recommendations, None.
        """
        # If the model is time series, use the dedicated time series preprocessing flow.
        if self.model_category == 'time_series':
            return self.preprocess_time_series(X, y)
        
        # Standard preprocessing flow for classification/regression/clustering
        X_train_original, X_test_original, y_train_original, y_test = self.split_dataset(X, y)
        X_train_missing_values, X_test_missing_values = self.handle_missing_values(X_train_original, X_test_original)
        
        # Only perform normality tests if applicable
        if self.model_category in ['regression', 'classification', 'clustering']:
            self.test_normality(X_train_missing_values)
        
        X_train_outliers_handled, y_train_outliers_handled = self.handle_outliers(X_train_missing_values, y_train_original)
        X_test_outliers_handled = X_test_missing_values.copy() if X_test_missing_values is not None else None
        recommendations = self.generate_recommendations()
        self.pipeline = self.build_pipeline(X_train_outliers_handled)
        X_train_preprocessed = self.pipeline.fit_transform(X_train_outliers_handled)
        X_test_preprocessed = self.pipeline.transform(X_test_outliers_handled) if X_test_outliers_handled is not None else None

        if self.model_category == 'classification':
            try:
                X_train_smoted, y_train_smoted = self.implement_smote(X_train_preprocessed, y_train_outliers_handled)
            except Exception as e:
                self.logger.error(f"❌ SMOTE application failed: {e}")
                raise
        else:
            X_train_smoted, y_train_smoted = X_train_preprocessed, y_train_outliers_handled
            self.logger.info("⚠️ SMOTE not applied: Not a classification model.")

        self.final_feature_order = list(self.pipeline.get_feature_names_out())
        X_train_final = pd.DataFrame(X_train_smoted, columns=self.final_feature_order)
        X_test_final = pd.DataFrame(X_test_preprocessed, columns=self.final_feature_order, index=X_test_original.index) if X_test_preprocessed is not None else None

        try:
            self.save_transformers()
        except Exception as e:
            self.logger.error(f"❌ Saving transformers failed: {e}")
            raise

        try:
            if X_test_final is not None:
                X_test_inverse = self.inverse_transform_data(X_test_final.values, original_data=X_test_original)
                self.logger.info("✅ Inverse transformations applied successfully.")
            else:
                X_test_inverse = None
        except Exception as e:
            self.logger.error(f"❌ Inverse transformations failed: {e}")
            X_test_inverse = None

        return X_train_final, X_test_final, y_train_smoted, y_test, recommendations, X_test_inverse


    def transform(self, X: pd.DataFrame) -> np.ndarray:
        """
        Transform new data using the fitted preprocessing pipeline.

        Args:
            X (pd.DataFrame): New data to transform.

        Returns:
            np.ndarray: Preprocessed data.
        """
        if self.pipeline is None:
            self.logger.error("Preprocessing pipeline has not been fitted. Cannot perform inverse transformation.")
            raise AttributeError("Preprocessing pipeline has not been fitted. Cannot perform inverse transformation.")
        self.logger.debug("Transforming new data.")
        X_preprocessed = self.pipeline.transform(X)
        if self.debug:
            self.logger.debug(f"Transformed data shape: {X_preprocessed.shape}")
        else:
            self.logger.info("Data transformed.")
        return X_preprocessed

    def preprocess_predict(self, X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Preprocess new data for prediction.

        Args:
            X (pd.DataFrame): New data for prediction.

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame]]: X_preprocessed, recommendations, X_inversed
        """
        step_name = "Preprocess Predict"
        self.logger.info(f"Step: {step_name}")

        # Log initial columns and feature count
        self.logger.debug(f"Initial columns in prediction data: {X.columns.tolist()}")
        self.logger.debug(f"Initial number of features: {X.shape[1]}")

        # Load transformers
        try:
            transformers = self.load_transformers()
            self.logger.debug("Transformers loaded successfully.")
        except Exception as e:
            self.logger.error(f"❌ Failed to load transformers: {e}")
            raise

        # Filter columns based on raw feature names
        try:
            X_filtered = self.filter_columns(X)
            self.logger.debug(f"Columns after filtering: {X_filtered.columns.tolist()}")
            self.logger.debug(f"Number of features after filtering: {X_filtered.shape[1]}")
        except Exception as e:
            self.logger.error(f"❌ Failed during column filtering: {e}")
            raise

        # Handle missing values
        try:
            X_filtered, _ = self.handle_missing_values(X_filtered)
            self.logger.debug(f"Columns after handling missing values: {X_filtered.columns.tolist()}")
            self.logger.debug(f"Number of features after handling missing values: {X_filtered.shape[1]}")
        except Exception as e:
            self.logger.error(f"❌ Failed during missing value handling: {e}")
            raise

        # Ensure all expected raw features are present
        expected_raw_features = self.numericals + self.ordinal_categoricals + self.nominal_categoricals
        provided_features = X_filtered.columns.tolist()

        self.logger.debug(f"Expected raw features: {expected_raw_features}")
        self.logger.debug(f"Provided features: {provided_features}")

        missing_raw_features = set(expected_raw_features) - set(provided_features)
        if missing_raw_features:
            self.logger.error(f"❌ Missing required raw feature columns in prediction data: {missing_raw_features}")
            raise ValueError(f"Missing required raw feature columns in prediction data: {missing_raw_features}")

        # Handle unexpected columns (optional: ignore or log)
        unexpected_features = set(provided_features) - set(expected_raw_features)
        if unexpected_features:
            self.logger.warning(f"⚠️ Unexpected columns in prediction data that will be ignored: {unexpected_features}")

        # Ensure the order of columns matches the pipeline's expectation (optional)
        X_filtered = X_filtered[expected_raw_features]
        self.logger.debug("Reordered columns to match the pipeline's raw feature expectations.")

        # Transform data using the loaded pipeline
        try:
            X_preprocessed_np = self.pipeline.transform(X_filtered)
            self.logger.debug(f"Transformed data shape: {X_preprocessed_np.shape}")
        except Exception as e:
            self.logger.error(f"❌ Transformation failed: {e}")
            raise

        # Retrieve feature names from the pipeline or use stored final_feature_order
        if hasattr(self.pipeline, 'get_feature_names_out'):
            try:
                columns = self.pipeline.get_feature_names_out()
                self.logger.debug(f"Derived feature names from pipeline: {columns.tolist()}")
            except Exception as e:
                self.logger.warning(f"Could not retrieve feature names from pipeline: {e}")
                columns = self.final_feature_order
                self.logger.debug(f"Using stored final_feature_order for column names: {columns}")
        else:
            columns = self.final_feature_order
            self.logger.debug(f"Using stored final_feature_order for column names: {columns}")

        # Convert NumPy array back to DataFrame with correct column names
        try:
            X_preprocessed_df = pd.DataFrame(X_preprocessed_np, columns=columns, index=X_filtered.index)
            self.logger.debug(f"X_preprocessed_df columns: {X_preprocessed_df.columns.tolist()}")
            self.logger.debug(f"Sample of X_preprocessed_df:\n{X_preprocessed_df.head()}")
        except Exception as e:
            self.logger.error(f"❌ Failed to convert transformed data to DataFrame: {e}")
            raise

        # Inverse transform for interpretability (optional, for interpretability)
        try:
            self.logger.debug(f"[DEBUG] Original data shape before inverse transform: {X.shape}")
            X_inversed = self.inverse_transform_data(X_preprocessed_np, original_data=X)
            self.logger.debug(f"[DEBUG] Inversed data shape: {X_inversed.shape}")
        except Exception as e:
            self.logger.error(f"❌ Inverse transformation failed: {e}")
            X_inversed = None

        # Generate recommendations (if applicable)
        try:
            recommendations = self.generate_recommendations()
            self.logger.debug("Generated preprocessing recommendations.")
        except Exception as e:
            self.logger.error(f"❌ Failed to generate recommendations: {e}")
            recommendations = pd.DataFrame()

        # Prepare outputs
        return X_preprocessed_df, recommendations, X_inversed

    def preprocess_clustering(self, X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Preprocess data for clustering mode.

        Args:
            X (pd.DataFrame): Input features for clustering.

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: X_processed, recommendations.
        """
        step_name = "Preprocess Clustering"
        self.logger.info(f"Step: {step_name}")
        debug_flag = self.get_debug_flag('debug_handle_missing_values')  # Use relevant debug flags

        # Handle Missing Values
        X_missing, _ = self.handle_missing_values(X, None)
        self.logger.debug(f"After handling missing values: X_missing.shape={X_missing.shape}")

        # Handle Outliers
        X_outliers_handled, _ = self.handle_outliers(X_missing, None)
        self.logger.debug(f"After handling outliers: X_outliers_handled.shape={X_outliers_handled.shape}")

        # Test Normality (optional for clustering)
        if self.model_category in ['clustering']:
            self.logger.info("Skipping normality tests for clustering.")
        else:
            self.test_normality(X_outliers_handled)

        # Generate Preprocessing Recommendations
        recommendations = self.generate_recommendations()

        # Build and Fit the Pipeline
        self.pipeline = self.build_pipeline(X_outliers_handled)
        self.logger.debug("Pipeline built and fitted.")

        # Transform the data
        X_processed = self.pipeline.transform(X_outliers_handled)
        self.logger.debug(f"After pipeline transform: X_processed.shape={X_processed.shape}")

        # Optionally, inverse transformations can be handled if necessary

        # Save Transformers (if needed)
        # Not strictly necessary for clustering unless you plan to apply the same preprocessing on new data
        self.save_transformers()

        self.logger.info("✅ Clustering data preprocessed successfully.")

        return X_processed, recommendations

    def final_preprocessing(self, data: pd.DataFrame) -> Tuple:
        """
        Execute the full preprocessing pipeline based on the mode.

        For 'train' mode:
        - If time series: pass the full filtered DataFrame (which includes the target) 
            to preprocess_time_series.
        - Else: split the data into X and y, then call preprocess_train.
        For 'predict' and 'clustering' modes, the existing flow remains unchanged.

        Returns:
            Tuple: Depending on mode:
                - 'train': For standard models: X_train, X_test, y_train, y_test, recommendations, X_test_inverse.
                            For time series models: X_seq, None, y_seq, None, recommendations, None.
                - 'predict': X_preprocessed, recommendations, X_inverse.
                - 'clustering': X_processed, recommendations.
        """
        self.logger.info(f"Starting: Final Preprocessing Pipeline in '{self.mode}' mode.")
        
        try:
            data = self.filter_columns(data)
            self.logger.info("✅ Column filtering completed successfully.")
        except Exception as e:
            self.logger.error(f"❌ Column filtering failed: {e}")
            raise

        if self.mode == 'train':
            if self.model_category == 'time_series':
                # For time series mode, do not split the DataFrame.
                # Pass the full filtered data (which still contains the target variable)
                # so that the time series preprocessing flow can extract the target after cleaning and sorting.
                return self.preprocess_time_series(data)
            else:
                if not all(col in data.columns for col in self.y_variable):
                    missing_y = [col for col in self.y_variable if col not in data.columns]
                    raise ValueError(f"Target variable(s) {missing_y} not found in the dataset.")
                X = data.drop(self.y_variable, axis=1)
                y = data[self.y_variable].iloc[:, 0] if len(self.y_variable) == 1 else data[self.y_variable]
                return self.preprocess_train(X, y)
        
        elif self.mode == 'predict':
            X = data.copy()
            transformers_path = os.path.join(self.transformers_dir, 'transformers.pkl')
            if not os.path.exists(transformers_path):
                self.logger.error(f"❌ Transformers file not found at '{self.transformers_dir}'. Cannot proceed with prediction.")
                raise FileNotFoundError(f"Transformers file not found at '{self.transformers_dir}'.")
            X_preprocessed, recommendations, X_inversed = self.preprocess_predict(X)
            self.logger.info("✅ Preprocessing completed successfully in predict mode.")
            return X_preprocessed, recommendations, X_inversed
        
        elif self.mode == 'clustering':
            X = data.copy()
            return self.preprocess_clustering(X)
        
        else:
            raise NotImplementedError(f"Mode '{self.mode}' is not implemented.")



    # Optionally, implement a method to display column info for debugging
    def _debug_column_info(self, df: pd.DataFrame, step: str = "Debug Column Info"):
        """
        Display information about DataFrame columns for debugging purposes.

        Args:
            df (pd.DataFrame): The DataFrame to inspect.
            step (str, optional): Description of the current step. Defaults to "Debug Column Info".
        """
        self.logger.debug(f"\n📊 {step}: Column Information")
        for col in df.columns:
            self.logger.debug(f"Column '{col}': {df[col].dtype}, Unique Values: {df[col].nunique()}")
        self.logger.debug("\n")

In [None]:




#---------------------------------


import numpy as np

def dtw_path(s1: np.ndarray, s2: np.ndarray) -> list:
    """
    Compute the DTW cost matrix and return the optimal warping path.
    
    Args:
        s1: Sequence 1, shape (n, features)
        s2: Sequence 2, shape (m, features)
    
    Returns:
        path: A list of index pairs [(i, j), ...] indicating the alignment.
    """
    n, m = len(s1), len(s2)
    cost = np.full((n+1, m+1), np.inf)
    cost[0, 0] = 0

    # Build the cost matrix
    for i in range(1, n+1):
        for j in range(1, m+1):
            dist = np.linalg.norm(s1[i-1] - s2[j-1])
            cost[i, j] = dist + min(cost[i-1, j], cost[i, j-1], cost[i-1, j-1])

    # Backtracking to find the optimal path
    i, j = n, m
    path = []
    while i > 0 and j > 0:
        path.append((i-1, j-1))
        directions = [cost[i-1, j], cost[i, j-1], cost[i-1, j-1]]
        min_index = np.argmin(directions)
        if min_index == 0:
            i -= 1
        elif min_index == 1:
            j -= 1
        else:
            i -= 1
            j -= 1
    path.reverse()
    return path

def warp_sequence(seq: np.ndarray, path: list, target_length: int) -> np.ndarray:
    """
    Warp the given sequence to match the target length based on the DTW warping path.
    
    Args:
        seq: Original sequence, shape (n, features)
        path: Warping path from dtw_path (list of tuples)
        target_length: Desired sequence length (typically the reference length)
    
    Returns:
        aligned_seq: Warped sequence with shape (target_length, features)
    """
    aligned_seq = np.zeros((target_length, seq.shape[1]))
    # Create mapping: for each target index, collect corresponding indices from seq
    mapping = {t: [] for t in range(target_length)}
    for (i, j) in path:
        mapping[j].append(i)
    
    for t in range(target_length):
        indices = mapping[t]
        if indices:
            aligned_seq[t] = np.mean(seq[indices], axis=0)
        else:
            # If no alignment, reuse the previous value (or use interpolation)
            aligned_seq[t] = aligned_seq[t-1] if t > 0 else seq[0]
    return aligned_seq

# scripts/model_factory.py

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR
from xgboost import XGBClassifier, XGBRegressor
from sklearn.cluster import KMeans
from pathlib import Path
import joblib
import logging
# from datapreprocessor import DataPreprocessor # Importing the DataPreprocessor class from datapreprocessor.py

logger = logging.getLogger(__name__)

def get_model(model_type: str, model_sub_type: str):
    """Factory function to get model instances based on the model type and subtype."""
    if model_type == "Tree Based Classifier":
        if model_sub_type == "Random Forest":
            return RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        elif model_sub_type == "XGBoost":
            return XGBClassifier(eval_metric='logloss', random_state=42)
        elif model_sub_type == "Decision Tree":
            return DecisionTreeClassifier(random_state=42)
        else:
            raise ValueError(f"Classifier subtype '{model_sub_type}' is not supported under '{model_type}'.")
    
    elif model_type == "Logistic Regression":
        if model_sub_type == "Logistic Regression":
            return LogisticRegression(random_state=42, max_iter=1000)
        else:
            raise ValueError(f"Subtype '{model_sub_type}' is not supported under '{model_type}'.")
    
    elif model_type == "K-Means":
        if model_sub_type == "K-Means":
            return KMeans(n_clusters=3, init='k-means++', n_init=10, max_iter=300, random_state=42)
        else:
            raise ValueError(f"Clustering subtype '{model_sub_type}' is not supported under '{model_type}'.")
    
    elif model_type == "Linear Regression":
        if model_sub_type == "Linear Regression":
            return LinearRegression()
        else:
            raise ValueError(f"Subtype '{model_sub_type}' is not supported under '{model_type}'.")
    
    elif model_type == "Tree Based Regressor":
        if model_sub_type == "Random Forest Regressor":
            return RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        elif model_sub_type == "XGBoost Regressor":
            return XGBRegressor(eval_metric='rmse', random_state=42)
        elif model_sub_type == "Decision Tree Regressor":
            return DecisionTreeRegressor(random_state=42)
        else:
            raise ValueError(f"Regressor subtype '{model_sub_type}' is not supported under '{model_type}'.")
    
    elif model_type == "Support Vector Machine":
        if model_sub_type == "Support Vector Machine":
            return SVC(probability=True, random_state=42)
        else:
            raise ValueError(f"SVM subtype '{model_sub_type}' is not supported under '{model_type}'.")
    
    else:
        raise ValueError(f"Model type '{model_type}' is not supported.")

def estimate_optimal_window_size(time_series: pd.Series, threshold: float = 0.5, max_window: int = 100) -> int:
    """
    Estimate an optimal window size based on when the autocorrelation drops below a threshold.
    
    Args:
        time_series: A pandas Series representing the raw time series data.
        threshold: Autocorrelation threshold (default 0.5).
        max_window: Maximum window length to consider.
    
    Returns:
        Optimal window size as an integer.
    """
    for lag in range(1, max_window + 1):
        ac = time_series.autocorr(lag=lag)
        if ac < threshold:
            return lag
    return max_window

import pandas as pd
import numpy as np
import yaml
from pathlib import Path


def load_config(config_path: Path) -> dict:
    with open(config_path, 'r') as f:
        config = yaml.safe_load(f)
    return config




def main_time_series():
    # Load configuration from the updated YAML file
    config_path = Path("../../dataset/test/preprocessor_config/preprocessor_config_baseball.yaml")
    config = load_config(config_path)
    
    # Extract time series parameters from the config
    ts_params = config.get("time_series", {})
    if not ts_params.get("enabled", False):
        print("[INFO] Time series processing is not enabled in the config.")
        return

    # Set dataset path based on config
    data_dir = Path(config["paths"]["data_dir"])
    raw_data_file = config["paths"]["raw_data"]
    raw_data_path = data_dir / raw_data_file
    
    # Load dataset
    try:
        df = pd.read_parquet(raw_data_path)
        df['pitch_speed_mph_biomech'] = pd.to_numeric(df['pitch_speed_mph_biomech'], errors='raise')
        df['height_meters_biomech'] = pd.to_numeric(df['height_meters_biomech'], errors='raise')
        df['mass_kilograms_biomech'] = pd.to_numeric(df['mass_kilograms_biomech'], errors='raise')
        print(f"[INFO] Dataset loaded from {raw_data_path}. Shape: {df.shape}")
    except Exception as e:
        print(f"[ERROR] Failed to load dataset: {e}")
        return

    # Estimate an optimal window size from the target column (optional)
    target_col = config["features"]["y_variable"][0]
    optimal_window = estimate_optimal_window_size(df[target_col], threshold=0.5, max_window=100)
    print("[INFO] Estimated optimal window size:", optimal_window)
    

    
    # -------------------------------
    # Example : Pitching Motion Segmentation with DTW Enabled
    # -------------------------------
    # In this new example, we use "shooting_motion" as the grouping (categorical) variable.
    # This will test the pipeline's ability to group sequences based on Pitching Motion,
    # while DTW alignment is enabled.
    ts_params_shooting = ts_params.copy()
    ts_params_shooting["use_dtw"] = True  # Enable DTW alignment
    print("\n[INFO] Running Pitching Motion Segmentation Example (grouping by  with DTW enabled)...")
    
    preprocessor_shooting = DataPreprocessor(
        model_type="LSTM",
        y_variable=config["features"]["y_variable"],
        ordinal_categoricals=config["features"].get("ordinal_categoricals", []),
        nominal_categoricals=config["features"].get("nominal_categoricals", []),
        numericals=[col for col in config["features"]["numericals"] if col != target_col],
        mode="train",
        options=ts_params_shooting,
        debug=True,
        graphs_output_dir=str(Path(config["paths"]["plots_output_dir"])),
        transformers_dir=str(Path(config["paths"]["transformers_save_base_dir"])),
        time_column=ts_params_shooting.get("time_column", "ongoing_timestamp_biomech"),
        window_size=ts_params_shooting.get("window_size", optimal_window),
        horizon=ts_params_shooting.get("horizon", 1),
        step_size=ts_params_shooting.get("step_size", 1),
        max_sequence_length=ts_params_shooting.get("max_sequence_length", optimal_window),
        sequence_categorical=["session_biomech", "trial_biomech"],
        sequence_dtw_or_pad_categorical=["pitch_phase_biomech"],
        time_series_sequence_mode="dtw"  # Options: "set_window", "dtw", "pad", "variable_length"

    )
    try:
        X_seq_shooting, _, y_seq_shooting, _, rec_shooting, _ = preprocessor_shooting.final_preprocessing(df)
        print("[INFO] Pitching Motion Segmentation Example complete.")
        print(f"[INFO] X_seq (Pitching Motion segmentation) shape: {X_seq_shooting.shape}")
        print(f"[INFO] y_seq (Pitching Motion segmentation) shape: {y_seq_shooting.shape}")
        print("[INFO] Preprocessing recommendations (Pitching Motion segmentation):")
        print(rec_shooting)
    except Exception as e:
        print(f"[ERROR] Pitching Motion Segmentation Example failed: {e}")




if __name__ == "__main__":
    print("Running updated time series preprocessing main function...")
    main_time_series()



Running updated time series preprocessing main function...


2025-02-22 18:28:35,366 [INFO] Starting: Final Preprocessing Pipeline in 'train' mode.
2025-02-22 18:28:35,366 [INFO] Step: filter_columns
2025-02-22 18:28:35,366 [DEBUG] y_variable provided: ['cumulative_valgus_phase_armcock_acc_biomech']
2025-02-22 18:28:35,397 [DEBUG] Unique values in target column(s): {'cumulative_valgus_phase_armcock_acc_biomech': {0: 0.0, 7143: 0.002532368, 7144: 0.015885554, 7145: 0.054205348, 7146: 0.109250602, 7147: 0.19907955199999997, 7148: 0.289181696, 7149: 0.39329903, 7150: 0.46868508799999997, 7151: 0.523832588, 7152: 0.5380424540000001, 7153: 0.5385843800000001, 12930: 0.000102272, 12931: 0.000559216, 12932: 0.000908248, 12933: 0.0009466, 12943: 0.0012718480000000001, 12944: 0.0069298960000000005, 12945: 0.031025704, 12946: 0.071178216, 12947: 0.137191736, 12948: 0.20546388399999999, 12949: 0.278611692, 12950: 0.32923040800000003, 12951: 0.36000839, 12952: 0.36527222000000004, 16588: 0.000273088, 16589: 0.002744882, 16590: 0.007792544, 16591: 0.01840852

[INFO] Dataset loaded from ..\..\dataset\test\data\final_inner_join_emg_biomech_data.parquet. Shape: (134720, 112)
[INFO] Estimated optimal window size: 6

[INFO] Running Pitching Motion Segmentation Example (grouping by  with DTW enabled)...


2025-02-22 18:28:35,576 [INFO] ✅ Filtered DataFrame to include only specified features. Shape: (134720, 90)
2025-02-22 18:28:35,578 [DEBUG] Selected Features: ['Collection Length (seconds)', 'EMG 1 (mV) - FDS (81770)', 'ACC X (G) - FDS (81770)', 'ACC Y (G) - FDS (81770)', 'ACC Z (G) - FDS (81770)', 'GYRO X (deg/s) - FDS (81770)', 'GYRO Y (deg/s) - FDS (81770)', 'GYRO Z (deg/s) - FDS (81770)', 'EMG 1 (mV) - FCU (81728)', 'ACC X (G) - FCU (81728)', 'ACC Y (G) - FCU (81728)', 'ACC Z (G) - FCU (81728)', 'GYRO X (deg/s) - FCU (81728)', 'GYRO Y (deg/s) - FCU (81728)', 'GYRO Z (deg/s) - FCU (81728)', 'EMG 1 (mV) - FCR (81745)', 'pitch_speed_mph_biomech', 'height_meters_biomech', 'mass_kilograms_biomech', 'shoulder_angle_x_biomech', 'shoulder_angle_y_biomech', 'shoulder_angle_z_biomech', 'elbow_angle_x_biomech', 'elbow_angle_y_biomech', 'elbow_angle_z_biomech', 'torso_angle_x_biomech', 'torso_angle_y_biomech', 'torso_angle_z_biomech', 'pelvis_angle_x_biomech', 'pelvis_angle_y_biomech', 'pelvis

In [5]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import joblib
import os



# -----------------------------
# CONFIGURATION (using the new paths and features)
# -----------------------------
config = {
    "paths": {
        "data_dir": "../../dataset/test/data",
        "raw_data": "final_inner_join_emg_biomech_data.parquet",
        "processed_data_dir": "preprocessor/processed",
        "features_metadata_file": "features_info/features_metadata.pkl",
        "predictions_output_dir": "preprocessor/predictions",
        "config_file": "../../dataset/test/preprocessor_config/preprocessor_config.yaml",
        "log_dir": "../preprocessor/logs",
        "model_save_base_dir": "../preprocessor/models",
        "transformers_save_base_dir": "../preprocessor/transformers",
        "plots_output_dir": "../preprocessor/plots",
        "training_output_dir": "../preprocessor/training_output"
    },
    "features": {
        "ordinal_categoricals": [
            "ACC X (G) - FDS (81770)_spike_flag",
            "ACC X (G) - FCU (81728)_spike_flag",
            "ACC Y (G) - FDS (81770)_spike_flag",
            "ACC Y (G) - FCU (81728)_spike_flag",
            "ACC Z (G) - FDS (81770)_spike_flag",
            "ACC Z (G) - FCU (81728)_spike_flag",
            "GYRO X (deg/s) - FDS (81770)_spike_flag",
            "GYRO X (deg/s) - FCU (81728)_spike_flag",
            "GYRO Y (deg/s) - FDS (81770)_spike_flag",
            "GYRO Y (deg/s) - FCU (81728)_spike_flag",
            "GYRO Z (deg/s) - FDS (81770)_spike_flag",
            "GYRO Z (deg/s) - FCU (81728)_spike_flag",
            "EMG 1 (mV) - FDS (81770)_spike_flag",
            "EMG_high_flag",
            "EMG_low_flag",
            "EMG_extreme_flag",
            "EMG_extreme_flag_dynamic",
            "ThrowingMotion",
            "session_biomech",
            "ongoing_timestamp_biomech",
            "trial_biomech",
            "Date/Time",
            "Timestamp",
            "emg_time",
            "datetime",
            "session_time_biomech",
            "biomech_datetime"
        ],
        "nominal_categoricals": [
            "Application",
            "athlete_name_biomech",
            "athlete_traq_biomech",
            "athlete_level_biomech",
            "lab_biomech",
            "pitch_type_biomech",
            "handedness_biomech",
            "pitch_phase_biomech"
        ],
        "numericals": [
            "Collection Length (seconds)",
            "EMG 1 (mV) - FDS (81770)",
            "ACC X (G) - FDS (81770)",
            "ACC Y (G) - FDS (81770)",
            "ACC Z (G) - FDS (81770)",
            "GYRO X (deg/s) - FDS (81770)",
            "GYRO Y (deg/s) - FDS (81770)",
            "GYRO Z (deg/s) - FDS (81770)",
            "EMG 1 (mV) - FCU (81728)",
            "ACC X (G) - FCU (81728)",
            "ACC Y (G) - FCU (81728)",
            "ACC Z (G) - FCU (81728)",
            "GYRO X (deg/s) - FCU (81728)",
            "GYRO Y (deg/s) - FCU (81728)",
            "GYRO Z (deg/s) - FCU (81728)",
            "EMG 1 (mV) - FCR (81745)",
            "pitch_speed_mph_biomech",
            "height_meters_biomech",
            "mass_kilograms_biomech",
            "shoulder_angle_x_biomech",
            "shoulder_angle_y_biomech",
            "shoulder_angle_z_biomech",
            "elbow_angle_x_biomech",
            "elbow_angle_y_biomech",
            "elbow_angle_z_biomech",
            "torso_angle_x_biomech",
            "torso_angle_y_biomech",
            "torso_angle_z_biomech",
            "pelvis_angle_x_biomech",
            "pelvis_angle_y_biomech",
            "pelvis_angle_z_biomech",
            "shoulder_velo_x_biomech",
            "shoulder_velo_y_biomech",
            "shoulder_velo_z_biomech",
            "elbow_velo_x_biomech",
            "elbow_velo_y_biomech",
            "elbow_velo_z_biomech",
            "torso_velo_x_biomech",
            "torso_velo_y_biomech",
            "torso_velo_z_biomech",
            "trunk_pelvis_dissociation_biomech",
            "shoulder_energy_transfer_biomech",
            "shoulder_energy_generation_biomech",
            "elbow_energy_transfer_biomech",
            "elbow_energy_generation_biomech",
            "lead_knee_energy_transfer_biomech",
            "lead_knee_energy_generation_biomech",
            "elbow_moment_x_biomech",
            "elbow_moment_y_biomech",
            "elbow_moment_z_biomech",
            "shoulder_thorax_moment_x_biomech",
            "shoulder_thorax_moment_y_biomech",
            "shoulder_thorax_moment_z_biomech",
            "max_shoulder_internal_rotational_velo_biomech"
        ]
    }
}

# -----------------------------
# TRAINING PHASE
# -----------------------------
# 1. Load your training data using the configured data_dir and raw_data path.
data_path = os.path.join(config["paths"]["data_dir"], config["paths"]["raw_data"])
data = pd.read_parquet(data_path)
print(f"[INFO] Training data loaded from {data_path}. Shape: {data.shape}")

# 2. Set up time series parameters from the configuration.
ts_params = {
    "enabled": True,
    "time_column": "ongoing_timestamp_biomech",
    "window_size": 50,
    "horizon": 1,
    "step_size": 1,
    "max_sequence_length": 50,
    "time_series_sequence_mode": "dtw",
    "phase_aware_normalization": {"enabled": False}
}

# 3. Create a preprocessor in train mode using the new feature lists.
preprocessor = DataPreprocessor(
    model_type="LSTM",
    y_variable=config["features"].get("y_variable", ["elbow_varus_moment_biomech"]),
    ordinal_categoricals=config["features"]["ordinal_categoricals"],
    nominal_categoricals=config["features"]["nominal_categoricals"],
    numericals=config["features"]["numericals"],
    mode="train",
    options=ts_params,
    debug=True,
    graphs_output_dir=config["paths"]["plots_output_dir"],
    transformers_dir=config["paths"]["transformers_save_base_dir"],
    time_column=ts_params.get("time_column"),
    window_size=ts_params.get("window_size"),
    horizon=ts_params.get("horizon"),
    step_size=ts_params.get("step_size"),
    max_sequence_length=ts_params.get("max_sequence_length"),
    time_series_sequence_mode="dtw",  # Options: "set_window", "dtw", "pad", "variable_length"
    sequence_categorical=["session_biomech", "trial_biomech"]
)

# 4. Preprocess training data to obtain sequences.
X_seq, _, y_seq, _, recommendations, _ = preprocessor.final_preprocessing(data)
print("Preprocessing recommendations:")
print(recommendations)

# Convert lists to numpy arrays if they aren't already
X_seq = np.array(X_seq)
y_seq = np.array(y_seq)

# Add these lines after the preprocessing step
print("Type of X_seq:", type(X_seq))
print("Shape of X_seq:", X_seq.shape)
print("Type of y_seq:", type(y_seq))
print("Shape of y_seq:", y_seq.shape)

# 5. Build a sample LSTM model.
num_sequences, seq_length, num_features = X_seq.shape
model = Sequential()
model.add(LSTM(64, input_shape=(seq_length, num_features), return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(optimizer='adam', loss='mse')

# 6. Train the model.
early_stop = EarlyStopping(monitor='loss', patience=5)
model.fit(X_seq, y_seq, epochs=5, batch_size=32, callbacks=[early_stop])

# 7. Save the model and transformers.
model_save_path = os.path.join(config["paths"]["model_save_base_dir"], "lstm_model.h5")
model.save(model_save_path)
print(f"Model saved at {model_save_path}")
# The preprocessor already saves transformers during final_preprocessing.

# -----------------------------
# PREDICTION PHASE
# -----------------------------
# 1. Create a preprocessor in predict mode.
preprocessor_pred = DataPreprocessor(
    model_type="LSTM",
    y_variable=config["features"].get("y_variable", ["elbow_varus_moment_biomech"]),
    ordinal_categoricals=config["features"]["ordinal_categoricals"],
    nominal_categoricals=config["features"]["nominal_categoricals"],
    numericals=config["features"]["numericals"],
    mode="predict",
    options=ts_params,
    debug=True,
    graphs_output_dir=config["paths"]["plots_output_dir"],
    transformers_dir=config["paths"]["transformers_save_base_dir"],
    time_column=ts_params.get("time_column"),
    window_size=ts_params.get("window_size"),
    horizon=ts_params.get("horizon"),
    step_size=ts_params.get("step_size"),
    max_sequence_length=ts_params.get("max_sequence_length"),
    time_series_sequence_mode="dtw",  # Options: "set_window", "dtw", "pad", "variable_length"
    sequence_categorical=["session_biomech", "trial_biomech"]
)

# 2. Load saved transformers.
preprocessor_pred.load_transformers()

# 3. Load the saved model.
model_loaded = load_model(model_save_path)

# 4. Load new prediction data.
new_data_path = os.path.join(config["paths"]["data_dir"], "final_inner_join_emg_biomech_data.parquet")
new_data = pd.read_parquet(new_data_path)
print(f"[INFO] New data loaded from {new_data_path}. Shape: {new_data.shape}")

# 5. Preprocess new data.
X_new_preprocessed, recommendations_pred, _ = preprocessor_pred.preprocess_predict(new_data)

# 6. Make predictions.
predictions = model_loaded.predict(X_new_preprocessed)
print("Predictions:")
print(predictions)


2025-02-22 10:34:48,499 [INFO] Starting: Final Preprocessing Pipeline in 'train' mode.
2025-02-22 10:34:48,500 [INFO] Step: filter_columns
2025-02-22 10:34:48,502 [DEBUG] y_variable provided: ['elbow_varus_moment_biomech']


[INFO] Training data loaded from ../../dataset/test/data\final_inner_join_emg_biomech_data.parquet. Shape: (134720, 112)


2025-02-22 10:34:48,629 [DEBUG] Unique values in target column(s): {'elbow_varus_moment_biomech': {0: 148.65489888840136, 1: 148.65893301364426, 2: 148.66296713888715, 3: 148.66700126413005, 4: 148.67103538937295, 5: 148.67506951461587, 6: 148.67910363985877, 7: 148.68313776510166, 8: 148.68717189034456, 9: 148.69120601558745, 10: 148.69524014083038, 11: 148.69927426607327, 12: 148.70330839131617, 13: 148.70734251655907, 14: 148.7107042875948, 15: 148.71473841283773, 16: 148.71877253808063, 17: 148.72280666332352, 18: 148.72684078856642, 19: 148.73087491380932, 20: 148.7349090390522, 21: 148.73894316429514, 22: 148.74297728953803, 23: 148.74701141478093, 24: 148.75104554002382, 25: 148.75507966526672, 26: 148.75911379050962, 27: 148.76314791575254, 28: 148.76718204099544, 29: 148.77121616623833, 30: 148.77525029148123, 31: 148.77928441672412, 32: 148.78331854196705, 33: 148.78735266720994, 34: 148.79138679245284, 35: 148.79542091769574, 36: 148.79945504293863, 37: 148.80348916818153, 3

Preprocessing recommendations:
                                                                Preprocessing Reason
ACC X (G) - FDS (81770)_spike_flag             Categorical: Most_frequent Imputation
ACC X (G) - FCU (81728)_spike_flag             Categorical: Most_frequent Imputation
ACC Y (G) - FDS (81770)_spike_flag             Categorical: Most_frequent Imputation
ACC Y (G) - FCU (81728)_spike_flag             Categorical: Most_frequent Imputation
ACC Z (G) - FDS (81770)_spike_flag             Categorical: Most_frequent Imputation
...                                                                              ...
elbow_moment_z_biomech                                  Numerical: Median Imputation
shoulder_thorax_moment_x_biomech                        Numerical: Median Imputation
shoulder_thorax_moment_y_biomech                        Numerical: Median Imputation
shoulder_thorax_moment_z_biomech                        Numerical: Median Imputation
max_shoulder_internal_rotational_v

2025-02-22 12:07:28,008 [INFO] Step: Load Transformers
2025-02-22 12:07:28,019 [DEBUG] Loading transformers from: ../preprocessor/transformers\transformers.pkl


Model saved at ../preprocessor/models\lstm_model.h5


2025-02-22 12:07:31,002 [DEBUG] Pipeline loaded. Ready to transform new data.
2025-02-22 12:07:31,003 [INFO] Transformers loaded successfully from '../preprocessor/transformers\transformers.pkl'.
2025-02-22 12:07:32,459 [INFO] Step: Preprocess Predict
2025-02-22 12:07:32,461 [DEBUG] Initial columns in prediction data: ['EMG 1 (mV) - FDS (81770)', 'ACC X (G) - FDS (81770)', 'ACC Y (G) - FDS (81770)', 'ACC Z (G) - FDS (81770)', 'GYRO X (deg/s) - FDS (81770)', 'GYRO Y (deg/s) - FDS (81770)', 'GYRO Z (deg/s) - FDS (81770)', 'EMG 1 (mV) - FCU (81728)', 'ACC X (G) - FCU (81728)', 'ACC Y (G) - FCU (81728)', 'ACC Z (G) - FCU (81728)', 'GYRO X (deg/s) - FCU (81728)', 'GYRO Y (deg/s) - FCU (81728)', 'GYRO Z (deg/s) - FCU (81728)', 'EMG 1 (mV) - FCR (81745)', 'Application', 'Date/Time', 'Collection Length (seconds)', 'Timestamp', 'ACC X (G) - FDS (81770)_spike_flag', 'ACC X (G) - FCU (81728)_spike_flag', 'ACC Y (G) - FDS (81770)_spike_flag', 'ACC Y (G) - FCU (81728)_spike_flag', 'ACC Z (G) - FDS 

[INFO] New data loaded from ../../dataset/test/data\final_inner_join_emg_biomech_data.parquet. Shape: (134720, 112)


2025-02-22 12:07:33,504 [DEBUG] Pipeline loaded. Ready to transform new data.
2025-02-22 12:07:33,505 [INFO] Transformers loaded successfully from '../preprocessor/transformers\transformers.pkl'.
2025-02-22 12:07:33,519 [DEBUG] Transformers loaded successfully.
2025-02-22 12:07:33,520 [INFO] Step: filter_columns
2025-02-22 12:07:33,521 [DEBUG] y_variable provided: ['elbow_varus_moment_biomech']
2025-02-22 12:07:33,687 [DEBUG] Unique values in target column(s): {'elbow_varus_moment_biomech': {0: 148.65489888840136, 1: 148.65893301364426, 2: 148.66296713888715, 3: 148.66700126413005, 4: 148.67103538937295, 5: 148.67506951461587, 6: 148.67910363985877, 7: 148.68313776510166, 8: 148.68717189034456, 9: 148.69120601558745, 10: 148.69524014083038, 11: 148.69927426607327, 12: 148.70330839131617, 13: 148.70734251655907, 14: 148.7107042875948, 15: 148.71473841283773, 16: 148.71877253808063, 17: 148.72280666332352, 18: 148.72684078856642, 19: 148.73087491380932, 20: 148.7349090390522, 21: 148.738

ValueError: in user code:

    File "c:\Users\GeoffreyHadfield\Anaconda3\envs\data_science_ml_preprocessor\lib\site-packages\keras\engine\training.py", line 2169, in predict_function  *
        return step_function(self, iterator)
    File "c:\Users\GeoffreyHadfield\Anaconda3\envs\data_science_ml_preprocessor\lib\site-packages\keras\engine\training.py", line 2155, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\GeoffreyHadfield\Anaconda3\envs\data_science_ml_preprocessor\lib\site-packages\keras\engine\training.py", line 2143, in run_step  **
        outputs = model.predict_step(data)
    File "c:\Users\GeoffreyHadfield\Anaconda3\envs\data_science_ml_preprocessor\lib\site-packages\keras\engine\training.py", line 2111, in predict_step
        return self(x, training=False)
    File "c:\Users\GeoffreyHadfield\Anaconda3\envs\data_science_ml_preprocessor\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\GeoffreyHadfield\Anaconda3\envs\data_science_ml_preprocessor\lib\site-packages\keras\engine\input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential" is incompatible with the layer: expected shape=(None, 25311, 118), found shape=(32, 93)
