In [1]:
# datapreprocessor.py

import pandas as pd
import numpy as np
from typing import List, Dict, Optional, Tuple, Any
import logging
import os
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import PowerTransformer, OrdinalEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from imblearn.over_sampling import SMOTE, ADASYN, SMOTENC, SMOTEN, BorderlineSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import probplot
import joblib  # For saving/loading transformers
from inspect import signature  # For parameter validation in SMOTE
from functools import wraps
import re
from feature_engine.selection import DropHighPSIFeatures

def dtw_path(s1: np.ndarray, s2: np.ndarray) -> list:
    """
    Compute the DTW cost matrix and return the optimal warping path.
    
    Args:
        s1: Sequence 1, shape (n, features)
        s2: Sequence 2, shape (m, features)
    
    Returns:
        path: A list of index pairs [(i, j), ...] indicating the alignment.
    """
    n, m = len(s1), len(s2)
    cost = np.full((n+1, m+1), np.inf)
    cost[0, 0] = 0

    # Build the cost matrix
    for i in range(1, n+1):
        for j in range(1, m+1):
            dist = np.linalg.norm(s1[i-1] - s2[j-1])
            cost[i, j] = dist + min(cost[i-1, j], cost[i, j-1], cost[i-1, j-1])

    # Backtracking to find the optimal path
    i, j = n, m
    path = []
    while i > 0 and j > 0:
        path.append((i-1, j-1))
        directions = [cost[i-1, j], cost[i, j-1], cost[i-1, j-1]]
        min_index = np.argmin(directions)
        if min_index == 0:
            i -= 1
        elif min_index == 1:
            j -= 1
        else:
            i -= 1
            j -= 1
    path.reverse()
    return path

def warp_sequence(seq: np.ndarray, path: list, target_length: int) -> np.ndarray:
    """
    Warp the given sequence to match the target length based on the DTW warping path.
    
    Args:
        seq: Original sequence, shape (n, features)
        path: Warping path from dtw_path (list of tuples)
        target_length: Desired sequence length (typically the reference length)
    
    Returns:
        aligned_seq: Warped sequence with shape (target_length, features)
    """
    aligned_seq = np.zeros((target_length, seq.shape[1]))
    # Create mapping: for each target index, collect corresponding indices from seq
    mapping = {t: [] for t in range(target_length)}
    for (i, j) in path:
        mapping[j].append(i)
    
    for t in range(target_length):
        indices = mapping[t]
        if indices:
            aligned_seq[t] = np.mean(seq[indices], axis=0)
        else:
            # If no alignment, reuse the previous value (or use interpolation)
            aligned_seq[t] = aligned_seq[t-1] if t > 0 else seq[0]
    return aligned_seq
    


class DataPreprocessor:
    def __init__(
        self,
        model_type: str,
        y_variable: List[str],
        ordinal_categoricals: List[str],
        nominal_categoricals: List[str],
        numericals: List[str],
        mode: str,  # 'train', 'predict', 'clustering'
        options: Optional[Dict] = None,
        debug: bool = False,
        normalize_debug: bool = False,
        normalize_graphs_output: bool = False,
        graphs_output_dir: str = './plots',
        transformers_dir: str = './transformers',
        time_series_sequence_mode: str = "pad", # "set_window", "pad", "dtw", "variable_length"
        # Retained sequence grouping parameters:
        sequence_categorical: Optional[List[str]] = None,
        # NEW: Secondary grouping for sub-phase segmentation (for DTW/pad modes)
        sequence_dtw_or_pad_categorical: Optional[List[str]] = None
    ):
        # --- Process and validate sequence grouping parameters ---
        self.sequence_categorical = list(sequence_categorical) if sequence_categorical else []
        self.sequence_dtw_or_pad_categorical = list(sequence_dtw_or_pad_categorical) if sequence_dtw_or_pad_categorical else []
        
        if set(self.sequence_categorical) & set(self.sequence_dtw_or_pad_categorical):
            conflicting = set(self.sequence_categorical) & set(self.sequence_dtw_or_pad_categorical)
            raise ValueError(f"Categorical conflict in {conflicting}. Top-level and sub-phase groups must form a strict hierarchy")

        # --- Basic attribute assignments ---
        self.model_type = model_type
        self.y_variable = y_variable
        self.ordinal_categoricals = ordinal_categoricals
        self.nominal_categoricals = nominal_categoricals
        self.numericals = numericals
        self.mode = mode.lower()
        if self.mode not in ['train', 'predict', 'clustering']:
            raise ValueError("Mode must be one of 'train', 'predict', or 'clustering'.")
        self.options = options or {}
        # Default outlier method for time series (fallback if not provided in time_series config)
        self.ts_outlier_method = self.options.get('handle_outliers', {}).get('time_series_method', 'median').lower()

        self.debug = debug
        self.normalize_debug = normalize_debug
        self.normalize_graphs_output = normalize_graphs_output
        self.graphs_output_dir = graphs_output_dir
        self.transformers_dir = transformers_dir

        # --- NEW: Extract time series parameters from config ---
        # Pull the 'time_series' block from options
        time_series_config = self.options  # Use the options directly

        # Enabled flag (to turn time series processing on/off)
        self.time_series_enabled = time_series_config.get('enabled', False)
        # Get the required parameters from the config
        self.time_column = time_series_config.get('time_column')
        self.horizon = time_series_config.get('horizon')
        self.step_size = time_series_config.get('step_size')
        self.ts_outlier_method = time_series_config.get('ts_outlier_handling_method', self.ts_outlier_method).lower()
        # Use the sequence mode from the argument instead of config
        self.time_series_sequence_mode = time_series_sequence_mode.lower()

        # Extract additional settings based on the sequence mode
        sequence_modes_config = time_series_config.get('sequence_modes', {})
        if self.time_series_sequence_mode == "set_window":
            set_window_config = sequence_modes_config.get('set_window', {})
            self.window_size = set_window_config.get('window_size')
            self.max_sequence_length = set_window_config.get('max_sequence_length')
            if self.window_size is None or self.step_size is None:
                raise ValueError("Both 'window_size' (from set_window config) and 'step_size' must be provided for 'set_window' mode.")
        elif self.time_series_sequence_mode == "pad":
            pad_config = sequence_modes_config.get('pad', {})
            self.padding_side = pad_config.get('padding_side', 'post')
            self.pad_threshold = pad_config.get('pad_threshold')
            self.max_sequence_length = None  # Not enforced for pad mode
        elif self.time_series_sequence_mode == "dtw":
            dtw_config = sequence_modes_config.get('dtw', {})
            self.use_dtw = dtw_config.get('use_dtw', True)
            self.reference_sequence = dtw_config.get('reference_sequence', 'max')
            self.dtw_threshold = dtw_config.get('dtw_threshold')
            self.max_sequence_length = None
        elif self.time_series_sequence_mode == "variable_length":
            var_config = sequence_modes_config.get('variable_length', {})
            self.window_size = var_config.get('window_size')
            self.max_sequence_length = None  # No fixed maximum in variable_length mode
        else:
            raise ValueError(f"Unknown time series sequence mode: {self.time_series_sequence_mode}")

        # ----------------------------------------------------
        self.max_phase_distortion = self.options.get('max_phase_distortion', 0.3)  # e.g., 30% distortion allowed
        self.max_length_variance = self.options.get('max_length_variance', 5)  # allowable variation in phase lengths

        if self.sequence_categorical and self.sequence_dtw_or_pad_categorical:
            overlap = set(self.sequence_categorical) & set(self.sequence_dtw_or_pad_categorical)
            if overlap:
                raise ValueError(f"Overlapping grouping columns: {overlap}. Top-level and sub-phase groups must be distinct")

        # --- Remaining initialization (unchanged) ---
        self.hierarchical_categories = {}
        model_type_lower = self.model_type.lower()
        if any(kw in model_type_lower for kw in ['lstm', 'rnn', 'time series']):
            self.model_category = 'time_series'
        else:
            self.model_category = self.map_model_type_to_category()

        self.categorical_indices = []
        if self.model_category == 'unknown':
            self.logger = logging.getLogger(self.__class__.__name__)
            self.logger.error(f"Model category for '{self.model_type}' is unknown. Check your configuration.")
            raise ValueError(f"Model category for '{self.model_type}' is unknown. Check your configuration.")
        if self.mode in ['train', 'predict']:
            if not self.y_variable:
                raise ValueError("Target variable 'y_variable' must be specified for supervised models in train/predict mode.")
        elif self.mode == 'clustering':
            self.y_variable = []

        # NEW: Initialize follow-through metadata storage for debugging extreme durations.
        self.follow_through_stats = []  # Will store dicts with keys: group_key, phase, length (in seconds), num_rows
        self.time_step = self.options.get('time_step', 1/60) if self.options else 1/60

        # Initialize other variables (scalers, transformers, etc.)
        self.scaler = None
        self.transformer = None
        self.ordinal_encoder = None
        self.nominal_encoder = None
        self.preprocessor = None
        self.smote = None
        self.feature_reasons = {col: '' for col in self.ordinal_categoricals + self.nominal_categoricals + self.numericals}
        self.preprocessing_steps = []
        self.normality_results = {}
        self.features_to_transform = []
        self.nominal_encoded_feature_names = []
        self.final_feature_order = []

        # Additional initialization for clustering
        self.cluster_transformers = {}
        self.cluster_model = None
        self.cluster_labels = None
        self.silhouette_score = None

        # Default thresholds for SMOTE recommendations
        self.imbalance_threshold = self.options.get('smote_recommendation', {}).get('imbalance_threshold', 0.1)
        self.noise_threshold = self.options.get('smote_recommendation', {}).get('noise_threshold', 0.1)
        self.overlap_threshold = self.options.get('smote_recommendation', {}).get('overlap_threshold', 0.1)
        self.boundary_threshold = self.options.get('smote_recommendation', {}).get('boundary_threshold', 0.1)

        self.pipeline = None

        # Initialize logging
        self.logger = logging.getLogger(self.__class__.__name__)
        self.logger.setLevel(logging.DEBUG if self.debug else logging.INFO)
        handler = logging.StreamHandler()
        formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s')
        handler.setFormatter(formatter)
        if not self.logger.handlers:
            self.logger.addHandler(handler)

        # Initialize feature_reasons for clustering
        self.feature_reasons = {col: '' for col in self.ordinal_categoricals + self.nominal_categoricals + self.numericals}
        if self.model_category == 'clustering':
            self.feature_reasons['all_numericals'] = ''



    def get_debug_flag(self, flag_name: str) -> bool:
        """
        Retrieve the value of a specific debug flag from the options.
        Args:
            flag_name (str): The name of the debug flag.
        Returns:
            bool: The value of the debug flag.
        """
        return self.options.get(flag_name, False)

    def _log(self, message: str, step: str, level: str = 'info'):
        """
        Internal method to log messages based on the step-specific debug flags.
        
        Args:
            message (str): The message to log.
            step (str): The preprocessing step name.
            level (str): The logging level ('info', 'debug', etc.).
        """
        debug_flag = self.get_debug_flag(f'debug_{step}')
        if debug_flag:
            if level == 'debug':
                self.logger.debug(message)
            elif level == 'info':
                self.logger.info(message)
            elif level == 'warning':
                self.logger.warning(message)
            elif level == 'error':
                self.logger.error(message)

    def map_model_type_to_category(self) -> str:
        """
        Map the model_type string to a predefined category based on keywords.

        Returns:
            str: The model category ('classification', 'regression', 'clustering', etc.).
        """
        classification_keywords = ['classifier', 'classification', 'logistic', 'svm', 'support vector machine', 'knn', 'neural network']
        regression_keywords = ['regressor', 'regression', 'linear', 'knn', 'neural network']  # Removed 'svm'
        clustering_keywords = ['k-means', 'clustering', 'dbscan', 'kmodes', 'kprototypes']

        model_type_lower = self.model_type.lower()

        for keyword in classification_keywords:
            if keyword in model_type_lower:
                return 'classification'

        for keyword in regression_keywords:
            if keyword in model_type_lower:
                return 'regression'

        for keyword in clustering_keywords:
            if keyword in model_type_lower:
                return 'clustering'

        return 'unknown'

    def filter_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        step_name = "filter_columns"
        self.logger.info(f"Step: {step_name}")

        # Combine all feature lists from configuration
        desired_features = self.numericals + self.ordinal_categoricals + self.nominal_categoricals

        # For time series models, ensure the time column is included
        if self.model_category == 'time_series' and self.time_column:
            if self.time_column not in df.columns:
                self.logger.error(f"Time column '{self.time_column}' not found in input data.")
                raise ValueError(f"Time column '{self.time_column}' not found in the input data.")
            if self.time_column not in desired_features:
                desired_features.append(self.time_column)

        # Debug log: report target variable info
        self.logger.debug(f"y_variable provided: {self.y_variable}")
        if self.y_variable and all(col in df.columns for col in self.y_variable):
            self.logger.debug(f"First value in target column(s): {df[self.y_variable].iloc[0].to_dict()}")

        # For 'train' mode, ensure the target variable is present and excluded from features
        if self.mode == 'train':
            if not all(col in df.columns for col in self.y_variable):
                missing_y = [col for col in self.y_variable if col not in df.columns]
                self.logger.error(f"Target variable(s) {missing_y} not found in the input data.")
                raise ValueError(f"Target variable(s) {missing_y} not found in the input data.")
            desired_features = [col for col in desired_features if col not in self.y_variable]
            filtered_df = df[desired_features + self.y_variable].copy()
        else:
            filtered_df = df[desired_features].copy()

        # Check that all desired features are present in the input DataFrame
        missing_features = [col for col in desired_features if col not in df.columns]
        if missing_features:
            self.logger.error(f"The following required features are missing in the input data: {missing_features}")
            raise ValueError(f"The following required features are missing in the input data: {missing_features}")

        # Additional numeric type check for expected numeric columns
        for col in self.numericals:
            if col in filtered_df.columns and not np.issubdtype(filtered_df[col].dtype, np.number):
                raise TypeError(f"Numerical column '{col}' has non-numeric dtype {filtered_df[col].dtype}")

        self.logger.info(f"✅ Filtered DataFrame to include only specified features. Shape: {filtered_df.shape}")
        self.logger.debug(f"Selected Features: {desired_features}")
        if self.mode == 'train':
            self.logger.debug(f"Retained Target Variable(s): {self.y_variable}")

        return filtered_df



    def _group_top_level(self, data: pd.DataFrame):
        """
        Group the data based on top-level sequence categorical variables.
        Returns the grouped DataFrames (without converting them to NumPy arrays)
        to ensure that subsequent processing (such as sub-phase segmentation) has access
        to DataFrame methods like .groupby and .columns.
        """
        if not self.sequence_categorical:
            return [('default_group', data)]
        
        groups = data.groupby(self.sequence_categorical)
        self.logger.debug(f"Group keys: {list(groups.groups.keys())}")
        
        validated_groups = []
        for name, group in groups:
            try:
                self.logger.debug(f"Group '{name}' type: {type(group)}, Shape: {group.shape if hasattr(group, 'shape') else 'N/A'}")
            except Exception as e:
                self.logger.error(f"Error obtaining shape for group {name}: {e}")
            if isinstance(group, pd.DataFrame):
                # *** FIX: Return the DataFrame (not group.values) so that it retains the .columns attribute ***
                validated_groups.append((name, group))
            else:
                self.logger.warning(f"Unexpected group type {type(group)} for group {name}")
        return validated_groups




    @staticmethod
    def normalize_phase_key(key: str) -> str:
        """
        Normalize a phase key by:
          - Stripping leading/trailing whitespace.
          - Inserting an underscore between a lowercase letter and an uppercase letter.
          - Replacing spaces with underscores.
          - Converting the whole string to lowercase.
          
        This ensures that keys like "Arm Acceleration" and "ArmAcceleration" both normalize to "arm_acceleration".
        """
        key = key.strip()
        key = re.sub(r'(?<=[a-z])(?=[A-Z])', '_', key)  # insert underscore before uppercase letter if preceded by a lowercase
        key = key.replace(" ", "_")
        return key.lower()

        
    @staticmethod
    def safe_array_conversion(data):
        """
        Convert input data to a NumPy array if it is not already.
        Handles both structured and unstructured arrays.
        Raises a TypeError if the input data is a dictionary.
        """
        if isinstance(data, dict):
            raise TypeError("Input data is a dict. Expected array-like input.")
        if isinstance(data, np.ndarray):
            if data.dtype.names:
                # For structured arrays, view as float32 and reshape to combine fields.
                return data.view(np.float32).reshape(data.shape + (-1,))
            return data
        elif hasattr(data, 'values'):
            arr = data.values
            if arr.dtype.names:
                return arr.view(np.float32).reshape(arr.shape + (-1,))
            return arr
        else:
            return np.array(data)
            
    def _segment_subphases(self, group_data: pd.DataFrame, skip_min_samples=False):
        """
        Segment a group's data into sub-phases based on the secondary grouping.
        For each phase, convert to a NumPy array (after filtering to numeric columns)
        and store a tuple (original key, numeric array). The phase key is normalized
        using normalize_phase_key.
        
        Additional debugging:
        - Logs the raw keys obtained from groupby.
        - Logs the normalized phase keys and compares them with the expected keys.
        
        Args:
            group_data (pd.DataFrame): Data for one group.
            skip_min_samples (bool): If True, do not skip phases with very few samples.
        
        Returns:
            dict: Dictionary mapping normalized phase keys to tuples (original key, array).
        """
        # If no secondary grouping is provided, return a default phase.
        if not self.sequence_dtw_or_pad_categorical:
            if self.numericals:
                group_data = group_data[[col for col in group_data.columns if col in self.numericals]]
            return {"default_phase": ("default_phase", group_data.values)}
        
        phase_groups = list(group_data.groupby(self.sequence_dtw_or_pad_categorical))
        
        # Debug: log raw phase keys from groupby
        raw_keys = [group for group, _ in phase_groups]
        self.logger.debug(f"Raw phase keys in group: {raw_keys}")
        
        subphases = {}
        MIN_PHASE_SAMPLES = 5 if not skip_min_samples else 1  # Option: reduce minimum sample threshold
        
        if self.time_column and self.time_column in group_data.columns:
            try:
                self._validate_timestamps(group_data)
            except Exception as e:
                self.logger.warning(f"Timestamp validation error: {e}")

        for phase_key, phase_df in phase_groups:
            # Normalize key
            if isinstance(phase_key, tuple):
                stable_key = "|".join(map(str, phase_key))
            else:
                stable_key = str(phase_key)
            normalized_key = DataPreprocessor.normalize_phase_key(stable_key)
            self.logger.debug(f"Sub-phase raw key '{stable_key}' normalized to: '{normalized_key}'")
                            
            if not isinstance(phase_df, (pd.DataFrame, np.ndarray)):
                self.logger.error(f"Invalid type {type(phase_df)} for phase '{stable_key}'. Skipping.")
                continue

            phase_length = len(phase_df)
            self.logger.debug(f"Phase '{stable_key}' (normalized: '{normalized_key}') length: {phase_length}")

            if phase_length < MIN_PHASE_SAMPLES:
                self.logger.warning(f"Skipping short phase '{stable_key}' (length {phase_length} < {MIN_PHASE_SAMPLES})")
                continue

            # Convert to numeric array if necessary
            if isinstance(phase_df, pd.DataFrame):
                numeric_phase_df = phase_df[[col for col in phase_df.columns if col in self.numericals]] if self.numericals else phase_df
                try:
                    numeric_phase_array = self.safe_array_conversion(numeric_phase_df)
                except Exception as e:
                    self.logger.error(f"Array conversion failed for phase '{stable_key}': {e}")
                    continue
            elif isinstance(phase_df, np.ndarray):
                numeric_phase_array = phase_df
            else:
                self.logger.error(f"Unexpected type {type(phase_df)} for phase '{stable_key}'. Skipping.")
                continue

            # Ensure the array is 2D
            if numeric_phase_array.ndim == 1:
                numeric_phase_array = numeric_phase_array.reshape(-1, 1)
                self.logger.debug(f"Phase '{stable_key}' reshaped to 2D: {numeric_phase_array.shape}")

            subphases[normalized_key] = (stable_key, numeric_phase_array)
        
        self.logger.debug(f"Normalized phase keys obtained: {list(subphases.keys())}")
        expected = set(self.sequence_dtw_or_pad_categorical)
        self.logger.debug(f"Expected phase keys: {expected}")
        
        if not subphases:
            self.logger.error("No valid subphases detected in this group.")
            raise ValueError("Subphase segmentation produced an empty dictionary.")
        return subphases




    def _validate_timestamps(self, phase_data: pd.DataFrame):
        """
        Validate that timestamps in phase_data have no large discontinuities (>1 second gap).
        Logs a warning if a gap is detected.
        """
        time_col = self.time_column
        if time_col not in phase_data.columns:
            return
        diffs = phase_data[time_col].diff().dropna()
        if (diffs > 1.0).any():
            gap_loc = diffs.idxmax()
            self.logger.warning(
                f"Timestamp jump in group {getattr(phase_data, 'name', 'unknown')}: {diffs[gap_loc]:.2f}s gap at index {gap_loc}"
            )

    def _flag_extreme_phases(self, phase_stats):
        """
        Identify and log any extreme Follow-Through phases (duration > 30 seconds).
        """
        follow_throughs = [s for s in phase_stats if s["phase"] == "Follow Through"]
        if follow_throughs:
            max_ft = max(follow_throughs, key=lambda x: x["length"])
            if max_ft["length"] > 30:
                self.logger.error(
                    f"Extreme Follow-Through: group {max_ft['group_key']} length={max_ft['length']:.3f}s "
                    f"({max_ft['num_rows']} frames)"
                )

    def _log_top_outliers(self):
        """
        Log the top 5 longest Follow-Through durations from the recorded metadata.
        """
        if not self.follow_through_stats:
            self.logger.debug("No Follow-Through stats recorded.")
            return
        sorted_ft = sorted(self.follow_through_stats, key=lambda x: x["length"], reverse=True)[:5]
        self.logger.debug("Top 5 Follow-Through Durations:")
        for i, stats in enumerate(sorted_ft, 1):
            self.logger.debug(f"{i}. Group {stats['group_key']}: {stats['length']:.3f}s ({stats['num_rows']} frames)")



    @staticmethod
    def pad_sequence(seq: np.ndarray, target_length: int) -> np.ndarray:
        """
        Pad or truncate the given sequence to match the target length.
        Ensures that the input is a 2D array. For a 1D input, reshapes it to (-1, 1).
        A minimum target_length of 5 is enforced to avoid degenerate sequences.
        """
        seq = np.array(seq)
        if seq.ndim == 1:
            seq = seq.reshape(-1, 1)  # Ensure the array is 2D
        current_length = seq.shape[0]
        target_length = max(target_length, 5)  # Enforce a minimum target length of 5
        if current_length >= target_length:
            return seq[:target_length]
        else:
            pad_width = target_length - current_length
            padding = np.zeros((pad_width, seq.shape[1]))
            return np.concatenate([seq, padding], axis=0)


    def _align_phase(self, phase_data, target_length: int, phase_name: str) -> np.ndarray:
        """
        Align a sub-phase's sequence to a target length using DTW (if enabled) or padding.
        Validates that the resulting array is 2D and has exactly target_length rows.
        If DTW alignment results in an array of incorrect shape, falls back to padding.
        """
        if isinstance(phase_data, pd.DataFrame):
            phase_data = phase_data[[col for col in phase_data.columns if col in self.numericals]] if self.numericals else phase_data.copy()

        if isinstance(phase_data, dict):
            self.logger.error(f"Received dict instead of array. Keys: {list(phase_data.keys())}")
            raise TypeError("Phase data must be array-like, not dict. Check your grouping logic.")

        phase_array = self.safe_array_conversion(phase_data)
        self.logger.debug(f"Aligning phase '{phase_name}' with input type {type(phase_data)} and shape {phase_array.shape}")

        if phase_array.ndim == 1:
            phase_array = phase_array.reshape(1, -1)
            self.logger.debug(f"Phase '{phase_name}' was 1D and has been reshaped to {phase_array.shape}")

        current_length = phase_array.shape[0]

        if phase_array.ndim != 2:
            self.logger.error(f"Invalid input shape {phase_array.shape} - expected a 2D array")
            raise ValueError("DTW alignment requires a 2D array input")
        
        if not np.issubdtype(phase_array.dtype, np.number):
            self.logger.warning(f"Non-numeric dtype detected: {phase_array.dtype}. Converting to np.float32.")
            try:
                phase_array = phase_array.astype(np.float32)
            except Exception as e:
                self.logger.error(f"Failed conversion to float32: {e}")
                raise

        try:
            if self.time_series_sequence_mode == "dtw":
                distortion = abs(current_length - target_length) / target_length
                self.logger.debug(f"[Distortion Analysis] Phase '{phase_name}': raw length {current_length} vs target {target_length} | Distortion: {distortion:.1%}")
                MAX_DISTORTION = 0.2
                if distortion > MAX_DISTORTION:
                    raise ValueError(f"Excessive DTW distortion {distortion:.1%} exceeds threshold of {MAX_DISTORTION:.0%}")
                alignment_path = dtw_path(phase_array, phase_array)
                aligned_seq = warp_sequence(phase_array, alignment_path, target_length)
            else:
                aligned_seq = self.pad_sequence(phase_array, target_length)
        except Exception as e:
            self.logger.warning(f"Alignment failed for phase '{phase_name}': {e}. Falling back to padding.")
            aligned_seq = self.pad_sequence(phase_array, target_length)

        if aligned_seq.shape[0] != target_length:
            self.logger.error(f"Phase '{phase_name}' alignment resulted in {aligned_seq.shape[0]} steps (expected {target_length}).")
            raise ValueError(f"Alignment for phase '{phase_name}' did not yield exactly {target_length} steps.")

        self.logger.debug(f"Phase '{phase_name}' aligned successfully to shape {aligned_seq.shape}")
        return aligned_seq



    def post_processing_report(self):
        """
        Generate a post-processing report of Follow-Through statistics after filtering.
        """
        ft_lengths = [s["length"] for s in self.follow_through_stats if s["phase"] == "Follow Through"]
        if ft_lengths:
            report = (
                f"Follow-Through Stats After Filtering:\n"
                f"- Min: {min(ft_lengths):.3f}s\n"
                f"- Max: {max(ft_lengths):.3f}s\n"
                f"- σ: {np.std(ft_lengths):.3f}s"
            )
            self.logger.info(report)
        else:
            self.logger.info("No Follow-Through phases recorded.")


    def _apply_smote_ts(self, X_seq, y_seq):
        """
        Apply SMOTE-TS to balance time series data when enabled.
        
        Parameters:
        -----------
        X_seq : array-like
            Input sequences with shape (n_samples, n_timesteps, n_features).
        y_seq : array-like
            Target values with shape (n_samples,) or (n_samples, n_targets).
            
        Returns:
        --------
        X_resampled, y_resampled : Balanced dataset after SMOTE-TS application.
        """
        # Check if SMOTE-TS is enabled via configuration
        if not self.options.get('apply_smote_ts'):
            self.logger.info("SMOTE-TS is disabled in configuration.")
            return X_seq, y_seq
        
        # Only applicable for classification tasks
        if self.model_category != 'classification':
            self.logger.info("SMOTE-TS skipped: not a classification problem.")
            return X_seq, y_seq
        
        # Only apply for supported modes: dtw, pad, and set_window.
        if self.time_series_sequence_mode not in ["dtw", "pad", "set_window"]:
            self.logger.warning(f"SMOTE-TS not recommended for mode '{self.time_series_sequence_mode}'. Skipping.")
            return X_seq, y_seq
        
        self.logger.info("Detecting phase transitions for SMOTE-TS...")
        phase_boundaries = self._detect_phase_transitions(X_seq)
        
        # Get SMOTE_TS parameters from configuration
        smote_config = self.options.get('smote_ts_config', {})
        smote_ts = SMOTE_TS(
            phases=self.sequence_dtw_or_pad_categorical,
            dtw_window=int(self.max_phase_distortion * 100),
            phase_markers=phase_boundaries,
            k_neighbors=smote_config.get('k_neighbors', 5),
            random_state=smote_config.get('random_state', 42)
        )
        
        self.logger.info(f"Applying SMOTE-TS with {smote_config.get('k_neighbors', 5)} neighbors...")
        try:
            X_resampled, y_resampled = smote_ts.fit_resample(X_seq, y_seq)
            self.logger.info(f"SMOTE-TS successful: Original shape {X_seq.shape} → Resampled shape {X_resampled.shape}")
            
            # Log the new class distribution
            unique, counts = np.unique(y_resampled, return_counts=True)
            self.logger.info(f"Resampled class distribution: {dict(zip(unique, counts))}")
            return X_resampled, y_resampled
        except Exception as e:
            self.logger.error(f"SMOTE-TS failed: {str(e)}. Returning original data.")
            return X_seq, y_seq




    def create_sequences(self, X: np.ndarray, y: np.ndarray) -> Tuple[Any, Any]:
        X_seq, y_seq = [], []
        if self.time_series_sequence_mode == "set_window":
            # Sliding window approach
            for i in range(0, len(X) - self.window_size - self.horizon + 1, self.step_size):
                seq_X = X[i:i+self.window_size]
                seq_y = y[i+self.window_size:i+self.window_size+self.horizon]
                if self.max_sequence_length and seq_X.shape[0] < self.max_sequence_length:
                    pad_width = self.max_sequence_length - seq_X.shape[0]
                    seq_X = np.pad(seq_X, ((0, pad_width), (0, 0)), mode='constant', constant_values=0)
                X_seq.append(seq_X)
                y_seq.append(seq_y)
        
        elif self.time_series_sequence_mode in ["dtw", "pad", "variable_length"]:
            # Full sequence processing
            X_seq = X
            y_seq = y
        
        else:
            raise ValueError(f"Unsupported time_series_sequence_mode: {self.time_series_sequence_mode}")

        return np.array(X_seq), np.array(y_seq)


    def split_dataset(
        self,
        X: pd.DataFrame,
        y: Optional[pd.Series] = None
    ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame], Optional[pd.Series], Optional[pd.Series]]:
        """
        Split the dataset into training and testing sets while retaining original indices.

        Args:
            X (pd.DataFrame): Features.
            y (Optional[pd.Series]): Target variable.

        Returns:
            Tuple[pd.DataFrame, Optional[pd.DataFrame], Optional[pd.Series], Optional[pd.Series]]: X_train, X_test, y_train, y_test
        """
        step_name = "split_dataset"
        self.logger.info("Step: Split Dataset into Train and Test")

        # Debugging Statements
        self._log(f"Before Split - X shape: {X.shape}", step_name, 'debug')
        if y is not None:
            self._log(f"Before Split - y shape: {y.shape}", step_name, 'debug')
        else:
            self._log("Before Split - y is None", step_name, 'debug')

        # Determine splitting based on mode
        if self.mode == 'train' and self.model_category in ['classification', 'regression']:
            if self.model_category == 'classification':
                stratify = y if self.options.get('split_dataset', {}).get('stratify_for_classification', False) else None
                test_size = self.options.get('split_dataset', {}).get('test_size', 0.2)
                random_state = self.options.get('split_dataset', {}).get('random_state', 42)
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, 
                    test_size=test_size,
                    stratify=stratify, 
                    random_state=random_state
                )
                self._log("Performed stratified split for classification.", step_name, 'debug')
            elif self.model_category == 'regression':
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, 
                    test_size=self.options.get('split_dataset', {}).get('test_size', 0.2),
                    random_state=self.options.get('split_dataset', {}).get('random_state', 42)
                )
                self._log("Performed random split for regression.", step_name, 'debug')
        else:
            # For 'predict' and 'clustering' modes or other categories
            X_train = X.copy()
            X_test = None
            y_train = y.copy() if y is not None else None
            y_test = None
            self.logger.info(f"No splitting performed for mode '{self.mode}' or model category '{self.model_category}'.")

        self.preprocessing_steps.append("Split Dataset into Train and Test")

        # Keep Indices Aligned Through Each Step
        if X_test is not None and y_test is not None:
            # Sort both X_test and y_test by index
            X_test = X_test.sort_index()
            y_test = y_test.sort_index()
            self.logger.debug("Sorted X_test and y_test by index for alignment.")

        # Debugging: Log post-split shapes and index alignment
        self._log(f"After Split - X_train shape: {X_train.shape}, X_test shape: {X_test.shape if X_test is not None else 'N/A'}", step_name, 'debug')
        if self.model_category == 'classification' and y_train is not None and y_test is not None:
            self.logger.debug(f"Class distribution in y_train:\n{y_train.value_counts(normalize=True)}")
            self.logger.debug(f"Class distribution in y_test:\n{y_test.value_counts(normalize=True)}")
        elif self.model_category == 'regression' and y_train is not None and y_test is not None:
            self.logger.debug(f"y_train statistics:\n{y_train.describe()}")
            self.logger.debug(f"y_test statistics:\n{y_test.describe()}")

        # Check index alignment
        if y_train is not None and X_train.index.equals(y_train.index):
            self.logger.debug("X_train and y_train indices are aligned.")
        else:
            self.logger.warning("X_train and y_train indices are misaligned.")

        if X_test is not None and y_test is not None and X_test.index.equals(y_test.index):
            self.logger.debug("X_test and y_test indices are aligned.")
        elif X_test is not None and y_test is not None:
            self.logger.warning("X_test and y_test indices are misaligned.")

        return X_train, X_test, y_train, y_test

    def handle_missing_values(self, X_train: pd.DataFrame, X_test: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Handle missing values for numerical and categorical features based on user options.
        """
        step_name = "handle_missing_values"
        self.logger.info("Step: Handle Missing Values")

        # Fetch user-defined imputation options or set defaults
        impute_options = self.options.get('handle_missing_values', {})
        numerical_strategy = impute_options.get('numerical_strategy', {})
        categorical_strategy = impute_options.get('categorical_strategy', {})

        # Numerical Imputation
        numerical_imputer = None
        new_columns = []
        if self.numericals:
            if self.model_category in ['regression', 'classification', 'clustering']:
                default_num_strategy = 'median'  # Changed to median as per preprocessor_config_baseball.yaml
            else:
                default_num_strategy = 'median'
            num_strategy = numerical_strategy.get('strategy', default_num_strategy)
            num_imputer_type = numerical_strategy.get('imputer', 'SimpleImputer')  # Can be 'SimpleImputer', 'KNNImputer', etc.

            self._log(f"Numerical Imputation Strategy: {num_strategy.capitalize()}, Imputer Type: {num_imputer_type}", step_name, 'debug')

            # Initialize numerical imputer based on user option
            if num_imputer_type == 'SimpleImputer':
                numerical_imputer = SimpleImputer(strategy=num_strategy)
            elif num_imputer_type == 'KNNImputer':
                knn_neighbors = numerical_strategy.get('knn_neighbors', 5)
                numerical_imputer = KNNImputer(n_neighbors=knn_neighbors)
            else:
                self.logger.error(f"Numerical imputer type '{num_imputer_type}' is not supported.")
                raise ValueError(f"Numerical imputer type '{num_imputer_type}' is not supported.")

            # Fit and transform ONLY on X_train
            X_train[self.numericals] = numerical_imputer.fit_transform(X_train[self.numericals])
            self.numerical_imputer = numerical_imputer  # Assign to self for saving
            self.feature_reasons.update({col: self.feature_reasons.get(col, '') + f'Numerical: {num_strategy.capitalize()} Imputation | ' for col in self.numericals})
            new_columns.extend(self.numericals)

            if X_test is not None:
                # Transform ONLY on X_test without fitting
                X_test[self.numericals] = numerical_imputer.transform(X_test[self.numericals])

        # Categorical Imputation
        categorical_imputer = None
        all_categoricals = self.ordinal_categoricals + self.nominal_categoricals
        if all_categoricals:
            default_cat_strategy = 'most_frequent'
            cat_strategy = categorical_strategy.get('strategy', default_cat_strategy)
            cat_imputer_type = categorical_strategy.get('imputer', 'SimpleImputer')

            self._log(f"Categorical Imputation Strategy: {cat_strategy.capitalize()}, Imputer Type: {cat_imputer_type}", step_name, 'debug')

            # Initialize categorical imputer based on user option
            if cat_imputer_type == 'SimpleImputer':
                categorical_imputer = SimpleImputer(strategy=cat_strategy)
            elif cat_imputer_type == 'ConstantImputer':
                fill_value = categorical_strategy.get('fill_value', 'Missing')
                categorical_imputer = SimpleImputer(strategy='constant', fill_value=fill_value)
            else:
                self.logger.error(f"Categorical imputer type '{cat_imputer_type}' is not supported.")
                raise ValueError(f"Categorical imputer type '{cat_imputer_type}' is not supported.")

            # Fit and transform ONLY on X_train
            X_train[all_categoricals] = categorical_imputer.fit_transform(X_train[all_categoricals])
            self.categorical_imputer = categorical_imputer  # Assign to self for saving
            self.feature_reasons.update({
                col: self.feature_reasons.get(col, '') + (f'Categorical: Constant Imputation (Value={categorical_strategy.get("fill_value", "Missing")}) | ' if cat_imputer_type == 'ConstantImputer' else f'Categorical: {cat_strategy.capitalize()} Imputation | ')
                for col in all_categoricals
            })
            new_columns.extend(all_categoricals)

            if X_test is not None:
                # Transform ONLY on X_test without fitting
                X_test[all_categoricals] = categorical_imputer.transform(X_test[all_categoricals])

        self.preprocessing_steps.append("Handle Missing Values")

        # Debugging: Log post-imputation shapes and missing values
        self._log(f"Completed: Handle Missing Values. Dataset shape after imputation: {X_train.shape}", step_name, 'debug')
        self._log(f"Missing values after imputation in X_train:\n{X_train.isnull().sum()}", step_name, 'debug')
        self._log(f"New columns handled: {new_columns}", step_name, 'debug')

        return X_train, X_test

    def handle_outliers(self, X_train: pd.DataFrame, y_train: Optional[pd.Series] = None) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
        """
        Handle outliers based on the model's sensitivity and user options.
        For time_series models, apply a custom outlier handling using a rolling statistic (median or mean)
        to replace extreme values rather than dropping rows (to preserve temporal alignment).

        Args:
            X_train (pd.DataFrame): Training features.
            y_train (pd.Series, optional): Training target.

        Returns:
            tuple: X_train with outliers handled and corresponding y_train.
        """
        step_name = "handle_outliers"
        self.logger.info("Step: Handle Outliers")
        self._log("Starting outlier handling.", step_name, 'debug')
        initial_shape = X_train.shape[0]
        outlier_options = self.options.get('handle_outliers', {})
        zscore_threshold = outlier_options.get('zscore_threshold', 3)
        iqr_multiplier = outlier_options.get('iqr_multiplier', 1.5)
        isolation_contamination = outlier_options.get('isolation_contamination', 0.05)

        # ----- NEW: Configurable outlier handling branch for time series -----
        if self.model_category == 'time_series':
            # Check if time series outlier handling is disabled
            if self.ts_outlier_method == 'none':
                self.logger.info("Time series outlier handling disabled per config")
                return X_train, y_train

            # Validate that the method is one of the allowed options
            valid_methods = ['median', 'mean']
            if self.ts_outlier_method not in valid_methods:
                raise ValueError(f"Invalid ts_outlier_method: {self.ts_outlier_method}. Choose from {valid_methods + ['none']}")

            self.logger.info(f"Applying {self.ts_outlier_method}-based outlier replacement for time series")
            
            # Process each numerical column using the selected method
            for col in self.numericals:
                # Dynamic method selection based on configuration
                if self.ts_outlier_method == 'median':
                    rolling_stat = X_train[col].rolling(window=5, center=True, min_periods=1).median()
                elif self.ts_outlier_method == 'mean':
                    rolling_stat = X_train[col].rolling(window=5, center=True, min_periods=1).mean()
                
                # Compute rolling IQR for outlier detection
                rolling_q1 = X_train[col].rolling(window=5, center=True, min_periods=1).quantile(0.25)
                rolling_q3 = X_train[col].rolling(window=5, center=True, min_periods=1).quantile(0.75)
                rolling_iqr = rolling_q3 - rolling_q1
                
                # Create an outlier mask based on deviation from the rolling statistic
                outlier_mask = abs(X_train[col] - rolling_stat) > (iqr_multiplier * rolling_iqr)
                
                # Replace detected outliers with the corresponding rolling statistic
                X_train.loc[outlier_mask, col] = rolling_stat[outlier_mask]
                self.logger.debug(f"Replaced {outlier_mask.sum()} outliers in column '{col}' using {self.ts_outlier_method} method.")
            
            self.preprocessing_steps.append("Handle Outliers (time_series custom)")
            self._log(f"Completed: Handle Outliers for time_series. Initial samples: {initial_shape}, Final samples: {X_train.shape[0]}", step_name, 'debug')
            return X_train, y_train
        # -----------------------------------------------------------------

        # Existing outlier handling for regression and classification models
        if self.model_category in ['regression', 'classification']:
            self.logger.info(f"Applying univariate outlier detection for {self.model_category}.")
            for col in self.numericals:
                # Z-Score Filtering
                apply_zscore = outlier_options.get('apply_zscore', True)
                if apply_zscore:
                    z_scores = np.abs((X_train[col] - X_train[col].mean()) / X_train[col].std())
                    mask_z = z_scores < zscore_threshold
                    removed_z = (~mask_z).sum()
                    X_train = X_train[mask_z]
                    if y_train is not None:
                        y_train = y_train.loc[X_train.index]
                    self.feature_reasons[col] += f'Outliers handled with Z-Score Filtering (threshold={zscore_threshold}) | '
                    self._log(f"Removed {removed_z} outliers from '{col}' using Z-Score Filtering.", step_name, 'debug')

                # IQR Filtering
                apply_iqr = outlier_options.get('apply_iqr', True)
                if apply_iqr:
                    Q1 = X_train[col].quantile(0.25)
                    Q3 = X_train[col].quantile(0.75)
                    IQR = Q3 - Q1
                    lower_bound = Q1 - iqr_multiplier * IQR
                    upper_bound = Q3 + iqr_multiplier * IQR
                    mask_iqr = (X_train[col] >= lower_bound) & (X_train[col] <= upper_bound)
                    removed_iqr = (~mask_iqr).sum()
                    X_train = X_train[mask_iqr]
                    if y_train is not None:
                        y_train = y_train.loc[X_train.index]
                    self.feature_reasons[col] += f'Outliers handled with IQR Filtering (multiplier={iqr_multiplier}) | '
                    self._log(f"Removed {removed_iqr} outliers from '{col}' using IQR Filtering.", step_name, 'debug')

        elif self.model_category == 'clustering':
            self.logger.info("Applying multivariate IsolationForest for clustering.")
            contamination = isolation_contamination
            iso_forest = IsolationForest(contamination=contamination, random_state=42)
            preds = iso_forest.fit_predict(X_train[self.numericals])
            mask_iso = preds != -1
            removed_iso = (preds == -1).sum()
            X_train = X_train[mask_iso]
            if y_train is not None:
                y_train = y_train.loc[X_train.index]
            self.feature_reasons['all_numericals'] += f'Outliers handled with Multivariate IsolationForest (contamination={contamination}) | '
            self._log(f"Removed {removed_iso} outliers using Multivariate IsolationForest.", step_name, 'debug')
        else:
            self.logger.warning(f"Model category '{self.model_category}' not recognized for outlier handling.")

        self.preprocessing_steps.append("Handle Outliers")
        self._log(f"Completed: Handle Outliers. Initial samples: {initial_shape}, Final samples: {X_train.shape[0]}", step_name, 'debug')
        self._log(f"Missing values after outlier handling in X_train:\n{X_train.isnull().sum()}", step_name, 'debug')
        return X_train, y_train



    def test_normality(self, X_train: pd.DataFrame) -> Dict[str, Dict]:
        """
        Test normality for numerical features based on normality tests and user options.

        Args:
            X_train (pd.DataFrame): Training features.

        Returns:
            Dict[str, Dict]: Dictionary with normality test results for each numerical feature.
        """
        step_name = "Test for Normality"
        self.logger.info(f"Step: {step_name}")
        debug_flag = self.get_debug_flag('debug_test_normality')
        normality_results = {}

        # Fetch user-defined normality test options or set defaults
        normality_options = self.options.get('test_normality', {})
        p_value_threshold = normality_options.get('p_value_threshold', 0.05)
        skewness_threshold = normality_options.get('skewness_threshold', 1.0)
        additional_tests = normality_options.get('additional_tests', [])  # e.g., ['anderson-darling']

        for col in self.numericals:
            data = X_train[col].dropna()
            skewness = data.skew()
            kurtosis = data.kurtosis()

            # Determine which normality test to use based on sample size and user options
            test_used = 'Shapiro-Wilk'
            p_value = 0.0

            if len(data) <= 5000:
                from scipy.stats import shapiro
                stat, p_val = shapiro(data)
                test_used = 'Shapiro-Wilk'
                p_value = p_val
            else:
                from scipy.stats import anderson
                result = anderson(data)
                test_used = 'Anderson-Darling'
                # Determine p-value based on critical values
                p_value = 0.0  # Default to 0
                for cv, sig in zip(result.critical_values, result.significance_level):
                    if result.statistic < cv:
                        p_value = sig / 100
                        break

            # Apply user-defined or default criteria
            if self.model_category in ['regression', 'classification', 'clustering']:
                # Linear, Logistic Regression, and Clustering: Use p-value and skewness
                needs_transform = (p_value < p_value_threshold) or (abs(skewness) > skewness_threshold)
            else:
                # Other models: Use skewness, and optionally p-values based on options
                use_p_value = normality_options.get('use_p_value_other_models', False)
                if use_p_value:
                    needs_transform = (p_value < p_value_threshold) or (abs(skewness) > skewness_threshold)
                else:
                    needs_transform = abs(skewness) > skewness_threshold

            normality_results[col] = {
                'skewness': skewness,
                'kurtosis': kurtosis,
                'p_value': p_value,
                'test_used': test_used,
                'needs_transform': needs_transform
            }

            # Conditional Detailed Logging
            if debug_flag:
                self._log(f"Feature '{col}': p-value={p_value:.4f}, skewness={skewness:.4f}, needs_transform={needs_transform}", step_name, 'debug')

        self.normality_results = normality_results
        self.preprocessing_steps.append(step_name)

        # Completion Logging
        if debug_flag:
            self._log(f"Completed: {step_name}. Normality results computed.", step_name, 'debug')
        else:
            self.logger.info(f"Step '{step_name}' completed: Normality results computed.")

        return normality_results


    def generate_recommendations(self) -> pd.DataFrame:
        """
        Generate a table of preprocessing recommendations based on the model type, data, and user options.

        Returns:
            pd.DataFrame: DataFrame containing recommendations for each feature.
        """
        step_name = "Generate Preprocessor Recommendations"
        self.logger.info(f"Step: {step_name}")
        debug_flag = self.get_debug_flag('debug_generate_recommendations')

        # Generate recommendations based on feature reasons
        recommendations = {}
        for col in self.ordinal_categoricals + self.nominal_categoricals + self.numericals:
            reasons = self.feature_reasons.get(col, '').strip(' | ')
            recommendations[col] = reasons

        recommendations_table = pd.DataFrame.from_dict(
            recommendations, 
            orient='index', 
            columns=['Preprocessing Reason']
        )
        if debug_flag:
            self.logger.debug(f"Preprocessing Recommendations:\n{recommendations_table}")
        else:
            self.logger.info("Preprocessing Recommendations generated.")

        self.preprocessing_steps.append(step_name)

        # Completion Logging
        if debug_flag:
            self._log(f"Completed: {step_name}. Recommendations generated.", step_name, 'debug')
        else:
            self.logger.info(f"Step '{step_name}' completed: Recommendations generated.")

        return recommendations_table

    def save_transformers(self):
        step_name = "Save Transformers"
        self.logger.info(f"Step: {step_name}")
        debug_flag = self.get_debug_flag('debug_save_transformers')
        
        # Ensure the transformers directory exists
        os.makedirs(self.transformers_dir, exist_ok=True)
        transformers_path = os.path.join(self.transformers_dir, 'transformers.pkl')  # Consistent file path
        
        transformers = {
            'numerical_imputer': getattr(self, 'numerical_imputer', None),
            'categorical_imputer': getattr(self, 'categorical_imputer', None),
            'preprocessor': self.pipeline,   # Includes all preprocessing steps
            'smote': self.smote,
            'final_feature_order': self.final_feature_order,
            'categorical_indices': self.categorical_indices
        }
        try:
            joblib.dump(transformers, transformers_path)
            if debug_flag:
                self._log(f"Transformers saved at '{transformers_path}'.", step_name, 'debug')
            else:
                self.logger.info(f"Transformers saved at '{transformers_path}'.")
        except Exception as e:
            self.logger.error(f"❌ Failed to save transformers: {e}")
            raise

        self.preprocessing_steps.append(step_name)

    def load_transformers(self) -> dict:
        step_name = "Load Transformers"
        self.logger.info(f"Step: {step_name}")
        debug_flag = self.get_debug_flag('debug_load_transformers')  # Assuming a step-specific debug flag
        transformers_path = os.path.join(self.transformers_dir, 'transformers.pkl')  # Correct path

        # Debug log
        self.logger.debug(f"Loading transformers from: {transformers_path}")

        if not os.path.exists(transformers_path):
            self.logger.error(f"❌ Transformers file not found at '{transformers_path}'. Cannot proceed with prediction.")
            raise FileNotFoundError(f"Transformers file not found at '{transformers_path}'.")

        try:
            transformers = joblib.load(transformers_path)

            # Extract transformers
            numerical_imputer = transformers.get('numerical_imputer')
            categorical_imputer = transformers.get('categorical_imputer')
            preprocessor = transformers.get('preprocessor')
            smote = transformers.get('smote', None)
            final_feature_order = transformers.get('final_feature_order', [])
            categorical_indices = transformers.get('categorical_indices', [])
            self.categorical_indices = categorical_indices  # Set the attribute

            # **Post-Loading Debugging:**
            if preprocessor is not None:
                try:
                    # Do not attempt to transform dummy data here
                    self.logger.debug(f"Pipeline loaded. Ready to transform new data.")
                except AttributeError as e:
                    self.logger.error(f"Pipeline's get_feature_names_out is not available: {e}")
                    expected_features = []
            else:
                self.logger.error("❌ Preprocessor is not loaded.")
                raise AttributeError("Preprocessor is not loaded.")

        except Exception as e:
            self.logger.error(f"❌ Failed to load transformers: {e}")
            raise

        self.preprocessing_steps.append(step_name)

        # Additional checks
        if preprocessor is None:
            self.logger.error("❌ Preprocessor is not loaded.")

        if debug_flag:
            self._log(f"Transformers loaded successfully from '{transformers_path}'.", step_name, 'debug')
        else:
            self.logger.info(f"Transformers loaded successfully from '{transformers_path}'.")

        # Set the pipeline
        self.pipeline = preprocessor

        # Return the transformers as a dictionary
        return {
            'numerical_imputer': numerical_imputer,
            'categorical_imputer': categorical_imputer,
            'preprocessor': preprocessor,
            'smote': smote,
            'final_feature_order': final_feature_order,
            'categorical_indices': categorical_indices
        }


    def determine_n_neighbors(self, minority_count: int, default_neighbors: int = 5) -> int:
        """
        Determine the appropriate number of neighbors for SMOTE based on minority class size.

        Args:
            minority_count (int): Number of samples in the minority class.
            default_neighbors (int): Default number of neighbors to use if possible.

        Returns:
            int: Determined number of neighbors for SMOTE.
        """
        if minority_count <= 1:
            raise ValueError("SMOTE cannot be applied when the minority class has less than 2 samples.")
        
        # Ensure n_neighbors does not exceed minority_count - 1
        n_neighbors = min(default_neighbors, minority_count - 1)
        return n_neighbors

    def implement_smote(self, X_train: pd.DataFrame, y_train: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
        """
        Implement SMOTE or its variants based on class imbalance with automated n_neighbors selection.

        Args:
            X_train (pd.DataFrame): Training features (transformed).
            y_train (pd.Series): Training target.

        Returns:
            Tuple[pd.DataFrame, pd.Series]: Resampled X_train and y_train.
        """
        step_name = "Implement SMOTE (Train Only)"
        self.logger.info(f"Step: {step_name}")

        # Check if classification
        if self.model_category != 'classification':
            self.logger.info("SMOTE not applicable: Not a classification model.")
            self.preprocessing_steps.append("SMOTE Skipped")
            return X_train, y_train

        # Calculate class distribution
        class_counts = y_train.value_counts()
        if len(class_counts) < 2:
            self.logger.warning("SMOTE not applicable: Only one class present.")
            self.preprocessing_steps.append("SMOTE Skipped")
            return X_train, y_train

        majority_class = class_counts.idxmax()
        minority_class = class_counts.idxmin()
        majority_count = class_counts.max()
        minority_count = class_counts.min()
        imbalance_ratio = minority_count / majority_count
        self.logger.info(f"Class Distribution before SMOTE: {class_counts.to_dict()}")
        self.logger.info(f"Imbalance Ratio (Minority/Majority): {imbalance_ratio:.4f}")

        # Determine SMOTE variant based on dataset composition
        has_numericals = len(self.numericals) > 0
        has_categoricals = len(self.ordinal_categoricals) + len(self.nominal_categoricals) > 0

        # Automatically select SMOTE variant
        if has_numericals and has_categoricals:
            smote_variant = 'SMOTENC'
            self.logger.info("Dataset contains both numerical and categorical features. Using SMOTENC.")
        elif has_numericals and not has_categoricals:
            smote_variant = 'SMOTE'
            self.logger.info("Dataset contains only numerical features. Using SMOTE.")
        elif has_categoricals and not has_numericals:
            smote_variant = 'SMOTEN'
            self.logger.info("Dataset contains only categorical features. Using SMOTEN.")
        else:
            smote_variant = 'SMOTE'  # Fallback
            self.logger.info("Feature composition unclear. Using SMOTE as default.")

        # Initialize SMOTE based on the variant
        try:
            if smote_variant == 'SMOTENC':
                if not self.categorical_indices:
                    # Determine categorical indices if not already set
                    categorical_features = []
                    for name, transformer, features in self.pipeline.transformers_:
                        if 'ord' in name or 'nominal' in name:
                            if isinstance(transformer, Pipeline):
                                encoder = transformer.named_steps.get('ordinal_encoder') or transformer.named_steps.get('onehot_encoder')
                                if hasattr(encoder, 'categories_'):
                                    # Calculate indices based on transformers order
                                    # This can be complex; for simplicity, assuming categorical features are the first
                                    categorical_features.extend(range(len(features)))
                    self.categorical_indices = categorical_features
                    self.logger.debug(f"Categorical feature indices for SMOTENC: {self.categorical_indices}")
                n_neighbors = self.determine_n_neighbors(minority_count, default_neighbors=5)
                smote = SMOTENC(categorical_features=self.categorical_indices, random_state=42, k_neighbors=n_neighbors)
                self.logger.debug(f"Initialized SMOTENC with categorical features indices: {self.categorical_indices} and n_neighbors={n_neighbors}")
            elif smote_variant == 'SMOTEN':
                n_neighbors = self.determine_n_neighbors(minority_count, default_neighbors=5)
                smote = SMOTEN(random_state=42, n_neighbors=n_neighbors)
                self.logger.debug(f"Initialized SMOTEN with n_neighbors={n_neighbors}")
            else:
                n_neighbors = self.determine_n_neighbors(minority_count, default_neighbors=5)
                smote = SMOTE(random_state=42, k_neighbors=n_neighbors)
                self.logger.debug(f"Initialized SMOTE with n_neighbors={n_neighbors}")
        except ValueError as ve:
            self.logger.error(f"❌ SMOTE initialization failed: {ve}")
            raise
        except Exception as e:
            self.logger.error(f"❌ Unexpected error during SMOTE initialization: {e}")
            raise

        # Apply SMOTE
        try:
            X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
            self.logger.info(f"Applied {smote_variant}. Resampled dataset shape: {X_resampled.shape}")
            self.preprocessing_steps.append("Implement SMOTE")
            self.smote = smote  # Assign to self for saving
            self.logger.debug(f"Selected n_neighbors for SMOTE: {n_neighbors}")
            return X_resampled, y_resampled
        except Exception as e:
            self.logger.error(f"❌ SMOTE application failed: {e}")
            raise

    def inverse_transform_data(self, X_transformed: np.ndarray, original_data: Optional[pd.DataFrame] = None) -> pd.DataFrame:
        """
        Perform inverse transformation on the transformed data to reconstruct original feature values.

        Args:
            X_transformed (np.ndarray): The transformed feature data.
            original_data (Optional[pd.DataFrame]): The original data before transformation.

        Returns:
            pd.DataFrame: The inverse-transformed DataFrame including passthrough columns.
        """
        if self.pipeline is None:
            self.logger.error("Preprocessing pipeline has not been fitted. Cannot perform inverse transformation.")
            raise AttributeError("Preprocessing pipeline has not been fitted. Cannot perform inverse transformation.")

        preprocessor = self.pipeline
        logger = logging.getLogger('InverseTransform')
        if self.debug or self.get_debug_flag('debug_final_inverse_transformations'):
            logger.setLevel(logging.DEBUG)
        else:
            logger.setLevel(logging.INFO)

        logger.debug(f"[DEBUG Inverse] Starting inverse transformation. Input shape: {X_transformed.shape}")

        # Initialize variables
        inverse_data = {}
        transformations_applied = False  # Flag to check if any transformations are applied
        start_idx = 0  # Starting index for slicing

        # Iterate over each transformer in the ColumnTransformer
        for name, transformer, features in preprocessor.transformers_:
            if name == 'remainder':
                logger.debug(f"[DEBUG Inverse] Skipping 'remainder' transformer (passthrough columns).")
                continue  # Skip passthrough columns

            end_idx = start_idx + len(features)
            logger.debug(f"[DEBUG Inverse] Transformer '{name}' handling features {features} with slice {start_idx}:{end_idx}")

            # Check if the transformer has an inverse_transform method
            if hasattr(transformer, 'named_steps'):
                # Access the last step in the pipeline (e.g., scaler or encoder)
                last_step = list(transformer.named_steps.keys())[-1]
                inverse_transformer = transformer.named_steps[last_step]

                if hasattr(inverse_transformer, 'inverse_transform'):
                    transformed_slice = X_transformed[:, start_idx:end_idx]
                    inverse_slice = inverse_transformer.inverse_transform(transformed_slice)

                    # Assign inverse-transformed data to the corresponding feature names
                    for idx, feature in enumerate(features):
                        inverse_data[feature] = inverse_slice[:, idx]

                    logger.debug(f"[DEBUG Inverse] Applied inverse_transform on transformer '{last_step}' for features {features}.")
                    transformations_applied = True
                else:
                    logger.debug(f"[DEBUG Inverse] Transformer '{last_step}' does not support inverse_transform. Skipping.")
            else:
                logger.debug(f"[DEBUG Inverse] Transformer '{name}' does not have 'named_steps'. Skipping.")

            start_idx = end_idx  # Update starting index for next transformer

        # Convert the inverse_data dictionary to a DataFrame
        if transformations_applied:
            inverse_df = pd.DataFrame(inverse_data, index=original_data.index if original_data is not None else None)
            logger.debug(f"[DEBUG Inverse] Inverse DataFrame shape (transformed columns): {inverse_df.shape}")
            logger.debug(f"[DEBUG Inverse] Sample of inverse-transformed data:\n{inverse_df.head()}")
        else:
            if original_data is not None:
                logger.warning("⚠️ No reversible transformations were applied. Returning original data.")
                inverse_df = original_data.copy()
                logger.debug(f"[DEBUG Inverse] Returning a copy of original_data with shape: {inverse_df.shape}")
            else:
                logger.error("❌ No transformations were applied and original_data was not provided. Cannot perform inverse transformation.")
                raise ValueError("No transformations were applied and original_data was not provided.")

        # Identify passthrough columns by excluding transformed features
        if original_data is not None and transformations_applied:
            transformed_features = set(inverse_data.keys())
            all_original_features = set(original_data.columns)
            passthrough_columns = list(all_original_features - transformed_features)
            logger.debug(f"[DEBUG Inverse] Inverse DataFrame columns before pass-through merge: {inverse_df.columns.tolist()}")
            logger.debug(f"[DEBUG Inverse] all_original_features: {list(all_original_features)}")
            logger.debug(f"[DEBUG Inverse] passthrough_columns: {passthrough_columns}")

            if passthrough_columns:
                logger.debug(f"[DEBUG Inverse] Passthrough columns to merge: {passthrough_columns}")
                passthrough_data = original_data[passthrough_columns].copy()
                inverse_df = pd.concat([inverse_df, passthrough_data], axis=1)

                # Ensure the final DataFrame has the same column order as original_data
                inverse_df = inverse_df[original_data.columns]
                logger.debug(f"[DEBUG Inverse] Final inverse DataFrame shape: {inverse_df.shape}")
                
                # Check for missing columns after inverse transform
                expected_columns = set(original_data.columns)
                final_columns = set(inverse_df.columns)
                missing_after_inverse = expected_columns - final_columns

                if missing_after_inverse:
                    err_msg = (
                    f"Inverse transform error: The following columns are missing "
                    f"after inverse transform: {missing_after_inverse}"
                    )
                    logger.error(err_msg)
                    raise ValueError(err_msg)
            else:
                logger.debug("[DEBUG Inverse] No passthrough columns to merge.")
        else:
            logger.debug("[DEBUG Inverse] Either no original_data provided or no transformations were applied.")

        return inverse_df



    def build_pipeline(self, X_train: pd.DataFrame) -> ColumnTransformer:
        transformers = []

        # Handle Numerical Features
        if self.numericals:
            numerical_strategy = self.options.get('handle_missing_values', {}).get('numerical_strategy', {}).get('strategy', 'median')
            numerical_imputer = self.options.get('handle_missing_values', {}).get('numerical_strategy', {}).get('imputer', 'SimpleImputer')

            if numerical_imputer == 'SimpleImputer':
                num_imputer = SimpleImputer(strategy=numerical_strategy)
            elif numerical_imputer == 'KNNImputer':
                knn_neighbors = self.options.get('handle_missing_values', {}).get('numerical_strategy', {}).get('knn_neighbors', 5)
                num_imputer = KNNImputer(n_neighbors=knn_neighbors)
            else:
                raise ValueError(f"Unsupported numerical imputer type: {numerical_imputer}")

            # Determine scaling method
            scaling_method = self.options.get('apply_scaling', {}).get('method', None)
            if scaling_method is None:
                # Default scaling based on model category
                if self.model_category in ['regression', 'classification', 'clustering']:
                    # For clustering, MinMaxScaler is generally preferred
                    if self.model_category == 'clustering':
                        scaler = MinMaxScaler()
                        scaling_type = 'MinMaxScaler'
                    else:
                        scaler = StandardScaler()
                        scaling_type = 'StandardScaler'
                else:
                    scaler = 'passthrough'
                    scaling_type = 'None'
            else:
                # Normalize the scaling_method string to handle case-insensitivity
                scaling_method_normalized = scaling_method.lower()
                if scaling_method_normalized == 'standardscaler':
                    scaler = StandardScaler()
                    scaling_type = 'StandardScaler'
                elif scaling_method_normalized == 'minmaxscaler':
                    scaler = MinMaxScaler()
                    scaling_type = 'MinMaxScaler'
                elif scaling_method_normalized == 'robustscaler':
                    scaler = RobustScaler()
                    scaling_type = 'RobustScaler'
                elif scaling_method_normalized == 'none':
                    scaler = 'passthrough'
                    scaling_type = 'None'
                else:
                    raise ValueError(f"Unsupported scaling method: {scaling_method}")

            numerical_transformer = Pipeline(steps=[
                ('imputer', num_imputer),
                ('scaler', scaler)
            ])

            transformers.append(('num', numerical_transformer, self.numericals))
            self.logger.debug(f"Numerical transformer added with imputer '{numerical_imputer}' and scaler '{scaling_type}'.")

        # Handle Ordinal Categorical Features
        if self.ordinal_categoricals:
            ordinal_strategy = self.options.get('encode_categoricals', {}).get('ordinal_encoding', 'OrdinalEncoder')
            if ordinal_strategy == 'OrdinalEncoder':
                ordinal_transformer = Pipeline(steps=[
                    ('imputer', SimpleImputer(strategy='most_frequent')),
                    ('ordinal_encoder', OrdinalEncoder())
                ])
                transformers.append(('ord', ordinal_transformer, self.ordinal_categoricals))
                self.logger.debug("Ordinal transformer added with OrdinalEncoder.")
            else:
                raise ValueError(f"Unsupported ordinal encoding strategy: {ordinal_strategy}")

        # Handle Nominal Categorical Features
        if self.nominal_categoricals:
            nominal_strategy = self.options.get('encode_categoricals', {}).get('nominal_encoding', 'OneHotEncoder')
            if nominal_strategy == 'OneHotEncoder':
                nominal_transformer = Pipeline(steps=[
                    ('imputer', SimpleImputer(strategy='most_frequent')),
                    ('onehot_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
                ])
                transformers.append(('nominal', nominal_transformer, self.nominal_categoricals))
                self.logger.debug("Nominal transformer added with OneHotEncoder.")
            elif nominal_strategy == 'OrdinalEncoder':
                nominal_transformer = Pipeline(steps=[
                    ('imputer', SimpleImputer(strategy='most_frequent')),
                    ('ordinal_encoder', OrdinalEncoder())
                ])
                transformers.append(('nominal_ord', nominal_transformer, self.nominal_categoricals))
                self.logger.debug("Nominal transformer added with OrdinalEncoder.")
            elif nominal_strategy == 'FrequencyEncoder':
                # Implement custom Frequency Encoding
                for feature in self.nominal_categoricals:
                    freq = X_train[feature].value_counts(normalize=True)
                    X_train[feature] = X_train[feature].map(freq)
                    self.feature_reasons[feature] += 'Frequency Encoding applied | '
                    self.logger.debug(f"Frequency Encoding applied to '{feature}'.")
            else:
                raise ValueError(f"Unsupported nominal encoding strategy: {nominal_strategy}")

        if not transformers and 'FrequencyEncoder' not in nominal_strategy:
            self.logger.error("No transformers added to the pipeline. Check feature categorization and configuration.")
            raise ValueError("No transformers added to the pipeline. Check feature categorization and configuration.")

        preprocessor = ColumnTransformer(transformers=transformers, remainder='passthrough')
        self.logger.debug("ColumnTransformer constructed with the following transformers:")
        for t in transformers:
            self.logger.debug(t)

        preprocessor.fit(X_train)
        self.logger.info("✅ Preprocessor fitted on training data.")

        # Determine categorical feature indices for SMOTENC if needed
        if self.options.get('implement_smote', {}).get('variant', None) == 'SMOTENC':
            if not self.categorical_indices:
                categorical_features = []
                for name, transformer, features in preprocessor.transformers_:
                    if 'ord' in name or 'nominal' in name:
                        if isinstance(transformer, Pipeline):
                            encoder = transformer.named_steps.get('ordinal_encoder') or transformer.named_steps.get('onehot_encoder')
                            if hasattr(encoder, 'categories_'):
                                # Calculate indices based on transformers order
                                # This can be complex; for simplicity, assuming categorical features are the first
                                categorical_features.extend(range(len(features)))
                self.categorical_indices = categorical_features
                self.logger.debug(f"Categorical feature indices for SMOTENC: {self.categorical_indices}")

        return preprocessor




    def check_target_alignment(self, X_seq: Any, y_seq: Any, horizon: int) -> bool:
        """
        Verify that for each sequence the target length matches expectations.
        For 'set_window' mode, the target should have 'horizon' rows;
        otherwise, it should equal the sequence length.
        """
        for idx, (seq, target) in enumerate(zip(X_seq, y_seq)):
            seq_length = seq.shape[0] if hasattr(seq, 'shape') else len(seq)
            
            # [UPDATED] Use current self.horizon value which may have been auto-updated
            expected_length = horizon if self.time_series_sequence_mode == "set_window" else self.horizon
            
            actual_length = target.shape[0] if hasattr(target, 'shape') else len(target)
            self.logger.debug(
                f"Sequence {idx}: sequence length = {seq_length}, expected target length = {expected_length}, actual target length = {actual_length}"
            )
            if actual_length != expected_length:
                self.logger.error(
                    f"Alignment error in sequence {idx}: expected target length {expected_length} but got {actual_length}"
                )
                return False
        return True



    def get_phase_order(self) -> List[str]:
        """Return temporal order of phases based on data or fallback to config."""
        predefined_order = ["windup", "arm_cocking", "arm_acceleration", "follow_through"]
        
        # Priority 1: Use phases detected during global alignment
        if hasattr(self, 'global_target_lengths') and self.global_target_lengths:
            detected_phases = list(self.global_target_lengths.keys())
            
            # Sort phases: predefined first, others alphabetically
            ordered = sorted(
                detected_phases,
                key=lambda x: (
                    predefined_order.index(x) 
                    if x in predefined_order 
                    else len(predefined_order),  # Push unknown phases to end
                    x
                )
            )
            return ordered
        
        # Priority 2: Fallback to configuration if no alignment data
        return self.sequence_dtw_or_pad_categorical or []


    def reassemble_phases(self, aligned_phases: Dict) -> Tuple[Dict, Dict]:
        """
        Concatenate the aligned phase arrays for each group along the temporal axis.
        This function:
        - Checks that all phases in the expected order (from get_phase_order) are present.
        - If any are missing, it raises an error with details.
        - Orders the phases and concatenates them along axis=0.
        - Records metadata (individual phase lengths, total features).
        
        Args:
            aligned_phases (dict): Dictionary mapping group keys to dictionaries of aligned phase arrays.
        
        Returns:
            Tuple[Dict, Dict]: A dictionary of final sequences and metadata.
        """
        phase_order = self.get_phase_order()
        final_seqs = {}
        metadata = {}
        for group_key, phases in aligned_phases.items():
            # Check if all expected phases are present.
            missing = set(phase_order) - set(phases.keys())
            if missing:
                self.logger.error(f"Group {group_key} is missing phases: {missing}")
                raise ValueError(f"Missing phases {missing} in group {group_key}")
            
            # Order the phases accordingly.
            ordered_phases = [phases[name] for name in phase_order]
            # Concatenate along time axis (axis=0)
            full_seq = np.concatenate(ordered_phases, axis=0)
            metadata[group_key] = {
                "phase_lengths": [arr.shape[0] for arr in ordered_phases],
                "total_features": full_seq.shape[1]
            }
            final_seqs[group_key] = full_seq
            self.logger.debug(
                f"Group {group_key} reassembled: shape {full_seq.shape} "
                f"(Phase lengths: {metadata[group_key]['phase_lengths']})"
            )
        return final_seqs, metadata


    def validate_temporal_integrity(self, final_seqs: Dict, metadata: Dict):
        """
        For each group, verify that the concatenated sequence length equals the sum of individual phase lengths.
        Raises a ValueError if a mismatch is found.
        """
        for group_key, seq in final_seqs.items():
            expected_length = sum(metadata[group_key]["phase_lengths"])
            if seq.shape[0] != expected_length:
                raise ValueError(
                    f"Group {group_key}: Expected length {expected_length}, got {seq.shape[0]}. "
                    f"Phase lengths: {metadata[group_key]['phase_lengths']}"
                )


    def validate_feature_space(self, final_seqs: Dict):
        """
        Ensure that all final sequences have the same number of features.
        """
        base_features = next(iter(final_seqs.values())).shape[1]
        for group_key, seq in final_seqs.items():
            if seq.shape[1] != base_features:
                raise ValueError(
                    f"Group {group_key}: Feature dimension mismatch. Expected {base_features}, got {seq.shape[1]}"
                )


    def log_phase_lengths(self, aligned_phases: Dict):
        """
        Logs the dimensions of each phase for each group for debugging purposes.
        """
        for group_key, phases in aligned_phases.items():
            self.logger.debug(f"\nGroup {group_key} phase dimensions:")
            for pname, parr in phases.items():
                self.logger.debug(f"  {pname}: {parr.shape}")
            # Assuming all phases have the same feature dimension:
            any_phase = next(iter(phases.values()))
            self.logger.debug(f"Total features (from a phase): {any_phase[1].shape[1] if isinstance(any_phase, tuple) else any_phase.shape[1]}")


    def sanity_check_concatenation(self, input_phases: List[np.ndarray], output_seq: np.ndarray):
        """
        Perform a sanity check by comparing sample values between the input phases and output sequence.
        Verifies that the very first value of the first phase and the last value of the last phase are preserved.
        """
        phase1_start = input_phases[0][0, 0]
        phaseN_end = input_phases[-1][-1, -1]
        if not np.isclose(output_seq[0, 0], phase1_start):
            raise AssertionError("Start value mismatch in concatenated sequence")
        if not np.isclose(output_seq[-1, -1], phaseN_end):
            raise AssertionError("End value mismatch in concatenated sequence")
        self.logger.debug("Sanity check passed for concatenation.")


    def full_reassembly_pipeline(self, aligned_phases: Dict) -> Tuple[np.ndarray, np.ndarray]:
        """
        Executes the complete reassembly pipeline:
        1. Logs input phase dimensions.
        2. Reassembles phases with reassemble_phases().
        3. Validates temporal integrity and feature consistency.
        4. Performs a sanity check on one sample group.
        5. Returns the final sequences and corresponding group labels.
        
        Args:
            aligned_phases (dict): Dictionary mapping group keys to aligned phase data.
        
        Returns:
            Tuple[np.ndarray, np.ndarray]: Final sequences array and array of group keys.
        """
        self.logger.debug("Starting full reassembly pipeline.")
        # Log the input phase dimensions
        self.logger.debug("Input phase dimensions:")
        self.log_phase_lengths(aligned_phases)
        
        # Reassemble phases
        final_seqs_dict, metadata = self.reassemble_phases(aligned_phases)
        
        # Run validations
        self.validate_temporal_integrity(final_seqs_dict, metadata)
        self.validate_feature_space(final_seqs_dict)

        # Perform a sanity check on one sample group
        sample_group = next(iter(aligned_phases.keys()))
        sample_phases = [aligned_phases[sample_group][p] for p in self.get_phase_order()]
        self.sanity_check_concatenation(sample_phases, final_seqs_dict[sample_group])
        
        # Convert the final sequences to arrays
        group_keys = list(final_seqs_dict.keys())
        sequences = np.array([final_seqs_dict[gk] for gk in group_keys])
        return sequences, np.array(group_keys)


    # ---------------------------------------------------------------------
    def apply_psi_feature_selection(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        Apply PSI-based feature selection using feature-engine's DropHighPSIFeatures.
        
        Reads configuration from self.options['psi_feature_selection'] (which is under time_series).
        """
        psi_config = self.options.get('psi_feature_selection', {})
        if not psi_config.get('enabled', False):
            self.logger.info("PSI-based feature selection is disabled. Skipping PSI step.")
            return data

        psi_threshold = psi_config.get('threshold', 0.25)
        split_frac = psi_config.get('split_frac', 0.75)
        split_distinct = psi_config.get('split_distinct', False)
        cut_off = psi_config.get('cut_off', None)

        if self.time_column is None or self.time_column not in data.columns:
            self.logger.warning("No time column specified or found. PSI-based feature selection cannot be applied.")
            return data

        features_to_analyze = [col for col in data.columns if col not in self.y_variable and col != self.time_column]

        try:
            psi_transformer = DropHighPSIFeatures(
                variables=features_to_analyze,
                split_col=self.time_column,
                split_frac=split_frac,
                split_distinct=split_distinct,
                threshold=psi_threshold,
                cut_off=cut_off
            )
            data_reduced = psi_transformer.fit_transform(data)
            dropped_features = set(features_to_analyze) - set(data_reduced.columns)
            if dropped_features:
                self.logger.info(f"Dropped {len(dropped_features)} features due to high PSI: {dropped_features}")
                for feature in dropped_features:
                    if feature in self.feature_reasons:
                        self.feature_reasons[feature] += f"Dropped due to high PSI (threshold={psi_threshold}) | "
                self.psi_values = psi_transformer.psi_values_
            return data_reduced

        except Exception as e:
            self.logger.error(f"Error during PSI-based feature selection: {e}")
            self.logger.warning("Proceeding without PSI-based feature selection.")
            return data



    def split_with_feature_engine(self, data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Split time series data using feature-engine's built-in splitting functionality.
        
        This method reads splitting parameters (split_frac, split_distinct, cut_off) from 
        self.options['feature_engine_split']. It creates a temporary DropHighPSIFeatures 
        transformer (with a dummy threshold so that no features are dropped) to perform the
        chronological split into a reference set (train) and a test set.
        
        Args:
            data (pd.DataFrame): The time series DataFrame to be split.
            
        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: (reference_set, test_set)
        """
        # Get feature-engine splitting configuration
        fe_config = self.options.get('feature_engine_split', {})
        split_frac = fe_config.get('split_frac', 0.75)
        split_distinct = fe_config.get('split_distinct', False)
        cut_off = fe_config.get('cut_off', None)

        # Check for time column availability
        if self.time_column is None or self.time_column not in data.columns:
            self.logger.warning("No time column specified or found. Falling back to standard time series splitting.")
            return self.split_time_series(data)

        try:
            # Initialize a dummy PSI transformer with threshold=1.0 (so that no features are dropped)
            splitter = DropHighPSIFeatures(
                variables=[],  # no features to analyze, we only use its splitting functionality
                split_col=self.time_column,
                split_frac=split_frac,
                split_distinct=split_distinct,
                threshold=1.0,  # set high so nothing is dropped
                cut_off=cut_off
            )
            splitter.fit(data)
            # Assume the transformer stores reference_set_ and test_set_ attributes
            reference_set = splitter.reference_set_.copy()
            test_set = splitter.test_set_.copy()
            self.logger.info(f"Data split using feature-engine method: reference_set shape={reference_set.shape}, test_set shape={test_set.shape}")
            return reference_set, test_set

        except Exception as e:
            self.logger.error(f"Error during feature-engine splitting: {e}")
            self.logger.warning("Falling back to standard time series splitting.")
            return self.split_time_series(data)


    def preprocess_time_series(self, data: pd.DataFrame) -> Tuple[Any, Any, Any, Any, pd.DataFrame, Any]:
        """
        Preprocess data for time series models with PSI-based feature selection and
        feature-engine splitting. The method reads all related options from the time_series section.
        """
        # 1. Handle missing values and outliers.
        data_clean, _ = self.handle_missing_values(data)
        X_temp = data_clean.drop(columns=self.y_variable)
        y_temp = data_clean[self.y_variable]
        X_temp, y_temp = self.handle_outliers(X_temp, y_temp)
        data_clean = pd.concat([X_temp, y_temp], axis=1)

        # 2. Sort data chronologically by self.time_column.
        if self.time_column is None:
            raise ValueError("For time series models, 'time_column' must be specified.")
        data_clean['__time__'] = pd.to_datetime(data_clean[self.time_column])
        data_sorted = data_clean.sort_values(by='__time__').drop(columns=['__time__'])

        # 3. Optionally apply PSI-based feature selection before splitting.
        psi_config = self.options.get('psi_feature_selection', {})
        if psi_config.get('enabled', False) and psi_config.get('apply_before_split', True):
            self.logger.info("Applying PSI-based feature selection before splitting.")
            data_sorted = self.apply_psi_feature_selection(data_sorted)

        # 4. Split data using the method specified in self.options['time_series_split'].
        split_method = self.options.get('time_series_split', {}).get('method', 'standard')
        if split_method == 'feature_engine':
            train_data, test_data = self.split_with_feature_engine(data_sorted)
        else:
            test_size = self.options.get('split_dataset', {}).get('test_size', 0.2)
            random_state = self.options.get('split_dataset', {}).get('random_state', 42)
            train_data, test_data = self.split_time_series(data_sorted, test_size, random_state)
        self.logger.info(f"Data split: train_data shape={train_data.shape}, test_data shape={test_data.shape}")

        # 5. Optionally apply PSI-based selection after splitting if configured.
        if psi_config.get('enabled', False) and not psi_config.get('apply_before_split', True):
            self.logger.info("Applying PSI-based feature selection after splitting on combined data.")
            combined_data = pd.concat([train_data, test_data])
            selected_features = self.apply_psi_feature_selection(combined_data).columns
            train_data = train_data[selected_features.intersection(train_data.columns)]
            test_data = test_data[selected_features.intersection(test_data.columns)]

        # 6. Extract X and y from data_sorted.
        X_clean = data_sorted.drop(columns=self.y_variable)
        y_clean = data_sorted[self.y_variable]

        # 7. Process sequences based on the configured time_series_sequence_mode.
        if self.time_series_sequence_mode == "set_window":
            X_train_seq, y_train_seq = self.process_set_window(train_data)
            X_test_seq, y_test_seq = self.process_set_window(test_data)
        elif self.time_series_sequence_mode in ["dtw", "pad", "variable_length"]:
            X_train_seq, y_train_seq = self.process_dtw_or_pad(train_data)
            X_test_seq, y_test_seq = self.process_dtw_or_pad(test_data)
        else:
            raise ValueError(f"Invalid time_series_sequence_mode: {self.time_series_sequence_mode}")

        # 8. If classification and SMOTE-TS is enabled, apply it.
        if self.model_category == 'classification' and self.options.get('apply_smote_ts', False):
            unique, counts = np.unique(y_train_seq, return_counts=True)
            imbalance_ratio = min(counts) / max(counts)
            threshold = self.options.get('smote_ts_config', {}).get('min_imbalance_ratio', 0.8)
            if imbalance_ratio < threshold:
                self.logger.info(f"Class imbalance ratio {imbalance_ratio:.2f} is below threshold {threshold}; applying SMOTE-TS.")
                X_train_seq, y_train_seq = self._apply_smote_ts(X_train_seq, y_train_seq)
            else:
                self.logger.info("Class imbalance is not significant; skipping SMOTE-TS.")

        # 9. Validate alignment between sequences and targets.
        if y_train_seq is not None and not self.check_target_alignment(X_train_seq, y_train_seq, self.horizon):
            self.logger.warning("Target alignment check failed for training sequences.")
        if y_test_seq is not None and not self.check_target_alignment(X_test_seq, y_test_seq, self.horizon):
            self.logger.warning("Target alignment check failed for test sequences.")

        # 10. Flag extreme phases and log top outliers.
        self._flag_extreme_phases(self.follow_through_stats)
        self._log_top_outliers()

        # 11. Generate recommendations and save transformers.
        recommendations = self.generate_recommendations()
        self.final_feature_order = list(self.pipeline.get_feature_names_out())
        self.save_transformers()

        # 12. Post-processing report.
        self.post_processing_report()

        return X_train_seq, X_test_seq, y_train_seq, y_test_seq, recommendations, None




    def preprocess_train(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Preprocess training data for various model types.
        For time series models, delegate to preprocess_time_series.
        
        Returns:
            - For standard models: X_train_final, X_test_final, y_train_smoted, y_test, recommendations, X_test_inverse.
            - For time series models: X_seq, None, y_seq, None, recommendations, None.
        """
        # If the model is time series, use the dedicated time series preprocessing flow.
        if self.model_category == 'time_series':
            return self.preprocess_time_series(X, y)
        
        # Standard preprocessing flow for classification/regression/clustering
        X_train_original, X_test_original, y_train_original, y_test = self.split_dataset(X, y)
        X_train_missing_values, X_test_missing_values = self.handle_missing_values(X_train_original, X_test_original)
        
        # Only perform normality tests if applicable
        if self.model_category in ['regression', 'classification', 'clustering']:
            self.test_normality(X_train_missing_values)
        
        X_train_outliers_handled, y_train_outliers_handled = self.handle_outliers(X_train_missing_values, y_train_original)
        X_test_outliers_handled = X_test_missing_values.copy() if X_test_missing_values is not None else None
        recommendations = self.generate_recommendations()
        self.pipeline = self.build_pipeline(X_train_outliers_handled)
        X_train_preprocessed = self.pipeline.fit_transform(X_train_outliers_handled)
        X_test_preprocessed = self.pipeline.transform(X_test_outliers_handled) if X_test_outliers_handled is not None else None

        if self.model_category == 'classification':
            try:
                X_train_smoted, y_train_smoted = self.implement_smote(X_train_preprocessed, y_train_outliers_handled)
            except Exception as e:
                self.logger.error(f"❌ SMOTE application failed: {e}")
                raise
        else:
            X_train_smoted, y_train_smoted = X_train_preprocessed, y_train_outliers_handled
            self.logger.info("⚠️ SMOTE not applied: Not a classification model.")

        self.final_feature_order = list(self.pipeline.get_feature_names_out())
        X_train_final = pd.DataFrame(X_train_smoted, columns=self.final_feature_order)
        X_test_final = pd.DataFrame(X_test_preprocessed, columns=self.final_feature_order, index=X_test_original.index) if X_test_preprocessed is not None else None

        try:
            self.save_transformers()
        except Exception as e:
            self.logger.error(f"❌ Saving transformers failed: {e}")
            raise

        try:
            if X_test_final is not None:
                X_test_inverse = self.inverse_transform_data(X_test_final.values, original_data=X_test_original)
                self.logger.info("✅ Inverse transformations applied successfully.")
            else:
                X_test_inverse = None
        except Exception as e:
            self.logger.error(f"❌ Inverse transformations failed: {e}")
            X_test_inverse = None

        return X_train_final, X_test_final, y_train_smoted, y_test, recommendations, X_test_inverse


    def preprocess_predict(self, X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Preprocess new data for prediction.

        Args:
            X (pd.DataFrame): New data for prediction.

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame]]: X_preprocessed, recommendations, X_inversed
        """
        step_name = "Preprocess Predict"
        self.logger.info(f"Step: {step_name}")

        # Log initial columns and feature count
        self.logger.debug(f"Initial columns in prediction data: {X.columns.tolist()}")
        self.logger.debug(f"Initial number of features: {X.shape[1]}")

        # Load transformers
        try:
            transformers = self.load_transformers()
            self.logger.debug("Transformers loaded successfully.")
        except Exception as e:
            self.logger.error(f"❌ Failed to load transformers: {e}")
            raise

        # Filter columns based on raw feature names
        try:
            X_filtered = self.filter_columns(X)
            self.logger.debug(f"Columns after filtering: {X_filtered.columns.tolist()}")
            self.logger.debug(f"Number of features after filtering: {X_filtered.shape[1]}")
        except Exception as e:
            self.logger.error(f"❌ Failed during column filtering: {e}")
            raise

        # Handle missing values
        try:
            X_filtered, _ = self.handle_missing_values(X_filtered)
            self.logger.debug(f"Columns after handling missing values: {X_filtered.columns.tolist()}")
            self.logger.debug(f"Number of features after handling missing values: {X_filtered.shape[1]}")
        except Exception as e:
            self.logger.error(f"❌ Failed during missing value handling: {e}")
            raise

        # Ensure all expected raw features are present
        expected_raw_features = self.numericals + self.ordinal_categoricals + self.nominal_categoricals
        provided_features = X_filtered.columns.tolist()

        self.logger.debug(f"Expected raw features: {expected_raw_features}")
        self.logger.debug(f"Provided features: {provided_features}")

        missing_raw_features = set(expected_raw_features) - set(provided_features)
        if missing_raw_features:
            self.logger.error(f"❌ Missing required raw feature columns in prediction data: {missing_raw_features}")
            raise ValueError(f"Missing required raw feature columns in prediction data: {missing_raw_features}")

        # Handle unexpected columns (optional: ignore or log)
        unexpected_features = set(provided_features) - set(expected_raw_features)
        if unexpected_features:
            self.logger.warning(f"⚠️ Unexpected columns in prediction data that will be ignored: {unexpected_features}")

        # Ensure the order of columns matches the pipeline's expectation (optional)
        X_filtered = X_filtered[expected_raw_features]
        self.logger.debug("Reordered columns to match the pipeline's raw feature expectations.")

        # Transform data using the loaded pipeline
        try:
            X_preprocessed_np = self.pipeline.transform(X_filtered)
            self.logger.debug(f"Transformed data shape: {X_preprocessed_np.shape}")
        except Exception as e:
            self.logger.error(f"❌ Transformation failed: {e}")
            raise

        # Retrieve feature names from the pipeline or use stored final_feature_order
        if hasattr(self.pipeline, 'get_feature_names_out'):
            try:
                columns = self.pipeline.get_feature_names_out()
                self.logger.debug(f"Derived feature names from pipeline: {columns.tolist()}")
            except Exception as e:
                self.logger.warning(f"Could not retrieve feature names from pipeline: {e}")
                columns = self.final_feature_order
                self.logger.debug(f"Using stored final_feature_order for column names: {columns}")
        else:
            columns = self.final_feature_order
            self.logger.debug(f"Using stored final_feature_order for column names: {columns}")

        # Convert NumPy array back to DataFrame with correct column names
        try:
            X_preprocessed_df = pd.DataFrame(X_preprocessed_np, columns=columns, index=X_filtered.index)
            self.logger.debug(f"X_preprocessed_df columns: {X_preprocessed_df.columns.tolist()}")
            self.logger.debug(f"Sample of X_preprocessed_df:\n{X_preprocessed_df.head()}")
        except Exception as e:
            self.logger.error(f"❌ Failed to convert transformed data to DataFrame: {e}")
            raise

        # Inverse transform for interpretability (optional, for interpretability)
        try:
            self.logger.debug(f"[DEBUG] Original data shape before inverse transform: {X.shape}")
            X_inversed = self.inverse_transform_data(X_preprocessed_np, original_data=X)
            self.logger.debug(f"[DEBUG] Inversed data shape: {X_inversed.shape}")
        except Exception as e:
            self.logger.error(f"❌ Inverse transformation failed: {e}")
            X_inversed = None

        # Generate recommendations (if applicable)
        try:
            recommendations = self.generate_recommendations()
            self.logger.debug("Generated preprocessing recommendations.")
        except Exception as e:
            self.logger.error(f"❌ Failed to generate recommendations: {e}")
            recommendations = pd.DataFrame()

        # Prepare outputs
        return X_preprocessed_df, recommendations, X_inversed

    def preprocess_clustering(self, X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Preprocess data for clustering mode.

        Args:
            X (pd.DataFrame): Input features for clustering.

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: X_processed, recommendations.
        """
        step_name = "Preprocess Clustering"
        self.logger.info(f"Step: {step_name}")
        debug_flag = self.get_debug_flag('debug_handle_missing_values')  # Use relevant debug flags

        # Handle Missing Values
        X_missing, _ = self.handle_missing_values(X, None)
        self.logger.debug(f"After handling missing values: X_missing.shape={X_missing.shape}")

        # Handle Outliers
        X_outliers_handled, _ = self.handle_outliers(X_missing, None)
        self.logger.debug(f"After handling outliers: X_outliers_handled.shape={X_outliers_handled.shape}")

        # Test Normality (optional for clustering)
        if self.model_category in ['clustering']:
            self.logger.info("Skipping normality tests for clustering.")
        else:
            self.test_normality(X_outliers_handled)

        # Generate Preprocessing Recommendations
        recommendations = self.generate_recommendations()

        # Build and Fit the Pipeline
        self.pipeline = self.build_pipeline(X_outliers_handled)
        self.logger.debug("Pipeline built and fitted.")

        # Transform the data
        X_processed = self.pipeline.transform(X_outliers_handled)
        self.logger.debug(f"After pipeline transform: X_processed.shape={X_processed.shape}")

        # Optionally, inverse transformations can be handled if necessary

        # Save Transformers (if needed)
        # Not strictly necessary for clustering unless you plan to apply the same preprocessing on new data
        self.save_transformers()

        self.logger.info("✅ Clustering data preprocessed successfully.")

        return X_processed, recommendations

    def final_preprocessing(self, data: pd.DataFrame) -> Tuple:
        """
        Execute the full preprocessing pipeline based on the mode.

        For 'train' mode:
        - If time series: pass the full filtered DataFrame (which includes the target) 
            to preprocess_time_series.
        - Else: split the data into X and y, then call preprocess_train.
        For 'predict' and 'clustering' modes, the existing flow remains unchanged.

        Returns:
            Tuple: Depending on mode:
                - 'train': For standard models: X_train, X_test, y_train, y_test, recommendations, X_test_inverse.
                            For time series models: X_seq, None, y_seq, None, recommendations, None.
                - 'predict': X_preprocessed, recommendations, X_inverse.
                - 'clustering': X_processed, recommendations.
        """
        self.logger.info(f"Starting: Final Preprocessing Pipeline in '{self.mode}' mode.")
        
        try:
            data = self.filter_columns(data)
            self.logger.info("✅ Column filtering completed successfully.")
        except Exception as e:
            self.logger.error(f"❌ Column filtering failed: {e}")
            raise

        if self.mode == 'train':
            if self.model_category == 'time_series':
                # For time series mode, do not split the DataFrame.
                # Pass the full filtered data (which still contains the target variable)
                # so that the time series preprocessing flow can extract the target after cleaning and sorting.
                return self.preprocess_time_series(data)
            else:
                if not all(col in data.columns for col in self.y_variable):
                    missing_y = [col for col in self.y_variable if col not in data.columns]
                    raise ValueError(f"Target variable(s) {missing_y} not found in the dataset.")
                X = data.drop(self.y_variable, axis=1)
                y = data[self.y_variable].iloc[:, 0] if len(self.y_variable) == 1 else data[self.y_variable]
                return self.preprocess_train(X, y)
        
        elif self.mode == 'predict':
            X = data.copy()
            transformers_path = os.path.join(self.transformers_dir, 'transformers.pkl')
            if not os.path.exists(transformers_path):
                self.logger.error(f"❌ Transformers file not found at '{self.transformers_dir}'. Cannot proceed with prediction.")
                raise FileNotFoundError(f"Transformers file not found at '{self.transformers_dir}'.")
            X_preprocessed, recommendations, X_inversed = self.preprocess_predict(X)
            self.logger.info("✅ Preprocessing completed successfully in predict mode.")
            return X_preprocessed, recommendations, X_inversed
        
        elif self.mode == 'clustering':
            X = data.copy()
            return self.preprocess_clustering(X)
        
        else:
            raise NotImplementedError(f"Mode '{self.mode}' is not implemented.")




import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import joblib
import os

# -----------------------------
# CONFIGURATION (using the new paths and features)
# -----------------------------
import yaml

# Load configuration from YAML file
config_file = "../../dataset/test/preprocessor_config/preprocessor_config_baseball.yaml"
with open(config_file, 'r') as f:
    config = yaml.safe_load(f)

# -----------------------------
# TRAINING PHASE (UPDATED)
# -----------------------------
# 1. Load your training data using the configured data_dir and raw_data path.
data_path = os.path.join(config["paths"]["data_dir"], config["paths"]["raw_data"])
data = pd.read_parquet(data_path)
# Filter out time_step column
data = data.drop('time_step', axis=1, errors='ignore')

# Display columns
print("\nDataset columns:")
for col in data.columns:
    print(f"- {col}")

# Check for null sums in the filtered data
null_sums = data.isnull().sum()
if null_sums.any():
    print("[WARNING] Found null values in the following columns:")
    print(null_sums[null_sums > 0])
else:
    print("[INFO] Dataset contains no null values and is ready for machine learning.")

# Filter out "Follow Through" phase
data = data[data['pitch_phase_biomech'] != 'Follow Through']
print(f"[INFO] Training data loaded from {data_path}. Shape: {data.shape}")
print(f"[INFO] Filtered out 'Follow Through' phase. New shape: {data.shape}")

print("Available config keys:", config.keys())
options = config.get("time_series", {})
if not options:
    raise KeyError("The configuration is missing the 'time_series' key. Please verify the YAML configuration.")

# 3. Create a preprocessor in train mode using the new feature lists.
preprocessor = DataPreprocessor(
    model_type="LSTM",
    y_variable=config["features"]["y_variable"],
    ordinal_categoricals=config["features"]["ordinal_categoricals"],
    nominal_categoricals=config["features"]["nominal_categoricals"],
    numericals=config["features"]["numericals"],
    mode="train",
    options=config["time_series"],  # Pass the whole 'models' section
    debug=True,
    graphs_output_dir=config["paths"]["plots_output_dir"],
    transformers_dir=config["paths"]["transformers_save_base_dir"],
    sequence_categorical=["session_biomech", "trial_biomech"],
    sequence_dtw_or_pad_categorical=["pitch_phase_biomech"],
    time_series_sequence_mode=config["time_series"]["ts_sequence_mode"]
)

# 4. Preprocess training data to obtain sequences.
X_seq, _, y_seq, _, recommendations, _ = preprocessor.final_preprocessing(data)
print("Preprocessing recommendations:")
print(recommendations)

# -----------------------------
# TRAINING THE HYBRID MODEL
# -----------------------------
import os
import numpy as np
import tensorflow as tf  # For mixed precision and optimizer modifications
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Masking, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
# Import TensorFlow Addons for the R² metric
import tensorflow_addons as tfa
# Import TCN module (make sure to install with: pip install tcn)
from tcn import TCN

def build_hybrid_model(input_shape, use_tcn=True, bidirectional=False):
    """
    Builds a model using a TCN-LSTM hybrid architecture or a pure LSTM architecture.
    
    Parameters:
        input_shape (tuple): Shape of the input data (sequence_length, num_features).
        use_tcn (bool): If True, adds TCN layers before the LSTM.
        bidirectional (bool): If True, wraps the LSTM layer in a Bidirectional wrapper.
                             For TCN-LSTM, bidirectional LSTM is applied after TCN layers.
                             
    Returns:
        model (tf.keras.Model): The compiled model.
    """
    model = Sequential()
    # Preserve original masking for padded sequences
    model.add(Masking(mask_value=0., input_shape=input_shape))
    
    if use_tcn:
        print("[ARCHITECTURE] Building TCN-LSTM hybrid model")
        # Add three TCN layers with increasing dilation rates
        for i in range(3):
            # Each TCN layer uses 64 filters, kernel size 3, and dilations that increase with the layer
            model.add(TCN(nb_filters=64,
                          kernel_size=3,
                          dilations=[2**j for j in range(3)],
                          return_sequences=True,
                          activation='relu'))
        # After TCN, add an LSTM layer.
        if bidirectional:
            # Use bidirectional LSTM with 32 units per direction
            model.add(Bidirectional(LSTM(32, return_sequences=False)))
        else:
            model.add(LSTM(64, return_sequences=False))
    else:
        print("[ARCHITECTURE] Building pure LSTM model")
        # Original logic: either bidirectional or unidirectional LSTM
        if bidirectional:
            model.add(Bidirectional(LSTM(64, return_sequences=False)))
        else:
            model.add(LSTM(64, return_sequences=False))
    
    # Common output layers
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='linear', dtype='float32'))
    
    return model

def train_lstm_model(X_seq, y_seq, ts_params, config, use_tcn=False, bidirectional=False):
    """
    Processes input data, builds/trains/saves a model with dynamic sequence length handling.
    NEW updates:
      - Reduced learning rate to 1e-5.
      - Added gradient clipping (clipnorm=1.0).
      - Reduced batch size from 64 to 32.
      - Enabled mixed precision for dynamic loss scaling.
      - Adaptive loss function selection based on target standard deviation.
      - Additional regression metrics (MAE, RMSE, R², MAPE) during model compilation.
      - NEW: Optionally uses a TCN-LSTM hybrid architecture based on 'use_tcn' flag.
      - NEW: Supports bidirectional LSTM via the 'bidirectional' flag.
    
    Parameters:
        X_seq (np.ndarray): 3D input sequences.
        y_seq (np.ndarray): Corresponding target values.
        ts_params (dict): Time series parameters.
        config (dict): Configuration dictionary.
        use_tcn (bool): Whether to use TCN layers before LSTM.
        bidirectional (bool): Whether to use a bidirectional LSTM (or after TCN).
        
    Architecture Selection Guidelines:
    ----------------------------------
    1. Pure LSTM (use_tcn=False):
       - Suitable for real-time applications.
       - Unidirectional: 64 units; Bidirectional: 64 units wrapped bidirectionally.
    2. TCN-LSTM Hybrid (use_tcn=True):
       - Suitable for offline analysis with multi-scale temporal context.
       - Uses three TCN layers with dilated convolutions.
       - Followed by an LSTM layer (bidirectional uses 32 units per direction).
    """
    # Enable mixed precision
    from tensorflow.keras.mixed_precision import set_global_policy
    set_global_policy('mixed_float16')
    print("[INFO] Mixed precision enabled with dynamic loss scaling.")
    
    # Debug prints for input data shapes
    print(f"Type of X_seq: {type(X_seq)}")
    print(f"Shape of X_seq: {X_seq.shape}")
    print(f"Type of y_seq: {type(y_seq)}")
    print(f"Shape of y_seq: {y_seq.shape}")
    
    num_features = X_seq.shape[2]
    for feat in range(num_features):
        unique_count = np.unique(X_seq[0, :, feat]).size
        print(f"[DEBUG] Feature {feat} unique values count: {unique_count}")
    
    # Validate input shape
    assert len(X_seq.shape) == 3, f"Expected 3D input for LSTM, got {X_seq.shape}"
    
    num_sequences, detected_seq_length, num_features = X_seq.shape
    if ts_params.get("ts_sequence_mode", "").lower() == "set_window":
        if 'window_size' not in ts_params or ts_params["window_size"] != detected_seq_length:
            original_window_size = ts_params.get("window_size", "undefined")
            ts_params["window_size"] = detected_seq_length
            print(f"Auto-updated window_size: {original_window_size} → {detected_seq_length}")
        
    sequence_lengths = [seq.shape[0] for seq in X_seq]
    unique_lengths = set(sequence_lengths)
    if len(unique_lengths) > 1:
        raise ValueError(f"Mixed sequence lengths detected: {unique_lengths}")
    else:
        print(f"All sequences have {detected_seq_length} frames")
        ts_params["validated_seq_length"] = detected_seq_length
    
    # Scale the input sequences using StandardScaler
    scaler = StandardScaler()
    X_seq_reshaped = X_seq.reshape(-1, num_features)
    X_seq_scaled = scaler.fit_transform(X_seq_reshaped)
    X_seq = X_seq_scaled.reshape(num_sequences, detected_seq_length, num_features)
    
    # Adaptive Loss Function Selection based on target variability
    label_std = np.std(y_seq)
    loss_fn = 'mse'
    if label_std > 1e4:
        loss_fn = 'mae'
        print(f"[INFO] High target standard deviation ({label_std:.2f}) detected; using MAE loss instead of MSE.")
    else:
        print(f"[INFO] Target standard deviation ({label_std:.2f}) within acceptable range; using MSE loss.")
    
    # Build the model using the hybrid builder
    input_shape = (detected_seq_length, num_features)
    model = build_hybrid_model(input_shape, use_tcn=use_tcn, bidirectional=bidirectional)
    
    # Configure optimizer with gradient clipping
    optimizer = Adam(learning_rate=1e-5, clipnorm=1.0)
    # Compile with additional regression metrics
    model.compile(
        optimizer=optimizer,
        loss=loss_fn,
        metrics=[
            tf.keras.metrics.MeanAbsoluteError(name='mae'),
            tf.keras.metrics.RootMeanSquaredError(name='rmse'),
            tfa.metrics.RSquare(name='r2'),
            tf.keras.metrics.MeanAbsolutePercentageError(name='mape')
        ]
    )
    print("[INFO] Model compiled with learning rate=1e-5, gradient clipping (clipnorm=1.0), and additional metrics (MAE, RMSE, R², MAPE).")
    
    early_stop = EarlyStopping(monitor='loss', patience=5)
    history = model.fit(
        X_seq,
        y_seq,
        epochs=5,
        batch_size=32,
        callbacks=[early_stop],
        verbose=2
    )
    
    # Evaluate the model on the training data
    evaluation = model.evaluate(X_seq, y_seq, verbose=0)
    print(f"[INFO] Training evaluation metrics: {evaluation}")
    
    model_path = os.path.join(config["paths"]["model_save_base_dir"], "lstm_model.h5")
    model.save(model_path)
    print(f"Model saved to {model_path}")
    
    return model

# ====================================================
# Configuration Update Example
# ====================================================
# Extend your configuration with architecture options
config["model_architecture"] = {
    "use_tcn": True,  # Set to True to enable TCN-LSTM hybrid architecture
    "bidirectional": False,  # Toggle bidirectionality as needed
    "unidirectional_units": 64,
    "bidirectional_units_per_direction": 32,
    "usage_guidance": {
        "pure_lstm": "Real-time applications, causal predictions",
        "tcn_lstm": "Offline analysis with multi-scale temporal context"
    }
}


# 2. Set up time series parameters from the configuration, including the new outlier handling option.
ts_params = config["time_series"]
# ====================================================
# Training Interface Update Example
# ====================================================
# Update the training call with architectural toggles from config
model = train_lstm_model(X_seq, y_seq, ts_params, config, 
                           use_tcn=config["model_architecture"]["use_tcn"],
                           bidirectional=config["model_architecture"]["bidirectional"])

# # ====================================================
# # Validation & Compatibility Testing Example
# # ====================================================
# # Optionally, test both architectures and compare parameter counts
# for use_tcn_flag in [False, True]:
#     for bidir in [False, True]:
#         print(f"\n{'='*40}\nTesting mode: use_tcn={use_tcn_flag}, bidirectional={bidir}\n{'='*40}")
#         test_model = train_lstm_model(X_seq, y_seq, ts_params, config, use_tcn=use_tcn_flag, bidirectional=bidir)
#         print(f"Parameter count: {test_model.count_params():,}")
#         del test_model  # Clear memory between tests




ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (c:\Users\GeoffreyHadfield\Anaconda3\envs\data_science_ml_preprocessor\lib\site-packages\sklearn\utils\_param_validation.py)

In [None]:
import sklearn
print(sklearn.__version__)
from imblearn.over_sampling import SMOTE, ADASYN, SMOTENC, SMOTEN, BorderlineSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
print("All imblearn modules imported successfully!")



Notes:
dtw is doing best overall but we should add a report to show it vs pad vs set_window

tcn is good with bidirectional not with unidirectional, uni is good by itself. Add this to the report as well. 


In [23]:
# ... existing imports ...
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import json
from datetime import datetime
import os
import tensorflow as tf

def run_sequence_mode_experiment(data, config, sequence_mode, model_architectures):
    """
    Run experiment for a specific sequence mode with different model architectures.
    
    Args:
        data (pd.DataFrame): Input data.
        config (dict): Configuration dictionary.
        sequence_mode (str): One of ["set_window", "dtw", "pad", "variable_length"].
        model_architectures (list): List of dicts containing model architecture settings.
    
    Returns:
        dict: A dictionary of experiment results.
    """
    results = {}
    
    # Update ts_params with time-series configuration for the experiment.
    # For each mode, we want to:
    #   - Always include the time_column.
    #   - For set_window mode: include window_size and step_size.
    #   - For DTW/pad modes: include max_sequence_length.
    #   - For all modes, we set horizon and the time_series_sequence_mode.
    ts_params = {
        "enabled": True,
        "time_column": "ongoing_timestamp_biomech",
        "window_size": 500,          # Only used for set_window mode.
        "horizon": 10,
        "step_size": 1,              # Only used for set_window mode.
        "max_sequence_length": 500,  # Used in pad mode.
        "time_series_sequence_mode": sequence_mode,
        # Optionally, you can also add a pad_threshold here if needed, e.g.:
        # "pad_threshold": 0.5,
        "handle_outliers": {
            "time_series_method": "none",  # Use "none" to disable custom time-series outlier handling.
        }
    }
    
    print(f"\n{'='*80}")
    print(f"Testing sequence mode: {sequence_mode}")
    print(f"{'='*80}")
    
    try:
        # Create preprocessor with current sequence mode and the appropriate parameters.
        preprocessor = DataPreprocessor(
            model_type="LSTM",  # Assume an LSTM-based model for time series.
            y_variable=config["features"]["y_variable"],
            ordinal_categoricals=config["features"]["ordinal_categoricals"],
            nominal_categoricals=config["features"]["nominal_categoricals"],
            numericals=config["features"]["numericals"],
            mode="train",
            options=ts_params,
            debug=True,
            graphs_output_dir=config["paths"]["plots_output_dir"],
            transformers_dir=config["paths"]["transformers_save_base_dir"],
            time_column=ts_params.get("time_column"),
            window_size=ts_params.get("window_size"), # set_window argument
            horizon=ts_params.get("horizon"),
            step_size=ts_params.get("step_size"), # set_window argument
            max_sequence_length=ts_params.get("max_sequence_length"), # set_window argument
            sequence_categorical=["session_biomech", "trial_biomech"],
            sequence_dtw_or_pad_categorical=["pitch_phase_biomech"],
            time_series_sequence_mode=sequence_mode
        )
        
        # Preprocess the data using the final_preprocessing method.
        # This returns:
        #   X_seq: The processed time-series sequences (as a NumPy array).
        #   y_seq: The aligned target sequences.
        #   recommendations: Preprocessing recommendations.
        X_seq, _, y_seq, _, recommendations, _ = preprocessor.final_preprocessing(data)
        
        # Test each provided model architecture.
        for arch in model_architectures:
            arch_name = f"{'TCN-' if arch['use_tcn'] else ''}{'Bi' if arch['bidirectional'] else ''}LSTM"
            print(f"\nTesting architecture: {arch_name}")
            
            try:
                # Train the model (train_lstm_model should be defined elsewhere)
                model = train_lstm_model(
                    X_seq, 
                    y_seq, 
                    ts_params, 
                    config, 
                    use_tcn=arch['use_tcn'],
                    bidirectional=arch['bidirectional']
                )
                
                # Evaluate the model using the same data (for demonstration).
                eval_metrics = model.evaluate(X_seq, y_seq, verbose=0)
                metric_names = ['loss', 'mae', 'rmse', 'r2', 'mape']
                
                # Store results in the dictionary.
                results[arch_name] = {
                    'metrics': dict(zip(metric_names, eval_metrics)),
                    'sequence_shape': X_seq.shape,
                    'architecture': arch
                }
                
                # Clean up model and clear session.
                del model
                tf.keras.backend.clear_session()
                
            except Exception as e:
                print(f"Error with architecture {arch_name}: {str(e)}")
                results[arch_name] = {'error': str(e)}
        
    except Exception as e:
        print(f"Error with sequence mode {sequence_mode}: {str(e)}")
        results['preprocessing_error'] = str(e)
    
    return results

def save_experiment_results(results, config):
    """Save experiment results to a JSON file."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"experiment_results_{timestamp}.json"
    filepath = os.path.join(config["paths"]["training_output_dir"], filename)
    
    def convert_to_serializable(obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return obj
    
    serializable_results = json.loads(json.dumps(results, default=convert_to_serializable))
    
    with open(filepath, 'w') as f:
        json.dump(serializable_results, f, indent=2)
    
    print(f"Results saved to {filepath}")

def print_experiment_summary(all_results):
    """Print a summary of all experiment results."""
    print("\n" + "="*100)
    print("EXPERIMENT SUMMARY")
    print("="*100)
    
    for sequence_mode, results in all_results.items():
        print(f"\nSequence Mode: {sequence_mode}")
        print("-" * 80)
        
        if 'preprocessing_error' in results:
            print(f"ERROR: {results['preprocessing_error']}")
            continue
            
        for arch_name, arch_results in results.items():
            if 'error' in arch_results:
                print(f"{arch_name}: ERROR - {arch_results['error']}")
                continue
                
            metrics = arch_results['metrics']
            print(f"\n{arch_name}:")
            print(f"  Sequence Shape: {arch_results['sequence_shape']}")
            print(f"  MAE: {metrics['mae']:.4f}")
            print(f"  RMSE: {metrics['rmse']:.4f}")
            print(f"  R²: {metrics['r2']:.4f}")
            print(f"  MAPE: {metrics['mape']:.4f}")

# Define model architectures to test
model_architectures = [
    # {'use_tcn': False, 'bidirectional': False},  # LSTM
    # {'use_tcn': False, 'bidirectional': True},   # BiLSTM
    # {'use_tcn': True, 'bidirectional': False},   # TCN-LSTM
    {'use_tcn': True, 'bidirectional': True}     # TCN-BiLSTM
]

# Sequence modes to test
sequence_modes = ["set_window", "dtw", "pad", "variable_length"]

# Run experiments for each sequence mode and collect results.
all_results = {}
for mode in sequence_modes:
    all_results[mode] = run_sequence_mode_experiment(data, config, mode, model_architectures)

# Print summary of all experiments.
print_experiment_summary(all_results)



Testing sequence mode: set_window
Error with sequence mode set_window: DataPreprocessor.__init__() got an unexpected keyword argument 'time_column'

Testing sequence mode: dtw
Error with sequence mode dtw: DataPreprocessor.__init__() got an unexpected keyword argument 'time_column'

Testing sequence mode: pad
Error with sequence mode pad: DataPreprocessor.__init__() got an unexpected keyword argument 'time_column'

Testing sequence mode: variable_length
Error with sequence mode variable_length: DataPreprocessor.__init__() got an unexpected keyword argument 'time_column'

EXPERIMENT SUMMARY

Sequence Mode: set_window
--------------------------------------------------------------------------------
ERROR: DataPreprocessor.__init__() got an unexpected keyword argument 'time_column'

Sequence Mode: dtw
--------------------------------------------------------------------------------
ERROR: DataPreprocessor.__init__() got an unexpected keyword argument 'time_column'

Sequence Mode: pad
-----