In [None]:
# %%writefile ml/feature_selection/data_loader_post_select_features.py

import logging
import pandas as pd
import pickle
from typing import Optional, Dict


def load_feature_names_for_base_data(filepath: str):
    """
    Load feature names from a pickle file.
    """
    with open(filepath, 'rb') as file:
        feature_names = pickle.load(file)
    return feature_names


def load_base_data_for_dataset(filepath: str):
    """
    Load the dataset from a CSV file.
    """
    return pd.read_csv(filepath)


def filter_base_data_for_select_features(dataset: pd.DataFrame, feature_names: list, debug: bool = False):
    """
    Filter the dataset to include only the specified feature names.
    """
    if feature_names is not None and len(feature_names) > 0:
        # Ensure only columns present in both the DataFrame and the selected features list are retained
        common_columns = set(dataset.columns).intersection(feature_names)
        filtered_dataset = dataset[list(common_columns)]
        if debug:
            print("Loaded and filtered dataset based on selected features:")
            print(filtered_dataset.head())
        return filtered_dataset
    else:
        print("No valid selected features found.")
        return None


def load_selected_features_data(
    features_path: str,
    dataset_path: str,
    y_variable: str,
    debug: bool = False
) -> pd.DataFrame:
    """
    Process machine learning data.

    Args:
        features_path (str): Path to the file containing feature names.
        dataset_path (str): Path to the main dataset file.
        y_variable (str): The target variable name.
        debug (bool): Flag to enable detailed debugging information.

    Returns:
        pd.DataFrame: The processed dataset ready for further analysis.

    Raises:
        ValueError: If any required step fails or invalid input is provided.
    """
    if debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    try:
        # Load the list of selected feature names
        logging.info("Loading selected features...")
        selected_features = load_feature_names_for_base_data(features_path)
        length_of_features = len(selected_features)
        logging.debug(f"Number of Features Selected = {length_of_features}" )
        logging.debug(f"Features Selected = {selected_features}" )

        # Load the dataset
        logging.info("Loading dataset...")
        final_ml_df = load_base_data_for_dataset(dataset_path)

        # Filter the DataFrame using the loaded list of selected feature names
        logging.info("Filtering dataset for selected features...")
        final_ml_df_selected_features = filter_base_data_for_select_features(
            final_ml_df, 
            selected_features, 
            debug=debug
        )

        if final_ml_df_selected_features is None or final_ml_df_selected_features.empty:
            raise ValueError("Filtered DataFrame is empty or invalid.")
        
        logging.info("Data processing complete. Returning processed DataFrame.")
        return final_ml_df_selected_features

    except Exception as e:
        logging.error(f"An error occurred during data processing: {e}")
        raise


if __name__ == "__main__":
    # Example usage:
    final_ml_df_selected_features = load_selected_features_data(
        features_path='../../data/model/pipeline/final_ml_df_selected_features_columns.pkl',
        dataset_path='../../data/processed/final_ml_dataset.csv',
        y_variable='result',
        debug=True
    )


In [None]:
# %%writefile ml/classification_preprocessor/smote_automation.py


import pandas as pd
import numpy as np
import logging
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances
from imblearn.over_sampling import BorderlineSMOTE, ADASYN, SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from collections import Counter

# Configure root logger
logging.basicConfig(
    level=logging.DEBUG,  # Ensure DEBUG level is enabled
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.StreamHandler()]
)

# Create a module-specific logger (if not already created)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)  # Explicitly set module-specific logger to DEBUG


def check_dataset_for_smote(
    X_train, y_train, debug=False,
    imbalance_threshold=0.2, noise_threshold=0.5,
    overlap_threshold=0.3, boundary_threshold=0.4,
    extreme_imbalance_threshold=0.05
):
    """
    Analyzes a dataset to recommend the best SMOTE variant.

    Parameters:
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series): Training target labels.
        debug (bool): Whether to log debug information.
        imbalance_threshold (float): Threshold for severe imbalance.
        noise_threshold (float): Threshold for noise detection.
        overlap_threshold (float): Threshold for class overlap detection.
        boundary_threshold (float): Threshold for boundary concentration detection.
        extreme_imbalance_threshold (float): Threshold for extreme imbalance.

    Returns:
        dict: Recommendations for SMOTE variants and analysis details.
    """
    if not isinstance(X_train, pd.DataFrame) or not isinstance(y_train, pd.Series):
        raise TypeError("X_train must be a DataFrame and y_train must be a Series.")

    # Step 1: Class Distribution
    class_distribution = y_train.value_counts(normalize=True)
    majority_class = class_distribution.idxmax()
    minority_class = class_distribution.idxmin()

    severe_imbalance = class_distribution[minority_class] < imbalance_threshold
    extreme_imbalance = class_distribution[minority_class] < extreme_imbalance_threshold

    if debug:
        logger.debug(f"X_train Shape: {X_train.shape}")
        logger.debug(f"Class Distribution: {class_distribution.to_dict()}")
        if extreme_imbalance:
            logging.warning(f"Extreme imbalance detected: {class_distribution[minority_class]:.2%}")

    # Step 2: Noise Analysis
    minority_samples = X_train[y_train == minority_class]
    majority_samples = X_train[y_train == majority_class]

    try:
        knn = NearestNeighbors(n_neighbors=5).fit(majority_samples)
        distances, _ = knn.kneighbors(minority_samples)
        median_distance = np.median(distances)
        noise_ratio = np.mean(distances < median_distance)
        noisy_data = noise_ratio > noise_threshold

        if debug:
            logger.debug(f"Median Distance to Nearest Neighbors: {median_distance}")
            logger.debug(f"Noise Ratio: {noise_ratio:.2%}")
    except ValueError as e:
        logging.error(f"Noise analysis error: {e}")
        noisy_data = False

    # Step 3: Overlap Analysis
    try:
        pdistances = pairwise_distances(minority_samples, majority_samples)
        overlap_metric = np.mean(pdistances < 1.0)
        overlapping_classes = overlap_metric > overlap_threshold

        if debug:
            logger.debug(f"Overlap Metric: {overlap_metric:.2%}")
    except ValueError as e:
        logging.error(f"Overlap analysis error: {e}")
        overlapping_classes = False

    # Step 4: Boundary Concentration
    try:
        boundary_ratio = np.mean(np.min(distances, axis=1) < np.percentile(distances, 25))
        boundary_concentration = boundary_ratio > boundary_threshold

        if debug:
            logger.debug(f"Boundary Concentration Ratio: {boundary_ratio:.2%}")
    except Exception as e:
        logging.error(f"Boundary concentration error: {e}")
        boundary_concentration = False

    # Step 5: Recommendations
    recommendations = []
    if severe_imbalance:
        recommendations.append("ADASYN" if not noisy_data else "SMOTEENN")
    if noisy_data:
        recommendations.append("SMOTEENN")
    if overlapping_classes:
        recommendations.append("SMOTETomek")
    if boundary_concentration:
        recommendations.append("BorderlineSMOTE")
    if not recommendations:
        recommendations.append("SMOTE")

    if debug:
        logger.debug("SMOTE Analysis Complete.")
        logger.debug(f"Recommendations: {recommendations}")

    return {
        "recommendations": recommendations,
        "details": {
            "severe_imbalance": severe_imbalance,
            "noisy_data": noisy_data,
            "overlapping_classes": overlapping_classes,
            "boundary_concentration": boundary_concentration
        }
    }

def apply_smote(X_train, y_train, recommendations, debug=False, smote_params=None):
    """
    Applies the recommended SMOTE variant to the dataset.

    Parameters:
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series): Training target labels.
        recommendations (list or str): Recommended SMOTE variants or a single SMOTE variant.
        debug (bool): Whether to log debug information.
        smote_params (dict): Parameters for SMOTE variants.

    Returns:
        pd.DataFrame, pd.Series: Resampled features and target labels.
        str: The applied SMOTE technique.
    """
    if smote_params is None:
        smote_params = {"random_state": 42}

    # Define supported SMOTE variants
    smote_variants = {
        "SMOTE": SMOTE(**smote_params),
        "ADASYN": ADASYN(**smote_params),
        "BorderlineSMOTE": BorderlineSMOTE(**smote_params),
        "SMOTEENN": SMOTEENN(**smote_params),
        "SMOTETomek": SMOTETomek(**smote_params)
    }

    # Determine SMOTE technique
    if isinstance(recommendations, list):
        if len(recommendations) == 0:
            logging.warning("Empty SMOTE recommendations. Skipping SMOTE.")
            return X_train, y_train, None
        elif len(recommendations) == 1:
            smote_technique = recommendations[0]
        else:
            logging.info("Multiple SMOTE variants recommended. Choosing the first.")
            smote_technique = recommendations[0]
    elif isinstance(recommendations, str):
        smote_technique = recommendations
    else:
        logging.error("Recommendations must be a list or string.")
        raise ValueError("Recommendations must be a list or string.")

    logger.debug(f"SMOTE Technique Requested: {smote_technique}")
    logger.debug(f"Available SMOTE Variants: {list(smote_variants.keys())}")

    # Ensure the technique exists
    if smote_technique not in smote_variants:
        logging.error(f"SMOTE variant '{smote_technique}' is not recognized. Available variants: {list(smote_variants.keys())}")
        raise KeyError(f"SMOTE variant '{smote_technique}' is not recognized.")

    smote_instance = smote_variants[smote_technique]
    X_resampled, y_resampled = smote_instance.fit_resample(X_train, y_train)

    if debug:
        logger.debug(f"Applied SMOTE Technique: {smote_technique}")
        logger.debug(f"Original X_train Shape: {X_train.shape}")
        logger.debug(f"Resampled X_train Shape: {X_resampled.shape}")
        logger.debug(f"Original Class Distribution: {Counter(y_train)}")
        logger.debug(f"Resampled Class Distribution: {Counter(y_resampled)}")

    return X_resampled, y_resampled, smote_technique


if __name__ == "__main__":
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler, MinMaxScaler, KBinsDiscretizer, OneHotEncoder, OrdinalEncoder, LabelEncoder
    y_variable = "result"
    debug = False
    from ml.feature_selection.data_loader_post_select_features import load_selected_features_data
    # Example usage:
    final_ml_df_selected_features = load_selected_features_data(
        features_path='../../data/model/pipeline/final_ml_df_selected_features_columns.pkl',
        dataset_path='../../data/processed/final_ml_dataset.csv',
        y_variable='result',
        debug=False
    )

    # Assuming numerical_info_df, categorical_info_df, and final_ml_df_selected_features are already defined
    y_variable = 'result'

    print("\n[Initial Dataset Info]")
    print(f"Columns to work with: {final_ml_df_selected_features.columns.tolist()}")

    # Step 1: Split dataset into features (X) and target (y)
    X = final_ml_df_selected_features.drop(columns=[y_variable])
    y = final_ml_df_selected_features[y_variable]

    # Step 2: Train-test split
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Apply scaling based on suggestions
    scaler_standard = StandardScaler()
    scaler_minmax = MinMaxScaler()

    # Features requiring StandardScaler
    standard_features = [
        'release_ball_velocity_z', 'knee_release_angle', 'wrist_release_angle',
        'knee_max_angle', 'release_ball_direction_z', 'wrist_max_angle'
    ]

    # Features requiring MinMaxScaler
    minmax_features = [
        'elbow_max_angle', 'elbow_release_angle', 'release_ball_direction_y',
        'release_ball_speed', 'release_ball_direction_x', 'release_ball_velocity_x',
        'release_ball_velocity_y', 'calculated_release_angle'
    ]

    # Apply StandardScaler
    X_train_standard = scaler_standard.fit_transform(X_train[standard_features])
    X_test_standard = scaler_standard.transform(X_test[standard_features])

    # Apply MinMaxScaler
    X_train_minmax = scaler_minmax.fit_transform(X_train[minmax_features])
    X_test_minmax = scaler_minmax.transform(X_test[minmax_features])

    # Combine scaled features
    import pandas as pd
    # Combine scaled features with aligned index
    X_train_scaled = pd.DataFrame(
        data=np.hstack((X_train_standard, X_train_minmax)),
        columns=standard_features + minmax_features,
        index=X_train.index  # Preserve the original index
    )
    X_test_scaled = pd.DataFrame(
        data=np.hstack((X_test_standard, X_test_minmax)),
        columns=standard_features + minmax_features,
        index=X_test.index  # Preserve the original index
    )


    # add in SMOTE TO TRAINING DATASETS ONLY 

    # from smote_automation import  check_dataset_for_smote, apply_smote

    # Analyze dataset for SMOTE
    smote_analysis = check_dataset_for_smote(X_train_scaled, y_train, debug=True)
    print("SMOTE Analysis Recommendations:", smote_analysis["recommendations"])

    # Apply SMOTE
    X_resampled, y_resampled, smote_used = apply_smote(X_train, 
                                                       y_train, 
                                                       "SMOTEENN", # Can also select individual: ADASYN, SMOTEENN, SMOTETomek, BorderlineSMOTE, and SMOTE
                                                       debug=True)
    print("Applied SMOTE Variant:", smote_used)
    print("Resampled Class Distribution:", Counter(y_resampled))

    logging.info(f"SMOTE Technique Used: {smote_used}")


In [None]:

# %%writefile ../../src/freethrow_predictions/ml/classification_preprocessor/preprocessor_encoding_filtered_datasets.py
# Filter into different Preprocessing/Encoding Datasets for Automated Preprocessing

from ml.feature_selection.data_loader_post_select_features import load_selected_features_data
from ml.classification_preprocessor.smote_automation import check_dataset_for_smote, apply_smote
import pandas as pd

def filter_features(numerical_info_df, categorical_info_df, dataset, y_variable, debug=False):
    """
    Filters features based on the analysis and returns lists for each processing type, excluding the target variable.
    """
    # Type checks for input DataFrames
    if not isinstance(numerical_info_df, pd.DataFrame):
        raise TypeError("numerical_info_df must be a pandas DataFrame")
    if not isinstance(categorical_info_df, pd.DataFrame):
        raise TypeError("categorical_info_df must be a pandas DataFrame")
    if not isinstance(dataset, pd.DataFrame):
        raise TypeError("dataset must be a pandas DataFrame")
    if y_variable not in dataset.columns:
        raise ValueError(f"Target variable '{y_variable}' not found in the dataset.")

    # Exclude the target variable from the dataset columns
    dataset_features = set(dataset.columns) - {y_variable}

    if debug:
        print(f"[Debug] Dataset Features: {dataset_features}")

    # Filter numerical features
    numerical_scaler_features = numerical_info_df[
        numerical_info_df['Preprocessing Suggestion'].str.contains('StandardScaler', na=False)
    ]['Feature'].tolist()

    numerical_minmax_features = numerical_info_df[
        numerical_info_df['Preprocessing Suggestion'].str.contains('MinMaxScaler', na=False)
    ]['Feature'].tolist()

    numerical_kbins_features = numerical_info_df[
        numerical_info_df['Preprocessing Suggestion'].str.contains('KBinsDiscretizer', na=False)
    ]['Feature'].tolist()

    numerical_dimred_features = numerical_info_df[
        numerical_info_df['Preprocessing Suggestion'].str.contains('Dimensionality Reduction', na=False)
    ]['Feature'].tolist()

    # Filter categorical features
    categorical_info_df = categorical_info_df[categorical_info_df['Feature'] != y_variable]
    categorical_onehot_features = categorical_info_df[
        categorical_info_df['Encoding Suggestion'].str.contains('OneHotEncoder', na=False)
    ]['Feature'].tolist()

    categorical_labelencode_features = categorical_info_df[
        categorical_info_df['Encoding Suggestion'].str.contains('LabelEncoder', na=False)
    ]['Feature'].tolist()

    # Debug: Identify missing preprocessing
    all_preprocessed_features = (
        numerical_scaler_features +
        numerical_minmax_features +
        numerical_kbins_features +
        numerical_dimred_features +
        categorical_onehot_features +
        categorical_labelencode_features
    )
    missing_preprocessing = set(numerical_info_df['Feature']) - set(all_preprocessed_features)
    if debug and missing_preprocessing:
        print(f"[Debug] Features missing preprocessing suggestions: {missing_preprocessing}")

    # Identify all processed features
    processed_features = set(all_preprocessed_features)

    # Identify unprocessed features
    unprocessed_features = dataset_features - processed_features

    if debug:
        print(f"[Debug] Processed Features: {processed_features}")
        print(f"[Debug] Unprocessed Features: {unprocessed_features}")

    # Validate feature split
    if len(processed_features | unprocessed_features) != len(dataset_features):
        raise ValueError("Filtered features do not match dataset columns!")

    return (
        numerical_scaler_features,
        numerical_minmax_features,  # Added MinMaxScaler
        numerical_kbins_features,
        numerical_dimred_features,
        categorical_onehot_features,
        categorical_labelencode_features,
        list(unprocessed_features)
    )


def check_on_processed_datasets(
    dataset,
    numerical_scaler_features,
    numerical_minmax_features,  # Include MinMax features
    numerical_kbins_features,
    numerical_dimred_features,
    categorical_onehot_features,
    categorical_labelencode_features,
    unprocessed_features,
    debug=False
):
    """
    Splits the dataset into subsets based on the filtered features, excluding the target variable.
    """
    # Ensure all feature subsets are valid
    numerical_scaler_data = dataset[numerical_scaler_features]
    numerical_minmax_data = dataset[numerical_minmax_features]  # Added MinMaxScaler group
    numerical_kbins_data = dataset[numerical_kbins_features]
    numerical_dimred_data = dataset[numerical_dimred_features]
    onehot_data = dataset[categorical_onehot_features]
    labelencode_data = dataset[categorical_labelencode_features]
    unprocessed_data = dataset[unprocessed_features]

    # Validate counts
    total_features_count = (
        len(numerical_scaler_features) +
        len(numerical_minmax_features) +  # Include MinMaxScaler features
        len(numerical_kbins_features) +
        len(numerical_dimred_features) +
        len(categorical_onehot_features) +
        len(categorical_labelencode_features) +
        len(unprocessed_features)
    )

    if debug:
        print("[Debug] Processed Data Summary:")
        print(f"  Numerical Scaler Data Shape: {numerical_scaler_data.shape}")
        print(f"  Numerical MinMax Data Shape: {numerical_minmax_data.shape}")  # Debug MinMaxScaler
        print(f"  Numerical KBins Data Shape: {numerical_kbins_data.shape}")
        print(f"  Numerical DimRed Data Shape: {numerical_dimred_data.shape}")
        print(f"  OneHot Data Shape: {onehot_data.shape}")
        print(f"  LabelEncode Data Shape: {labelencode_data.shape}")
        print(f"  Unprocessed Data Shape: {unprocessed_data.shape}")
        print(f"  Total Processed Features: {total_features_count}")
        print(f"  Original Dataset Features (Excluding Target): {dataset.shape[1]}")

    if total_features_count != dataset.shape[1]:
        raise ValueError(
            f"Feature count mismatch! Processed: {total_features_count}, Original: {dataset.shape[1]}"
        )

    return {
        'numerical_scaler_data': numerical_scaler_data,
        'numerical_minmax_data': numerical_minmax_data,  # Added MinMaxScaler data
        'numerical_kbins_data': numerical_kbins_data,
        'numerical_dimred_data': numerical_dimred_data,
        'onehot_data': onehot_data,
        'labelencode_data': labelencode_data,
        'unprocessed_data': unprocessed_data
    }

if __name__ == "__main__":

    # from preprocessor_recommendations import filter_features_by_type, analyze_categorical_features, analyze_numerical_features_enhanced_v2, 
    # from preprocessor_encoding_filtered_datasets import filter_features, check_on_processed_datasets
    debug = True
    # Example parameter tuning
    zscore_threshold = 3
    tukey_threshold = 1.5
    max_rows_shapiro = 5000
    min_rows_normality_percentage = 0.05
    high_outlier_percentage = 5
    correlation_threshold = 0.8  # Threshold for multicollinearity check

    # File paths
    features_path = '../../data/model/pipeline/final_ml_df_selected_features_columns.pkl'
    dataset_path = "../../data/processed/final_ml_dataset.csv"
    
    # from data_loader_post_select_features import load_selected_features_data
    # Example usage:
    final_ml_df_selected_features = load_selected_features_data(
        features_path=features_path,
        dataset_path=dataset_path,
        y_variable='result',
        debug=True
    )
    

    # Step 1: Filter features
    categorical_df, numerical_df = filter_features_by_type(final_ml_df_selected_features, debug=debug)

    # Step 2: Analyze categorical features
    categorical_info_df = analyze_categorical_features(
        categorical_df,
        low_cardinality_threshold=10,
        high_cardinality_threshold=50,
        missing_threshold=0.3,
        debug=False
    )

    # Step 3: Analyze numerical features and handle outliers automatically
    numerical_info_df = analyze_numerical_features_enhanced_v2(
        numerical_df,
        y_feature=None,
        zscore_threshold=zscore_threshold,
        tukey_threshold=tukey_threshold,
        missing_threshold=0.5,
        high_cardinality_threshold=1000,
        debug=False
    )

    # Display results
    print("\nCategorical Features Analysis:")
    print(categorical_info_df.to_string(index=False))

    print("\nNumerical Features Analysis:")
    print(numerical_info_df.to_string(index=False))


    # Assuming numerical_info_df, categorical_info_df, and final_ml_df_selected_features are already defined
    y_variable = 'result'
    print("columns to work with =", final_ml_df_selected_features.columns)
    print("categorical_info_df columns to work with =", categorical_info_df['Feature'])
    print("numerical_info_df columns to work with =", numerical_info_df['Feature'])

    # Step 1: Filter features
    (
        numerical_scaler_features,
        numerical_minmax_features,  # Added MinMaxScaler
        numerical_kbins_features,
        numerical_dimred_features,
        onehot_features,
        labelencode_features,
        unprocessed_features
    ) = filter_features(
        numerical_info_df, categorical_info_df, final_ml_df_selected_features, y_variable, debug=debug
    )

    # Debugging outputs
    print(f"[Debug] Numerical Scaler Features: {numerical_scaler_features}")
    print(f"[Debug] Numerical MinMax Features: {numerical_minmax_features}")
    print(f"[Debug] Unprocessed Features: {unprocessed_features}")

    # Step 2: Process data, ensuring target variable is excluded
    checks_on_processed_datasets = check_on_processed_datasets(
        final_ml_df_selected_features.drop(columns=[y_variable]),  # Exclude target
        numerical_scaler_features,
        numerical_minmax_features,  # Added MinMaxScaler
        numerical_kbins_features,
        numerical_dimred_features,
        onehot_features,
        labelencode_features,
        unprocessed_features,  # Ensure this argument is passed
        debug=debug
    )

    # Display results
    print("\n[Final Validation] Processed Data Shapes:")
    for key, df in checks_on_processed_datasets.items():
        print(f"  {key}: {df.shape}")

In [None]:
# %%writefile ../../src/freethrow_predictions/ml/classification_preprocessor/datapreprocessor_class.py

import pandas as pd
import pickle
import joblib
from sklearn.model_selection import train_test_split
# from data_loader_post_select_features import load_feature_names, load_dataset, filter_base_data_for_select_features
# from preprocessor_recommendations import filter_features_by_type, analyze_categorical_features, analyze_numerical_features_enhanced_v2, 
# from preprocessor_encoding_filtered_datasets import filter_features, check_on_processed_datasets
    # from preprocessor import (
    #     create_numerical_scaler_pipeline, 
    #     create_numerical_minmax_pipeline, 
    #     create_numerical_kbins_pipeline, 
    #     create_categorical_onehot_pipeline, 
    #     create_categorical_labelencode_pipeline, 
    #     create_numerical_dimred_pipeline, 
    #     process_feature_groups, 
    #     preprocess_target, 
    #     combine_transformed_data_with_index,
    #      save_pipeline_and_assets
    # )
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


from ml.feature_selection.data_loader_post_select_features import load_selected_features_data
from ml.classification_preprocessor.smote_automation import check_dataset_for_smote, apply_smote

class DataPreprocessor:
    """
    DataPreprocessor handles the preprocessing of the dataset, including feature transformations,
    handling train-test splits, applying SMOTE, and encoding the target variable.

    Attributes:
        features_path (str): Path to the selected features pickle file.
        dataset_path (str): Path to the processed dataset CSV.
        assets_path (str): Path to save preprocessing assets.
        y_variable (str): The target variable name.
        optimization_columns (list): Columns to consider for optimization ranges.
        debug (bool): If True, enables detailed logging and validation outputs.
        preprocessor_train_test_split (bool): Whether to perform a train-test split.
        test_size (float): Proportion of the dataset to include in the test split.
        random_state (int): Random state for reproducibility.
        stratify (bool): Whether to stratify the split based on the target variable.
        specific_smote (str or None): SMOTE variant to apply ('SMOTE', 'ADASYN', 'recommended', or None).
        original_index (pd.Index): The original index of the dataset before preprocessing.
    """

    def __init__(
        self,
        features_path: str,
        dataset_path: str,
        assets_path: str,
        y_variable: str = 'result',
        optimization_columns: list = None,  # New parameter
        debug: bool = True,
        preprocessor_train_test_split: bool = False,
        test_size: float = 0.2,
        random_state: int = 42,
        stratify: bool = True,
        specific_smote: str = 'recommended',  # New parameter
    ) -> None:
        # Initialization parameters
        self.features_path = features_path
        self.dataset_path = dataset_path
        self.assets_path = assets_path
        self.y_variable = y_variable
        self.debug = debug

        # Parameters for train-test split
        self.preprocessor_train_test_split = preprocessor_train_test_split
        self.test_size = test_size
        self.random_state = random_state
        self.stratify = stratify

        # SMOTE handling parameter
        self.specific_smote = specific_smote  # Added SMOTE parameter

        # Optimization-specific parameters
        self.optimization_columns = optimization_columns if optimization_columns else []
        self.optimization_ranges = None  # Lazy initialization

        # Define parameters used in analysis functions
        self.zscore_threshold = 3
        self.tukey_threshold = 1.5
        self.high_cardinality_threshold = 1000
        self.missing_threshold = 0.5
        self.low_cardinality_threshold = 10
        self.high_cardinality_threshold_categorical = 50

        self.original_index = None  # Initialize original_index

    def _load_original_dataset(self) -> pd.DataFrame:
        try:
            original_df = pd.read_csv(self.dataset_path)  # Placeholder
            self.original_data = original_df  # Assign to class attribute
            if self.debug:
                logger.debug(f"Original dataset loaded with index:\n{original_df.index}")
            return original_df
        except Exception as e:
            logger.error(f"Error loading original dataset: {e}")
            raise

    def load_data(self):
        """
        Load and preprocess the data using the new data loading function.
        """
        self.filtered_data = load_selected_features_data(
            features_path=self.features_path,
            dataset_path=self.dataset_path,
            y_variable=self.y_variable,
            debug=self.debug,
        )
        self.original_df = self._load_original_dataset()
        self.original_index = self.original_df.index  # Store original index
        if self.debug:
            logger.debug(f"Original index in DataPreprocessor:\n{self.original_index}")
        print("Selected features loaded and data processing complete.")
        
    def preprocess_data(self, data):
        """
        Generic preprocessing method for dataset, 
        reusable for train-test split and full dataset.
        """
        # Step 1: Filter features by type
        categorical_df, numerical_df = filter_features_by_type(data, debug=self.debug)

        # Step 2: Analyze categorical features
        categorical_info = analyze_categorical_features(
            categorical_df,
            low_cardinality_threshold=self.low_cardinality_threshold,
            high_cardinality_threshold=self.high_cardinality_threshold_categorical,
            missing_threshold=self.missing_threshold,
            debug=self.debug,
        )

        # Step 3: Analyze numerical features
        numerical_info = analyze_numerical_features_enhanced_v2(
            numerical_df,
            y_feature=None,
            zscore_threshold=self.zscore_threshold,
            tukey_threshold=self.tukey_threshold,
            missing_threshold=self.missing_threshold,
            high_cardinality_threshold=self.high_cardinality_threshold,
            debug=self.debug,
        )

        # Step 4: Filter features for processing
        (
            numerical_scaler_features,
            numerical_minmax_features,
            numerical_kbins_features,
            numerical_dimred_features,
            onehot_features,
            labelencode_features,
            unprocessed_features,
        ) = filter_features(
            numerical_info, categorical_info, data, y_variable=self.y_variable, debug=self.debug
        )

        return (
            numerical_scaler_features,
            numerical_minmax_features,
            numerical_kbins_features,
            numerical_dimred_features,
            onehot_features,
            labelencode_features,
            unprocessed_features,
        )
        
    def _combine_transformed_data(self, transformed_data_dict, original_index):
        """
        Combine transformed feature arrays into a single DataFrame with prefixed column names.

        Args:
            transformed_data_dict (dict): Dictionary of transformed feature arrays.
            original_index (pd.Index): Original index to align the combined DataFrame.

        Returns:
            pd.DataFrame: Unified DataFrame with all transformed features.
        """
        combined_data = {}
        for group_name, array in transformed_data_dict.items():
            if array is None:
                logger.warning(f"[Combine Transformed Data] Transformed data for group '{group_name}' is None. Filling with empty array.")
                array = np.zeros((len(original_index), 0))  # Fallback to an empty array
            group_df = pd.DataFrame(array, columns=[f"{group_name}_{i}" for i in range(array.shape[1])])
            combined_data[group_name] = group_df

        combined_df = pd.concat(combined_data.values(), axis=1)
        combined_df.index = original_index
        if self.debug:
            logger.debug("\n[Combine Transformed Data]")
            logger.debug(f"Combined Shape: {combined_df.shape}")
            logger.debug(f"Combined Columns: {list(combined_df.columns)}[:10]...")  # Show first 10 columns

        return combined_df

    def process_feature_groups(self, data, use_existing_pipelines=False):
        if use_existing_pipelines:
            transformed_data, _, _ = process_feature_groups(
                data,
                self.numerical_scaler_features,
                self.numerical_minmax_features,
                self.numerical_kbins_features,
                self.numerical_dimred_features,
                self.onehot_features,
                self.labelencode_features,
                self.unprocessed_features,
                fitted_pipelines=self.fitted_pipelines,
                debug=self.debug,
            )
        else:
            transformed_data, self.fitted_pipelines, self.feature_indices = process_feature_groups(
                data,
                self.numerical_scaler_features,
                self.numerical_minmax_features,
                self.numerical_kbins_features,
                self.numerical_dimred_features,
                self.onehot_features,
                self.labelencode_features,
                self.unprocessed_features,
                debug=self.debug,
            )
        
        # Generate flattened feature indices
        self.flattened_feature_indices = {}
        start_idx = 0
        for group_name, array in transformed_data.items():
            if array is None or array.shape[1] == 0:
                logger.warning(f"[Process Feature Groups] Transformed data for group '{group_name}' is empty or None. Skipping.")
                continue
            if group_name not in self.feature_indices or self.feature_indices[group_name] is None:
                logger.warning(f"[Process Feature Groups] Feature indices for group '{group_name}' are missing. Skipping.")
                continue
            num_features = array.shape[1]
            feature_names = self.feature_indices[group_name]['feature_names']
            for i in range(num_features):
                feature_name = feature_names[i]
                self.flattened_feature_indices[start_idx] = feature_name
                start_idx += 1

        # Create reverse mapping from feature name to index
        self.feature_name_to_index = {feature_name: index for index, feature_name in self.flattened_feature_indices.items()}
        
        transformed_data_df = self._combine_transformed_data(transformed_data, data.index)
        return transformed_data, transformed_data_df

    def finalize_data(self, transformed_data, original_data):
        """
        Finalizes transformed data by combining it with the original index.
        """
        combined_df = combine_transformed_data_with_index(
            transformed_data, self.feature_indices, original_data.index, debug=self.debug
        )
        return combined_df

    def preprocess_target(self, y_train, y_test=None):
        y_train_encoded, y_test_encoded, label_encoder = preprocess_target(
            y_train, y_test, debug=self.debug
        )
        return y_train_encoded, y_test_encoded, label_encoder

    def save_assets(self):
        if not hasattr(self, "fitted_pipelines") or not hasattr(self, "feature_indices"):
            raise RuntimeError(
                "Assets are not available for saving. Ensure preprocessing is complete before saving."
            )
        assets = {
            'fitted_pipelines': self.fitted_pipelines,
            'feature_indices': self.feature_indices,
            'flattened_feature_indices': self.flattened_feature_indices,
            'feature_name_to_index': self.feature_name_to_index,  # Add this line
            'label_encoder': self.label_encoder,
        }
        with open(self.assets_path, 'wb') as f:
            pickle.dump(assets, f)
        logger.debug(f"Pipeline and assets saved to {self.assets_path}")

    def extract_transformed_optimization_ranges(self):
        """
        Extract min and max ranges for the optimization columns from the transformed dataset.
        """
        if not self.optimization_columns:
            raise ValueError("No optimization columns specified.")
        
        self.optimization_transformed_ranges = {}
        for col in self.optimization_columns:
            if col not in self.feature_name_to_index:
                logger.warning(f"Optimization column '{col}' not found in feature indices.")
                continue
            col_index = self.feature_name_to_index[col]
            transformed_values = self.X_transformed.iloc[:, col_index]
            
            # Check if transformed_values is valid
            if transformed_values.empty:
                logger.warning(f"No values found for column '{col}'.")
                continue

            self.optimization_transformed_ranges[col] = (transformed_values.min(), transformed_values.max())
        
        return self.optimization_transformed_ranges

    def _preprocess_and_transform(self, X_train, X_test=None):
        # Preprocess features
        (
            self.numerical_scaler_features,
            self.numerical_minmax_features,
            self.numerical_kbins_features,
            self.numerical_dimred_features,
            self.onehot_features,
            self.labelencode_features,
            self.unprocessed_features,
        ) = self.preprocess_data(self.filtered_data)

        # Process and finalize training data
        self.transformed_train, self.transformed_train_df = self.process_feature_groups(X_train)
        self.X_train_transformed = self.finalize_data(
            self.transformed_train, X_train
        )

        if self.debug:
            logger.debug(f"X_train_transformed index:\n{self.X_train_transformed.index}")
            logger.debug(f"Original X_train index:\n{X_train.index}")

        if X_test is not None:
            # Process and finalize testing data
            self.transformed_test, self.transformed_test_df = self.process_feature_groups(
                X_test, use_existing_pipelines=True
            )
            self.X_test_transformed = self.finalize_data(
                self.transformed_test, X_test
            )
            if self.debug:
                logger.debug(f"X_test_transformed index:\n{self.X_test_transformed.index}")
                logger.debug(f"Original X_test index:\n{X_test.index}")
        else:
            # Process and finalize testing data
            self.transformed_data, self.transformed_df = self.process_feature_groups(
                X_train, use_existing_pipelines=True
            )
            self.X_transformed = self.finalize_data(
                self.transformed_train, X_train
            )
            if self.debug:
                logger.debug(f"X_transformed index:\n{self.X_transformed.index}")
                logger.debug(f"Original X index:\n{X_train.index}")

    def _preprocess_target(self, y_train, y_test=None):
        self.y_train_encoded, self.y_test_encoded, self.label_encoder = self.preprocess_target(
            y_train, y_test
        )
        if not self.preprocessor_train_test_split:
            # In the no-split case, assign y_encoded for consistency
            self.y_encoded = self.y_train_encoded

    def extract_optimization_ranges(self):
        """
        Extract min and max ranges for the optimization columns from the original dataset.
        """
        if not self.optimization_columns:
            raise ValueError("No optimization columns specified.")
        
        self.optimization_ranges = {}
        for col in self.optimization_columns:
            if col not in self.filtered_data.columns:
                logger.warning(f"Optimization column '{col}' not found in the dataset.")
                continue
            values = self.filtered_data[col]
            
            # Check if values is valid
            if values.empty:
                logger.warning(f"No values found for column '{col}'.")
                continue

            self.optimization_ranges[col] = (values.min(), values.max())
        
        return self.optimization_ranges

    def transform_new_data(self, input_df):
        """
        Transform new optimization data using the preprocessing pipeline.

        Args:
            input_df (pd.DataFrame): DataFrame with new data to transform.

        Returns:
            pd.DataFrame: Transformed data.
        """
        # Ensure optimization columns are present
        missing_features = set(self.optimization_columns) - set(input_df.columns)
        if missing_features:
            raise ValueError(f"Input data is missing optimization columns: {missing_features}")

        # Reorder columns to match the training data
        input_df = input_df[self.optimization_columns]

        # Apply the preprocessing pipeline
        transformed_data = self.pipeline.transform(input_df)

        return transformed_data

    def run(self, return_optimization_ranges=False):
        """
        Main entry point for preprocessing.

        Args:
            return_optimization_ranges (bool): Whether to include optimization ranges in the output.

        Returns:
            tuple: Preprocessed data and optional optimization ranges.
        """
        self.load_data()
        if self.filtered_data is not None:
            # Split features and target
            y_variable = self.y_variable
            if y_variable not in self.filtered_data.columns:
                raise ValueError(f"Target variable '{y_variable}' not found in dataset.")
            
            X = self.filtered_data.drop(columns=[y_variable])
            y = self.filtered_data[y_variable]

            if self.preprocessor_train_test_split:
                # Perform train-test split
                stratify = y if self.stratify else None
                X_train, X_test, y_train, y_test = train_test_split(
                    X,
                    y,
                    test_size=self.test_size,
                    random_state=self.random_state,
                    stratify=stratify,
                )
                if self.debug:
                    logger.debug(f"Index after train-test split:")
                    logger.debug(f"X_train index:\n{X_train.index}")
                    logger.debug(f"X_test index:\n{X_test.index}")
                    logger.debug(f"y_train index:\n{y_train.index}")
                    logger.debug(f"y_test index:\n{y_test.index}")

                # SMOTE automation on TRAINING ONLY
                smote_analysis = check_dataset_for_smote(X_train, y_train, debug=self.debug)
                logger.info(f"SMOTE Analysis Recommendations: {smote_analysis['recommendations']}")

                # Apply SMOTE based on specific_smote parameter
                if self.specific_smote is not None:
                    if self.specific_smote.lower() == 'recommended':
                        smote_variant = smote_analysis.get('recommendations', None)
                        if smote_variant is None:
                            logger.warning("No SMOTE recommendations available. Skipping SMOTE.")
                            X_train_resampled, y_train_resampled = X_train, y_train
                            smote_used = None
                        else:
                            X_train_resampled, y_train_resampled, smote_used = apply_smote(
                                X_train, y_train, smote_variant, debug=self.debug
                            )
                            logger.info(f"Applied SMOTE Variant: {smote_used}")
                    else:
                        # Use the specified SMOTE variant
                        smote_variant = self.specific_smote
                        X_train_resampled, y_train_resampled, smote_used = apply_smote(
                            X_train, y_train, smote_variant, debug=self.debug
                        )
                        logger.info(f"Applied SMOTE Variant: {smote_used}")
                else:
                    # No SMOTE applied
                    X_train_resampled, y_train_resampled = X_train, y_train
                    smote_used = None
                    logger.info("SMOTE not applied.")

                if smote_used:
                    logger.info(f"Resampled Class Distribution: {Counter(y_train_resampled)}")
                else:
                    logger.info(f"Original Class Distribution: {Counter(y_train_resampled)}")

                # Proceed with preprocessing using resampled data
                self._preprocess_and_transform(X_train_resampled, X_test)
                self._preprocess_target(y_train_resampled, y_test)

                # Validate index consistency
                assert self.X_train_transformed.index.equals(X_train_resampled.index), "Index mismatch in X_train_transformed"
                assert self.X_test_transformed.index.equals(X_test.index), "Index mismatch in X_test_transformed"

                if self.debug:
                    logger.debug("Index consistency verified after preprocessing with train-test split.")

                self.save_assets()
                return (
                    self.X_train_transformed,
                    self.X_test_transformed,
                    self.y_train_encoded,
                    self.y_test_encoded,
                    self.transformed_train,
                    self.transformed_train_df,
                    X_train_resampled,
                    y_train_resampled,
                )
            else:
                # Preprocess without split
                self._preprocess_and_transform(X)
                self._preprocess_target(y)

                # Validate index consistency
                assert self.X_transformed.index.equals(X.index), "Index mismatch in X_transformed"

                if self.debug:
                    logger.debug("Index consistency verified after preprocessing without train-test split.")

                self.save_assets()

                if return_optimization_ranges and self.optimization_columns:
                    self.extract_optimization_ranges()  # Compute normal ranges
                    self.extract_transformed_optimization_ranges()
                    return (
                        self.X_transformed,
                        self.y_encoded,
                        self.transformed_data,
                        self.transformed_df,
                        self.original_data,  # Included original data
                        X,
                        y,
                        self.optimization_ranges,  # Now computed
                        self.optimization_transformed_ranges,  # Return transformed ranges
                    )
                else:
                    return (
                        self.X_transformed,
                        self.y_encoded,
                        self.transformed_data,
                        self.transformed_df,
                        self.original_data,  # Included original data
                        X,
                        y,
                    )



if __name__ == "__main__":
    # File paths
    features_path = '../../data/model/pipeline/final_ml_df_selected_features_columns.pkl'
    dataset_path = "../../data/processed/final_ml_dataset.csv"
    assets_path = '../../data/model/pipeline/preprocessing_assets.pkl'

    # Example 1: With train-test split
    dp_split = DataPreprocessor(
        features_path=features_path,
        dataset_path=dataset_path,
        assets_path=assets_path,
        y_variable='result',
        preprocessor_train_test_split=True,
        test_size=0.3,
        random_state=123,
        stratify=True,
        specific_smote='SMOTEENN',  # Specify desired SMOTE variant
    )
    (
        X_train_transformed,
        X_test_transformed,
        y_train_encoded,
        y_test_encoded,
        transformed_data_train,
        transformed_train_df,  # Include the additional output
        X_train,
        y_train,
    ) = dp_split.run()


    print(f"X_train_transformed shape: {X_train_transformed.shape}")
    print(f"X_test_transformed shape: {X_test_transformed.shape}")
    print(f"Transformed Data Train: {transformed_data_train.keys()}")
    print(f"Original X_train shape: {X_train.shape}")
    print(f"Original y_train shape: {y_train.shape}")

    # Example 2: Without train-test split
    dp_no_split = DataPreprocessor(
        features_path=features_path,
        dataset_path=dataset_path,
        assets_path=assets_path,
        y_variable='result',
        preprocessor_train_test_split=False,
        specific_smote='recommended',  # Specify desired SMOTE variant
    )
    X_transformed, y_encoded, transformed_data, transformed_data_df, original_data, X, y = dp_no_split.run()

    print(f"X_transformed shape: {X_transformed.shape}")
    print(f"Transformed Data: {transformed_data.keys()}")
    print(f"Transformed Data: {transformed_data_df.columns}")
    print(f"Original X shape: {X.shape}")
    print(f"Original y shape: {y.shape}")

    #-------------Preprocessing with Optimization Ranges------------
    dp = DataPreprocessor(
        features_path=features_path,
        dataset_path=dataset_path,
        assets_path=assets_path,
        y_variable="result",
        optimization_columns=["knee_max_angle", "wrist_max_angle", "elbow_max_angle"], # can and do use all x columns for bayesian optimization test
        preprocessor_train_test_split=False,
        specific_smote='recommended',  # Specify desired SMOTE variant
    )

    results = dp.run(return_optimization_ranges=True)
    X_transformed, y_encoded, transformed_data, transformed_data_df, original_data, X, y, optimization_ranges, optimization_transformed_ranges = results

    print(f"Optimization Ranges: {optimization_ranges}")
    print(f"optimization_transformed_ranges: {optimization_transformed_ranges}")
    print(f"original_data shape: {original_data.shape}")
    print(f"X shape: {X.shape}")
    print(f"X_transformed shape: {X_transformed.shape}")

In [None]:
# %%writefile ../../src/freethrow_predictions/ml/classification_processors/inverse_preprocessor_class.py

from typing import Dict, Optional, Any, List
import pandas as pd
import numpy as np
import logging
from ml.feature_selection.data_loader_post_select_features import load_selected_features_data
# Import necessary inverse preprocessor functions
# from inverse_preprocessor_functions import (load_pipeline_and_assets, 
#                                            inverse_transform_feature_groups,
#                                            combine_inverse_transformed_data,
#                                            prepare_final_dataset_with_target)

# Configure logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

class InversePreprocessor:
    """
    InversePreprocessor reverses the transformations applied during preprocessing,
    allowing the reconstructed dataset to be compared or interpreted in its original form.

    Attributes:
        # ... existing attributes

    Methods:
        transform(original_data, transformed_data, y_encoded):
            Performs the inverse transformation on the transformed data.
            - Validates that the index remains consistent throughout inverse preprocessing.
            - Logs detailed debug information if debug=True.
        append_columns_from_original(final_dataset, original_data, columns_to_append, debug=False):
            Appends specified columns from original_data to final_dataset after validating index consistency.
    """
    def __init__(self, 
                 pipelines: Optional[Dict[str, Any]] = None, 
                 feature_indices: Optional[Dict[str, Any]] = None, 
                 flattened_feature_indices: Optional[Dict[int, str]] = None,
                 label_encoder: Optional[Any] = None,
                 assets_path: Optional[str] = None,
                 debug: bool = False):
        self.pipelines = pipelines
        self.feature_indices = feature_indices
        self.flattened_feature_indices = flattened_feature_indices
        self.label_encoder = label_encoder
        self.assets_path = assets_path
        self.debug = debug

        if not self.pipelines or not self.feature_indices or not self.flattened_feature_indices:
            if self.assets_path:
                if self.debug:
                    logger.debug(f"[InversePreprocessor] Loading pipelines and assets from: {self.assets_path}")
                self.pipelines, self.feature_indices, self.flattened_feature_indices, self.label_encoder = load_pipeline_and_assets(self.assets_path)
            else:
                raise ValueError("Pipelines and feature indices are not provided and no assets_path is specified.")

        if self.debug:
            logger.debug(f"[InversePreprocessor] Initialized InversePreprocessor with assets_path: {assets_path}")
            logger.debug(f"[InversePreprocessor] Flattened Feature Indices: {self.flattened_feature_indices}")
            logger.debug(f"[InversePreprocessor] Loaded Pipelines: {list(self.pipelines.keys())}")
            logger.debug(f"[InversePreprocessor] Feature Indices: {self.feature_indices}")
            logger.debug(f"[InversePreprocessor] Flattened Feature Indices: {self.flattened_feature_indices}")
            logger.debug(f"[InversePreprocessor] Label Encoder: {self.label_encoder}")


    def append_columns_from_original(self, 
                                     final_dataset: pd.DataFrame, 
                                     original_data: pd.DataFrame, 
                                     columns_to_append: List[str],
                                     debug: bool = False) -> pd.DataFrame:
        """
        Appends specified columns from original_data to final_dataset after ensuring index alignment.

        Args:
            final_dataset (pd.DataFrame): The dataset to which columns will be appended.
            original_data (pd.DataFrame): The original dataset containing the columns to append.
            columns_to_append (List[str]): List of column names to append from original_data.
            debug (bool, optional): If True, provides detailed debug information. Defaults to False.

        Returns:
            pd.DataFrame: The updated final_dataset with appended columns.
        """
        if debug:
            logger.debug("[append_columns_from_original] Starting index consistency check.")

        # Step 1: Index Consistency Check
        if not original_data.index.equals(final_dataset.index):
            logger.warning("[append_columns_from_original] Index mismatch between original_data and final_dataset. Reindexing to align.")
            final_dataset = final_dataset.reindex(original_data.index)
            logger.debug("[append_columns_from_original] Reindexing completed.")

        # Step 2: Column Selection Validation
        if debug:
            logger.debug(f"[append_columns_from_original] Columns requested to append: {columns_to_append}")
        
        valid_columns = []
        for col in columns_to_append:
            if not isinstance(col, str) or not col.strip():
                logger.warning(f"[append_columns_from_original] Invalid column name detected: '{col}'. Skipping.")
                continue
            if col not in original_data.columns:
                logger.warning(f"[append_columns_from_original] Column '{col}' not found in original_data. Skipping.")
                continue
            valid_columns.append(col)

        if not valid_columns:
            logger.error("[append_columns_from_original] No valid columns to append. Returning the final_dataset unchanged.")
            return final_dataset

        if debug:
            logger.debug(f"[append_columns_from_original] Valid columns to append: {valid_columns}")

        # Step 3: Append Columns
        columns_to_append_df = original_data[valid_columns]
        if debug:
            logger.debug(f"[append_columns_from_original] Shape of columns to append: {columns_to_append_df.shape}")
            logger.debug(f"[append_columns_from_original] Columns before concatenation: {final_dataset.columns.tolist()}")
        
        final_dataset_updated = pd.concat([final_dataset, columns_to_append_df], axis=1)

        if debug:
            logger.debug(f"[append_columns_from_original] Shape after concatenation: {final_dataset_updated.shape}")
            logger.debug(f"[append_columns_from_original] Columns after concatenation: {final_dataset_updated.columns.tolist()}")
            logger.debug(f"[append_columns_from_original] Sample of appended data:\n{columns_to_append_df.head()}")

        # Step 4: Output the Final Dataset
        if debug:
            logger.debug("[append_columns_from_original] Successfully appended columns from original_data.")
        else:
            logger.info("[append_columns_from_original] Columns appended successfully.")

        return final_dataset_updated

    def inverse_transform_features(self, transformed_data: Dict[str, np.ndarray]) -> Dict[str, pd.DataFrame]:
        if self.debug:
            logger.debug("[InversePreprocessor] Performing inverse transformation on feature groups.")
        try:
            inverse_transformed_data = inverse_transform_feature_groups(
                transformed_data,
                self.pipelines,
                self.feature_indices,
                debug=self.debug
            )
            if self.debug:
                for group, data in inverse_transformed_data.items():
                    logger.debug(f"[InversePreprocessor] Inverse transformed group '{group}' with shape {data.shape}")
                    logger.debug(f"[InversePreprocessor] Inverse transformed group '{group}' index:\n{data.index}")
            return inverse_transformed_data
        except Exception as e:
            logger.error(f"Error during feature inverse transformation: {e}")
            raise RuntimeError(f"Error during feature inverse transformation: {e}")

    def inverse_transform_optimization_params(self, params: pd.DataFrame, optimization_columns: list) -> pd.DataFrame:
        """
        Inverse transform optimization parameters to the original scale.

        Args:
            params (pd.DataFrame): Transformed parameter values as a DataFrame.
            optimization_columns (list): List of columns to inverse-transform.

        Returns:
            pd.DataFrame: Inverse-transformed parameter values as a DataFrame.
        """
        if self.debug:
            logger.debug(f"[InversePreprocessor] Inverse-transforming optimization parameters for columns: {optimization_columns}")

        try:
            # Flatten feature_indices to get all feature names
            all_features = set()
            for group_name, group in self.feature_indices.items():
                all_features.update(group["feature_names"])

            # Ensure optimization columns exist in all_features
            missing_columns = [col for col in optimization_columns if col not in all_features]
            if missing_columns:
                raise ValueError(f"Optimization columns not found in feature indices: {missing_columns}")

            # Map each feature to its pipeline and index
            feature_to_pipeline_and_index = {}
            for group_name, group in self.feature_indices.items():
                pipeline = self.pipelines[group_name]
                feature_names = group['feature_names']
                for i, feature in enumerate(feature_names):
                    feature_to_pipeline_and_index[feature] = (pipeline, i)

            # Perform inverse transformation for each column
            inverse_transformed = {}
            for col in optimization_columns:
                pipeline, feature_index = feature_to_pipeline_and_index[col]
                transformed_value = params[col].values.reshape(-1, 1)
                # Extract the transformer
                transformer = pipeline.steps[-1][1]
                if isinstance(transformer, StandardScaler):
                    scale = transformer.scale_[feature_index]
                    mean = transformer.mean_[feature_index]
                    original_value = transformed_value * scale + mean
                    inverse_transformed[col] = original_value.flatten()
                elif isinstance(transformer, MinMaxScaler):
                    data_min = transformer.data_min_[feature_index]
                    data_max = transformer.data_max_[feature_index]
                    feature_range_min, feature_range_max = transformer.feature_range
                    original_value = ((transformed_value - feature_range_min) / (feature_range_max - feature_range_min)) * (data_max - data_min) + data_min
                    inverse_transformed[col] = original_value.flatten()
                else:
                    raise NotImplementedError(f"Transformer {transformer} not supported for inverse_transform")

            # Combine results into a DataFrame
            inverse_transformed_df = pd.DataFrame(inverse_transformed)
            if self.debug:
                logger.debug(f"[InversePreprocessor] Inverse-transformed optimization parameters:\n{inverse_transformed_df}")
        
            return inverse_transformed_df
        except Exception as e:
            logger.error(f"Error during inverse transformation of optimization parameters: {e}")
            raise RuntimeError(f"Error during inverse transformation of optimization parameters: {e}")

    def decode_target(self, y_encoded: np.ndarray) -> Optional[np.ndarray]:
        if not self.label_encoder:
            if self.debug:
                logger.debug("[InversePreprocessor] No label encoder provided; skipping target decoding.")
            return None
        if self.debug:
            logger.debug("[InversePreprocessor] Decoding target variable.")
        try:
            decoded_targets = self.label_encoder.inverse_transform(y_encoded)
            if self.debug:
                logger.debug(f"[InversePreprocessor] Decoded targets (first 5): {decoded_targets[:5]}")
            return decoded_targets
        except Exception as e:
            logger.error(f"Error decoding target variable: {e}")
            raise RuntimeError(f"Error decoding target variable: {e}")

    def combine_features_and_targets(self, 
                                    original_data: pd.DataFrame, 
                                    inverse_transformed_data: Dict[str, pd.DataFrame], 
                                    y_encoded: np.ndarray) -> pd.DataFrame:
        decoded_targets = self.decode_target(y_encoded)
        y_encoded_series = pd.Series(y_encoded, index=original_data.index, name='Encoded_Target')
        if decoded_targets is not None:
            decoded_targets_series = pd.Series(decoded_targets, index=original_data.index, name='Decoded_Target')
        else:
            decoded_targets_series = None
        try:
            combined_features = combine_inverse_transformed_data(
                inverse_transformed_data,
                original_data.index,
                original_data.columns,
                debug=self.debug
            )
            combined_data = prepare_final_dataset_with_target(
                original_data,
                combined_features,
                y_encoded_series,
                decoded_targets_series,
                debug=self.debug
            )
            return combined_data
        except Exception as e:
            logger.error(f"Error combining features and targets: {e}")
            raise RuntimeError(f"Error combining features and targets: {e}")

    
    # Example of existing transform method for context
    def transform(self, 
                  original_data: pd.DataFrame, 
                  transformed_data: Dict[str, np.ndarray], 
                  y_encoded: np.ndarray) -> pd.DataFrame:
        if self.debug:
            logger.debug("[InversePreprocessor] Starting full inverse transformation and recombination process.")
            logger.debug(f"Original data index:\n{original_data.index}")
            logger.debug(f"Transformed data keys: {list(transformed_data.keys())}")
        
        # Inverse transformation steps...
        inverse_transformed_data = self.inverse_transform_features(transformed_data)
        
        if self.debug:
            for group, data in inverse_transformed_data.items():
                logger.debug(f"[InversePreprocessor] Inverse transformed group '{group}' with shape {data.shape}")
                logger.debug(f"[InversePreprocessor] Inverse transformed group '{group}' index:\n{data.index}")
        
        # Combine inverse transformed data
        final_dataset = self.combine_features_and_targets(
            original_data, 
            inverse_transformed_data, 
            y_encoded
        )
        
        # Validate index consistency
        if not final_dataset.index.equals(original_data.index):
            logger.warning("Index mismatch after inverse transformation. Reindexing the final dataset.")
            final_dataset = final_dataset.reindex(original_data.index)
        
        if self.debug:
            logger.debug(f"[InversePreprocessor] Final dataset index:\n{final_dataset.index}")
            logger.debug(f"[InversePreprocessor] Original dataset index:\n{original_data.index}")
        
        return final_dataset


if __name__ == "__main__":
    # from datapreprocessor_class import DataPreprocessor
    # File paths
    features_path = '../../data/model/pipeline/final_ml_df_selected_features_columns.pkl'
    dataset_path = "../../data/processed/final_ml_dataset.csv"
    assets_path = '../../data/model/pipeline/preprocessing_assets.pkl'

    # Example 1: With train-test split
    dp_split = DataPreprocessor(
        features_path=features_path,
        dataset_path=dataset_path,
        assets_path=assets_path,
        y_variable='result',
        preprocessor_train_test_split=True,
        test_size=0.3,
        random_state=123,
        stratify=True,
    )
    (
        X_train_transformed,
        X_test_transformed,
        y_train_encoded,
        y_test_encoded,
        transformed_data_train,
        transformed_train_df,  # Include the additional output
        X_train,
        y_train,
    ) = dp_split.run()


    print(f"X_train_transformed shape: {X_train_transformed.shape}")
    print(f"X_test_transformed shape: {X_test_transformed.shape}")
    print(f"Transformed Data Train: {transformed_data_train.keys()}")
    print(f"Original X_train shape: {X_train.shape}")
    print(f"Original y_train shape: {y_train.shape}")


    # Initialize InversePreprocessor with assets_path (no need to preload assets manually)
    inverse_transformer = InversePreprocessor(
        assets_path=assets_path,  # Assets will be loaded automatically
        debug=True
    )

    # Perform inverse transformation and combine with targets
    final_dataset = inverse_transformer.transform(
        original_data=X_train,
        transformed_data=transformed_data_train,
        y_encoded=y_train_encoded
    )

    # Example: Append specified columns from original_data to final_dataset
    columns_to_append = ['player_height_in_meters', 'player_weight__in_kg']  # Example columns
    final_dataset = inverse_transformer.append_columns_from_original(
        final_dataset=final_dataset,
        original_data=X_train,
        columns_to_append=columns_to_append,
        debug=True
    )

    # Display the resulting dataset
    print("[Final Dataset]:")
    print(final_dataset.head())
    print(final_dataset.shape)
    
    # ---------------Example 2: Without train-test split------------------
    dp_no_split = DataPreprocessor(
        features_path=features_path,
        dataset_path=dataset_path,
        assets_path=assets_path,
        y_variable='result',
        preprocessor_train_test_split=False,
    )
    X_transformed, y_encoded, transformed_data, transformed_data_df, original_data, X, y = dp_no_split.run()

    print(f"X_transformed shape: {X_transformed.shape}")
    print(f"Transformed Data: {transformed_data.keys()}")
    print(f"Transformed Data: {transformed_data_df.columns}")
    print(f"Original X shape: {X.shape}")
    print(f"Original y shape: {y.shape}")
    
    # Inverse transformation for Example 2: Without train-test split
    print("\n[Example 2: Without Train-Test Split]")
    inverse_transformer = InversePreprocessor(
        assets_path=assets_path,  # Assets will be loaded automatically
        debug=True
    )

    # Perform inverse transformation and combine with targets
    final_dataset_no_split = inverse_transformer.transform(
        original_data=X,
        transformed_data=transformed_data,
        y_encoded=y_encoded
    )

    # Example: Append specified columns from original_data to final_dataset_no_split
    columns_to_append_no_split = ['trial_id']  # Example columns
    final_dataset_no_split = inverse_transformer.append_columns_from_original(
        final_dataset=final_dataset_no_split,
        original_data=original_data,
        columns_to_append=columns_to_append_no_split,
        debug=True
    )

    # Display the resulting dataset
    print("[Final Dataset without Train-Test Split]:")
    print(f"Final Dataset shape (No Split): {final_dataset_no_split.shape}")
    print(f"Original Dataset shape (With Optimization Ranges): {original_data.shape}")


    #-------------example 3: Preprocessing with Optimization Ranges------------
    dp = DataPreprocessor(
        features_path=features_path,
        dataset_path=dataset_path,
        assets_path=assets_path,
        y_variable="result",
        optimization_columns=["knee_max_angle", "wrist_max_angle", "elbow_max_angle"],
        preprocessor_train_test_split=False,
    )

    results = dp.run(return_optimization_ranges=True)
    X_transformed, y_encoded, transformed_data, transformed_data_df, original_data, X, y, optimization_ranges, optimization_transformed_ranges = results

    print(f"Optimization Ranges: {optimization_ranges}")
    print(f"optimization_transformed_ranges: {optimization_transformed_ranges}")

    # Inverse transformation for Example 3: Preprocessing with Optimization Ranges
    print("\n[Example 3: Preprocessing with Optimization Ranges]")
    inverse_transformer = InversePreprocessor(
        assets_path=assets_path,  # Assets will be loaded automatically
        debug=True
    )

    # Perform inverse transformation and combine with targets
    final_dataset_with_optimization = inverse_transformer.transform(
        original_data=X,
        transformed_data=transformed_data,
        y_encoded=y_encoded
    )

    # Example: Append specified columns from original_data to final_dataset_with_optimization
    columns_to_append_with_opt = ["trial_id"]  # Example columns
    final_dataset_with_optimization = inverse_transformer.append_columns_from_original(
        final_dataset=final_dataset_with_optimization,
        original_data=original_data,
        columns_to_append=columns_to_append_with_opt,
        debug=True
    )

    # Inverse transform optimization parameters
    inverse_optimization_params = inverse_transformer.inverse_transform_optimization_params(
        params=pd.DataFrame(optimization_transformed_ranges),
        optimization_columns=["knee_max_angle", "wrist_max_angle", "elbow_max_angle"]
    )

    # Display the resulting datasets
    print(f"Final Dataset shape (With Optimization Ranges): {final_dataset_with_optimization.shape}")
    print(f"Original Dataset shape (With Optimization Ranges): {original_data.shape}")

    print("\n[Inverse-Transformed Optimization Parameters]:")
    print(inverse_optimization_params)
    print(f"Inverse-Transformed Optimization Parameters shape: {inverse_optimization_params.shape}")


In [None]:
# %%writefile ../../src/freethrow_predictions/ml/inverse_preprocessor_functions.py
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, KBinsDiscretizer, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import joblib
from ml.feature_selection.data_loader_post_select_features import load_selected_features_data

def inverse_transform_feature_groups(
    transformed_data,
    fitted_pipelines,
    feature_indices,
    debug=False
):
    """
    Applies inverse transformations for each feature group using the fitted pipelines.
    Returns a dictionary of DataFrames with appropriate column names.
    """
    inverse_transformed_data = {}

    for feature_type, data in transformed_data.items():
        if data is not None:
            if feature_type in fitted_pipelines:
                pipeline = fitted_pipelines[feature_type]
                if hasattr(pipeline, 'inverse_transform'):
                    inv_transformed = pipeline.inverse_transform(data)
                else:
                    # If inverse_transform is not available, use the original data
                    inv_transformed = data
                    if debug:
                        print(f"\n[{feature_type}] inverse_transform not available. Using original transformed data.")
                # Retrieve feature names
                feature_names = feature_indices[feature_type]['feature_names']
                inverse_transformed_data[feature_type] = pd.DataFrame(inv_transformed, columns=feature_names)
                if debug:
                    print(f"\n[Inverse {feature_type}] Features: {feature_names}")
                    print(f"Inverse Transformed Data Shape: {inv_transformed.shape}")
            else:
                # For unprocessed data or if pipeline is not available
                feature_names = feature_indices[feature_type]['feature_names']
                inverse_transformed_data[feature_type] = pd.DataFrame(data, columns=feature_names)
                if debug:
                    print(f"\n[Unprocessed {feature_type}] Features: {feature_names}")
                    print(f"Data Shape: {data.shape}")

    return inverse_transformed_data

def combine_inverse_transformed_data(inverse_transformed_data, original_index, original_columns, debug=False):
    """
    Combines inverse transformed DataFrames into a single DataFrame,
    preserving the original index and matching original column order.
    """
    combined_df = pd.concat(inverse_transformed_data.values(), axis=1)
    if debug:
        print(f"[Debug] Combined Data Shape (Before Index Assignment): {combined_df.shape}")
        print(f"[Debug] Original Index Length: {len(original_index)}")

    # Validate index length
    if len(original_index) != combined_df.shape[0]:
        raise ValueError(
            f"Index length mismatch: Combined Data has {combined_df.shape[0]} rows, but original index has {len(original_index)} elements"
        )
    combined_df.index = original_index  # Attach the original index

    # Validate column alignment
    if debug:
        print(f"[Debug] Combined Data Columns (Before Alignment): {list(combined_df.columns)}")
        print(f"[Debug] Original Columns: {list(original_columns)}")

    if set(original_columns) != set(combined_df.columns):
        raise ValueError(
            f"Column mismatch: Combined Data Columns={list(combined_df.columns)}, Original Columns={list(original_columns)}"
        )
    combined_df = combined_df[original_columns]  # Align columns with the original order

    return combined_df



def load_pipeline_and_assets(path):
    """
    Loads the fitted preprocessing pipelines, feature indices, flattened feature indices,
    and label encoder from a file.

    Args:
        path (str): Path to the saved file.

    Returns:
        tuple: (fitted_pipelines, feature_indices, flattened_feature_indices, label_encoder)
    """
    assets = joblib.load(path)
    print(f"Pipeline and assets loaded from {path}")
    
    # Debug: Print keys and sample values in assets
    print(f"Loaded asset keys: {list(assets.keys())}")
    for key, value in assets.items():
        print(f"[Debug] Key: {key}, Type: {type(value)}")
        if isinstance(value, dict):
            print(f"[Debug] Sample from {key}: {list(value.keys())[:5]}")
        elif isinstance(value, list):
            print(f"[Debug] Sample from {key}: {value[:5]}")
        elif isinstance(value, pd.DataFrame):
            print(f"[Debug] DataFrame {key} Columns: {value.columns}")
    
    # Check flattened_feature_indices
    flattened_feature_indices = assets.get("flattened_feature_indices", None)
    if flattened_feature_indices is None:
        print("[Warning] Flattened feature indices not found in the loaded assets.")
    
    return (
        assets["fitted_pipelines"],
        assets["feature_indices"],
        flattened_feature_indices,
        assets["label_encoder"],
    )


def prepare_final_dataset_with_target(
    X_train, X_train_inverse_transformed, y_train, y_train_decoded, debug=False
):
    """
    Prepares the final dataset by checking alignment and adding the target variable.

    Args:
        X_train (pd.DataFrame): Original training features.
        X_train_inverse_transformed (pd.DataFrame): Inverse-transformed training features.
        y_train (pd.Series): Original target variable.
        y_train_decoded (array-like): Decoded target variable.
        debug (bool): Whether to print debug statements.

    Returns:
        pd.DataFrame: Final dataset with the target variable included.
    """
    if debug:
        print(f"\n[Inverse Transformed Data Shapes]")
        print(f"X_train_inverse_transformed Shape: {X_train_inverse_transformed.shape}")
        print(f"X_train_inverse_transformed Index:\n{X_train_inverse_transformed.index[:5]}")
        print(f"y_train Index:\n{y_train.index[:5]}")

    # Check alignment of indices
    if not X_train_inverse_transformed.index.equals(y_train.index):
        raise ValueError("Index mismatch between X_train_inverse_transformed and y_train")

    # Check if the original and inverse-transformed datasets are identical
    # Check if the original and inverse-transformed datasets are identical
    if debug:
        # Adjusted equality check
        if np.allclose(X_train.values, X_train_inverse_transformed.values, atol=1e-6):
            print("\nThe inverse-transformed features align with the original features within tolerance.")
        else:
            print("\nThe inverse-transformed features DO NOT align with the original features even within tolerance.")


            # Identify misaligned columns
            misaligned_columns = X_train.columns[
                ~X_train.eq(X_train_inverse_transformed).all(axis=0)
            ]
            print(f"\n[Misaligned Columns]")
            print(misaligned_columns.tolist())

            # Show how values differ in mismatched columns
            for col in misaligned_columns:
                print(f"\nColumn: {col}")
                mismatch = ~X_train[col].eq(X_train_inverse_transformed[col])
                mismatched_rows = X_train.loc[mismatch, col].head()
                transformed_mismatched_rows = X_train_inverse_transformed.loc[mismatch, col].head()
                print(f"Original Values (First 5 mismatches):\n{mismatched_rows}")
                print(f"Inverse Transformed Values (First 5 mismatches):\n{transformed_mismatched_rows}")


    # Add the target variable to the inverse-transformed DataFrame
    X_train_with_target = X_train_inverse_transformed.copy()
    X_train_with_target['y_variable_original'] = y_train_decoded

    if debug:
        print(f"\n[Final Dataset with Target Variable]")
        print(f"X_train_with_target Shape: {X_train_with_target.shape}")
        print(X_train_with_target.head())

    return X_train_with_target


# Main execution
if __name__ == "__main__":
    # from datapreprocessor_class import DataPreprocessor
    # File paths
    features_path = '../../data/model/pipeline/final_ml_df_selected_features_columns.pkl'
    dataset_path = "../../data/processed/final_ml_dataset.csv"
    assets_path = '../../data/model/pipeline/preprocessing_assets.pkl'

    # Example 1: With train-test split
    dp_split = DataPreprocessor(
        features_path=features_path,
        dataset_path=dataset_path,
        assets_path=assets_path,
        y_variable='result',
        preprocessor_train_test_split=True,
        test_size=0.3,
        random_state=123,
        stratify=True,
    )
    (
        X_train_transformed,
        X_test_transformed,
        y_train_encoded,
        y_test_encoded,
        transformed_data_train,
        transformed_train_df,  # Include the additional output
        X_train,
        y_train,
    ) = dp_split.run()

    print(f"X_train_transformed shape: {X_train_transformed.shape}")
    print(f"X_test_transformed shape: {X_test_transformed.shape}")
    print(f"Transformed Data Train: {transformed_data_train.keys()}")
    print(f"Original X_train shape: {X_train.shape}")
    print(f"Original y_train shape: {y_train.shape}")

    #------------------------------------------
    # Saving/Loading Pipelines/Feature Lists
    # Define base paths
    BASE_DATA_PATH = '../../data/model/pipeline/preprocessing_assets.pkl'


    # Step 2: Load the preprocessing assets (simulate decoding in a different context)
    fitted_pipelines, feature_indices_train, flattened_feature_indices, label_encoder = load_pipeline_and_assets(path=BASE_DATA_PATH)
    
    #------------------------------------------
    # Decoding examples:

    # Optional: Decode target variable for inverse transformation
    if label_encoder:
        y_train_decoded = label_encoder.inverse_transform(y_train_encoded)
        print(f"\n[Decoded y_train (First 5)]: {y_train_decoded[:5]}")


    # Step 8: Inverse Transform the Training Data
    inverse_transformed_data_train = inverse_transform_feature_groups(
        transformed_data_train,
        fitted_pipelines,
        feature_indices_train,
        debug=debug
    )


    # Combine inverse transformed data
    X_train_inverse_transformed = combine_inverse_transformed_data(
        inverse_transformed_data_train,
        X_train.index,       # Original index
        X_train.columns,     # Original columns
        debug=True
    )

    # Call the function with debug enabled
    X_train_with_target = prepare_final_dataset_with_target(
        X_train,
        X_train_inverse_transformed,
        y_train,
        y_train_decoded,
        debug=True
    )




In [None]:

# %%writefile ../../src/freethrow_predictions/ml/classification_preprocessor/preprocessor_functions.py
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, KBinsDiscretizer, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import joblib
from ml.feature_selection.data_loader_post_select_features import load_selected_features_data
from ml.classification_preprocessor.smote_automation import check_dataset_for_smote, apply_smote

# Define pipelines for feature preprocessing
def create_numerical_scaler_pipeline():
    return Pipeline([
        ('imputer', SimpleImputer(strategy='mean', add_indicator=True)),
        ('scaler', StandardScaler())
    ])

def create_numerical_minmax_pipeline():
    return Pipeline([
        ('imputer', SimpleImputer(strategy='mean', add_indicator=True)),
        ('scaler', MinMaxScaler())
    ])

def create_numerical_kbins_pipeline():
    return Pipeline([
        ('imputer', SimpleImputer(strategy='mean', add_indicator=True)),
        ('kbins', KBinsDiscretizer(encode='ordinal', strategy='uniform'))
    ])

def create_categorical_onehot_pipeline():
    return Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent', add_indicator=True)),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

def create_categorical_labelencode_pipeline():
    return Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent', add_indicator=True)),
        ('encoder', OrdinalEncoder())
    ])

def create_numerical_dimred_pipeline(n_components=2):
    return Pipeline([
        ('imputer', SimpleImputer(strategy='mean', add_indicator=True)),
        ('dimred', PCA(n_components=n_components))
    ])

# Process feature groups individually
def process_feature_groups(
    dataset,
    numerical_scaler_features,
    numerical_minmax_features,
    numerical_kbins_features,
    numerical_dimred_features,
    categorical_onehot_features,
    categorical_labelencode_features,
    unprocessed_features,
    fitted_pipelines=None,
    debug=False
):
    pipelines = {}
    transformed_data = {}
    feature_indices = {}  # Dictionary to store feature names and their indices
    if fitted_pipelines is None:
        fitted_pipelines = {}

    current_index = 0  # To keep track of the position in the transformed data


    # Numerical Scaler Features
    if numerical_scaler_features:
        if 'numerical_scaler' in fitted_pipelines:
            pipeline = fitted_pipelines['numerical_scaler']
            transformed = pipeline.transform(dataset[numerical_scaler_features])
        else:
            pipeline = create_numerical_scaler_pipeline()
            transformed = pipeline.fit_transform(dataset[numerical_scaler_features])
            fitted_pipelines['numerical_scaler'] = pipeline
        transformed_data['numerical_scaler'] = transformed
        # Store feature names and their indices
        feature_indices['numerical_scaler'] = {
            'feature_names': numerical_scaler_features,
            'indices': list(range(current_index, current_index + transformed.shape[1]))
        }
        current_index += transformed.shape[1]
        if debug:
            print(f"\n[Numerical Scaler] Features: {numerical_scaler_features}")
            print(f"Transformed Data Shape: {transformed.shape}")


    # Numerical MinMax Features
    if numerical_minmax_features:
        if 'numerical_minmax' in fitted_pipelines:
            pipeline = fitted_pipelines['numerical_minmax']
            transformed = pipeline.transform(dataset[numerical_minmax_features])
        else:
            pipeline = create_numerical_minmax_pipeline()
            transformed = pipeline.fit_transform(dataset[numerical_minmax_features])
            fitted_pipelines['numerical_minmax'] = pipeline
        transformed_data['numerical_minmax'] = transformed
        feature_indices['numerical_minmax'] = {
            'feature_names': numerical_minmax_features,
            'indices': list(range(current_index, current_index + transformed.shape[1]))
        }
        current_index += transformed.shape[1]
        if debug:
            print(f"\n[Numerical MinMax] Features: {numerical_minmax_features}")
            print(f"Transformed Data Shape: {transformed.shape}")

    # Numerical KBins Features
    if numerical_kbins_features:
        if 'numerical_kbins' in fitted_pipelines:
            pipeline = fitted_pipelines['numerical_kbins']
            transformed_data['numerical_kbins'] = pipeline.transform(dataset[numerical_kbins_features])
        else:
            pipeline = create_numerical_kbins_pipeline()
            transformed_data['numerical_kbins'] = pipeline.fit_transform(dataset[numerical_kbins_features])
            fitted_pipelines['numerical_kbins'] = pipeline
        if debug:
            print(f"\n[Numerical KBins] Features: {numerical_kbins_features}")
            print(f"Transformed Data Shape: {transformed_data['numerical_kbins'].shape}")

    # Numerical Dimensionality Reduction Features
    if numerical_dimred_features:
        if 'numerical_dimred' in fitted_pipelines:
            pipeline = fitted_pipelines['numerical_dimred']
            transformed_data['numerical_dimred'] = pipeline.transform(dataset[numerical_dimred_features])
        else:
            pipeline = create_numerical_dimred_pipeline(n_components=3)
            transformed_data['numerical_dimred'] = pipeline.fit_transform(dataset[numerical_dimred_features])
            fitted_pipelines['numerical_dimred'] = pipeline
        if debug:
            print(f"\n[Numerical DimRed] Features: {numerical_dimred_features}")
            print(f"Transformed Data Shape: {transformed_data['numerical_dimred'].shape}")

    # Categorical OneHot Features
    if categorical_onehot_features:
        if 'categorical_onehot' in fitted_pipelines:
            pipeline = fitted_pipelines['categorical_onehot']
            transformed = pipeline.transform(dataset[categorical_onehot_features])
        else:
            pipeline = create_categorical_onehot_pipeline()
            transformed = pipeline.fit_transform(dataset[categorical_onehot_features])
            fitted_pipelines['categorical_onehot'] = pipeline
        transformed_data['categorical_onehot'] = transformed
        # Get feature names from OneHotEncoder
        onehot_encoder = pipeline.named_steps['onehot']
        categories = onehot_encoder.categories_
        expanded_feature_names = []
        for feature, cats in zip(categorical_onehot_features, categories):
            expanded_feature_names.extend([f"{feature}_{cat}" for cat in cats])
        feature_indices['categorical_onehot'] = {
            'feature_names': expanded_feature_names,
            'indices': list(range(current_index, current_index + transformed.shape[1]))
        }
        current_index += transformed.shape[1]
        if debug:
            print(f"\n[Categorical OneHot] Features: {categorical_onehot_features}")
            print(f"Transformed Data Shape: {transformed.shape}")

    # Categorical LabelEncode Features
    if categorical_labelencode_features:
        if 'categorical_labelencode' in fitted_pipelines:
            pipeline = fitted_pipelines['categorical_labelencode']
            transformed_data['categorical_labelencode'] = pipeline.transform(dataset[categorical_labelencode_features])
        else:
            pipeline = create_categorical_labelencode_pipeline()
            transformed_data['categorical_labelencode'] = pipeline.fit_transform(dataset[categorical_labelencode_features])
            fitted_pipelines['categorical_labelencode'] = pipeline
        if debug:
            print(f"\n[Categorical LabelEncode] Features: {categorical_labelencode_features}")
            print(f"Transformed Data Shape: {transformed_data['categorical_labelencode'].shape}")

    # Unprocessed Features
    if unprocessed_features:
        transformed = dataset[unprocessed_features].values
        transformed_data['unprocessed'] = transformed
        feature_indices['unprocessed'] = {
            'feature_names': unprocessed_features,
            'indices': list(range(current_index, current_index + transformed.shape[1]))
        }
        current_index += transformed.shape[1]
        if debug:
            print(f"\n[Unprocessed] Features: {unprocessed_features}")
            print(f"Data Shape: {transformed.shape}")
    else:
        transformed_data['unprocessed'] = None

    return transformed_data, fitted_pipelines, feature_indices

# Encode/Decode target variable (y)
def preprocess_target(y_train, y_test=None, debug=False):
    """
    Encodes the target variable into binary format if necessary and provides a way to inverse transform.
    """
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    if y_test is not None:
        y_test_encoded = label_encoder.transform(y_test)
    else:
        y_test_encoded = None
    if debug:
        print("\n[Target Variable Encoding]")
        print(f"Classes: {label_encoder.classes_}")
        print(f"Encoded y_train (First 5): {y_train_encoded[:5]}")
        if y_test_encoded is not None:
            print(f"Encoded y_test (First 5): {y_test_encoded[:5]}")

    return y_train_encoded, y_test_encoded, label_encoder



# Combine transformed data into final arrays
def combine_transformed_data_with_index(transformed_data, feature_indices, original_index, debug=False):
    """
    Combines transformed feature arrays into a single dataset and collects feature names,
    ensuring the original index is preserved.
    """
    combined_data = []
    combined_feature_names = []
    for feature_type, data in transformed_data.items():
        if data is not None:
            combined_data.append(data)
            # Retrieve feature names from feature_indices
            if feature_type in feature_indices:
                feature_names = feature_indices[feature_type]['feature_names']
                combined_feature_names.extend(feature_names)
            else:
                feature_names = [f"{feature_type}_{i}" for i in range(data.shape[1])]
                combined_feature_names.extend(feature_names)
            if debug:
                print(f"[Combine] Adding {feature_type}: Shape {data.shape} Features: {feature_names}")
    
    combined_array = np.hstack(combined_data)
    combined_df = pd.DataFrame(combined_array, columns=combined_feature_names)
    combined_df.index = original_index  # Reattach original index
    
    if debug:
        print("\n[Combine Transformed Data]")
        print(f"Combined Shape: {combined_df.shape}")
        print(f"Combined Columns: {list(combined_df.columns)}[:10]...")  # Show first 10 columns
    
    return combined_df


def save_pipeline_and_assets(fitted_pipelines, feature_indices, label_encoder, path):
    """
    Saves the fitted preprocessing pipelines, feature indices, and label encoder to a file.

    Args:
        fitted_pipelines (dict): Dictionary of fitted pipelines for each feature group.
        feature_indices (dict): Dictionary of feature indices and names for each transformation step.
        label_encoder (LabelEncoder): Fitted LabelEncoder for target variable.
        path (str): Path to save the file.
    """
    assets = {
        "fitted_pipelines": fitted_pipelines,
        "feature_indices": feature_indices,
        "label_encoder": label_encoder,
    }
    joblib.dump(assets, path)
    print(f"Pipeline and assets saved to {path}")





# Main execution
if __name__ == "__main__":
    # from preprocessor_recommendations import filter_features_by_type, analyze_categorical_features, analyze_numerical_features_enhanced_v2, 
    # from preprocessor_encoding_filtered_datasets import filter_features, check_on_processed_datasets
    from collections import Counter
    # File paths
    features_path = '../../data/model/pipeline/final_ml_df_selected_features_columns.pkl'
    dataset_path = "../../data/processed/final_ml_dataset.csv"
    
    # from data_loader_post_select_features import load_selected_features_data
    # Example usage:
    final_ml_df_selected_features = load_selected_features_data(
        features_path=features_path,
        dataset_path=dataset_path,
        y_variable='result',
        debug=False
    )
    
    # Step 1: Filter features
    categorical_df, numerical_df = filter_features_by_type(final_ml_df_selected_features, debug=debug)

    # Step 2: Analyze categorical features
    categorical_info_df = analyze_categorical_features(
        categorical_df,
        low_cardinality_threshold=10,
        high_cardinality_threshold=50,
        missing_threshold=0.3,
        debug=False
    )

    # Step 3: Analyze numerical features and handle outliers automatically
    numerical_info_df = analyze_numerical_features_enhanced_v2(
        numerical_df,
        y_feature=None,
        zscore_threshold=zscore_threshold,
        tukey_threshold=tukey_threshold,
        missing_threshold=0.5,
        high_cardinality_threshold=1000,
        debug=False
    )


    debug = True

    # Assuming numerical_info_df, categorical_info_df, and final_ml_df_selected_features are already defined
    y_variable = 'result'

    print("\n[Initial Dataset Info]")
    print(f"Columns to work with: {final_ml_df_selected_features.columns.tolist()}")
    print(f"Categorical Features: {categorical_info_df['Feature'].tolist()}")
    print(f"Numerical Features: {numerical_info_df['Feature'].tolist()}")

    # Step 1: Split dataset into features (X) and target (y)
    X = final_ml_df_selected_features.drop(columns=[y_variable])
    y = final_ml_df_selected_features[y_variable]

    # Step 2: Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    print(f"\n[Train-Test Split]")
    print(f"X_train Shape: {X_train.shape}")
    #print(f"X_train: {X_train.head()}")
    print(f"X_test Shape: {X_test.shape}")
    print(f"y_train Shape: {y_train.shape}")
    print(f"y_test Shape: {y_test.shape}")


    # add in SMOTE TO TRAINING DATASETS ONLY 

    # from smote_automation import  check_dataset_for_smote, apply_smote

    # Analyze dataset for SMOTE
    smote_analysis = check_dataset_for_smote(X_train, y_train, debug=True)
    print("SMOTE Analysis Recommendations:", smote_analysis["recommendations"])

    # Apply SMOTE
    X_train_resampled, y_train_resampled, smote_used = apply_smote(X_train, y_train, smote_analysis["recommendations"], debug=True)
    print("Applied SMOTE Variant:", smote_used)
    print("Resampled Class Distribution:", Counter(y_train_resampled))

    logging.info(f"SMOTE Technique Used: {smote_used}")

    # Step 3: Filter features
    (
        numerical_scaler_features,
        numerical_minmax_features,
        numerical_kbins_features,
        numerical_dimred_features,
        onehot_features,
        labelencode_features,
        unprocessed_features
    ) = filter_features(
        numerical_info_df, categorical_info_df, final_ml_df_selected_features, y_variable, debug=debug
    )
    
    # Save original feature names
    original_feature_names = {
        'numerical_scaler': numerical_scaler_features,
        'numerical_minmax': numerical_minmax_features,
        'numerical_kbins': numerical_kbins_features,
        'numerical_dimred': numerical_dimred_features,
        'onehot': onehot_features,
        'labelencode': labelencode_features,
        'unprocessed': unprocessed_features
        }
    
    
    # Step 4: Preprocess feature groups on training data
    transformed_data_train, fitted_pipelines, feature_indices_train = process_feature_groups(
        X_train_resampled,
        numerical_scaler_features,
        numerical_minmax_features,
        numerical_kbins_features,
        numerical_dimred_features,
        onehot_features,
        labelencode_features,
        unprocessed_features,
        debug=debug
    )


    # Step 5: Preprocess feature groups on testing data using fitted pipelines
    transformed_data_test, _, _ = process_feature_groups(
        X_test,
        numerical_scaler_features,
        numerical_minmax_features,
        numerical_kbins_features,
        numerical_dimred_features,
        onehot_features,
        labelencode_features,
        unprocessed_features,
        fitted_pipelines=fitted_pipelines,
        debug=debug
    )

    # Debugging before combining transformed data
    print("\n[Debug] Transformed Data Keys and Shapes:")
    for key, value in transformed_data_train.items():
        if value is not None:
            print(f"{key}: {value.shape}")
        
    # Step 6: Combine transformed features and collect feature names
    X_train_transformed = combine_transformed_data_with_index(transformed_data_train, feature_indices_train, X_train_resampled.index, debug=debug)
    X_test_transformed = combine_transformed_data_with_index(transformed_data_test, feature_indices_train, X_test.index, debug=debug)

    print(f"\n[Transformed Data Shapes]")
    print(f"X_train_transformed Shape: {X_train_transformed.shape}")
    print(f"X_test_transformed Shape: {X_test_transformed.shape}")

    # Step 7: Preprocess target variable
    y_train_encoded, y_test_encoded, label_encoder = preprocess_target(y_train_resampled, y_test, debug=debug)

    # Final data shapes
    print(f"\n[Final Data Shapes]")
    print(f"X_train_transformed: {X_train_transformed.shape}")
    print(f"X_test_transformed: {X_test_transformed.shape}")
    print(f"y_train_encoded: {y_train_encoded.shape}")
    print(f"y_test_encoded: {y_test_encoded.shape}")

    #------------------------------------------
    # Saving/Loading Pipelines/Feature Lists
    # Define base paths
    preprocessing_assets_path = '../../data/model/pipeline/preprocessing_assets.pkl'

    # Save pipelines, feature indices, and label encoder
    save_pipeline_and_assets(fitted_pipelines, feature_indices_train, label_encoder, path=preprocessing_assets_path)



In [None]:
# %%writefile ../../src/freethrow_predictions/ml/classification_preprocessor/preprocessor_recommendations.py
# Data Preprocessing Recommendation Code


import pandas as pd
import numpy as np
import pickle
import logging
from scipy.stats import shapiro, normaltest
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import Normalizer, PolynomialFeatures, KBinsDiscretizer, Binarizer, QuantileTransformer
from sklearn.decomposition import PCA
from ml.feature_selection.data_loader_post_select_features import load_selected_features_data
from ml.classification_preprocessor.smote_automation import check_dataset_for_smote, apply_smote


def filter_features_by_type(final_ml_df, debug=False):
    """
    Separates features into categorical and numerical types for further processing.
    """
    categorical_features = []
    numerical_features = []
    excluded_features = []

    for col in final_ml_df.columns:
        # Exclude known ID columns or irrelevant features
        if col in ['player_participant_id', 'trial_id', 'shot_id']:
            excluded_features.append(col)
            continue
        
        # Identify categorical features
        if final_ml_df[col].dtype in ['object', 'category'] or final_ml_df[col].nunique() < 10:
            categorical_features.append(col)
        else:
            numerical_features.append(col)
    
    if debug:
        print(f"Categorical Features: {categorical_features}")
        print(f"Numerical Features: {numerical_features}")
        print(f"Excluded Features: {excluded_features}")

    return final_ml_df[categorical_features], final_ml_df[numerical_features]


# Rules for Numerical Preprocessing

#     Scaling/Normalization:
#         Use StandardScaler for features that follow a normal distribution.
#         Use MinMaxScaler for non-normal distributed features with a wide range of values.
#         Use Normalizer for features where relative magnitudes are critical (e.g., vectors).

#     Feature Transformation:
#         Use KBinsDiscretizer for binning features with continuous values and less than 10 unique bins (e.g., ages, income categories).
#         Use Binarizer for threshold-based classification (e.g., binary labels or thresholding a numeric feature like probability).
#         Use PolynomialFeatures for generating interaction terms if the feature shows a high correlation with the target.

#     Outlier Handling:
#         Apply Z-Score for outlier detection in normally distributed data.
#         Use Tukey's Method (IQR-based) for skewed or non-normal distributions.

#     Imputation:
#         Use SimpleImputer with strategy 'mean' or 'median' for low missingness (<30%).
#         Use IterativeImputer for multivariate datasets with moderate missingness (30-50%).
#         Exclude features with high missingness (>50%).

#     High Cardinality:
#         Apply dimensionality reduction techniques such as PCA or QuantileTransformer for numerical features with a high number of unique values (>1000).

#     Special Cases:
#         Use Log Transformation for skewed distributions (e.g., long tails).
#         Apply clipping for extreme outliers based on percentiles (e.g., clipping to the 5th and 95th percentiles).
        



def analyze_numerical_features_enhanced_v2(
    numerical_df,
    y_feature=None,
    zscore_threshold=3,
    tukey_threshold=1.5,
    missing_threshold=0.5,
    high_cardinality_threshold=1000,
    debug=False
):
    """
    Analyzes numerical features and recommends preprocessing based on explicit rules.
    """
    feature_info = []
    total_rows = len(numerical_df)

    for col in numerical_df.columns:
        if col == y_feature:
            continue
        try:
            # Core Metrics
            unique_values = numerical_df[col].nunique()
            missing_values = numerical_df[col].isnull().sum()
            missing_ratio = missing_values / total_rows
            valid_values = numerical_df[col].dropna()

            # Normality Tests
            is_normal_shapiro = is_normal_ad = None
            if len(valid_values) > 8:  # Minimum samples for normality tests
                shapiro_stat, shapiro_p = shapiro(valid_values)
                is_normal_shapiro = shapiro_p > 0.05
                ad_stat, ad_p = normaltest(valid_values)
                is_normal_ad = ad_p > 0.05

            # Outlier Detection
            if is_normal_shapiro:
                outlier_method = "Z-Score"
                outlier_reason = "Assumes normal distribution for outlier detection."
            else:
                outlier_method = "Tukey's Method"
                outlier_reason = "Handles non-normal distributions effectively."

            # Preprocessing Suggestion
            if unique_values > high_cardinality_threshold:
                preprocessing = "Dimensionality Reduction"
                preprocessing_reason = "High cardinality can lead to overfitting; dimensionality reduction avoids it."
            elif unique_values < 10:
                preprocessing = "KBinsDiscretizer"
                preprocessing_reason = "Low cardinality; binning simplifies representation."
            elif is_normal_shapiro:
                preprocessing = "StandardScaler"
                preprocessing_reason = "Feature is normally distributed; scaling to zero mean and unit variance is recommended."
            elif len(valid_values) < 100:
                preprocessing = "Normalizer"
                preprocessing_reason = "Low sample size; normalization avoids over-scaling."
            else:
                preprocessing = "MinMaxScaler"
                preprocessing_reason = "Feature is not normally distributed; MinMax scaling normalizes values to [0,1]."

            # Imputation Recommendation
            if missing_ratio == 0:
                imputation = "No Imputation"
                imputation_reason = "No missing values."
            elif missing_ratio < 0.3:
                imputation = "SimpleImputer"
                imputation_reason = "Moderate missingness; mean or median imputation suffices."
            elif missing_ratio <= missing_threshold:
                imputation = "IterativeImputer"
                imputation_reason = "High missingness; multivariate imputation preserves feature relationships."
            else:
                imputation = "Exclude Feature"
                imputation_reason = "Exceeds missingness threshold."

            # Record Results
            feature_info.append({
                "Feature": col,
                "Data Type": str(numerical_df[col].dtype),
                "Preprocessing Suggestion": preprocessing,
                "Reason": f"Unique Values: {unique_values}, Missing Ratio: {missing_ratio:.2%}, {preprocessing_reason}",
                "Imputation Recommendation": imputation,
                "Imputation Reason": imputation_reason,
                "Outlier Method": outlier_method,
                "Outlier Reason": outlier_reason,
            })

        except Exception as e:
            if debug:
                print(f"Error analyzing feature {col}: {e}")

    return pd.DataFrame(feature_info)


def analyze_categorical_features(
    categorical_df,
    low_cardinality_threshold=10,
    high_cardinality_threshold=50,
    missing_threshold=0.3,
    debug=False
):
    """
    Analyzes categorical features for preprocessing and recommends imputers and encoders.
    """
    feature_info = []

    for col in categorical_df.columns:
        try:
            unique_values = categorical_df[col].nunique()
            missing_values = categorical_df[col].isnull().sum()
            missing_ratio = missing_values / len(categorical_df)

            # Encoding Suggestion
            if unique_values == 1:
                encoding = "Drop Column"
                encoding_reason = "Only one unique value; not useful for modeling."
            elif unique_values <= low_cardinality_threshold:
                encoding = "LabelEncoder"
                encoding_reason = "Low cardinality; integer encoding is efficient."
            elif unique_values <= high_cardinality_threshold:
                encoding = "OneHotEncoder"
                encoding_reason = "Moderate cardinality; one-hot encoding balances simplicity and precision."
            else:
                encoding = "Group or Target Encoding"
                encoding_reason = "High cardinality; direct encoding may lead to inefficiencies."

            # Imputation Recommendation
            if missing_ratio == 0:
                imputation = "No Imputation"
                imputation_reason = "No missing values."
            elif missing_ratio < missing_threshold:
                imputation = "SimpleImputer"
                imputation_reason = "Moderate missingness; most_frequent strategy is effective."
            else:
                imputation = "Exclude Feature"
                imputation_reason = "Exceeds missingness threshold."

            # Record Results
            feature_info.append({
                "Feature": col,
                "Data Type": str(categorical_df[col].dtype),
                "Encoding Suggestion": encoding,
                "Reason": f"Unique Values: {unique_values}, Missing Ratio: {missing_ratio:.2%}, {encoding_reason}",
                "Imputation Recommendation": imputation,
                "Imputation Reason": imputation_reason,
            })

        except Exception as e:
            if debug:
                print(f"Error analyzing feature {col}: {e}")

    return pd.DataFrame(feature_info)






if __name__ == "__main__":

    debug = True
    # Example parameter tuning
    zscore_threshold = 3
    tukey_threshold = 1.5
    max_rows_shapiro = 5000
    min_rows_normality_percentage = 0.05
    high_outlier_percentage = 5
    correlation_threshold = 0.8  # Threshold for multicollinearity check

    # File paths
    features_path = '../../data/model/pipeline/final_ml_df_selected_features_columns.pkl'
    dataset_path = "../../data/processed/final_ml_dataset.csv"
    
    # from ml.feature_selection.data_loader_post_select_features import load_selected_features_data
    # Example usage:
    final_ml_df_selected_features = load_selected_features_data(
        features_path=features_path,
        dataset_path=dataset_path,
        y_variable='result',
        debug=False
    )
    final_ml_df = pd.read_csv(dataset_path)
    print(" final_ml_df shape = ", final_ml_df.shape)
    print(" final_ml_df_selected_features shape = ", final_ml_df_selected_features.shape)
    # Step 1: Filter features
    categorical_df, numerical_df = filter_features_by_type(final_ml_df_selected_features, debug=debug)

    # Step 2: Analyze categorical features
    categorical_info_df = analyze_categorical_features(
        categorical_df,
        low_cardinality_threshold=10,
        high_cardinality_threshold=50,
        missing_threshold=0.3,
        debug=False
    )

    # Step 3: Analyze numerical features and handle outliers automatically
    numerical_info_df = analyze_numerical_features_enhanced_v2(
        numerical_df,
        y_feature=None,
        zscore_threshold=zscore_threshold,
        tukey_threshold=tukey_threshold,
        missing_threshold=0.5,
        high_cardinality_threshold=1000,
        debug=False
    )

    # Display results
    print("\nCategorical Features Analysis:")
    print(categorical_info_df.to_string(index=False))

    print("\nNumerical Features Analysis:")
    print(numerical_info_df.to_string(index=False))