### Main function used for testing prior to use in Streamlit
Give the option of processing new data or selecting new features or reviewing the categoricals and how they are handled
- processing the ml dataset
  - option of new data to for processing the basketball data
- selecting new features:
  - options of how to handle categorical functions
  - multi collinearity handling
  - feature importance review 
  - then select the features
- Preprocessing
  - SMOTE selection if prompted (otherwise use the automation)
    - option of selection or of the metrics that go into the automation of the SMOTE
    - option of checking the benefit of the smote selected
  - Great output of the selections chosen
  - Option of including the basic outlier methods
- Train:
  - option of training which model or all
  - option of adjusting the bayesian ranges for the models
- Predict:
  - options: model and anything else we can 
- Shap Dashboard:
  - Options of which metrics to view for the graphs specifically and global recommendations for SHAP
- Bayesian Dashboard separately to find the best metrics globally for their probabilities as a concept
- 

In [None]:
import logging 
import pandas as pd
import numpy as np
import pickle


#----------Loading and prepping the dataset-----------------
from ml.load_and_feature_engineer_data import process_basketball_data

# Define paths
directory_path = "../../../SPL-Open-Data/basketball/freethrow/data/P0001"
player_info_path = "../../../SPL-Open-Data/basketball/freethrow/participant_information.json"
output_ml_path = "../../../data/processed/final_ml_dataset.csv"
output_granular_path = "../../../data/processed/final_granular_dataset.csv"

# Call the processing function to create final_ml_dataset and final_granular_dataset 
# Purpose: feature engineer on a per trial basis so we can see on one example we are getting them correctly
process_basketball_data(
    directory_path=directory_path,
    player_info_path=player_info_path,
    output_ml_path=output_ml_path,
    output_granular_path=output_granular_path,
    debug=True,  # Enable debug mode for detailed logs
    log_level=logging.DEBUG,  # Set logging level to DEBUG
    new_data=False  # Set to True to append, False to overwrite
)




#---------- feature selection (on entire dataset)-----------------
from ml.feature_selection.multicollinearity_checker import check_multicollinearity
from ml.feature_selection.feature_importance_calculator import calculate_feature_importance


file_path = "../../data/processed/final_ml_dataset.csv"
final_ml_df = pd.read_csv(file_path)

# Feature selection based on multi collinearity and random forest importance selection
target_variable = 'result'
correlation_threshold = 0.8
debug = True

# Remove columns to address collinearity
drop_features = [
    'trial_id', 'player_participant_id', 'landing_y', 'landing_x', 'entry_angle', 'shot_id',
    'L_KNEE_avg_power', 'L_WRIST_energy_std', 'R_WRIST_energy_max', 
    'R_ANKLE_energy_mean', 'R_5THFINGER_energy_std', 'R_KNEE_avg_power', 'L_1STFINGER_max_power', 
    'L_5THFINGER_energy_max', 'L_WRIST_max_power', 'R_HIP_energy_std', 'L_1STFINGER_energy_max', 
    'R_ANKLE_energy_max', 'R_ELBOW_energy_max', 'R_ANKLE_energy_std', 'L_WRIST_energy_max', 
    'player_estimated_hand_length_cm', 'player_estimated_standing_reach_cm', 
    'player_estimated_wingspan_cm', 'player_weight__in_kg', 'L_KNEE_energy_std', 'L_HIP_energy_max', 
    'L_ANKLE_energy_max', 'L_WRIST_std_power', 'L_ELBOW_std_power', 'R_KNEE_max_power', 
    'L_ELBOW_avg_power', 'R_ELBOW_min_power', 'L_WRIST_min_power', 'R_HIP_energy_mean', 
    'L_ELBOW_energy_max', 'L_ELBOW_min_power', 'R_1STFINGER_min_power', 'L_ANKLE_min_power', 
    'L_1STFINGER_avg_power', 'R_ANKLE_std_power', 'R_5THFINGER_avg_power', 'L_1STFINGER_energy_mean', 
    'R_HIP_max_power', 'R_WRIST_avg_power', 'R_ELBOW_energy_mean', 'L_WRIST_avg_power', 
    'L_1STFINGER_std_power', 'L_KNEE_energy_max', 'L_WRIST_energy_mean', 'R_KNEE_energy_std', 
    'L_HIP_energy_std', 'L_KNEE_energy_mean', 'R_WRIST_energy_mean', 'L_ELBOW_max_power', 
    'R_WRIST_energy_std', 'L_ANKLE_std_power', 'L_HIP_energy_mean', 'L_ELBOW_energy_mean', 
    'R_HIP_avg_power', 'L_HIP_std_power', 'R_KNEE_std_power', 'L_ANKLE_energy_std', 
    'release_frame_time', 'R_ANKLE_avg_power', 'L_ANKLE_max_power', 'L_5THFINGER_energy_std', 
    'R_WRIST_min_power', 'R_1STFINGER_energy_mean', 'R_ELBOW_energy_std', 'R_HIP_std_power', 
    'R_KNEE_energy_max', 'R_WRIST_std_power', 'L_1STFINGER_energy_std', 'L_HIP_avg_power', 
    'R_5THFINGER_energy_mean', 'R_ANKLE_max_power', 'L_ANKLE_avg_power', 'R_5THFINGER_max_power', 
    'R_5THFINGER_energy_max', 'L_5THFINGER_min_power', 'L_ELBOW_energy_std', 
    'R_1STFINGER_energy_max', 'R_KNEE_min_power', 'R_1STFINGER_energy_std', 
    'R_5THFINGER_std_power', 'L_1STFINGER_min_power', 'R_ELBOW_max_power', 'L_HIP_min_power', 
    'L_5THFINGER_std_power', 'R_1STFINGER_max_power', 'R_KNEE_energy_mean', 'L_5THFINGER_avg_power', 
    'L_5THFINGER_max_power', 'R_HIP_min_power', 'L_KNEE_max_power', 'R_5THFINGER_min_power', 
    'R_1STFINGER_std_power', 'R_ELBOW_avg_power', 'L_ANKLE_energy_mean', 'R_ELBOW_std_power', 
    'L_5THFINGER_energy_mean', 'R_1STFINGER_avg_power', 'R_HIP_energy_max', 'L_KNEE_std_power',
    'R_ANKLE_min_power', 'L_KNEE_min_power', 'L_HIP_max_power'
]

# Step 1: Check for multicollinearity
print("\nChecking for Multicollinearity...")
multicollinearity_df = check_multicollinearity(final_ml_df, threshold=correlation_threshold, debug=debug)

# Step 2: Handle multicollinearity
if not multicollinearity_df.empty:
    for index, row in multicollinearity_df.iterrows():
        feature1, feature2, correlation = row['Feature1'], row['Feature2'], row['Correlation']
        print(f"High correlation ({correlation}) between '{feature1}' and '{feature2}'.")

        # Drop or combine features based on criteria
        # Example decision logic here...
        # drop_features = ['trial_id', 'player_participant_id']
        # # Drop the identified features from the dataset
        # Drop the identified features from the dataset
        final_ml_df = final_ml_df.drop(columns=drop_features, errors='ignore')

        print(f"Dropped {len(drop_features)} features: {', '.join(drop_features)}")
else:
    print("No multicollinearity issues detected.")

# Step 3: Calculate feature importance
print("\nCalculating Feature Importance...")
feature_importances = calculate_feature_importance(
    final_ml_df, target_variable=target_variable, n_estimators=100, random_state=42, debug=debug
)

print("\nFinal Feature Importances:")
print(feature_importances.to_string(index=False))


#Final Decisions: 
# Features recommended for dropping
features_to_drop = [
    'peak_height_relative'
]
print(f"Dropped features (for redundancy or multicollinearity): {', '.join(features_to_drop)}")

# Final features to retain for classification
final_keep_list = [
    'release_ball_direction_x' ,'release_ball_direction_z', 'release_ball_direction_y',
    'elbow_release_angle', 'elbow_max_angle',
    'wrist_release_angle', 'wrist_max_angle',
    'knee_release_angle', 'knee_max_angle',
    'result', 'release_ball_speed',
    'release_ball_velocity_x', 'release_ball_velocity_y','release_ball_velocity_z',
    'calculated_release_angle'
]
# Apply the filter to keep only these columns
final_ml_df_selected_features = final_ml_df[final_keep_list]
print(f"Retained {len(final_keep_list)} features: {', '.join(final_keep_list)}")

# Save feature names to a file
with open('../../data/model/pipeline/final_ml_df_selected_features_columns.pkl', 'wb') as f:
    pickle.dump(final_ml_df_selected_features.columns.tolist(), f)

print(f"Retained {len(final_keep_list)} features: {', '.join(final_keep_list)}")

#---------STOP: CHECK ON CATEGORICALS AND IF YOU WANT TO BIN, WILL BE REMINDED AFTER DATAPREPROCESSOR, DO IT HERE OR DO IT WITH FEATURE-ENGINEERING PACKAGE--------
from ml.feature_selection.categorize_categoricals import transform_features_with_bins
# # Example bin configuration
category_bin_config = {
    'player_height_in_meters': {
        'bins': [0, 1.80, 2.00, np.inf],
        'labels': ["Short", "Average", "Tall"]
    },
    'player_weight__in_kg': {
        'bins': [0, 75, 95, np.inf],
        'labels': ["Lightweight", "Average", "Heavy"]
    },
    'player_estimated_wingspan_cm': {
        'bins': [0, 190, 220, np.inf],
        'labels': ["Small", "Medium", "Large"]
    },
    'player_estimated_standing_reach_cm': {
        'bins': [0, 230, 250, np.inf],
        'labels': ["Short", "Average", "Tall"]
    },
    'player_estimated_hand_length_cm': {
        'bins': [0, 20, 25, np.inf],
        'labels': ["Small", "Medium", "Large"]
    }
}
# Save the category bin configuration
with open('../../data/model/pipeline/category_bin_config.pkl', 'wb') as f:
    pickle.dump(category_bin_config, f)

# Load the category bin configuration
with open('../../data/model/pipeline/category_bin_config.pkl', 'rb') as f:
    loaded_category_bin_config = pickle.load(f)

file_path = "../../data/processed/final_ml_dataset.csv"
#import ml dataset from spl_dataset_prep
final_ml_df = pd.read_csv(file_path)


# Step 1: Transform player features using the configuration
categorized_columns_df = transform_features_with_bins(final_ml_df, loaded_category_bin_config, debug=debug)

# Step 2: Combine the original DataFrame with the categorized columns
final_ml_df_categoricals = pd.concat([final_ml_df, categorized_columns_df], axis=1)


# from ml.feature_selection.data_loader_post_select_features import load_selected_features_data
# # Example usage:
# final_ml_df_selected_features = load_selected_features_data(
#     features_path='../../data/model/pipeline/final_ml_df_selected_features_columns.pkl',
#     dataset_path='../../data/processed/final_ml_dataset.csv',
#     category_bin_config_path='../../data/model/pipeline/category_bin_config.pkl',
#     y_variable='result',
#     debug=False
# )





# #-----------------Preprocessing (within Predict and Training so Commented Out)---------------------
# #--Preprocessing--
# # Additions 
#     # - add in the option to change the smote technique to the datapreprocessor if chosen to not automate
# from ml.classification_preprocessor.datapreprocessor_class import DataPreprocessor

# # File paths
# features_path = '../../data/model/pipeline/final_ml_df_selected_features_columns.pkl'
# dataset_path = "../../data/processed/final_ml_dataset.csv"
# assets_path = '../../data/model/pipeline/preprocessing_assets.pkl'
# category_bin_config_path = '../../data/model/pipeline/category_bin_config.pkl'

# # Example 1: With train-test split
# dp_split = DataPreprocessor(
#     features_path=features_path,
#     dataset_path=dataset_path,
#     assets_path=assets_path,
#     category_bin_config_path=category_bin_config_path,
#     y_variable='result',
#     perform_split=True,
#     test_size=0.3,
#     random_state=123,
#     stratify=True,
# )
# (
#     X_train_transformed,
#     X_test_transformed,
#     y_train_encoded,
#     y_test_encoded,
#     transformed_data_train,
#     transformed_train_df,  # Include the additional output
#     X_train,
#     y_train,
# ) = dp_split.run()


# print(f"X_train_transformed shape: {X_train_transformed.shape}")
# print(f"X_test_transformed shape: {X_test_transformed.shape}")
# print(f"Transformed Data Train: {transformed_data_train.keys()}")
# print(f"Original X_train shape: {X_train.shape}")
# print(f"Original y_train shape: {y_train.shape}")

# # Example 2: Without train-test split
# dp_no_split = DataPreprocessor(
#     features_path=features_path,
#     dataset_path=dataset_path,
#     assets_path=assets_path,
#     category_bin_config_path=category_bin_config_path,
#     y_variable='result',
#     perform_split=False,
# )
# X_transformed, y_encoded, transformed_data, transformed_data_df, original_data, X, y = dp_no_split.run()

# print(f"X_transformed shape: {X_transformed.shape}")
# print(f"Transformed Data: {transformed_data.keys()}")
# print(f"Transformed Data: {transformed_data_df.columns}")
# print(f"Original X shape: {X.shape}")
# print(f"Original y shape: {y.shape}")

# #-------------Preprocessing with Optimization Ranges------------
# dp = DataPreprocessor(
#     features_path=features_path,
#     dataset_path=dataset_path,
#     assets_path=assets_path,
#     category_bin_config_path=category_bin_config_path,
#     y_variable="result",
#     optimization_columns=["knee_max_angle", "wrist_max_angle", "elbow_max_angle"],
#     perform_split=False,
# )

# results = dp.run(return_optimization_ranges=True)
# X_transformed, y_encoded, transformed_data, transformed_data_df, original_data, X, y, optimization_ranges, optimization_transformed_ranges = results

# print(f"Optimization Ranges: {optimization_ranges}")
# print(f"optimization_transformed_ranges: {optimization_transformed_ranges}")
# print(f"original_data shape: {original_data.shape}")
# print(f"X shape: {X.shape}")
# print(f"X_transformed shape: {X_transformed.shape}")

# #-----------------Inverse Preprocessing(within Predict and Training so Commented Out)---------------------
# from ml.classification_processors.inverse_preprocessor_class import InversePreprocessor



# # Example 1: With train-test split
# dp_split = DataPreprocessor(
#     features_path=features_path,
#     dataset_path=dataset_path,
#     assets_path=assets_path,
#     category_bin_config_path=category_bin_config_path,
#     y_variable='result',
#     perform_split=True,
#     test_size=0.3,
#     random_state=123,
#     stratify=True,
# )
# (
#     X_train_transformed,
#     X_test_transformed,
#     y_train_encoded,
#     y_test_encoded,
#     transformed_data_train,
#     transformed_train_df,  # Include the additional output
#     X_train,
#     y_train,
# ) = dp_split.run()


# print(f"X_train_transformed shape: {X_train_transformed.shape}")
# print(f"X_test_transformed shape: {X_test_transformed.shape}")
# print(f"Transformed Data Train: {transformed_data_train.keys()}")
# print(f"Original X_train shape: {X_train.shape}")
# print(f"Original y_train shape: {y_train.shape}")


# # Initialize InversePreprocessor with assets_path (no need to preload assets manually)
# inverse_transformer = InversePreprocessor(
#     assets_path=assets_path,  # Assets will be loaded automatically
#     debug=True
# )

# # Perform inverse transformation and combine with targets
# final_dataset = inverse_transformer.transform(
#     original_data=X_train,
#     transformed_data=transformed_data_train,
#     y_encoded=y_train_encoded
# )

# # Example: Append specified columns from original_data to final_dataset
# columns_to_append = ['player_height_in_meters', 'player_weight__in_kg']  # Example columns
# final_dataset = inverse_transformer.append_columns_from_original(
#     final_dataset=final_dataset,
#     original_data=X_train,
#     columns_to_append=columns_to_append,
#     debug=True
# )

# # Display the resulting dataset
# print("[Final Dataset]:")
# print(final_dataset.head())
# print(final_dataset.shape)

# # ---------------Example 2: Without train-test split------------------
# dp_no_split = DataPreprocessor(
#     features_path=features_path,
#     dataset_path=dataset_path,
#     assets_path=assets_path,
#     category_bin_config_path=category_bin_config_path,
#     y_variable='result',
#     perform_split=False,
# )
# X_transformed, y_encoded, transformed_data, transformed_data_df, original_data, X, y = dp_no_split.run()

# print(f"X_transformed shape: {X_transformed.shape}")
# print(f"Transformed Data: {transformed_data.keys()}")
# print(f"Transformed Data: {transformed_data_df.columns}")
# print(f"Original X shape: {X.shape}")
# print(f"Original y shape: {y.shape}")

# # Inverse transformation for Example 2: Without train-test split
# print("\n[Example 2: Without Train-Test Split]")
# inverse_transformer = InversePreprocessor(
#     assets_path=assets_path,  # Assets will be loaded automatically
#     debug=True
# )

# # Perform inverse transformation and combine with targets
# final_dataset_no_split = inverse_transformer.transform(
#     original_data=X,
#     transformed_data=transformed_data,
#     y_encoded=y_encoded
# )

# # Example: Append specified columns from original_data to final_dataset_no_split
# columns_to_append_no_split = ['trial_id']  # Example columns
# final_dataset_no_split = inverse_transformer.append_columns_from_original(
#     final_dataset=final_dataset_no_split,
#     original_data=original_data,
#     columns_to_append=columns_to_append_no_split,
#     debug=True
# )

# # Display the resulting dataset
# print("[Final Dataset without Train-Test Split]:")
# print(f"Final Dataset shape (No Split): {final_dataset_no_split.shape}")
# print(f"Original Dataset shape (With Optimization Ranges): {original_data.shape}")


# #-------------example 3: Preprocessing with Optimization Ranges------------
# dp = DataPreprocessor(
#     features_path=features_path,
#     dataset_path=dataset_path,
#     assets_path=assets_path,
#     category_bin_config_path=category_bin_config_path,
#     y_variable="result",
#     optimization_columns=["knee_max_angle", "wrist_max_angle", "elbow_max_angle"],
#     perform_split=False,
# )

# results = dp.run(return_optimization_ranges=True)
# X_transformed, y_encoded, transformed_data, transformed_data_df, original_data, X, y, optimization_ranges, optimization_transformed_ranges = results

# print(f"Optimization Ranges: {optimization_ranges}")
# print(f"optimization_transformed_ranges: {optimization_transformed_ranges}")

# # Inverse transformation for Example 3: Preprocessing with Optimization Ranges
# print("\n[Example 3: Preprocessing with Optimization Ranges]")
# inverse_transformer = InversePreprocessor(
#     assets_path=assets_path,  # Assets will be loaded automatically
#     debug=True
# )

# # Perform inverse transformation and combine with targets
# final_dataset_with_optimization = inverse_transformer.transform(
#     original_data=X,
#     transformed_data=transformed_data,
#     y_encoded=y_encoded
# )

# # Example: Append specified columns from original_data to final_dataset_with_optimization
# columns_to_append_with_opt = ["trial_id"]  # Example columns
# final_dataset_with_optimization = inverse_transformer.append_columns_from_original(
#     final_dataset=final_dataset_with_optimization,
#     original_data=original_data,
#     columns_to_append=columns_to_append_with_opt,
#     debug=True
# )

# # Inverse transform optimization parameters
# inverse_optimization_params = inverse_transformer.inverse_transform_optimization_params(
#     params=pd.DataFrame(optimization_transformed_ranges),
#     optimization_columns=["knee_max_angle", "wrist_max_angle", "elbow_max_angle"]
# )

# # Display the resulting datasets
# print(f"Final Dataset shape (With Optimization Ranges): {final_dataset_with_optimization.shape}")
# print(f"Original Dataset shape (With Optimization Ranges): {original_data.shape}")

# print("\n[Inverse-Transformed Optimization Parameters]:")
# print(inverse_optimization_params)
# print(f"Inverse-Transformed Optimization Parameters shape: {inverse_optimization_params.shape}")

#-----------------mlflow functions---------------------
from ml.mlflow.mlflow_logger import MLflowLogger
    
#-----------------Training---------------------



#-----------------Predicting---------------------




#-----------------Shap Dashboard---------------------
#-----------------Experimental: Generalized Bayesian Optimized Metrics Dashboard---------------------
#-----------------Animation---------------------




### eliminate repeated code

In [None]:
# main_script.py

import logging
import os
import pickle
from collections import Counter

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from ml.load_and_feature_engineer_data import process_basketball_data
from ml.feature_selection.categorize_categoricals import transform_features_with_bins
from ml.feature_selection.multicollinearchk_featimportancechk import (
    check_multicollinearity,
    calculate_feature_importance
)
from ml.feature_selection.data_loader_post_select_features import load_selected_features_data
from ml.classification_preprocessor.smote_automation import (
    check_dataset_for_smote,
    apply_smote
)
from ml.classification_preprocessor.datapreprocessor_class import DataPreprocessor
from ml.classification_processors.inverse_preprocessor_class import InversePreprocessor
from ml.mlflow.mlflow_logger import MLflowLogger
from ml.train import bayes_best_model_train
from shap.shap_utils import compute_shap_values, plot_shap_summary
from utils.naming_utils import generate_output_filename

# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

def process_ml_dataset(base_dir, participant_info_path, processed_data_dir, pipeline_assets_dir, debug=False):
    """
    Process the ML dataset by loading and preprocessing basketball data.

    Parameters:
        base_dir (str): Directory path to the raw basketball data.
        participant_info_path (str): Path to the participant information JSON file.
        processed_data_dir (str): Directory to save processed data.
        pipeline_assets_dir (str): Directory to save pipeline assets.
        debug (bool): Enable detailed logging and validation outputs.

    Returns:
        tuple: Paths to the processed ML and granular datasets.
    """
    # Generate dynamic filenames
    output_ml_filename = generate_output_filename("final_ml_dataset")
    output_granular_filename = generate_output_filename("final_granular_dataset")
    output_ml_path = os.path.join(processed_data_dir, output_ml_filename)
    output_granular_path = os.path.join(processed_data_dir, output_granular_filename)

    # Process basketball data
    process_basketball_data(
        directory_path=base_dir,
        player_info_path=participant_info_path,
        output_ml_path=output_ml_path,
        output_granular_path=output_granular_path,
        debug=debug,
        log_level=logging.DEBUG if debug else logging.INFO,
        new_data=False  # Set to True to append, False to overwrite
    )

    if debug:
        logger.debug(f"Processed ML Dataset saved at: {output_ml_path}")
        logger.debug(f"Processed Granular Dataset saved at: {output_granular_path}")
    else:
        print("Step [Process ML Dataset] completed.")

    return output_ml_path, output_granular_path

def select_new_features(processed_ml_path, pipeline_assets_dir, debug=False):
    """
    Select new features by handling categoricals, checking multicollinearity, and reviewing feature importance.

    Parameters:
        processed_ml_path (str): Path to the processed ML dataset CSV.
        pipeline_assets_dir (str): Directory to save pipeline assets.
        debug (bool): Enable detailed logging and validation outputs.

    Returns:
        str: Path to the selected features pickle file.
    """
    # Define bin configuration
    category_bin_config = {
        'player_height_in_meters': {
            'bins': [0, 1.80, 2.00, np.inf],
            'labels': ["Short", "Average", "Tall"]
        },
        'player_weight__in_kg': {
            'bins': [0, 75, 95, np.inf],
            'labels': ["Lightweight", "Average", "Heavy"]
        },
        'player_estimated_wingspan_cm': {
            'bins': [0, 190, 220, np.inf],
            'labels': ["Small", "Medium", "Large"]
        },
        'player_estimated_standing_reach_cm': {
            'bins': [0, 230, 250, np.inf],
            'labels': ["Short", "Average", "Tall"]
        },
        'player_estimated_hand_length_cm': {
            'bins': [0, 20, 25, np.inf],
            'labels': ["Small", "Medium", "Large"]
        }
    }

    # Save bin configuration with dynamic naming
    bin_config_filename = generate_output_filename("category_bin_config")
    bin_config_path = os.path.join(pipeline_assets_dir, bin_config_filename)
    with open(bin_config_path, 'wb') as f:
        pickle.dump(category_bin_config, f)
    logger.info(f"Category bin configuration saved as {bin_config_filename}")

    # Load bin configuration
    with open(bin_config_path, 'rb') as f:
        loaded_category_bin_config = pickle.load(f)
    logger.debug("Category bin configuration loaded.")

    # Load the processed ML dataset
    final_ml_df = pd.read_csv(processed_ml_path)
    logger.debug(f"ML dataset loaded with shape: {final_ml_df.shape}")

    # Transform features with bins
    categorized_columns_df = transform_features_with_bins(
        final_ml_df, loaded_category_bin_config, debug=debug
    )

    # Combine original DataFrame with categorized columns
    final_ml_df_categoricals = pd.concat([final_ml_df, categorized_columns_df], axis=1)
    logger.debug(f"Combined DataFrame shape: {final_ml_df_categoricals.shape}")

    # Multicollinearity and Feature Importance
    target_variable = 'result'
    correlation_threshold = 0.8

    multicollinearity_df = check_multicollinearity(
        final_ml_df_categoricals, threshold=correlation_threshold, debug=debug
    )

    # Define features to drop based on analysis
    drop_features = []

    if not multicollinearity_df.empty:
        for index, row in multicollinearity_df.iterrows():
            feature1, feature2, correlation = row['Feature1'], row['Feature2'], row['Correlation']
            logger.debug(f"High correlation ({correlation}) between '{feature1}' and '{feature2}'.")
            # Decision logic to drop features (e.g., drop the second feature)
            drop_features.append(feature2)
        # Remove duplicates
        drop_features = list(set(drop_features))
        final_ml_df_categoricals.drop(columns=drop_features, errors='ignore', inplace=True)
        logger.info(f"Dropped {len(drop_features)} features due to multicollinearity: {', '.join(drop_features)}")
    else:
        logger.info("No multicollinearity issues detected.")

    # Calculate feature importance
    feature_importances = calculate_feature_importance(
        final_ml_df_categoricals, target_variable=target_variable, n_estimators=100,
        random_state=42, debug=debug
    )
    logger.debug("Feature importance calculated.")

    if debug:
        print("\nFinal Feature Importances:")
        print(feature_importances.to_string(index=False))
    else:
        print("Step [Feature Selection] completed.")

    # Final feature selection
    final_keep_list = [
        'release_ball_direction_x', 'release_ball_direction_z', 'release_ball_direction_y',
        'elbow_release_angle', 'elbow_max_angle', 'wrist_release_angle', 'wrist_max_angle',
        'knee_release_angle', 'knee_max_angle', 'result', 'release_ball_speed',
        'release_ball_velocity_x', 'release_ball_velocity_y', 'release_ball_velocity_z',
        'calculated_release_angle'
    ]

    final_ml_df_selected_features = final_ml_df_categoricals[final_keep_list]
    logger.info(f"Retained {len(final_keep_list)} features for classification.")

    # Save selected feature names with dynamic naming
    selected_features_filename = generate_output_filename("final_ml_df_selected_features_columns", "selected_features", "pkl")
    selected_features_path = os.path.join(pipeline_assets_dir, selected_features_filename)
    with open(selected_features_path, 'wb') as f:
        pickle.dump(final_ml_df_selected_features.columns.tolist(), f)
    logger.info(f"Selected feature names saved as {selected_features_filename}")

    return selected_features_path

def preprocess_data(selected_features_path, processed_ml_path, pipeline_assets_dir, debug=False):
    """
    Preprocess the data by handling class imbalance with SMOTE and applying outlier detection.

    Parameters:
        selected_features_path (str): Path to the selected features pickle file.
        processed_ml_path (str): Path to the processed ML dataset CSV.
        pipeline_assets_dir (str): Directory to save pipeline assets.
        debug (bool): Enable detailed logging and validation outputs.

    Returns:
        tuple: Preprocessed training and testing data.
    """
    # Load selected features data
    final_ml_df_selected_features = load_selected_features_data(
        features_path=selected_features_path,
        dataset_path=processed_ml_path,
        category_bin_config_path=os.path.join(pipeline_assets_dir, generate_output_filename("category_bin_config")),
        y_variable='result',
        debug=debug
    )

    # Initial Dataset Info
    logger.info("[Initial Dataset Info]")
    logger.info(f"Columns to work with: {final_ml_df_selected_features.columns.tolist()}")

    # Split dataset into features (X) and target (y)
    X = final_ml_df_selected_features.drop(columns=['result'])
    y = final_ml_df_selected_features['result']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    logger.info("[Train-Test Split]")
    logger.info(f"X_train Shape: {X_train.shape}")
    logger.info(f"X_test Shape: {X_test.shape}")
    logger.info(f"y_train Shape: {y_train.shape}")
    logger.info(f"y_test Shape: {y_test.shape}")

    # Analyze and apply SMOTE
    smote_analysis = check_dataset_for_smote(X_train, y_train, debug=debug)
    logger.info(f"SMOTE Analysis Recommendations: {smote_analysis['recommendations']}")

    X_resampled, y_resampled, smote_used = apply_smote(
        X_train, y_train, smote_analysis["recommendations"], debug=debug
    )
    logger.info(f"Applied SMOTE Variant: {smote_used}")
    logger.info(f"Resampled Class Distribution: {Counter(y_resampled)}")

    # Save SMOTE technique used
    logging.info(f"SMOTE Technique Used: {smote_used}")

    # Optionally include outlier detection methods here
    # (Implementation depends on specific requirements)

    if debug:
        print("Preprocessing completed with SMOTE applied.")
    else:
        print("Step [Preprocessing] completed.")

    return X_resampled, X_test, y_resampled, y_test

def train_model(selected_features_path, processed_ml_path, pipeline_assets_dir, model_save_dir, tuning_results_save,
                classification_save_path, debug=False):
    """
    Train machine learning models with hyperparameter tuning.

    Parameters:
        selected_features_path (str): Path to the selected features pickle file.
        processed_ml_path (str): Path to the processed ML dataset CSV.
        pipeline_assets_dir (str): Directory containing pipeline assets.
        model_save_dir (str): Directory to save trained models.
        tuning_results_save (str): Path to save hyperparameter tuning results.
        classification_save_path (str): Path to save classification reports.
        debug (bool): Enable detailed logging and validation outputs.

    Returns:
        None
    """
    # Initialize MLflow Logger
    mlflow_logger = MLflowLogger(
        tracking_uri="file:///absolute/path/to/mlruns",  # Ensure this path exists
        experiment_name="Model_Tuning_and_Evaluation",
        enable_mlflow=True
    )

    # Data Preprocessing
    dp = DataPreprocessor(
        features_path=selected_features_path,
        dataset_path=processed_ml_path,
        assets_path=os.path.join(pipeline_assets_dir, generate_output_filename("preprocessing_assets")),
        category_bin_config_path=os.path.join(pipeline_assets_dir, generate_output_filename("category_bin_config")),
        y_variable='result',
        perform_split=True,
        test_size=0.3,
        random_state=123,
        stratify=True,
        optimization_columns=["knee_max_angle", "wrist_max_angle", "elbow_max_angle"],
        debug=debug
    )

    results = dp.run(return_optimization_ranges=True)
    X_train_transformed, X_test_transformed, y_train_encoded, y_test_encoded, _, _, _, _, optimization_ranges, _ = results
    logger.debug("Data preprocessing for model training completed.")

    # Model Training and Hyperparameter Tuning
    logger.info("Starting hyperparameter tuning and evaluation...")
    bayes_best_model_train(
        X_train=X_train_transformed,
        y_train=y_train_encoded,
        X_test=X_test_transformed,
        y_test=y_test_encoded,
        selection_metric="Log Loss",
        use_pca=True,
        save_dir=tuning_results_save,
        model_save_dir=model_save_dir,
        classification_save_path=classification_save_path,
        selected_models="XGBoost",  # Can choose decision tree or random forest, or set to None to find best of all 3
        mlflow_logger=mlflow_logger
    )

    if debug:
        print("Model training and hyperparameter tuning completed.")
    else:
        print("Step [Train Model] completed.")

def predict_and_evaluate(model_save_dir, pipeline_assets_dir, processed_ml_path, debug=False):
    """
    Make predictions using the trained model and perform inverse transformations for interpretability.

    Parameters:
        model_save_dir (str): Directory where models are saved.
        pipeline_assets_dir (str): Directory containing pipeline assets.
        processed_ml_path (str): Path to the processed ML dataset CSV.
        debug (bool): Enable detailed logging and validation outputs.

    Returns:
        None
    """
    # Load selected features
    selected_features_filename = generate_output_filename("final_ml_df_selected_features_columns", "selected_features", "pkl")
    selected_features_path = os.path.join(pipeline_assets_dir, selected_features_filename)

    # Load the processed ML dataset
    final_ml_df_selected_features = load_selected_features_data(
        features_path=selected_features_path,
        dataset_path=processed_ml_path,
        category_bin_config_path=os.path.join(pipeline_assets_dir, generate_output_filename("category_bin_config")),
        y_variable='result',
        debug=debug
    )

    # Split dataset into features (X) and target (y)
    X = final_ml_df_selected_features.drop(columns=['result'])
    y = final_ml_df_selected_features['result']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Load the trained model
    load_model_name = "XGBoost"
    logger.info(f"Loading model: {load_model_name}")
    loaded_model = load_model(load_model_name, save_dir=model_save_dir)

    # Make predictions
    y_pred = loaded_model.predict(X_test)
    logger.info(f"Predictions from loaded model: {y_pred}")

    # Initialize Inverse Preprocessor
    preprocessing_assets_filename = generate_output_filename("preprocessing_assets")
    preprocessing_assets_path = os.path.join(pipeline_assets_dir, preprocessing_assets_filename)
    inverse_transformer = InversePreprocessor(
        assets_path=preprocessing_assets_path,  # Assets will be loaded automatically
        debug=debug
    )

    # Perform inverse transformation and combine with targets
    final_dataset = inverse_transformer.transform(
        original_data=X_train,
        transformed_data=None,  # Adjust as per your pipeline
        y_encoded=y_train  # Adjust encoding as necessary
    )

    # Append specified columns
    columns_to_append = ['player_height_in_meters', 'player_weight__in_kg']
    final_dataset = inverse_transformer.append_columns_from_original(
        final_dataset=final_dataset,
        original_data=X_train,
        columns_to_append=columns_to_append,
        debug=debug
    )

    # Display the resulting dataset
    print("[Final Dataset]:")
    print(final_dataset.head())
    print(final_dataset.shape)

    # Compute SHAP values and plot summary
    shap_values = compute_shap_values(loaded_model, X_test)
    plot_shap_summary(shap_values, X_test)

    if debug:
        print("Prediction and evaluation completed.")
    else:
        print("Step [Predict and Evaluate] completed.")

def main():
    """
    Main function to execute the machine learning pipeline.
    """
    # Define base directories
    base_dir = "../../../SPL-Open-Data/basketball/freethrow/data/P0001"
    participant_info_path = "../../../SPL-Open-Data/basketball/freethrow/participant_information.json"
    processed_data_dir = "../../data/processed/"
    pipeline_assets_dir = '../../data/model/pipeline/'
    model_save_dir = "../../data/model"
    tuning_results_save = "../../data/model/tuning_results/tuning_results.json"
    classification_save_path = "../../data/model/classification_reports/classification_reports.txt"

    # Set debug mode
    debug_mode = True  # Set to False to minimize outputs

    # Step 1: Process ML Dataset
    processed_ml_path, processed_granular_path = process_ml_dataset(
        base_dir=base_dir,
        participant_info_path=participant_info_path,
        processed_data_dir=processed_data_dir,
        pipeline_assets_dir=pipeline_assets_dir,
        debug=debug_mode
    )

    # Step 2: Select New Features
    selected_features_path = select_new_features(
        processed_ml_path=processed_ml_path,
        pipeline_assets_dir=pipeline_assets_dir,
        debug=debug_mode
    )

    # Step 3: Preprocess Data
    X_resampled, X_test, y_resampled, y_test = preprocess_data(
        selected_features_path=selected_features_path,
        processed_ml_path=processed_ml_path,
        pipeline_assets_dir=pipeline_assets_dir,
        debug=debug_mode
    )

    # Step 4: Train Model
    train_model(
        selected_features_path=selected_features_path,
        processed_ml_path=processed_ml_path,
        pipeline_assets_dir=pipeline_assets_dir,
        model_save_dir=model_save_dir,
        tuning_results_save=tuning_results_save,
        classification_save_path=classification_save_path,
        debug=debug_mode
    )

    # Step 5: Predict and Evaluate
    predict_and_evaluate(
        model_save_dir=model_save_dir,
        pipeline_assets_dir=pipeline_assets_dir,
        processed_ml_path=processed_ml_path,
        debug=debug_mode
    )

if __name__ == "__main__":
    main()
