In [None]:
        
# main.py

import pandas as pd
import logging
import os
import yaml
import joblib
# from data_preprocessor import DataPreprocessor
# from clustering_module import ClusteringModule  # Ensure this is implemented
# from feature_manager import FeatureManager  # Ensure this is implemented

def load_dataset(path: str) -> pd.DataFrame:
    """
    Load the dataset from a CSV file.

    Args:
        path (str): Path to the dataset CSV file.

    Returns:
        pd.DataFrame: Loaded dataset.
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"Dataset not found at {path}")
    return pd.read_csv(path)

def load_config(config_path: str) -> dict:
    """
    Load and parse the YAML configuration file.

    Args:
        config_path (str): Path to the preprocessor_config.yaml file.

    Returns:
        dict: Parsed configuration.
    """
    if not os.path.exists(config_path):
        raise FileNotFoundError(f"Configuration file not found at {config_path}")
    with open(config_path, 'r') as file:
        config = yaml.safe_load(file)
    return config

def construct_filepath(output_dir: str, identifier: str, dataset_key: str) -> str:
    """
    Utility function to construct file paths for saving models and preprocessors.

    Args:
        identifier (str): Identifier for the file (e.g., 'trained_model', 'preprocessor').
        dataset_key (str): Key representing the dataset type.

    Returns:
        str: Constructed file path.
    """
    return os.path.join(output_dir, f"{dataset_key}_{identifier}.pkl")

def main():
    # ----------------------------
    # Step 1: Load Configuration
    # ----------------------------
    config_path = '../../dataset/test/preprocessor_config/preprocessor_config.yaml'
    try:
        config = load_config(config_path)
        logger_config = config.get('logging', {})
        logger_level = logger_config.get('level', 'INFO').upper()
        logger_format = logger_config.get('format', '%(asctime)s [%(levelname)s] %(message)s')
    except Exception as e:
        print(f"❌ Failed to load configuration: {e}")
        return  # Exit if config loading fails

    # ----------------------------
    # Step 2: Configure Logging
    # ----------------------------
    debug_flag = config.get('logging', {}).get('debug', False)
    logging.basicConfig(
        level=logging.DEBUG if debug_flag else getattr(logging, logger_level, logging.INFO),
        format=logger_format,
        handlers=[
            logging.StreamHandler()
        ]
    )
    logger = logging.getLogger('main_preprocessing')

    # ----------------------------
    # Step 3: Extract Feature Assets
    # ----------------------------
    features_config = config.get('features', {})
    column_assets = {
        'y_variable': features_config.get('y_variable', []),
        'ordinal_categoricals': features_config.get('ordinal_categoricals', []),
        'nominal_categoricals': features_config.get('nominal_categoricals', []),
        'numericals': features_config.get('numericals', [])
    }

    # ----------------------------
    # Step 4: Extract Execution Parameters
    # ----------------------------
    execution = config.get('execution', {})
    shared_execution = execution.get('shared', {})
    mode_execution = execution.get('train', {})  # Default to train mode
    current_mode = mode_execution.get('mode', 'train').lower()

    # ----------------------------
    # Step 5: Get List of Model Types
    # ----------------------------
    model_types = config.get('model_types', ['Tree Based Classifier'])  # Default to one model if not specified

    for current_model_type in model_types:
        logger.info(f"---\nProcessing Model: {current_model_type}\n---")

        # Step 6: Extract Mode for the Current Model
        model_config = config.get('models', {}).get(current_model_type, {})
        if not model_config:
            logger.error(f"No configuration found for model '{current_model_type}'. Skipping.")
            continue

        # Determine mode based on the model type
        # For example, if the model is a clustering model, set mode to 'clustering'
        if current_model_type in ['K-Means', 'Hierarchical Clustering', 'DBSCAN', 'KModes', 'KPrototypes']:
            current_mode = 'clustering'
        elif current_model_type in ['Logistic Regression', 'Tree Based Classifier', 'Support Vector Machine']:
            current_mode = 'train'
        elif current_model_type in ['Linear Regression', 'Tree Based Regressor']:
            current_mode = 'train'
        else:
            current_mode = 'train'  # Default to train

        # ----------------------------
        # Step 7: Handle Modes for Each Model
        # ----------------------------
        if current_mode == 'train':
            # Adjust output directories to prevent overwriting
            execution_train = execution.get('train', {})
            train_mode = 'train'

            train_input_path = execution_train.get('input_path', '')
            base_output_dir = execution_train.get('output_dir', './processed_data')
            model_output_dir = os.path.join(base_output_dir, current_model_type.replace(" ", "_"))
            transformers_dir = execution_train.get('save_transformers_path', './transformers')  # Changed: Remove model name
            normalize_debug = execution_train.get('normalize_debug', False)
            normalize_graphs_output = execution_train.get('normalize_graphs_output', False)

            # Validate essential paths
            if not train_input_path:
                logger.error("❌ 'input_path' for training mode is not specified in the configuration.")
                continue
            if not os.path.exists(train_input_path):
                logger.error(f"❌ Training input dataset not found at {train_input_path}.")
                continue

            # Initialize DataPreprocessor
            preprocessor = DataPreprocessor(
                model_type=current_model_type,
                y_variable=column_assets.get('y_variable', []),
                ordinal_categoricals=column_assets.get('ordinal_categoricals', []),
                nominal_categoricals=column_assets.get('nominal_categoricals', []),
                numericals=column_assets.get('numericals', []), 
                mode=train_mode,
                options=model_config,
                debug=debug_flag,
                normalize_debug=normalize_debug,
                normalize_graphs_output=normalize_graphs_output,
                graphs_output_dir=shared_execution.get('plot_output_dir', './plots'),
                transformers_dir=transformers_dir  # Now a directory
            )

            # Initialize FeatureManager
            save_path = config.get('execution', {}).get('shared', {}).get('features_metadata_path', '../../dataset/test/features_info/features_metadata.pkl')
            feature_manager = FeatureManager(save_path=save_path)  # Ensure FeatureManager is correctly implemented

            # Load Training Dataset via FeatureManager
            try:
                filtered_df, column_assets = feature_manager.load_features_and_dataset(
                    debug=True  # Set to False to reduce verbosity
                )
                logger.info("✅ Features loaded and dataset filtered successfully.")
            except Exception as e:
                logger.error(f"❌ Failed to load features and dataset: {e}")
                continue

            # Execute Preprocessing
            try:
                X_train, X_test, y_train, y_test, recommendations, X_test_inverse = preprocessor.final_preprocessing(filtered_df)
                logger.info("✅ Preprocessing completed successfully in train mode.")
            except Exception as e:
                logger.error(f"❌ Preprocessing failed in train mode: {e}")
                continue

            # Save Preprocessed Data
            try:
                os.makedirs(model_output_dir, exist_ok=True)
                X_train.to_csv(os.path.join(model_output_dir, 'X_train.csv'), index=False)
                y_train.to_csv(os.path.join(model_output_dir, 'y_train.csv'), index=False)
                X_test.to_csv(os.path.join(model_output_dir, 'X_test.csv'), index=False)
                y_test.to_csv(os.path.join(model_output_dir, 'y_test.csv'), index=False)
                recommendations.to_csv(os.path.join(model_output_dir, 'preprocessing_recommendations.csv'), index=False)
                logger.info(f"✅ Preprocessed data saved to '{model_output_dir}'.")
            except Exception as e:
                logger.error(f"❌ Failed to save preprocessed data: {e}")
                continue

            # Optional: Visualize Inverse Transformations
            try:
                if X_test_inverse is not None:
                    print(f"Inverse Transformed Test Data for {current_model_type}:")
                    print(X_test_inverse.head())
            except Exception as e:
                logger.error(f"❌ Error during visualization: {e}")
                continue

            logger.info(f"✅ All preprocessing tasks completed successfully for model '{current_model_type}'.")

        elif current_mode == 'predict':
            # Adjust paths accordingly
            execution_predict = execution.get('predict', {})
            predict_mode = 'predict'

            predict_input_path = execution_predict.get('prediction_input_path', '')
            predictions_output_path = execution_predict.get('predictions_output_path', './predictions')
            transformers_dir = execution_predict.get('load_transformers_path', './transformers')  # Correct directory
            trained_model_path = execution_predict.get('trained_model_path', './models/trained_model.pkl')  # Path to load model
            normalize_debug = execution_predict.get('normalize_debug', False)
            normalize_graphs_output = execution_predict.get('normalize_graphs_output', False)

            # Validate essential paths
            if not predict_input_path:
                logger.error("❌ 'prediction_input_path' for predict mode is not specified in the configuration.")
                continue
            if not os.path.exists(predict_input_path):
                logger.error(f"❌ Prediction input dataset not found at {predict_input_path}.")
                continue
            if not os.path.exists(trained_model_path):
                logger.error(f"❌ Trained model not found at {trained_model_path}.")
                continue
            if not os.path.exists(transformers_dir):
                logger.error(f"❌ Transformers directory not found at {transformers_dir}.")
                continue

            # Initialize DataPreprocessor
            preprocessor = DataPreprocessor(
                model_type=current_model_type,
                y_variable=column_assets.get('y_variable', []),
                ordinal_categoricals=column_assets.get('ordinal_categoricals', []),
                nominal_categoricals=column_assets.get('nominal_categoricals', []),
                numericals=column_assets.get('numericals', []),
                mode=predict_mode,
                options=model_config,
                debug=debug_flag,
                normalize_debug=normalize_debug,
                normalize_graphs_output=normalize_graphs_output,
                graphs_output_dir=shared_execution.get('plot_output_dir', './plots'),
                transformers_dir=transformers_dir  # Directory path
            )

            # Load Prediction Dataset
            try:
                df_predict = load_dataset(predict_input_path)
                logger.info(f"✅ Prediction input data loaded from '{predict_input_path}'.")
            except Exception as e:
                logger.error(f"❌ Failed to load prediction input data: {e}")
                continue

            # Execute Preprocessing for Prediction
            try:
                X_preprocessed, recommendations, X_inversed = preprocessor.preprocess_predict(X=df_predict)
                logger.info("✅ Preprocessing completed successfully in predict mode.")
            except Exception as e:
                logger.error(f"❌ Preprocessing failed in predict mode: {e}")
                continue

            # Load Trained Model
            # try:
            #     trained_model = joblib.load(trained_model_path)
            #     logger.info(f"✅ Trained model loaded from '{trained_model_path}'.")
            # except Exception as e:
            #     logger.error(f"❌ Failed to load trained model: {e}")
            #     continue

            # # Make Predictions
            # try:
            #     # Ensure X_preprocessed is a DataFrame with appropriate feature names
            #     if isinstance(X_preprocessed, np.ndarray):
            #         X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=preprocessor.final_feature_order)
            #     else:
            #         X_preprocessed_df = X_preprocessed

            #     # Make predictions
            #     predictions = trained_model.predict(X_preprocessed_df)
            #     logger.info("✅ Predictions made successfully.")
            # except Exception as e:
            #     logger.error(f"❌ Prediction failed: {e}")
            #     continue
            y_new_pred = np.random.choice(['1', '0'], size=X_inversed.shape[0])  # Example for binary predictions
            
            # Attach Predictions to Inversed Data
            if X_inversed is not None:
                # Ensure predictions length matches the number of rows in X_inversed
                if len(y_new_pred) == len(X_inversed):
                    # Add predictions column
                    X_inversed['predictions'] = y_new_pred
                    logger.info("✅ Predictions attached to inversed data successfully.")

                    # Debugging Output AFTER attaching predictions
                    print(f"\nUpdated INVERSED DATA with Predictions for {current_model_type}:")
                    print(X_inversed.head())  # Shows predictions column included
                else:
                    logger.error("❌ Predictions length does not match inversed data length.")
                    continue
            else:
                logger.error("❌ Inversed data is None. Cannot attach predictions.")
                continue

            # Save Predictions
            try:
                os.makedirs(predictions_output_path, exist_ok=True)
                predictions_filename = os.path.join(predictions_output_path, f'predictions_{current_model_type.replace(" ", "_")}.csv')
                if X_inversed is not None:
                    X_inversed.to_csv(predictions_filename, index=False)
                else:
                    logger.error("❌ Inversed data is None. Predictions not saved.")
                    continue
                logger.info(f"✅ Predictions saved to '{predictions_filename}'.")
            except Exception as e:
                logger.error(f"❌ Failed to save predictions: {e}")
                continue

            logger.info(f"✅ All prediction tasks completed successfully for model '{current_model_type}'.")

        elif current_mode == 'clustering':
            # Adjust paths accordingly
            execution_clustering = execution.get('clustering', {})
            clustering_mode = 'clustering'

            clustering_input_path = execution_clustering.get('clustering_input_path', '')
            clustering_output_dir = execution_clustering.get('clustering_output_dir', './clustering_output')
            normalize_debug = execution_clustering.get('normalize_debug', False)
            normalize_graphs_output = execution_clustering.get('normalize_graphs_output', False)

            # Validate essential paths
            if not clustering_input_path:
                logger.error("❌ 'clustering_input_path' for clustering mode is not specified in the configuration.")
                continue
            if not os.path.exists(clustering_input_path):
                logger.error(f"❌ Clustering input dataset not found at {clustering_input_path}.")
                continue

            # Initialize DataPreprocessor
            preprocessor = DataPreprocessor(
                model_type=current_model_type,
                y_variable=column_assets.get('y_variable', []),
                ordinal_categoricals=column_assets.get('ordinal_categoricals', []),
                nominal_categoricals=column_assets.get('nominal_categoricals', []),
                numericals=column_assets.get('numericals', []),
                mode=clustering_mode,
                options=model_config,
                debug=debug_flag,
                normalize_debug=normalize_debug,
                normalize_graphs_output=normalize_graphs_output,
                graphs_output_dir=shared_execution.get('plot_output_dir', './plots'),
                transformers_dir=execution_clustering.get('save_transformers_path', './transformers')  # Changed: Remove model name
            )

            # Load Clustering Dataset
            try:
                df_clustering = load_dataset(clustering_input_path)
                logger.info(f"✅ Clustering input data loaded from '{clustering_input_path}'.")
            except Exception as e:
                logger.error(f"❌ Failed to load clustering input data: {e}")
                continue

            # Execute Preprocessing for Clustering
            try:
                X_processed, recommendations = preprocessor.final_preprocessing(df_clustering)
                logger.info("✅ Preprocessing completed successfully in clustering mode.")
            except Exception as e:
                logger.error(f"❌ Preprocessing failed in clustering mode: {e}")
                continue

            # Initialize and Train Clustering Model
            try:
                # Load clustering model parameters from config
                clustering_model_config = model_config.get('clustering_model_params', {})

                clustering_module = ClusteringModule(
                    model_type=current_model_type,
                    model_params=clustering_model_config,
                    debug=debug_flag
                )

                clustering_module.fit(X_processed)
                clustering_module.evaluate(X_processed)
                # Plot clusters if applicable
                clustering_module.plot_clusters(X_processed, clustering_output_dir)
                # Save the clustering model
                os.makedirs(clustering_output_dir, exist_ok=True)
                clustering_model_path = os.path.join(clustering_output_dir, f"{current_model_type.replace(' ', '_')}_model.pkl")
                clustering_module.save_model(clustering_model_path)
                logger.info(f"✅ Clustering model saved to '{clustering_model_path}'.")
            except Exception as e:
                logger.error(f"❌ Clustering tasks failed: {e}")
                continue

            logger.info(f"✅ All clustering tasks completed successfully for model '{current_model_type}'.")

        else:
            logger.error(f"❌ Unsupported mode '{current_mode}'. Supported modes are 'train', 'predict', and 'clustering'.")
            continue

    logger.info("✅ All model processing completed successfully.")

if __name__ == "__main__":
    main()
