In [None]:
import pandas as pd
import numpy as np
import joblib
import logging
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTENC, SMOTEN, SMOTE
from typing import List, Optional, Dict, Any
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from collections import Counter
import os
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


# ----------------------------
# Step 1: Configure Logging
# ----------------------------
def configure_logging(debug: bool = False, logger_name: str = 'PreprocessingPipeline') -> logging.Logger:
    """
    Configure logging settings.

    Args:
        debug (bool): Flag to enable detailed debugging.
        logger_name (str): Name of the logger.

    Returns:
        logging.Logger: Configured logger instance.
    """
    log_level = logging.DEBUG if debug else logging.INFO
    logger = logging.getLogger(logger_name)
    if not logger.handlers:
        logger.setLevel(log_level)
        formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(name)s: %(message)s')
        # File Handler
        fh = logging.FileHandler(f"{logger_name.lower()}.log")
        fh.setLevel(log_level)
        fh.setFormatter(formatter)
        # Stream Handler
        sh = logging.StreamHandler()
        sh.setLevel(log_level)
        sh.setFormatter(formatter)
        # Add Handlers
        logger.addHandler(fh)
        logger.addHandler(sh)
    return logger


# ----------------------------
# Step 2: Define the PreprocessingPipeline Class
# ----------------------------
class PreprocessingPipeline:
    def __init__(
        self,
        numericals: List[str],
        ordinal_categoricals: List[str],
        nominal_categoricals: List[str],
        y_variable: str,
        smote_params: Optional[Dict] = None,
        debug: bool = False
    ):
        """
        Initialize the PreprocessingPipeline with specified columns.

        Args:
            numericals (List[str]): List of numerical feature names.
            ordinal_categoricals (List[str]): List of ordinal categorical feature names.
            nominal_categoricals (List[str]): List of nominal categorical feature names.
            y_variable (str): Name of the target variable.
            smote_params (Dict, optional): Parameters for the SMOTE technique.
            debug (bool): Flag to enable detailed debugging.
        """
        self.debug = debug
        self.logger = configure_logging(debug=self.debug, logger_name='PreprocessingPipeline')
        self.pipeline = None
        self.categorical_indices = []
        self.numericals = numericals
        self.ordinal_categoricals = ordinal_categoricals
        self.nominal_categoricals = nominal_categoricals
        self.y_variable = y_variable
        self.smote_params = smote_params or {}
        self.logger.debug(f"Initialized PreprocessingPipeline with numericals: {self.numericals}, "
                          f"ordinal_categoricals: {self.ordinal_categoricals}, "
                          f"nominal_categoricals: {self.nominal_categoricals}, "
                          f"y_variable: {self.y_variable}")

    def categorize_features(self, df: pd.DataFrame, include_target: bool = True) -> Dict[str, List[str]]:
        """
        Ensure that the DataFrame contains only the specified columns.

        Args:
            df (pd.DataFrame): The input DataFrame.
            include_target (bool): Whether to include the target variable in the check.

        Returns:
            Dict[str, List[str]]: Dictionary categorizing the features.
        """
        self.logger.debug("Starting feature categorization based on provided column lists.")

        # Determine expected columns based on context
        if include_target:
            expected_columns = self.numericals + self.ordinal_categoricals + self.nominal_categoricals + [self.y_variable]
        else:
            expected_columns = self.numericals + self.ordinal_categoricals + self.nominal_categoricals

        missing_columns = [col for col in expected_columns if col not in df.columns]
        if missing_columns:
            self.logger.error(f"The following specified columns are missing in the DataFrame: {missing_columns}")
            raise ValueError(f"The following specified columns are missing in the DataFrame: {missing_columns}")

        feature_types = {
            'numerical': self.numericals,
            'ordinal': self.ordinal_categoricals,
            'nominal': self.nominal_categoricals
        }

        # Debug Output
        if self.debug:
            self.logger.debug(f"Categorized Numerical Features: {feature_types['numerical']}")
            self.logger.debug(f"Categorized Ordinal Features: {feature_types['ordinal']}")
            self.logger.debug(f"Categorized Nominal Features: {feature_types['nominal']}")
            if include_target:
                self.logger.debug(f"Target Variable: {self.y_variable}")
        else:
            self.logger.info(f"Features categorized: Numerical={len(feature_types['numerical'])}, "
                             f"Ordinal={len(feature_types['ordinal'])}, Nominal={len(feature_types['nominal'])}.")

        return feature_types

    def create_preprocessing_pipeline(self, X_train: pd.DataFrame) -> ColumnTransformer:
        """
        Create and fit the preprocessing pipeline.

        Args:
            X_train (pd.DataFrame): Training features.

        Returns:
            ColumnTransformer: Fitted preprocessing pipeline.
        """
        self.logger.debug("Creating preprocessing pipeline with specified transformers.")

        feature_types = self.categorize_features(X_train, include_target=False)
        
        # Confirm the number of features
        total_features = len(feature_types['numerical']) + len(feature_types['ordinal']) + len(feature_types['nominal'])
        self.logger.debug(f"Pipeline will be fitted on {total_features} features: {feature_types}")

        transformers = []

        # Numerical Transformer
        if feature_types['numerical']:
            numerical_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', StandardScaler())
            ])
            transformers.append(('num', numerical_transformer, feature_types['numerical']))
            self.logger.debug("Numerical transformer added to pipeline.")
        else:
            self.logger.debug("No numerical features to transform.")

        # Ordinal Categorical Transformer
        if feature_types['ordinal']:
            ordinal_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('ordinal_encoder', OrdinalEncoder())
            ])
            transformers.append(('ord', ordinal_transformer, feature_types['ordinal']))
            self.logger.debug("Ordinal categorical transformer added to pipeline.")
        else:
            self.logger.debug("No ordinal categorical features to transform.")

        # Nominal Categorical Transformers
        for feature in feature_types['nominal']:
            transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('onehot_encoder', OneHotEncoder(handle_unknown='ignore'))
            ])
            transformers.append((f'onehot_enc_{feature}', transformer, [feature]))
            self.logger.debug(f"Nominal categorical transformer for '{feature}' added to pipeline.")

        # Validate that at least one transformer is present
        if not transformers:
            self.logger.error("No transformers added to the pipeline. Please check the feature lists.")
            raise ValueError("No transformers added to the pipeline. Please check the feature lists.")

        # Combine Transformers
        preprocessor = ColumnTransformer(transformers=transformers, remainder='drop')
        preprocessor.fit(X_train)
        self.logger.info("✅ Preprocessing pipeline created and fitted on training data.")

        # Determine categorical feature indices for SMOTENC
        feature_names = preprocessor.get_feature_names_out()
        self.logger.debug(f"Feature names after preprocessing: {feature_names}")

        categorical_indices = []
        start_idx = 0
        for name, transformer, features in preprocessor.transformers_:
            if name.startswith('onehot_enc_'):
                ohe = transformer.named_steps['onehot_encoder']
                n_categories = len(ohe.categories_[0])
                categorical_indices.extend(list(range(start_idx, start_idx + n_categories)))
                self.logger.debug(f"OneHotEncoder for '{features[0]}' has {n_categories} categories; indices {list(range(start_idx, start_idx + n_categories))}.")
                start_idx += n_categories
            elif name in ['num', 'ord']:
                n_features = len(features)
                self.logger.debug(f"Transformer '{name}' processes {n_features} features; advancing start index by {n_features}.")
                start_idx += n_features
            else:
                self.logger.warning(f"Unknown transformer '{name}'. Skipping index calculation.")

        self.categorical_indices = categorical_indices
        self.logger.debug(f"Categorical feature indices for SMOTENC: {self.categorical_indices}")

        self.pipeline = preprocessor
        return preprocessor

    def select_smote_technique(self) -> Any:
        """
        Select the appropriate SMOTE technique based on feature types.

        Returns:
            An instance of SMOTE, SMOTENC, or SMOTEN.
        """
        self.logger.debug("Selecting SMOTE technique based on feature types.")

        has_numerical = len(self.numericals) > 0
        has_categorical = len(self.ordinal_categoricals) + len(self.nominal_categoricals) > 0

        if has_numerical and has_categorical:
            self.logger.debug("Both numerical and categorical features present. Using SMOTENC.")
            smote = SMOTENC(categorical_features=self.categorical_indices, **self.smote_params)
            selected_smote = 'SMOTENC'
        elif has_categorical:
            self.logger.debug("Only categorical features present. Using SMOTEN.")
            smote = SMOTEN(**self.smote_params)
            selected_smote = 'SMOTEN'
        elif has_numerical:
            self.logger.debug("Only numerical features present. Using SMOTE.")
            smote = SMOTE(**self.smote_params)
            selected_smote = 'SMOTE'
        else:
            self.logger.error("No features available for SMOTE.")
            raise ValueError("At least one feature type must be present for SMOTE.")

        self.logger.info(f"✅ Selected SMOTE Technique: {selected_smote}")
        return smote

    def fit_transform(self, df: pd.DataFrame) -> Dict[str, Any]:
        """
        Fit the preprocessing pipeline, apply SMOTE, and split the data.

        Args:
            df (pd.DataFrame): The entire dataset including features and target.

        Returns:
            Dict[str, Any]: Contains preprocessed training and testing data.
        """
        self.logger.debug("Starting fit_transform process.")
        feature_types = self.categorize_features(df, include_target=True)

        # Separate features and target
        X = df.drop(columns=[self.y_variable])
        y = df[self.y_variable]

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size=0.25,
            random_state=42,
            stratify=y
        )
        self.logger.debug(f"Data split into training ({X_train.shape}) and testing ({X_test.shape}) sets.")

        # Debug Output
        if self.debug:
            self.logger.debug(f"Training set shape: {X_train.shape}")
            self.logger.debug(f"Testing set shape: {X_test.shape}")
            self.logger.debug(f"Training target distribution: {Counter(y_train)}")
        else:
            self.logger.info("✅ Data split into training and testing sets.")

        # Create and fit preprocessing pipeline
        self.create_preprocessing_pipeline(X_train)

        # Transform training data
        X_train_preprocessed = self.pipeline.transform(X_train)
        self.logger.info("✅ Training data preprocessed.")

        # Apply SMOTE
        smote = self.select_smote_technique()
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_preprocessed, y_train)
        self.logger.info("✅ SMOTE applied to training data.")

        # Debug Output
        if self.debug:
            self.logger.debug(f"Resampled training set shape: {X_train_resampled.shape}")
            self.logger.debug(f"Resampled training target distribution: {Counter(y_train_resampled)}")
        else:
            self.logger.info("✅ Training data resampled.")

        # Transform testing data
        X_test_preprocessed = self.pipeline.transform(X_test)
        self.logger.info("✅ Testing data preprocessed.")

        # Inverse transform testing data
        X_test_inverse = self.inverse_transform_data(X_test_preprocessed)
        self.logger.info("✅ Inverse transformation applied to testing data.")

        # Prepare outputs
        outputs = {
            'X_train_preprocessed': X_train_resampled,
            'X_test_preprocessed': X_test_preprocessed,
            'y_train': y_train_resampled,
            'y_test': y_test,
            'X_test_inverse': X_test_inverse,
            'selected_smote': smote.__class__.__name__
        }

        # Debug Output
        if self.debug:
            self.logger.debug(f"Outputs: {list(outputs.keys())}")

        return outputs

    def transform_new_data(self, df_new: pd.DataFrame) -> Dict[str, Any]:
        """
        Transform new data for prediction.

        Args:
            df_new (pd.DataFrame): New data for prediction.

        Returns:
            Dict[str, Any]: Contains preprocessed and inverse-transformed new data.
        """
        self.logger.debug("Starting transformation of new data for prediction.")
        feature_types = self.categorize_features(df_new, include_target=False)

        # Preprocess new data
        X_preprocessed = self.pipeline.transform(df_new)
        self.logger.info("✅ New data preprocessed.")

        # Inverse transform
        X_inversed = self.inverse_transform_data(X_preprocessed)
        self.logger.info("✅ Inverse transformation applied to new data.")

        # Generate recommendations (if applicable)
        recommendations = []

        outputs = {
            'X_preprocessed': X_preprocessed,
            'recommendations': recommendations,  # Ensure this method exists and returns relevant data
            'X_inversed': X_inversed
        }

        # Debug Output
        if self.debug:
            self.logger.debug(f"Outputs: {list(outputs.keys())}")

        return outputs


    def inverse_transform_data(self, X_transformed: np.ndarray) -> pd.DataFrame:
        """
        Perform inverse transformation on the transformed data to reconstruct original feature values.

        Args:
            X_transformed (np.ndarray): The transformed feature data.

        Returns:
            pd.DataFrame: The inverse-transformed DataFrame.
        """
        self.logger.debug("Starting inverse transformation.")
        preprocessor = self.pipeline
        inverse_data = {}
        start_idx = 0

        for name, transformer, features in preprocessor.transformers_:
            if name == 'remainder':
                continue  # Skip remainder

            if not features:
                self.logger.debug(f"Transformer '{name}' has no features. Skipping.")
                continue

            if name == 'num':
                end_idx = start_idx + len(features)
                numerical_data = X_transformed[:, start_idx:end_idx]
                numerical_inverse = transformer.named_steps['scaler'].inverse_transform(numerical_data)
                for idx, feature in enumerate(features):
                    inverse_data[feature] = numerical_inverse[:, idx]
                self.logger.debug(f"Numerical features {features} inverse transformed.")
                start_idx = end_idx
            elif name == 'ord':
                end_idx = start_idx + len(features)
                ordinal_data = X_transformed[:, start_idx:end_idx]
                ordinal_inverse = transformer.named_steps['ordinal_encoder'].inverse_transform(ordinal_data)
                for idx, feature in enumerate(features):
                    inverse_data[feature] = ordinal_inverse[:, idx]
                self.logger.debug(f"Ordinal features {features} inverse transformed.")
                start_idx = end_idx
            elif name.startswith('onehot_enc_'):
                ohe = transformer.named_steps['onehot_encoder']
                n_categories = len(ohe.categories_[0])
                end_idx = start_idx + n_categories
                nominal_data = X_transformed[:, start_idx:end_idx]
                nominal_inverse = ohe.inverse_transform(nominal_data)
                feature = features[0]
                inverse_data[feature] = nominal_inverse[:, 0]
                self.logger.debug(f"Nominal feature '{feature}' inverse transformed.")
                start_idx = end_idx
            else:
                self.logger.warning(f"Unknown transformer '{name}'. Skipping inverse transformation.")

        inverse_df = pd.DataFrame(inverse_data)
        self.logger.debug("Inverse-transformed DataFrame constructed.")

        return inverse_df

    def save_pipeline(self, filepath: str):
        """
        Save the fitted preprocessing pipeline to disk.

        Args:
            filepath (str): Path to save the pipeline.
        """
        joblib.dump(self.pipeline, filepath)
        self.logger.info(f"✅ Preprocessing pipeline saved to '{filepath}'.")

    def load_pipeline(self, filepath: str):
        """
        Load a fitted preprocessing pipeline from disk.

        Args:
            filepath (str): Path to load the pipeline from.
        """
        if not os.path.exists(filepath):
            self.logger.error(f"Preprocessing pipeline file '{filepath}' does not exist.")
            raise FileNotFoundError(f"Preprocessing pipeline file '{filepath}' does not exist.")
        self.pipeline = joblib.load(filepath)
        self.logger.info(f"✅ Preprocessing pipeline loaded from '{filepath}'.")


# ----------------------------
# Step 3: Define the TrainingModule Class
# ----------------------------
class TrainingModule:
    def __init__(self, model_type: str = 'LogisticRegression', model_params: Optional[Dict] = None, debug: bool = False):
        """
        Initialize the TrainingModule with the specified model.

        Args:
            model_type (str): Type of the classifier ('LogisticRegression', 'RandomForestClassifier', etc.).
            model_params (dict, optional): Parameters for the classifier.
            debug (bool): Flag to enable detailed debugging.
        """
        self.debug = debug
        self.logger = configure_logging(debug=self.debug, logger_name='TrainingModule')
        self.model_type = model_type
        self.model_params = model_params or {}
        self.model = self._initialize_model()

    def _initialize_model(self):
        """
        Initialize the classifier based on the specified type and parameters.

        Returns:
            classifier: An instance of the specified classifier.
        """
        try:
            if self.model_type == 'LogisticRegression':
                model = LogisticRegression(**self.model_params)
            elif self.model_type == 'RandomForestClassifier':
                model = RandomForestClassifier(**self.model_params)
            elif self.model_type == 'GradientBoostingClassifier':
                model = GradientBoostingClassifier(**self.model_params)
            else:
                self.logger.error(f"Unsupported model type: {self.model_type}")
                raise ValueError(f"Unsupported model type: {self.model_type}")
            self.logger.debug(f"Initialized {self.model_type} with parameters: {self.model_params}")
            return model
        except Exception as e:
            self.logger.error(f"Error initializing model: {e}")
            raise e

    def train(self, X_train: np.ndarray, y_train: pd.Series):
        """
        Train the classifier on the training data.

        Args:
            X_train (np.ndarray): Preprocessed training features.
            y_train (pd.Series): Training target variable.

        Returns:
            self
        """
        self.logger.debug("Starting model training.")
        self.model.fit(X_train, y_train)
        self.logger.info("✅ Model trained successfully.")
        return self

    def evaluate(self, X_test: np.ndarray, y_test: pd.Series) -> Dict[str, float]:
        """
        Evaluate the trained model on the test data.

        Args:
            X_test (np.ndarray): Preprocessed test features.
            y_test (pd.Series): Test target variable.

        Returns:
            dict: Dictionary containing evaluation metrics.
        """
        self.logger.debug("Starting model evaluation.")
        y_pred = self.model.predict(X_test)

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = cross_val_score(self.model, X_test, y_test, scoring='precision', cv=5).mean()
        recall = cross_val_score(self.model, X_test, y_test, scoring='recall', cv=5).mean()
        f1 = cross_val_score(self.model, X_test, y_test, scoring='f1', cv=5).mean()
        roc_auc = cross_val_score(self.model, X_test, y_test, scoring='roc_auc', cv=5).mean()

        # Log performance metrics
        self.logger.info(f"✅ Model Accuracy on Test Set: {accuracy:.2f}")
        self.logger.info(f"✅ Model Precision: {precision:.2f}")
        self.logger.info(f"✅ Model Recall: {recall:.2f}")
        self.logger.info(f"✅ Model F1-Score: {f1:.2f}")
        self.logger.info(f"✅ Model ROC-AUC: {roc_auc:.2f}")

        # Classification Report
        self.logger.debug("Classification Report:")
        self.logger.debug(f"\n{classification_report(y_test, y_pred, zero_division=0)}")

        # Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm)
        disp.plot()
        plt.title("Confusion Matrix")
        plt.show()

        # Store performance metrics
        performance_metrics = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'roc_auc': roc_auc
        }

        # Save performance metrics
        metrics_filename = f'performance_metrics_{self.model_type}.csv'
        performance_df = pd.DataFrame([performance_metrics])
        performance_df.to_csv(metrics_filename, index=False)
        self.logger.info(f"✅ Performance metrics saved to '{metrics_filename}'.")

        return performance_metrics

    def save_model(self, filepath: str):
        """
        Save the trained model to disk.

        Args:
            filepath (str): Path to save the model.
        """
        joblib.dump(self.model, filepath)
        self.logger.info(f"✅ Model saved to '{filepath}'.")

    def load_model(self, filepath: str):
        """
        Load a trained model from disk.

        Args:
            filepath (str): Path to load the model from.
        """
        if not os.path.exists(filepath):
            self.logger.error(f"Model file '{filepath}' does not exist.")
            raise FileNotFoundError(f"Model file '{filepath}' does not exist.")
        self.model = joblib.load(filepath)
        self.logger.info(f"✅ Model loaded from '{filepath}'.")

    def predict(self, X: np.ndarray) -> np.ndarray:
        """
        Make predictions using the trained model.

        Args:
            X (np.ndarray): Preprocessed feature data.

        Returns:
            np.ndarray: Predicted labels.
        """
        self.logger.debug("Making predictions.")
        predictions = self.model.predict(X)
        self.logger.info("✅ Predictions made successfully.")
        return predictions


# ----------------------------
# Step 3: Define Utility Functions
# ----------------------------
def construct_filepath(base_name: str, dataset_key: str, extension: str = '.joblib') -> str:
    """
    Construct a standardized filepath based on base name, dataset key, and extension.

    Args:
        base_name (str): Base name of the file (e.g., 'trained_model').
        dataset_key (str): Identifier for the dataset.
        extension (str): File extension (default is '.joblib').

    Returns:
        str: Constructed filepath.
    """
    return f"{base_name}_{dataset_key}{extension}"


# ----------------------------
# Step 4: Define the Main Execution Logic
# ----------------------------
def main():
    """
    Main function to demonstrate training and prediction using PreprocessingPipeline and TrainingModule.
    Processes three datasets: numerical-only, categorical-only, and mixed.
    """
    # ----------------------------
    # Define Column Lists
    # ----------------------------
    # For demonstration, we define column lists manually.
    # In practice, these can be dynamically determined or passed as arguments.
    column_definitions = {
        'numerical_only': {
            'numericals': ['age', 'income'],
            'ordinal_categoricals': [],
            'nominal_categoricals': [],
            'y_variable': 'salary'
        },
        'categorical_only': {
            'numericals': [],
            'ordinal_categoricals': ['education_level', 'experience'],
            'nominal_categoricals': ['gender', 'city', 'department'],
            'y_variable': 'salary'
        },
        'mixed': {
            'numericals': ['age', 'income'],
            'ordinal_categoricals': ['education_level', 'experience'],
            'nominal_categoricals': ['gender', 'city', 'department'],
            'y_variable': 'salary'
        }
    }

    # ----------------------------
    # Define Debug Flag
    # ----------------------------
    # Set debug=True for detailed outputs, debug=False for minimal outputs
    debug = True  # Change to False for production

    # ----------------------------
    # Define Dataset Keys
    # ----------------------------
    dataset_keys = ['numerical_only', 'categorical_only', 'mixed']

    # ----------------------------
    # Iterate Through Each Dataset for Training
    # ----------------------------
    for dataset_key in dataset_keys:
        print(f"\n--- Training on Dataset: {dataset_key} ---")
        cols = column_definitions[dataset_key]

        # Initialize PreprocessingPipeline with dynamic columns
        pipeline = PreprocessingPipeline(
            numericals=cols['numericals'],
            ordinal_categoricals=cols['ordinal_categoricals'],
            nominal_categoricals=cols['nominal_categoricals'],
            y_variable=cols['y_variable'],
            smote_params={},  # Add any specific SMOTE parameters if needed
            debug=debug
        )

        # ----------------------------
        # Data Generation
        # ----------------------------
        additional_rows = 300
        np.random.seed(42)  # For reproducibility

        if dataset_key == 'numerical_only':
            expanded_data = {
                'age': np.random.randint(20, 60, additional_rows),
                'income': np.random.randint(30000, 100000, additional_rows),
                'salary': np.random.choice([0, 1], additional_rows, p=[0.7, 0.3])  # Imbalanced target
            }
        elif dataset_key == 'categorical_only':
            expanded_data = {
                'gender': np.random.choice(['Male', 'Female'], additional_rows),
                'city': np.random.choice(['New York', 'Chicago', 'Los Angeles', 'Houston'], additional_rows),
                'department': np.random.choice(['Sales', 'Engineering', 'HR', 'Marketing', 'Finance'], additional_rows),
                'education_level': np.random.choice(['Bachelors', 'Masters', 'PhD'], additional_rows),
                'experience': np.random.choice(['Junior', 'Mid', 'Senior'], additional_rows),
                'salary': np.random.choice([0, 1], additional_rows, p=[0.7, 0.3])  # Imbalanced target
            }
        elif dataset_key == 'mixed':
            expanded_data = {
                'age': np.random.randint(20, 60, additional_rows),
                'income': np.random.randint(30000, 100000, additional_rows),
                'gender': np.random.choice(['Male', 'Female'], additional_rows),
                'city': np.random.choice(['New York', 'Chicago', 'Los Angeles', 'Houston'], additional_rows),
                'department': np.random.choice(['Sales', 'Engineering', 'HR', 'Marketing', 'Finance'], additional_rows),
                'education_level': np.random.choice(['Bachelors', 'Masters', 'PhD'], additional_rows),
                'experience': np.random.choice(['Junior', 'Mid', 'Senior'], additional_rows),
                'salary': np.random.choice([0, 1], additional_rows, p=[0.7, 0.3])  # Imbalanced target
            }
        else:
            print(f"❌ Unknown dataset key: {dataset_key}")
            continue

        # Create DataFrame
        df = pd.DataFrame(expanded_data)
        pipeline.logger.info(f"✅ DataFrame for dataset '{dataset_key}' with {additional_rows} rows created.")

        # ----------------------------
        # Training Phase
        # ----------------------------
        preprocessed_data = pipeline.fit_transform(df)

        X_train_preprocessed = preprocessed_data['X_train_preprocessed']
        X_test_preprocessed = preprocessed_data['X_test_preprocessed']
        y_train = preprocessed_data['y_train']
        y_test = preprocessed_data['y_test']
        X_test_inverse = preprocessed_data['X_test_inverse']
        selected_smote = preprocessed_data['selected_smote']

        # Save the inverse-transformed test set
        inverse_filename = f'X_test_inverse_{dataset_key}.csv'
        X_test_inverse.to_csv(inverse_filename, index=False)
        pipeline.logger.info(f"✅ Inverse-transformed test set saved as '{inverse_filename}'.")

        # Initialize TrainingModule
        training_module = TrainingModule(
            model_type='LogisticRegression',
            model_params={'max_iter': 1000},
            debug=debug
        )

        # Train the model
        training_module.train(X_train_preprocessed, y_train)

        # Evaluate the model
        performance_metrics = training_module.evaluate(X_test_preprocessed, y_test)

        # Save the trained model
        model_filepath = construct_filepath('trained_model', dataset_key)
        training_module.save_model(model_filepath)

        # Save the preprocessing pipeline
        preprocessor_filepath = construct_filepath('preprocessor', dataset_key)
        pipeline.save_pipeline(preprocessor_filepath)

        # ----------------------------
        # Summary of Training Phase
        # ----------------------------
        print(f"Selected SMOTE Technique for '{dataset_key}': {selected_smote}")

    # ----------------------------
    # Prediction Phase
    # ----------------------------
    print("\n--- Prediction Phase ---")
    prediction_dataset_key = 'mixed'  # Choose the dataset for prediction

    # Retrieve column definitions
    cols = column_definitions[prediction_dataset_key]

    # Initialize PreprocessingPipeline for prediction
    pipeline = PreprocessingPipeline(
        numericals=cols['numericals'],
        ordinal_categoricals=cols['ordinal_categoricals'],
        nominal_categoricals=cols['nominal_categoricals'],
        y_variable=cols['y_variable'],
        debug=debug
    )

    # Define the path to the saved preprocessing pipeline
    preprocessor_filepath = construct_filepath('preprocessor', prediction_dataset_key)

    # Load the preprocessing pipeline
    try:
        pipeline.load_pipeline(preprocessor_filepath)
    except FileNotFoundError as e:
        print(f"❌ {e}")
        return

    # Initialize TrainingModule
    training_module = TrainingModule(
        model_type='LogisticRegression',
        model_params={'max_iter': 1000},
        debug=debug
    )

    # Define the path to the trained model
    model_filepath = construct_filepath('trained_model', prediction_dataset_key)

    # Load the trained model
    try:
        training_module.load_model(model_filepath)
    except FileNotFoundError as e:
        print(f"❌ {e}")
        return

    # Example Prediction Data
    prediction_data = {
        'age': [28, 40],
        'income': [50000, 80000],
        'gender': ['Female', 'Male'],
        'city': ['Chicago', 'Houston'],
        'department': ['Engineering', 'Sales'],
        'education_level': ['Masters', 'Bachelors'],
        'experience': ['Mid', 'Senior']
    }

    X_new = pd.DataFrame(prediction_data)
    pipeline.logger.info("✅ New prediction data created.")

    # Preprocess the new data
    preprocessed_new_data = pipeline.transform_new_data(X_new)

    X_new_preprocessed = preprocessed_new_data['X_preprocessed']
    X_new_inversed = preprocessed_new_data['X_inversed']

    # Make predictions
    y_new_pred = training_module.predict(X_new_preprocessed)

    # Attach predictions to inverse-transformed data
    X_new_inversed['predictions'] = y_new_pred
    pipeline.logger.debug("Predictions attached to inverse-transformed DataFrame.")

    # Save the inverse-transformed prediction data
    prediction_inverse_filename = f'X_inverse_prediction_{prediction_dataset_key}.csv'
    X_new_inversed.to_csv(prediction_inverse_filename, index=False)
    pipeline.logger.info(f"✅ Inverse-transformed prediction data saved as '{prediction_inverse_filename}'.")

    # Display the inverse-transformed prediction data
    print("\nInverse Transformed Prediction DataFrame with Predictions:")
    print(X_new_inversed)



if __name__ == "__main__":
    main()
