In [None]:
import pandas as pd
import numpy as np
import joblib
import logging
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline as ImbPipeline
import matplotlib.pyplot as plt
import yaml
import unittest
from sklearn.ensemble import RandomForestClassifier

# ----------------------------
# Step 1: Configure Logging
# ----------------------------
def configure_logging(debug: bool = False):
    """
    Configure logging settings.
    
    Args:
        debug (bool): Flag to enable detailed debugging.
        
    Returns:
        logging.Logger: Configured logger instance.
    """
    log_level = logging.DEBUG if debug else logging.INFO
    logging.basicConfig(
        level=log_level,
        format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
        handlers=[
            logging.FileHandler("preprocessing_pipeline.log"),
            logging.StreamHandler()
        ]
    )
    logger = logging.getLogger('PreprocessingPipeline')
    return logger

# ----------------------------
# Step 2: Define Feature Categorization within Pipeline
# ----------------------------
class PreprocessingPipeline:
    def __init__(self, config_path: str = None
                 ,numerical_features: List[str] = None
                 ,ordinal_features: List[str] = None
                 ,nominal_features: List[str] = None
                 ,target_variable: List[str] = None
                 , debug: bool = False):
        """
        Initialize the PreprocessingPipeline with optional debugging and configuration.
        
        Args:
            config_path (str): Path to the YAML configuration file.
            debug (bool): Flag to enable detailed debugging.
        
        Example:
            pipeline = PreprocessingPipeline(config_path='config.yaml')
        """
        self.debug = debug
        self.logger = configure_logging(debug=self.debug)
        self.pipeline = None
        self.numerical_features = numerical_features
        self.ordinal_features = ordinal_features
        self.nominal_features = nominal_features
        self.target_variable = target_variable
        self.classifier = None
        
        if config_path:
            with open(config_path, 'r') as file:
                config = yaml.safe_load(file)
            self.target_variable = config['target_variable']
            classifier_info = config.get('classifier', {})
            classifier_type = classifier_info.get('type', 'LogisticRegression')
            classifier_params = classifier_info.get('parameters', {})
            
            # Dynamically import the classifier
            self.classifier = getattr(__import__('sklearn.linear_model', fromlist=[classifier_type]), classifier_type)(**classifier_params)
            self.debug = config.get('debug', False)
            self.logger = configure_logging(debug=self.debug)
            self.logger.debug(f"Pipeline initialized with classifier: {self.classifier}")
        else:
            self.logger.error("No configuration file provided.")
            raise ValueError("Configuration file path must be provided.")
    
    def categorize_features(self, df: pd.DataFrame):
        """
        Automatically categorize features into numerical, ordinal, and nominal based on predefined lists.
        
        Args:
            df (pd.DataFrame): The input DataFrame.
        
        Raises:
            KeyError: If any specified columns are missing from the DataFrame.
        
        Example:
            pipeline.categorize_features(df)
        """
        # Define feature categories
        feature_lists = {
            'numerical_features': self.numerical_features,
            'ordinal_features': self.ordinal_features,
            'nominal_features': self.nominal_features
        }
        
        # Verify that all specified columns exist
        missing_columns = {}
        for category, columns in feature_lists.items():
            for col in columns:
                if col not in df.columns:
                    missing_columns.setdefault(category, []).append(col)
        
        if missing_columns:
            for category, cols in missing_columns.items():
                self.logger.error(f"Missing columns in category '{category}': {cols}")
            raise KeyError("One or more columns specified in feature lists are missing from the DataFrame.")
        else:
            self.logger.info("✅ All specified columns exist in the DataFrame.")
        
        # Optionally, log the categorized features
        if self.debug:
            self.logger.debug(f"Numerical Features: {self.numerical_features}")
            self.logger.debug(f"Ordinal Features: {self.ordinal_features}")
            self.logger.debug(f"Nominal Features: {self.nominal_features}")
    
    def create_preprocessing_pipeline(self, X_train: pd.DataFrame) -> ImbPipeline:
        """
        Create a preprocessing, resampling, and modeling pipeline compatible with SMOTENC.
        
        Args:
            X_train (pd.DataFrame): Training feature set for fitting transformers.
        
        Returns:
            ImbPipeline: An imblearn Pipeline object including preprocessing, SMOTENC, and the model.
        
        Example:
            pipeline = pipeline.create_preprocessing_pipeline(X_train)
        """
        self.logger.debug("Creating preprocessing, resampling, and modeling pipeline with numerical, ordinal, and nominal transformers.")
        
        # Numerical Transformer
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ])
        
        self.logger.debug("Numerical transformer created.")
        
        # Ordinal Categorical Transformer
        ordinal_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('ordinal_encoder', OrdinalEncoder())
        ])
        
        self.logger.debug("Ordinal transformer created.")
        
        # Nominal Categorical Transformers: Use OneHotEncoder
        nominal_transformers = []
        for feature in self.nominal_features:
            transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('onehot_encoder', OneHotEncoder(handle_unknown='ignore'))
            ])
            nominal_transformers.append((f'onehot_enc_{feature}', transformer, [feature]))
            self.logger.debug(f"Nominal transformer with OneHotEncoder for '{feature}' created.")
        
        # Combine Transformers using ColumnTransformer
        preprocessor = ColumnTransformer(transformers=[
            ('num', numerical_transformer, self.numerical_features),
            ('ord', ordinal_transformer, self.ordinal_features),
            *nominal_transformers  # Unpack nominal transformers
        ], remainder='drop')
        
        self.logger.debug("ColumnTransformer created with numerical, ordinal, and nominal transformers.")
        
        # Fit the preprocessor to determine feature indices
        preprocessor.fit(X_train)
        self.logger.info("✅ Preprocessor fitted on training data.")
        
        # Determine the number of output features after preprocessing
        feature_names = preprocessor.get_feature_names_out()
        self.logger.debug(f"Feature names after preprocessing: {feature_names}")
        
        # Identify indices of all categorical features (from OneHotEncoder)
        categorical_indices = []
        start_idx = 0
        for name, transformer, features in preprocessor.transformers_:
            if name.startswith('onehot_enc_'):
                ohe = transformer.named_steps['onehot_encoder']
                n_categories = len(ohe.categories_[0])
                categorical_indices.extend(list(range(start_idx, start_idx + n_categories)))
                self.logger.debug(f"Feature '{name}' has {n_categories} categories; indices {list(range(start_idx, start_idx + n_categories))}.")
                start_idx += n_categories
            elif name in ['num', 'ord']:
                n_features = len(features)
                self.logger.debug(f"Feature '{name}' has {n_features} features; advancing start index by {n_features}.")
                start_idx += n_features
            else:
                self.logger.warning(f"Unknown transformer '{name}'. Skipping index calculation.")
        
        self.logger.debug(f"Categorical feature indices for SMOTENC: {categorical_indices}")
        
        # Initialize SMOTENC with correct categorical feature indices
        smote_nc = SMOTENC(
            categorical_features=categorical_indices,
            sampling_strategy='auto',
            k_neighbors=3,
            random_state=42
        )
        
        self.logger.debug("SMOTENC initialized with correct categorical_features.")
        
        # Create the full imblearn pipeline with preprocessing, SMOTENC, and the classifier
        pipeline = ImbPipeline(steps=[
            ('preprocessor', preprocessor),
            ('smote_nc', smote_nc),
            ('classifier', self.classifier)
        ])
        
        self.logger.debug("Full preprocessing, SMOTENC, and classifier pipeline created.")
        
        return pipeline
    
    def inverse_transform_data(
        self,
        X_transformed: np.ndarray
    ) -> pd.DataFrame:
        """
        Perform inverse transformation on the transformed data to reconstruct original feature values.
        
        Args:
            X_transformed (np.ndarray): The transformed feature data.
        
        Returns:
            pd.DataFrame: The inverse-transformed DataFrame.
        
        Example:
            X_inverse = pipeline.inverse_transform_data(X_transformed)
        """
        preprocessor = self.pipeline.named_steps['preprocessor']
        logger = logging.getLogger('InverseTransform')
        logger.debug("Starting inverse transformation.")
        
        # Initialize dictionaries to hold inverse-transformed data
        inverse_data = {}
        
        # Initialize index tracker
        start_idx = 0
        
        # Iterate through each transformer in the ColumnTransformer
        for name, transformer, features in preprocessor.transformers_:
            if name == 'remainder':
                continue  # Skip any remainder features
            # Extract the transformed data for current transformer
            if name == 'num':
                end_idx = start_idx + len(features)
                numerical_data = X_transformed[:, start_idx:end_idx]
                numerical_inverse = transformer.named_steps['scaler'].inverse_transform(
                    numerical_data
                )
                inverse_data.update({feature: numerical_inverse[:, idx] for idx, feature in enumerate(features)})
                logger.debug(f"Numerical features {features} inverse transformed.")
                start_idx = end_idx
            elif name == 'ord':
                end_idx = start_idx + len(features)
                ordinal_data = X_transformed[:, start_idx:end_idx]
                ordinal_inverse = transformer.named_steps['ordinal_encoder'].inverse_transform(
                    ordinal_data
                )
                inverse_data.update({feature: ordinal_inverse[:, idx] for idx, feature in enumerate(features)})
                logger.debug(f"Ordinal features {features} inverse transformed.")
                start_idx = end_idx
            elif name.startswith('onehot_enc_'):
                # For OneHotEncoder, need to inverse transform multiple columns
                transformer_steps = transformer.named_steps
                onehot_encoder = transformer_steps['onehot_encoder']
                # Get number of categories for this feature
                n_categories = len(onehot_encoder.categories_[0])
                end_idx = start_idx + n_categories
                nominal_data = X_transformed[:, start_idx:end_idx]
                nominal_inverse = onehot_encoder.inverse_transform(nominal_data)
                inverse_data.update({feature: nominal_inverse[:, 0] for feature in features})
                logger.debug(f"Nominal features {features} inverse transformed.")
                start_idx = end_idx
            else:
                logger.warning(f"Unknown transformer '{name}'. Skipping inversion.")
        
        # Create the inverse-transformed DataFrame
        inverse_df = pd.DataFrame(inverse_data)
        
        logger.debug("Inverse-transformed DataFrame constructed.")
        
        logger.info("✅ Inverse transformation completed successfully.")
        
        return inverse_df
    
    def predict(self, X_new: pd.DataFrame) -> dict:
        """
        Preprocess new data, perform predictions, and inverse transform for interpretability.
        
        Args:
            X_new (pd.DataFrame): New data for prediction.
        
        Returns:
            dict: Dictionary containing preprocessed data and inverse-transformed DataFrame with predictions.
        
        Example:
            prediction_output = pipeline.predict(X_new)
        """
        self.logger.debug("Starting prediction process.")
        
        if self.pipeline is None:
            self.logger.error("Pipeline has not been trained. Please run the training process first.")
            raise AttributeError("Pipeline has not been trained. Please run the training process first.")
        
        # Make predictions
        y_pred = self.pipeline.predict(X_new)
        self.logger.info("✅ Predictions made on new data.")
        
        # Transform and inverse transform the new data
        X_preprocessed = self.pipeline.named_steps['preprocessor'].transform(X_new)
        X_inverse = self.inverse_transform_data(
            X_transformed=X_preprocessed
        )
        self.logger.debug("Inverse transformation applied to new data.")
        
        # Attach predictions to the inverse-transformed DataFrame
        X_inverse['predictions'] = y_pred
        self.logger.debug("Predictions attached to inverse-transformed DataFrame.")
        
        # Prepare the output dictionary
        output = {
            'X_preprocessed': X_preprocessed,
            'X_inverse': X_inverse
        }
        
        return output
    
    def train(self, X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series) -> dict:
        """
        Train the preprocessing, resampling, and classification pipeline.
        
        Args:
            X_train (pd.DataFrame): Training feature set.
            y_train (pd.Series): Training target variable.
            X_test (pd.DataFrame): Testing feature set.
            y_test (pd.Series): Testing target variable.
        
        Returns:
            dict: Dictionary containing preprocessed and inverse-transformed datasets and performance metrics.
        
        Example:
            training_output = pipeline.train(X_train, y_train, X_test, y_test)
        """
        self.logger.debug("Starting training process.")
        
        # Print class distribution in training set
        print("Training Set Class Distribution:")
        print(y_train.value_counts())
        
        # Create and fit the preprocessing, resampling, and modeling pipeline
        self.pipeline = self.create_preprocessing_pipeline(X_train)
        
        # Fit the pipeline
        self.pipeline.fit(X_train, y_train)
        self.logger.info("✅ Preprocessing, SMOTENC, and Logistic Regression pipeline fitted on training data.")
    
        # Make predictions on the test set
        y_pred = self.pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = cross_val_score(self.pipeline, X_train, y_train, scoring='precision', cv=5).mean()
        recall = cross_val_score(self.pipeline, X_train, y_train, scoring='recall', cv=5).mean()
        f1 = cross_val_score(self.pipeline, X_train, y_train, scoring='f1', cv=5).mean()
        roc_auc = cross_val_score(self.pipeline, X_train, y_train, scoring='roc_auc', cv=5).mean()
        
        # Log performance metrics
        self.logger.info(f"✅ Model Accuracy on Test Set: {accuracy:.2f}")
        self.logger.info(f"✅ Model Precision: {precision:.2f}")
        self.logger.info(f"✅ Model Recall: {recall:.2f}")
        self.logger.info(f"✅ Model F1-Score: {f1:.2f}")
        self.logger.info(f"✅ Model ROC-AUC: {roc_auc:.2f}")
        
        # Print classification report
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
        
        # Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm)
        disp.plot()
        plt.title("Confusion Matrix")
        plt.show()
        
        # Store performance metrics
        performance_metrics = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'roc_auc': roc_auc
        }
        
        # Transform the training and test sets
        X_train_preprocessed = self.pipeline.named_steps['preprocessor'].transform(X_train)
        X_test_preprocessed = self.pipeline.named_steps['preprocessor'].transform(X_test)
        
        # Inverse transform the test set for interpretability
        X_test_inverse = self.inverse_transform_data(
            X_transformed=X_test_preprocessed
        )
        self.logger.info("✅ Inverse transformation applied to test data.")
        
        # Preserve the original indexing
        X_test_inverse.index = X_test.index
        self.logger.debug("Index preserved for inverse-transformed test data.")
        
        # Optionally, save performance metrics to a file for monitoring
        performance_df = pd.DataFrame([performance_metrics])
        performance_df.to_csv('performance_metrics.csv', index=False)
        self.logger.info("✅ Performance metrics saved to 'performance_metrics.csv'.")
        
        # Prepare the output dictionary
        output = {
            'X_train_preprocessed': X_train_preprocessed,
            'X_test_preprocessed': X_test_preprocessed,
            'y_train_preprocessed': y_train,
            'y_test_preprocessed': y_test,
            'X_test_inverse': X_test_inverse,
            'performance_metrics': performance_metrics
        }
        
        return output

# ----------------------------
# Step 7: Define the Main Execution Logic
# ----------------------------
def main():
    """
    Main function to demonstrate training and prediction using the PreprocessingPipeline.
    """
    # ----------------------------
    # Training Phase
    # ----------------------------
    numerical_features = ['age']  # Update as needed
    ordinal_features = ['education_level', 'experience']
    nominal_features = ['gender', 'city', 'department']
    target_variable = ["salary"]
    # Initialize the PreprocessingPipeline with configuration file
    pipeline = PreprocessingPipeline(config_path='config.yaml'
                                     ,numerical_features= numerical_features 
                                     ,ordinal_features = ordinal_features
                                     ,nominal_features = nominal_features
                                     ,target_variable = target_variable)
    
    # Sample Training Data; replace with your actual data
    data = {
        'age': [25, 32, 47, 51, 22, 33, 45, 52],
        'salary': [0, 1, 1, 1, 0, 1, 1, 1],  # Binary target for Logistic Regression
        'education_level': ['Bachelors', 'Masters', 'PhD', 'Bachelors', 'Masters', 'PhD', 'Bachelors', 'Masters'],
        'experience': ['Junior', 'Mid', 'Senior', 'Senior', 'Mid', 'Senior', 'Junior', 'Mid'],
        'gender': ['Male', 'Female', 'Female', 'Male', 'Female', 'Female', 'Male', 'Female'],
        'city': ['New York', 'Chicago', 'Los Angeles', 'Houston', 'Chicago', 'Houston', 'New York', 'Los Angeles'],
        'department': ['Sales', 'Engineering', 'HR', 'Engineering', 'Sales', 'HR', 'Sales', 'Engineering']
    }

    # Expanding the dataset with more rows
    additional_rows = 50

    # Generate new rows
    expanded_data = {
        'age': np.random.randint(20, 60, additional_rows),
        'salary': np.random.choice([0, 1], additional_rows),
        'education_level': np.random.choice(['Bachelors', 'Masters', 'PhD'], additional_rows),
        'experience': np.random.choice(['Junior', 'Mid', 'Senior'], additional_rows),
        'gender': np.random.choice(['Male', 'Female'], additional_rows),
        'city': np.random.choice(['New York', 'Chicago', 'Los Angeles', 'Houston'], additional_rows),
        'department': np.random.choice(['Sales', 'Engineering', 'HR', 'Marketing', 'Finance'], additional_rows),
    }
    # Create new DataFrame and append to existing data
    df_pre_expansion = pd.DataFrame(data)

    df = pd.concat([df_pre_expansion, pd.DataFrame(expanded_data)], ignore_index=True)

    # Define target and features
    X = df.drop('salary', axis=1)
    y = df['salary']
    

    
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.25,
        random_state=42,
        stratify=y  # Ensure proportional representation of classes
    )
    pipeline.logger.info("✅ Data split into training and testing sets successfully.")
    
    # Train the pipeline and model
    training_output = pipeline.train(X_train, y_train, X_test, y_test)
    pipeline.logger.info("✅ Training completed.")
    
    # Save the pipeline and model
    joblib.dump(pipeline, 'preprocessing_pipeline.joblib')
    pipeline.logger.info("✅ Preprocessing pipeline and model saved as 'preprocessing_pipeline.joblib'.")
    
    # Save the inverse-transformed test set for interpretability
    training_output['X_test_inverse'].to_csv('X_test_inverse.csv')
    pipeline.logger.info("✅ Inverse-transformed test set saved as 'X_test_inverse.csv'.")
    
    # ----------------------------
    # Prediction Phase
    # ----------------------------
    # Load the trained pipeline and model
    loaded_pipeline = joblib.load('preprocessing_pipeline.joblib')
    loaded_pipeline.logger.info("✅ Preprocessing pipeline and model loaded from 'preprocessing_pipeline.joblib'.")
    
    # Example Prediction Data
    prediction_data = {
        'age': [28, 40],
        'education_level': ['Masters', 'Bachelors'],
        'experience': ['Mid', 'Senior'],
        'gender': ['Female', 'Male'],
        'city': ['Chicago', 'Houston'],
        'department': ['Engineering', 'Sales']
    }
    
    X_new = pd.DataFrame(prediction_data)
    
    # Categorize features and verify
    loaded_pipeline.categorize_features(X_new)
    
    # Perform prediction preprocessing and inverse transformation
    prediction_output = loaded_pipeline.predict(X_new)
    loaded_pipeline.logger.info("✅ Prediction data preprocessed and predictions made successfully.")
    
    # Save the inverse-transformed prediction data for interpretability
    prediction_output['X_inverse'].to_csv('X_inverse_prediction.csv')
    loaded_pipeline.logger.info("✅ Inverse-transformed prediction data saved as 'X_inverse_prediction.csv'.")
    
    # Display the inverse-transformed prediction data
    print("\nInverse Transformed Prediction DataFrame with Predictions:")
    print(prediction_output['X_inverse'])


if __name__ == "__main__":
    main()