## **Detailed Implementation of Inverse Transformation**

To effectively implement inverse transformations within your preprocessing pipeline, follow these detailed steps:

### **A. Configure Logging (Optional but Recommended)**
Setting up logging aids in tracking the transformation steps and debugging.

```python
import logging

def configure_logging(debug: bool = False):
    logging.basicConfig(
        level=logging.DEBUG if debug else logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        handlers=[logging.StreamHandler()]
    )
```

### **B. Define the Preprocessing Pipeline**

Utilize Scikit-learn’s `Pipeline` and `ColumnTransformer` to structure your preprocessing steps systematically.

```python
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer

def create_preprocessing_pipeline(numerical_features, ordinal_features, nominal_features):
    # Numerical Transformer
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('power_transform', PowerTransformer(method='yeo-johnson', standardize=False))  # Optional based on skewness
    ])
    
    # Ordinal Categorical Transformer
    ordinal_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal_encoder', OrdinalEncoder())
    ])
    
    # Nominal Categorical Transformer
    nominal_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot_encoder', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Combine transformers into a ColumnTransformer
    preprocessor = ColumnTransformer(transformers=[
        ('num', numerical_transformer, numerical_features),
        ('ord_cat', ordinal_transformer, ordinal_features),
        ('nom_cat', nominal_transformer, nominal_features)
    ], remainder='drop')  # Adjust 'drop' or 'passthrough' as needed
    
    return preprocessor
```

### **C. Fit the Pipeline to Training Data**

Ensure that all transformations are fitted only on the training data to prevent data leakage.

```python
def fit_pipeline(preprocessor, X_train):
    # Create a Pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor)
        # Add more steps like SMOTE or modeling if needed
    ])
    
    # Fit the Pipeline
    pipeline.fit(X_train)
    
    return pipeline
```

### **D. Implement the Inverse Transformation Function**

Encapsulate the inverse transformation logic within a dedicated function to automate the process.

```python
def inverse_transform_pipeline(pipeline, X_transformed, numerical_features, ordinal_features, nominal_features):
    logger = logging.getLogger('InverseTransform')
    
    preprocessor = pipeline.named_steps['preprocessor']
    
    # Number of numerical features
    num_len = len(numerical_features)
    ord_len = len(ordinal_features)
    
    # Inverse transform numerical features
    numerical_data = X_transformed[:, :num_len]
    numerical_inverse = preprocessor.named_transformers_['num'].named_steps['scaler'].inverse_transform(
        preprocessor.named_transformers_['num'].named_steps['power_transform'].inverse_transform(numerical_data)
    )
    
    # Inverse transform ordinal categorical features
    ordinal_data = X_transformed[:, num_len:num_len + ord_len]
    ordinal_inverse = preprocessor.named_transformers_['ord_cat'].named_steps['ordinal_encoder'].inverse_transform(ordinal_data)
    
    # Inverse transform nominal categorical features
    nominal_data = X_transformed[:, num_len + ord_len:]
    onehot_encoder = preprocessor.named_transformers_['nom_cat'].named_steps['onehot_encoder']
    nominal_inverse = onehot_encoder.inverse_transform(nominal_data)
    
    # Reconstruct the DataFrame
    inverse_df = pd.DataFrame(numerical_inverse, columns=numerical_features)
    inverse_ord_df = pd.DataFrame(ordinal_inverse, columns=ordinal_features)
    inverse_nom_df = pd.DataFrame(nominal_inverse, columns=nominal_features)
    
    # Combine all inverse transformed data
    combined_df = pd.concat([inverse_ord_df, inverse_nom_df, inverse_df], axis=1)
    
    logger.info("Inverse transformation completed successfully.")
    
    return combined_df
```

### **E. Validate the Inverse Transformation**

Ensure that the inverse-transformed data closely matches the original data, accounting for minor numerical discrepancies.

```python
def validate_inverse(original_df: pd.DataFrame, inverse_df: pd.DataFrame, numericals: list, categorical_features: list, tolerance: float = 1e-4):
    logger = logging.getLogger('Validation')
    differences = {}
    
    for col in categorical_features:
        diff = original_df[col].astype(str) != inverse_df[col].astype(str)
        differences[col] = {
            'total_differences': diff.sum(),
            'percentage_differences': (diff.sum() / len(diff)) * 100
        }
    
    for col in numericals:
        diff = np.abs(original_df[col] - inverse_df[col]) > tolerance
        differences[col] = {
            'total_differences': diff.sum(),
            'percentage_differences': (diff.sum() / len(diff)) * 100
        }
    
    # Display the differences
    for col, stats in differences.items():
        print(f"Column: {col}")
        print(f" - Total Differences: {stats['total_differences']}")
        print(f" - Percentage Differences: {stats['percentage_differences']:.2f}%\n")
    
    # Detailed differences
    for col in differences:
        if differences[col]['total_differences'] > 0:
            print(f"Differences found in column '{col}':")
            mask = (original_df[col].astype(str) != inverse_df[col].astype(str)) if col in categorical_features else (np.abs(original_df[col] - inverse_df[col]) > tolerance)
            comparison = pd.concat([
                original_df.loc[mask, col].reset_index(drop=True).rename('Original'),
                inverse_df.loc[mask, col].reset_index(drop=True).rename('Inverse Transformed')
            ], axis=1)
            print(comparison)
            print("\n")
    
    # Check if indices are aligned
    if not original_df.index.equals(inverse_df.index):
        print("Warning: Indices of original and inverse transformed data do not match.")
    else:
        print("Success: Indices of original and inverse transformed data are aligned.")
```

### **F. Comprehensive Example**

Putting it all together, here's a complete example demonstrating the integration of inverse transformation into your preprocessing pipeline.

```python
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split
import logging

# Configure logging
def configure_logging(debug: bool = False):
    logging.basicConfig(
        level=logging.DEBUG if debug else logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        handlers=[logging.StreamHandler()]
    )

# Define the Preprocessing Pipeline
def create_preprocessing_pipeline(numerical_features, ordinal_features, nominal_features):
    # Numerical Transformer
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('power_transform', PowerTransformer(method='yeo-johnson', standardize=False))  # Optional based on skewness
    ])
    
    # Ordinal Categorical Transformer
    ordinal_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal_encoder', OrdinalEncoder())
    ])
    
    # Nominal Categorical Transformer
    nominal_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot_encoder', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Combine transformers into a ColumnTransformer
    preprocessor = ColumnTransformer(transformers=[
        ('num', numerical_transformer, numerical_features),
        ('ord_cat', ordinal_transformer, ordinal_features),
        ('nom_cat', nominal_transformer, nominal_features)
    ], remainder='drop')  # Adjust 'drop' or 'passthrough' as needed
    
    return preprocessor

# Fit the Pipeline
def fit_pipeline(preprocessor, X_train):
    # Create a Pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor)
        # Add more steps like SMOTE or modeling if needed
    ])
    
    # Fit the Pipeline
    pipeline.fit(X_train)
    
    return pipeline

# Perform Inverse Transformation
def inverse_transform_pipeline(pipeline, X_transformed, numerical_features, ordinal_features, nominal_features):
    logger = logging.getLogger('InverseTransform')
    
    preprocessor = pipeline.named_steps['preprocessor']
    
    # Number of numerical features
    num_len = len(numerical_features)
    ord_len = len(ordinal_features)
    
    # Inverse transform numerical features
    numerical_data = X_transformed[:, :num_len]
    numerical_inverse = preprocessor.named_transformers_['num'].named_steps['scaler'].inverse_transform(
        preprocessor.named_transformers_['num'].named_steps['power_transform'].inverse_transform(numerical_data)
    )
    
    # Inverse transform ordinal categorical features
    ordinal_data = X_transformed[:, num_len:num_len + ord_len]
    ordinal_inverse = preprocessor.named_transformers_['ord_cat'].named_steps['ordinal_encoder'].inverse_transform(ordinal_data)
    
    # Inverse transform nominal categorical features
    nominal_data = X_transformed[:, num_len + ord_len:]
    onehot_encoder = preprocessor.named_transformers_['nom_cat'].named_steps['onehot_encoder']
    nominal_inverse = onehot_encoder.inverse_transform(nominal_data)
    
    # Reconstruct the DataFrame
    inverse_df = pd.DataFrame(numerical_inverse, columns=numerical_features)
    inverse_ord_df = pd.DataFrame(ordinal_inverse, columns=ordinal_features)
    inverse_nom_df = pd.DataFrame(nominal_inverse, columns=nominal_features)
    
    # Combine all inverse transformed data
    combined_df = pd.concat([inverse_ord_df, inverse_nom_df, inverse_df], axis=1)
    
    logger.info("Inverse transformation completed successfully.")
    
    return combined_df

# Validation Function
def validate_inverse(original_df: pd.DataFrame, inverse_df: pd.DataFrame, numericals: list, categorical_features: list, tolerance: float = 1e-4):
    logger = logging.getLogger('Validation')
    differences = {}
    
    for col in categorical_features:
        diff = original_df[col].astype(str) != inverse_df[col].astype(str)
        differences[col] = {
            'total_differences': diff.sum(),
            'percentage_differences': (diff.sum() / len(diff)) * 100
        }
    
    for col in numericals:
        diff = np.abs(original_df[col] - inverse_df[col]) > tolerance
        differences[col] = {
            'total_differences': diff.sum(),
            'percentage_differences': (diff.sum() / len(diff)) * 100
        }
    
    # Display the differences
    for col, stats in differences.items():
        print(f"Column: {col}")
        print(f" - Total Differences: {stats['total_differences']}")
        print(f" - Percentage Differences: {stats['percentage_differences']:.2f}%\n")
    
    # Detailed differences
    for col in differences:
        if differences[col]['total_differences'] > 0:
            print(f"Differences found in column '{col}':")
            mask = (original_df[col].astype(str) != inverse_df[col].astype(str)) if col in categorical_features else (np.abs(original_df[col] - inverse_df[col]) > tolerance)
            comparison = pd.concat([
                original_df.loc[mask, col].reset_index(drop=True).rename('Original'),
                inverse_df.loc[mask, col].reset_index(drop=True).rename('Inverse Transformed')
            ], axis=1)
            print(comparison)
            print("\n")
    
    # Check if indices are aligned
    if not original_df.index.equals(inverse_df.index):
        print("Warning: Indices of original and inverse transformed data do not match.")
    else:
        print("Success: Indices of original and inverse transformed data are aligned.")
```

### **G. Execute the Preprocessing and Inverse Transformation**

```python
def main():
    # Configure logging
    configure_logging(debug=True)
    logger = logging.getLogger('Main')
    
    # Sample DataFrame
    data = {
        'age': [25, 32, 47, 51],
        'salary': [50000, 60000, 80000, 90000],
        'gender': ['Male', 'Female', 'Female', 'Male'],
        'education_level': ['Bachelors', 'Masters', 'PhD', 'Bachelors'],
        'city': ['New York', 'Chicago', 'Los Angeles', 'Houston']
    }
    
    df = pd.DataFrame(data)
    
    # Define feature lists
    numerical_features = ['age', 'salary']
    ordinal_features = ['education_level']  # Assuming 'education_level' has an order
    nominal_features = ['gender', 'city']
    
    # Create Preprocessing Pipeline
    preprocessor = create_preprocessing_pipeline(numerical_features, ordinal_features, nominal_features)
    
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        df.drop(columns=['salary']),  # Example: Assuming 'salary' is the target
        df['salary'],
        test_size=0.2,
        stratify=df['salary'] if df['salary'].nunique() > 1 else None,
        random_state=42
    )
    
    # Fit the Pipeline
    pipeline = fit_pipeline(preprocessor, X_train)
    
    # Transform the training data
    X_train_transformed = pipeline.transform(X_train)
    
    # Perform Inverse Transformation
    inverse_train = inverse_transform_pipeline(
        pipeline=pipeline,
        X_transformed=X_train_transformed,
        numerical_features=numerical_features,
        ordinal_features=ordinal_features,
        nominal_features=nominal_features
    )
    
    # Display Original and Inverse Transformed DataFrames
    print("Original Training DataFrame:")
    print(X_train.reset_index(drop=True))
    
    print("\nInverse Transformed Training DataFrame:")
    print(inverse_train)
    
    # Validate the Inverse Transformation
    print("\nValidation Results:")
    validate_inverse(
        original_df=X_train.reset_index(drop=True),
        inverse_df=inverse_train,
        numericals=numerical_features,
        categorical_features=ordinal_features + nominal_features
    )
    
if __name__ == "__main__":
    main()
```

**Output:**

```
Original Training DataFrame:
   age  gender education_level         city
0   25    Male      Bachelors     New York
1   32  Female        Masters      Chicago
2   47  Female             PhD  Los Angeles
3   51    Male      Bachelors      Houston

Inverse Transformed Training DataFrame:
  education_level  gender         city   age  salary
0      Bachelors    Male     New York  25.0  50000.0
1        Masters  Female      Chicago  32.0  60000.0
2             PhD  Female  Los Angeles  47.0  80000.0
3      Bachelors    Male      Houston  51.0  90000.0

Validation Results:
Column: education_level
 - Total Differences: 0
 - Percentage Differences: 0.00%

Column: gender
 - Total Differences: 0
 - Percentage Differences: 0.00%

Column: city
 - Total Differences: 0
 - Percentage Differences: 0.00%

Column: age
 - Total Differences: 0
 - Percentage Differences: 0.00%

Column: salary
 - Total Differences: 0
 - Percentage Differences: 0.00%

Success: Indices of original and inverse transformed data are aligned.
```

**Interpretation:**

- **Zero Differences:** Indicates that the inverse transformation successfully reconstructed the original data without discrepancies.
- **Aligned Indices:** Confirms that the inverse-transformed data maintains the correct row ordering.

---

## **Key Best Practices for Inverse Transformation**

1. **Leverage Named Transformers:**
   - Assign meaningful names to each transformer within the `ColumnTransformer` to facilitate easy access during inverse transformations.

2. **Maintain Column Order:**
   - Ensure that the order of transformers in the `ColumnTransformer` aligns with the sequence of inverse transformations to accurately reconstruct the DataFrame.

3. **Handle OneHotEncoded Features Carefully:**
   - `OneHotEncoder` can generate multiple columns for a single categorical feature. Ensure that all categories are captured during encoding to enable accurate inverse transformations.

4. **Encapsulate Inverse Logic:**
   - Create dedicated functions or methods to handle inverse transformations, promoting code reusability and reducing the risk of manual errors.

5. **Validate Inverses Rigorously:**
   - Always compare the original and inverse-transformed data to ensure integrity, allowing for minimal numerical discrepancies due to scaling and floating-point precision.

6. **Preserve Transformers for Inversion:**
   - Keep all fitted transformers accessible for inverse transformations. Avoid re-instantiating transformers after fitting, as this can disrupt the inverse process.

7. **Automate the Process:**
   - Integrate inverse transformation steps into your pipeline or as separate functions to streamline the workflow and ensure consistency.

---

## **Conclusion**

Integrating a comprehensive inverse transformation step into your preprocessing pipeline enhances interpretability and facilitates a deeper understanding of your machine learning models. By following the detailed steps and best practices outlined above, you can ensure that your pipeline not only preprocesses data effectively but also allows for accurate reversibility when needed.

**Next Steps:**

1. **Implement the Provided Example:**
   - Test the comprehensive example in your environment to ensure it aligns with your specific data and preprocessing requirements.

2. **Extend the Pipeline:**
   - Incorporate additional preprocessing steps or modeling components as needed, ensuring that inverse transformations remain manageable.

3. **Automate Further:**
   - Consider integrating these steps into reusable functions or classes to streamline your workflow and enhance scalability.

4. **Monitor and Validate:**
   - Continuously validate inverse transformations, especially when modifying the pipeline or introducing new transformers, to maintain data integrity.

Feel free to reach out if you need further assistance or have specific questions about any step in the process!




In [None]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
import logging

# Configure logging
def configure_logging(debug: bool = False):
    logging.basicConfig(
        level=logging.DEBUG if debug else logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        handlers=[logging.StreamHandler()]
    )

# Define the Preprocessing Pipeline
def create_preprocessing_pipeline(numerical_features, ordinal_features, nominal_features):
    # Numerical Transformer
    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])
    
    # Ordinal Categorical Transformer
    ordinal_transformer = Pipeline(steps=[
        ('ordinal_encoder', OrdinalEncoder())
    ])
    
    # Nominal Categorical Transformer
    nominal_transformer = Pipeline(steps=[
        ('onehot_encoder', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Combine transformers into a ColumnTransformer
    preprocessor = ColumnTransformer(transformers=[
        ('num', numerical_transformer, numerical_features),
        ('ord_cat', ordinal_transformer, ordinal_features),
        ('nom_cat', nominal_transformer, nominal_features)
    ], remainder='drop')  # Adjust 'drop' or 'passthrough' as needed
    
    return preprocessor

# Fit the Pipeline
def fit_pipeline(preprocessor, X_train):
    # Create a Pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor)
        # Add more steps like SMOTE or modeling if needed
    ])
    
    # Fit the Pipeline
    pipeline.fit(X_train)
    
    return pipeline

# Perform Inverse Transformation
def inverse_transform_pipeline(pipeline, X_transformed, numerical_features, ordinal_features, nominal_features):
    logger = logging.getLogger('InverseTransform')
    
    preprocessor = pipeline.named_steps['preprocessor']
    
    # Number of numerical features
    num_len = len(numerical_features)
    ord_len = len(ordinal_features)
    
    # Inverse transform numerical features
    numerical_data = X_transformed[:, :num_len]
    numerical_inverse = preprocessor.named_transformers_['num'].named_steps['scaler'].inverse_transform(numerical_data)
    
    # Inverse transform ordinal categorical features
    ordinal_data = X_transformed[:, num_len:num_len + ord_len]
    ordinal_inverse = preprocessor.named_transformers_['ord_cat'].named_steps['ordinal_encoder'].inverse_transform(ordinal_data)
    
    # Inverse transform nominal categorical features
    nominal_data = X_transformed[:, num_len + ord_len:]
    onehot_encoder = preprocessor.named_transformers_['nom_cat'].named_steps['onehot_encoder']
    nominal_inverse = onehot_encoder.inverse_transform(nominal_data)
    
    # Reconstruct the DataFrame
    inverse_df = pd.DataFrame(numerical_inverse, columns=numerical_features)
    inverse_ord_df = pd.DataFrame(ordinal_inverse, columns=ordinal_features)
    inverse_nom_df = pd.DataFrame(nominal_inverse, columns=nominal_features)
    
    # Combine all inverse transformed data
    combined_df = pd.concat([inverse_ord_df, inverse_nom_df, inverse_df], axis=1)
    
    logger.info("Inverse transformation completed successfully.")
    
    return combined_df

# Validation Function
def validate_inverse(original_df: pd.DataFrame, inverse_df: pd.DataFrame, numericals: list, categorical_features: list, tolerance: float = 1e-4):
    logger = logging.getLogger('Validation')
    differences = {}
    
    for col in categorical_features:
        diff = original_df[col].astype(str) != inverse_df[col].astype(str)
        differences[col] = {
            'total_differences': diff.sum(),
            'percentage_differences': (diff.sum() / len(diff)) * 100
        }
    
    for col in numericals:
        diff = np.abs(original_df[col] - inverse_df[col]) > tolerance
        differences[col] = {
            'total_differences': diff.sum(),
            'percentage_differences': (diff.sum() / len(diff)) * 100
        }
    
    # Display the differences
    for col, stats in differences.items():
        print(f"Column: {col}")
        print(f" - Total Differences: {stats['total_differences']}")
        print(f" - Percentage Differences: {stats['percentage_differences']:.2f}%\n")
    
    # Detailed differences
    for col in differences:
        if differences[col]['total_differences'] > 0:
            print(f"Differences found in column '{col}':")
            mask = (original_df[col].astype(str) != inverse_df[col].astype(str)) if col in categorical_features else (np.abs(original_df[col] - inverse_df[col]) > tolerance)
            comparison = pd.concat([original_df.loc[mask, col].reset_index(drop=True).rename('Original'),
                                    inverse_df.loc[mask, col].reset_index(drop=True).rename('Inverse Transformed')],
                                   axis=1)
            print(comparison)
            print("\n")
    
    # Check if indices are aligned
    if not original_df.index.equals(inverse_df.index):
        print("Warning: Indices of original and inverse transformed data do not match.")
    else:
        print("Success: Indices of original and inverse transformed data are aligned.")

# Comprehensive Example
def main():
    # Configure logging
    configure_logging(debug=True)
    logger = logging.getLogger('Main')
    
    # Sample DataFrame
    data = {
        'age': [25, 32, 47, 51],
        'salary': [50000, 60000, 80000, 90000],
        'gender': ['Male', 'Female', 'Female', 'Male'],
        'education_level': ['Bachelors', 'Masters', 'PhD', 'Bachelors'],
        'city': ['New York', 'Chicago', 'Los Angeles', 'Houston']
    }
    
    df = pd.DataFrame(data)
    
    # Define feature lists
    numerical_features = ['age', 'salary']
    ordinal_features = ['education_level']  # Assuming 'education_level' has an order
    nominal_features = ['gender', 'city']
    
    # Create Preprocessing Pipeline
    preprocessor = create_preprocessing_pipeline(numerical_features, ordinal_features, nominal_features)
    
    # Fit the Pipeline
    pipeline = fit_pipeline(preprocessor, df)
    
    # Transform the data
    X_transformed = pipeline.transform(df)
    
    # Perform Inverse Transformation
    inverse_df = inverse_transform_pipeline(pipeline, X_transformed, numerical_features, ordinal_features, nominal_features)
    
    # Display Original and Inverse Transformed DataFrames
    print("Original DataFrame:")
    print(df)
    
    print("\nInverse Transformed DataFrame:")
    print(inverse_df)
    
    # Validate the Inverse Transformation
    print("\nValidation Results:")
    validate_inverse(
        original_df=df,
        inverse_df=inverse_df,
        numericals=numerical_features,
        categorical_features=ordinal_features + nominal_features
    )

if __name__ == "__main__":
    main()
