In [30]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np

class OrdinalMerger(BaseEstimator, TransformerMixin):
    def __init__(self, min_obs=10):
        """
        Custom transformer for ordinal variables.

        Parameters:
        - min_obs: Minimum number of observations required for each category. If not met,
                   categories will be merged with the next higher category.
        """
        self.min_obs = min_obs
        self.mapping_ = {}

    def fit(self, X, y=None):
        """
        Fit the transformer to the data.

        Parameters:
        - X: A pandas DataFrame containing ordinal variables.
        - y: Ignored (compatibility with scikit-learn pipelines).

        Returns:
        - self: Fitted transformer.
        """
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input must be a pandas DataFrame.")

        self.mapping_ = {}

        for col in X.columns:
            value_counts = X[col].value_counts().sort_index()
            categories = value_counts.index.tolist()
            counts = value_counts.values

            # Initialize the mapping for this column
            mapping = {}

            # Start iterating over categories
            cumulative_count = 0
            current_group = categories[0]

            for i, (cat, count) in enumerate(zip(categories, counts)):
                cumulative_count += count

                if cumulative_count < self.min_obs and i < len(categories) - 1:
                    # Merge current category with the next one
                    mapping[cat] = current_group
                else:
                    # Finalize the current group
                    mapping[cat] = current_group
                    # Start a new group for the next category
                    if i < len(categories) - 1:
                        current_group = categories[i + 1]
                        cumulative_count = 0

            self.mapping_[col] = mapping

        return self

    def transform(self, X):
        """
        Transform the data by merging categories as determined during fitting.

        Parameters:
        - X: A pandas DataFrame containing ordinal variables.

        Returns:
        - X_transformed: Transformed DataFrame with merged categories.
        """
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input must be a pandas DataFrame.")

        X_transformed = X.copy()

        for col in X.columns:
            if col in self.mapping_:
                X_transformed[col] = X[col].map(self.mapping_[col])

        return X_transformed

# Example usage:
data = pd.DataFrame({
    'col1': [0, 1, 1, 2, 2, 2, 3, 3, 3, 3],
    'col2': [1, 2, 2, 2, 3, 3, 4, 4, 4, 4]
})

transformer = OrdinalMerger(min_obs=3)
transformer.fit(data)
transformed_data = transformer.transform(data)

print("Original Data:")
print(data)
print("\nTransformed Data:")
print(transformed_data)


Original Data:
   col1  col2
0     0     1
1     1     2
2     1     2
3     2     2
4     2     3
5     2     3
6     3     4
7     3     4
8     3     4
9     3     4

Transformed Data:
   col1  col2
0     0     1
1     0     1
2     0     1
3     2     1
4     2     3
5     2     3
6     3     3
7     3     3
8     3     3
9     3     3
