Add MeanEncoderTransform (#413)

* add MeanEncoderTransform * fix fit * fix code * final * fix mode name * resolve changelog * resolve changelog * fix all * add comments * satisfy mypy * add tests, fix docs * fix * fix --------- Co-authored-by: Egor Baturin <egoriyaa@github.com>
etna-team · Jul 12, 2024 · 12f19fb · 12f19fb
1 parent 1bd32a7
commit 12f19fb
Show file tree

Hide file tree

Showing 8 changed files with 684 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,7 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Add `get_anomalies_mad` function for anomaly detection ([#398](https://github.com/etna-team/etna/pull/398))
 - Add `TSDataset.features` property to get list of all features in a dataset ([#405](https://github.com/etna-team/etna/pull/405))
 - Add `MADOutlierTransform` class for anomaly detection ([#415](https://github.com/etna-team/etna/pull/415))
-- 
+- Add `MeanEncoderTransform` ([#413](https://github.com/etna-team/etna/pull/413))
 
 ### Changed
 - Allow to change `device`, `batch_size` and `num_workers` of embedding models ([#396](https://github.com/etna-team/etna/pull/396))

diff --git a/docs/source/api_reference/transforms.rst b/docs/source/api_reference/transforms.rst
@@ -52,6 +52,7 @@ Categorical encoding transforms:
    :template: class.rst
 
    SegmentEncoderTransform
+   MeanEncoderTransform
    MeanSegmentEncoderTransform
    LabelEncoderTransform
    OneHotEncoderTransform

diff --git a/etna/transforms/__init__.py b/etna/transforms/__init__.py
@@ -21,6 +21,7 @@
 from etna.transforms.embeddings import EmbeddingSegmentTransform
 from etna.transforms.embeddings import EmbeddingWindowTransform
 from etna.transforms.encoders import LabelEncoderTransform
+from etna.transforms.encoders import MeanEncoderTransform
 from etna.transforms.encoders import MeanSegmentEncoderTransform
 from etna.transforms.encoders import OneHotEncoderTransform
 from etna.transforms.encoders import SegmentEncoderTransform

diff --git a/etna/transforms/encoders/__init__.py b/etna/transforms/encoders/__init__.py
@@ -1,4 +1,5 @@
 from etna.transforms.encoders.categorical import LabelEncoderTransform
 from etna.transforms.encoders.categorical import OneHotEncoderTransform
+from etna.transforms.encoders.mean_encoder import MeanEncoderTransform
 from etna.transforms.encoders.mean_segment_encoder import MeanSegmentEncoderTransform
 from etna.transforms.encoders.segment_encoder import SegmentEncoderTransform
diff --git a/etna/transforms/encoders/mean_encoder.py b/etna/transforms/encoders/mean_encoder.py
@@ -0,0 +1,313 @@
+import reprlib
+from enum import Enum
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Union
+from typing import cast
+
+import numpy as np
+import pandas as pd
+from bottleneck import nanmean
+
+from etna.datasets import TSDataset
+from etna.distributions import BaseDistribution
+from etna.distributions import FloatDistribution
+from etna.transforms import IrreversibleTransform
+
+
+class EncoderMode(str, Enum):
+    """Enum for different encoding strategies."""
+
+    per_segment = "per-segment"
+    macro = "macro"
+
+    @classmethod
+    def _missing_(cls, value):
+        raise ValueError(f"The strategy '{value}' doesn't exist")
+
+
+class MissingMode(str, Enum):
+    """Enum for handle missing strategies."""
+
+    category = "category"
+    global_mean = "global_mean"
+
+    @classmethod
+    def _missing_(cls, value):
+        raise NotImplementedError(
+            f"{value} is not a valid {cls.__name__}. Supported types: {', '.join([repr(m.value) for m in cls])}"
+        )
+
+
+class MeanEncoderTransform(IrreversibleTransform):
+    """
+    Makes encoding of categorical feature.
+
+    For timestamps that are before the last timestamp seen in ``fit`` transformations are made using the formula below:
+
+    .. math::
+       \\frac{TargetSum + RunningMean * Smoothing}{FeatureCount + Smoothing}
+
+    where
+
+    * TargetSum is the sum of target up to the current timestamp for the current category, not including the current timestamp
+    * RunningMean is target mean up to the current timestamp, not including the current timestamp
+    * FeatureCount is the number of categories with the same value as in the current timestamp, not including the current timestamp
+
+    For future timestamps:
+
+    * for known categories encoding are filled with global mean of target for these categories calculated during ``fit``
+    * for unknown categories encoding are filled with global mean of target in the whole dataset calculated during ``fit``
+
+    All types of NaN values are considering as one category.
+    """
+
+    idx = pd.IndexSlice
+
+    def __init__(
+        self,
+        in_column: str,
+        out_column: str,
+        mode: Union[EncoderMode, str] = "per-segment",
+        handle_missing: str = MissingMode.category,
+        smoothing: int = 1,
+    ):
+        """
+        Init MeanEncoderTransform.
+
+        Parameters
+        ----------
+        in_column:
+            categorical column to apply transform
+        out_column:
+            name of added column
+        mode:
+            mode to encode segments
+
+            * 'per-segment' - statistics are calculated across each segment individually
+
+            * 'macro' - statistics are calculated across all segments. In this mode transform can work with new segments that were not seen during ``fit``
+        handle_missing:
+            mode to handle missing values in ``in_column``
+
+            * 'category' - NaNs they are interpreted as a separate categorical feature
+
+            * 'global_mean' - NaNs are filled with the running mean
+        smoothing:
+            smoothing parameter
+        """
+        super().__init__(required_features=["target", in_column])
+        self.in_column = in_column
+        self.out_column = out_column
+        self.mode = EncoderMode(mode)
+        self.handle_missing = MissingMode(handle_missing)
+        self.smoothing = smoothing
+
+        self._global_means: Optional[Union[float, Dict[str, float]]] = None
+        self._global_means_category: Optional[Union[Dict[str, float], Dict[str, Dict[str, float]]]] = None
+        self._last_timestamp: Union[pd.Timestamp, int, None]
+
+    def _fit(self, df: pd.DataFrame) -> "MeanEncoderTransform":
+        """
+        Fit encoder.
+
+        Parameters
+        ----------
+        df:
+            dataframe with data to fit expanding mean target encoder.
+
+        Returns
+        -------
+        :
+            Fitted transform
+        """
+        df.loc[:, pd.IndexSlice[:, self.in_column]] = df.loc[:, pd.IndexSlice[:, self.in_column]].fillna(np.NaN)
+
+        if self.mode is EncoderMode.per_segment:
+            axis = 0
+            segments = df.columns.get_level_values("segment").unique().tolist()
+            global_means = nanmean(df.loc[:, self.idx[:, "target"]], axis=axis)
+            global_means = dict(zip(segments, global_means))
+
+            global_means_category = {}
+            for segment in segments:
+                segment_df = TSDataset.to_flatten(df.loc[:, pd.IndexSlice[segment, :]])
+                global_means_category[segment] = (
+                    segment_df[[self.in_column, "target"]]
+                    .groupby(self.in_column, dropna=False)
+                    .mean()
+                    .to_dict()["target"]
+                )
+        else:
+            axis = None
+            global_means = nanmean(df.loc[:, self.idx[:, "target"]], axis=axis)
+
+            segment_df = TSDataset.to_flatten(df)
+            global_means_category = (
+                segment_df[[self.in_column, "target"]].groupby(self.in_column, dropna=False).mean().to_dict()["target"]
+            )
+
+        self._global_means = global_means
+        self._global_means_category = global_means_category
+        self._last_timestamp = df.index[-1]
+
+        return self
+
+    @staticmethod
+    def _count_macro_running_mean(df, n_segments):
+        y = df["target"]
+        timestamp_count = y.groupby(df["timestamp"]).transform("count")
+        timestamp_sum = y.groupby(df["timestamp"]).transform("sum")
+        expanding_mean = timestamp_sum.iloc[::n_segments].cumsum() / timestamp_count.iloc[::n_segments].cumsum()
+        expanding_mean = expanding_mean.repeat(n_segments)
+        # first timestamp is NaN
+        expanding_mean = pd.Series(index=df.index, data=expanding_mean.values).shift(n_segments)
+        return expanding_mean
+
+    def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Get encoded values for the segment.
+
+        Parameters
+        ----------
+        df:
+            dataframe with data to transform.
+
+        Returns
+        -------
+        :
+            result dataframe
+
+        Raises
+        ------
+        ValueError:
+            If transform isn't fitted.
+        NotImplementedError:
+            If there are segments that weren't present during training.
+        """
+        if self._global_means is None:
+            raise ValueError("The transform isn't fitted!")
+
+        segments = df.columns.get_level_values("segment").unique().tolist()
+        n_segments = len(segments)
+        if self.mode is EncoderMode.per_segment:
+            self._global_means = cast(Dict[str, float], self._global_means)
+            new_segments = set(segments) - self._global_means.keys()
+            if len(new_segments) > 0:
+                raise NotImplementedError(
+                    f"This transform can't process segments that weren't present on train data: {reprlib.repr(new_segments)}"
+                )
+        df.loc[:, self.idx[:, self.in_column]] = df.loc[:, self.idx[:, self.in_column]].fillna(np.NaN)
+
+        future_timestamps = df.index[df.index > self._last_timestamp]
+        intersected_timestamps = df.index[df.index <= self._last_timestamp]
+
+        intersected_df = df.loc[intersected_timestamps, self.idx[:, :]]
+        future_df = df.loc[future_timestamps, self.idx[:, :]]
+
+        if len(intersected_df) > 0:
+            if self.mode is EncoderMode.per_segment:
+                for segment in segments:
+                    segment_df = TSDataset.to_flatten(intersected_df.loc[:, self.idx[segment, :]])
+                    y = segment_df["target"]
+                    # first timestamp is NaN
+                    expanding_mean = y.expanding().mean().shift()
+                    # cumcount not including current timestamp
+                    cumcount = y.groupby(segment_df[self.in_column].astype(str)).agg("cumcount")
+                    # cumsum not including current timestamp
+                    cumsum = (
+                        y.groupby(segment_df[self.in_column].astype(str))
+                        .transform(lambda x: x.shift().cumsum())
+                        .fillna(0)
+                    )
+                    feature = (cumsum + expanding_mean * self.smoothing) / (cumcount + self.smoothing)
+                    if self.handle_missing is MissingMode.global_mean:
+                        nan_feature_index = segment_df[segment_df[self.in_column].isnull()].index
+                        feature.loc[nan_feature_index] = expanding_mean.loc[nan_feature_index]
+                    intersected_df.loc[:, self.idx[segment, self.out_column]] = feature.values
+
+            else:
+                flatten = TSDataset.to_flatten(intersected_df)
+                flatten = flatten.sort_values(["timestamp", "segment"])
+                running_mean = self._count_macro_running_mean(flatten, n_segments)
+
+                temp = pd.DataFrame(index=flatten.index, columns=["cumsum", "cumcount"], dtype=float)
+
+                timestamps = intersected_df.index
+                categories = pd.unique(df.loc[:, self.idx[:, self.in_column]].values.ravel())
+
+                cumstats = pd.DataFrame(data={"sum": 0, "count": 0, self.in_column: categories})
+                cur_timestamp_idx = np.arange(0, len(timestamps) * n_segments, len(timestamps))
+                for _ in range(len(timestamps)):
+                    timestamp_df = flatten.loc[cur_timestamp_idx]
+                    # statistics from previous timestamp
+                    cumsum_dict = dict(cumstats[[self.in_column, "sum"]].values)
+                    cumcount_dict = dict(cumstats[[self.in_column, "count"]].values)
+                    # map categories for current timestamp to statistics
+                    temp.loc[cur_timestamp_idx, "cumsum"] = timestamp_df[self.in_column].map(cumsum_dict)
+                    temp.loc[cur_timestamp_idx, "cumcount"] = timestamp_df[self.in_column].map(cumcount_dict)
+                    # count statistics for current timestamp
+                    stats = (
+                        timestamp_df["target"]
+                        .groupby(timestamp_df[self.in_column], dropna=False)
+                        .agg(["count", "sum"])
+                        .reset_index()
+                    )
+                    # sum current and previous statistics
+                    cumstats = pd.concat([cumstats, stats]).groupby(self.in_column, as_index=False, dropna=False).sum()
+                    cur_timestamp_idx += 1
+
+                feature = (temp["cumsum"] + running_mean * self.smoothing) / (temp["cumcount"] + self.smoothing)
+                if self.handle_missing is MissingMode.global_mean:
+                    nan_feature_index = flatten[flatten[self.in_column].isnull()].index
+                    feature.loc[nan_feature_index] = running_mean.loc[nan_feature_index]
+
+                feature = pd.DataFrame(
+                    feature.values.reshape(len(timestamps), n_segments),
+                    columns=pd.MultiIndex.from_product([segments, [self.out_column]]),
+                    index=intersected_df.index,
+                )
+                intersected_df = pd.concat([intersected_df, feature], axis=1)
+
+        if len(future_df) > 0:
+            n_timestamps = len(future_df.index)
+            if self.mode is EncoderMode.per_segment:
+                self._global_means_category = cast(Dict[str, Dict[str, float]], self._global_means_category)
+                self._global_means = cast(Dict[str, float], self._global_means)
+                for segment in segments:
+                    segment_df = TSDataset.to_flatten(future_df.loc[:, self.idx[segment, :]])
+                    feature = segment_df[self.in_column].map(self._global_means_category[segment])
+                    feature = feature.fillna(self._global_means[segment])
+                    future_df.loc[:, self.idx[segment, self.out_column]] = feature.values
+            else:
+                flatten = TSDataset.to_flatten(future_df)
+                feature = flatten[self.in_column].map(self._global_means_category)
+                feature = feature.fillna(self._global_means)
+                feature = pd.DataFrame(
+                    feature.values.reshape(len(segments), n_timestamps).T,
+                    columns=pd.MultiIndex.from_product([segments, [self.out_column]]),
+                    index=future_df.index,
+                )
+                future_df = pd.concat([future_df, feature], axis=1)
+
+        intersected_df = intersected_df.sort_index(axis=1)
+        future_df = future_df.sort_index(axis=1)
+        transformed_df = pd.concat((intersected_df, future_df), axis=0)
+        return transformed_df
+
+    def get_regressors_info(self) -> List[str]:
+        """Return the list with regressors created by the transform."""
+        return [self.out_column]
+
+    def params_to_tune(self) -> Dict[str, BaseDistribution]:
+        """Get default grid for tuning hyperparameters.
+
+        This grid tunes ``smoothing`` parameter. Other parameters are expected to be set by the user.
+
+        Returns
+        -------
+        :
+            Grid to tune.
+        """
+        return {"smoothing": FloatDistribution(low=0, high=2)}