diff --git a/CHANGELOG.md b/CHANGELOG.md index c9aef0dfd..546fce1f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add `EmbeddingWindowTransform` ([#265](https://github.com/etna-team/etna/pull/265)) - Add `TSTCCEmbeddingModel` ([#294](https://github.com/etna-team/etna/pull/294)) - Add `210-embedding_models` example notebook ([#304](https://github.com/etna-team/etna/pull/304)) -- +- Add parameter `drop_zero` into `MRMRFeatureSelectionTransform` ([#308](https://github.com/etna-team/etna/issues/308)) - - - @@ -44,7 +44,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Fix FordA download url in classification notebook ([#309](https://github.com/etna-team/etna/pull/309)) - Allow `seaborn` dependency to have higher version ([#319](https://github.com/etna-team/etna/pull/319)) -- +- Fix `MRMRFeatureSelectionTransform` to correctly handle less-is-better `relevance_table` ([#308](https://github.com/etna-team/etna/issues/308)) - - - diff --git a/etna/analysis/feature_selection/mrmr_selection.py b/etna/analysis/feature_selection/mrmr_selection.py index f861a045b..aa1036a18 100644 --- a/etna/analysis/feature_selection/mrmr_selection.py +++ b/etna/analysis/feature_selection/mrmr_selection.py @@ -35,6 +35,7 @@ def mrmr( regressors: pd.DataFrame, top_k: int, fast_redundancy: bool = False, + drop_zero: bool = False, relevance_aggregation_mode: str = AggregationMode.mean, redundancy_aggregation_mode: str = AggregationMode.mean, atol: float = 1e-10, @@ -59,6 +60,10 @@ def mrmr( fast_redundancy: * True: compute redundancy only inside the the segments, time complexity :math:`O(top\_k * n\_segments * n\_features * history\_len)` * False: compute redundancy for all the pairs of segments, time complexity :math:`O(top\_k * n\_segments^2 * n\_features * history\_len)` + drop_zero: + * True: use only features with relevance > 0 in calculations, if their number is less than ``top_k`` + randomly selects features with zero relevance so that the total number of selected features is ``top_k`` + * False: use all features in calculations relevance_aggregation_mode: the method for relevance values per-segment aggregation redundancy_aggregation_mode: @@ -88,16 +93,26 @@ def mrmr( relevance = relevance_table.apply(relevance_aggregation_fn).fillna(0) all_features = relevance.index.to_list() + + if top_k >= len(all_features): + return all_features.copy() + segments = set(regressors.columns.get_level_values("segment")) selected_features: List[str] = [] not_selected_features = all_features.copy() - redundancy_table = pd.DataFrame(np.inf, index=all_features, columns=all_features) - top_k = min(top_k, len(all_features)) + if drop_zero: + not_relevant_features = list(filter(lambda feature: not relevance.loc[feature] == 0, not_selected_features)) + relevant_features = list(set(all_features) - set(not_relevant_features)) + if top_k >= len(relevant_features): + return relevant_features + not_relevant_features[: (top_k - len(relevant_features))] + not_selected_features = relevant_features + + redundancy_table = pd.DataFrame(1, index=all_features, columns=all_features) for i in range(top_k): score_numerator = relevance.loc[not_selected_features] - score_denominator = pd.Series(1, index=not_selected_features) + score_denominator = pd.Series(0, index=not_selected_features) if i > 0: last_selected_feature = selected_features[-1] last_selected_regressor = regressors.loc[pd.IndexSlice[:], pd.IndexSlice[:, last_selected_feature]] @@ -124,14 +139,13 @@ def mrmr( redundancy_table.loc[not_selected_features, last_selected_feature] = ( segment_redundancy.agg(redundancy_aggregation_fn) .clip(atol) - .fillna(np.inf) + .fillna(1) .loc[not_selected_features] .values.squeeze() ) - score_denominator = redundancy_table.loc[not_selected_features, selected_features].mean(axis=1) - score_denominator[np.isclose(score_denominator, 1, atol=atol)] = np.inf - score = score_numerator / score_denominator + score_denominator = redundancy_table.loc[not_selected_features, selected_features].max(axis=1) + score = score_numerator * (1 - score_denominator) best_feature = score.index[score.argmax()] selected_features.append(best_feature) not_selected_features.remove(best_feature) diff --git a/etna/transforms/feature_selection/feature_importance.py b/etna/transforms/feature_selection/feature_importance.py index d1eef3053..1825e11a3 100644 --- a/etna/transforms/feature_selection/feature_importance.py +++ b/etna/transforms/feature_selection/feature_importance.py @@ -172,6 +172,7 @@ def __init__( top_k: int, features_to_use: Union[List[str], Literal["all"]] = "all", fast_redundancy: bool = False, + drop_zero: bool = False, relevance_aggregation_mode: str = AggregationMode.mean, redundancy_aggregation_mode: str = AggregationMode.mean, atol: float = 1e-10, @@ -190,6 +191,10 @@ def __init__( features_to_use: columns of the dataset to select from if "all" value is given, all columns are used + drop_zero: + * True: use only features with relevance > 0 in calculations, if their number is less than ``top_k``, + randomly selects features with zero relevance so that the total number of selected features is ``top_k`` + * False: use all features in calculations fast_redundancy: * True: compute redundancy only inside the segments, time complexity :math:`O(top\_k \\cdot n\_segments \\cdot n\_features \\cdot history\_len)` * False: compute redundancy for all the pairs of segments, time complexity :math:`O(top\_k \\cdot n\_segments^2 \\cdot n\_features \\cdot history\_len)` @@ -209,6 +214,7 @@ def __init__( self.relevance_table = relevance_table self.top_k = top_k self.fast_redundancy = fast_redundancy + self.drop_zero = drop_zero self.relevance_aggregation_mode = relevance_aggregation_mode self.redundancy_aggregation_mode = redundancy_aggregation_mode self.atol = atol @@ -236,12 +242,16 @@ def _fit(self, df: pd.DataFrame) -> "MRMRFeatureSelectionTransform": relevance_table = self.relevance_table(df_target, df_features, **self.relevance_params) if not self.relevance_table.greater_is_better: - relevance_table *= -1 + min_relevance = relevance_table.values.min() + max_relevance = relevance_table.values.max() + relevance_table = max_relevance + min_relevance - relevance_table + self.selected_features = mrmr( relevance_table=relevance_table, regressors=df_features, top_k=self.top_k, fast_redundancy=self.fast_redundancy, + drop_zero=self.drop_zero, relevance_aggregation_mode=self.relevance_aggregation_mode, redundancy_aggregation_mode=self.redundancy_aggregation_mode, atol=self.atol, diff --git a/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py b/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py index c1a7dd409..2920c066d 100644 --- a/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py +++ b/tests/test_transforms/test_feature_selection/test_feature_importance_transform.py @@ -319,6 +319,94 @@ def test_mrmr_right_regressors(relevance_table, ts_with_regressors, fast_redunda assert set(selected_regressors) == {"regressor_useful_0", "regressor_useful_1", "regressor_useful_2"} +@pytest.mark.parametrize("fast_redundancy", ([True, False])) +@pytest.mark.parametrize("relevance_table", ([ModelRelevanceTable()])) +def test_mrmr_top_k_greater_than_number_of_regressors(relevance_table, ts_with_regressors, fast_redundancy): + """Check that transform selects all regressors if top_k greater than number of regressors.""" + ts = ts_with_regressors + + mrmr = MRMRFeatureSelectionTransform( + relevance_table=relevance_table, + top_k=20, + model=CatBoostRegressor(iterations=1), + fast_redundancy=fast_redundancy, + ) + df_selected = mrmr.fit_transform(ts).to_pandas() + selected_regressors = set() + for column in df_selected.columns.get_level_values("feature"): + if column.startswith("regressor"): + selected_regressors.add(column) + assert len(selected_regressors) == len(ts.regressors) + + +@pytest.mark.parametrize("fast_redundancy", ([True, False])) +@pytest.mark.parametrize("relevance_table", ([ModelRelevanceTable()])) +def test_mrmr_select_top_k_regressors_in_drop_zero_mode(relevance_table, ts_with_regressors, fast_redundancy): + """Check that transform selects top_k regressors in drop_zero mode + if number of regressors with positive relevance less than top_k.""" + ts = ts_with_regressors + + mrmr = MRMRFeatureSelectionTransform( + relevance_table=relevance_table, + top_k=10, + model=CatBoostRegressor(iterations=1), + drop_zero=True, + fast_redundancy=fast_redundancy, + ) + df_selected = mrmr.fit_transform(ts).to_pandas() + selected_regressors = set() + for column in df_selected.columns.get_level_values("feature"): + if column.startswith("regressor"): + selected_regressors.add(column) + assert len(selected_regressors) == 10 + + +@pytest.mark.parametrize("fast_redundancy", ([True, False])) +@pytest.mark.parametrize("relevance_table", ([ModelRelevanceTable()])) +def test_mrmr_drop_zero_mode_sanity_check(relevance_table, ts_with_regressors, fast_redundancy): + """Check that transform selects right top_k regressors in drop_zero mode.""" + ts = ts_with_regressors + + mrmr = MRMRFeatureSelectionTransform( + relevance_table=relevance_table, + top_k=3, + model=RandomForestRegressor(), + drop_zero=True, + fast_redundancy=fast_redundancy, + ) + + df_selected = mrmr.fit_transform(ts).to_pandas() + selected_regressors = set() + for column in df_selected.columns.get_level_values("feature"): + if column.startswith("regressor"): + selected_regressors.add(column) + + assert set(selected_regressors) == {"regressor_useful_0", "regressor_useful_1", "regressor_useful_2"} + + +@pytest.mark.parametrize("fast_redundancy", ([True, False])) +@pytest.mark.parametrize("relevance_table", ([ModelRelevanceTable()])) +def test_mrmr_drop_zero_mode_top_k_less_than_relevant(relevance_table, ts_with_regressors, fast_redundancy): + """Check that transform selects exact top_k regressors if number of relevant regressors greater than top_k.""" + ts = ts_with_regressors + + mrmr = MRMRFeatureSelectionTransform( + relevance_table=relevance_table, + top_k=2, + model=RandomForestRegressor(), + drop_zero=True, + fast_redundancy=fast_redundancy, + ) + + df_selected = mrmr.fit_transform(ts).to_pandas() + selected_regressors = set() + for column in df_selected.columns.get_level_values("feature"): + if column.startswith("regressor"): + selected_regressors.add(column) + + assert set(selected_regressors) == {"regressor_useful_0", "regressor_useful_1"} + + @pytest.mark.parametrize( "transform", [ diff --git a/tests/test_transforms/test_inference/test_inverse_transform.py b/tests/test_transforms/test_inference/test_inverse_transform.py index df2dc0f01..7b0303112 100644 --- a/tests/test_transforms/test_inference/test_inverse_transform.py +++ b/tests/test_transforms/test_inference/test_inverse_transform.py @@ -221,14 +221,14 @@ def _test_inverse_transform_train(self, ts, transform, expected_changes): fast_redundancy=True, ), "ts_with_exog", - {"create": {"weekday", "monthday", "positive"}}, + {"create": {"positive", "weekday", "year"}}, ), ( MRMRFeatureSelectionTransform( relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True, fast_redundancy=False ), "ts_with_exog", - {"create": {"weekday", "monthday", "positive"}}, + {"create": {"positive", "weekday", "year"}}, ), ( TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), @@ -679,14 +679,14 @@ def test_inverse_transform_train_fail_resample(self, transform, dataset_name, ex fast_redundancy=True, ), "ts_with_exog", - {"create": {"weekday", "monthday", "positive"}}, + {"create": {"positive", "weekday", "year"}}, ), ( MRMRFeatureSelectionTransform( relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True, fast_redundancy=False ), "ts_with_exog", - {"create": {"weekday", "monthday", "positive"}}, + {"create": {"positive", "weekday", "year"}}, ), ( TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), @@ -1710,14 +1710,14 @@ def _test_inverse_transform_train_new_segments(self, ts, transform, train_segmen relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True, fast_redundancy=True ), "ts_with_exog", - {"create": {"monthday", "positive", "weekday"}}, + {"create": {"positive", "weekday", "year"}}, ), ( MRMRFeatureSelectionTransform( relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True, fast_redundancy=False ), "ts_with_exog", - {"create": {"monthday", "positive", "weekday"}}, + {"create": {"positive", "weekday", "year"}}, ), ( TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), @@ -2651,14 +2651,14 @@ def _test_inverse_transform_future_with_target( fast_redundancy=True, ), "ts_with_exog", - {"create": {"weekday", "monthday", "positive"}}, + {"create": {"positive", "weekday", "year"}}, ), ( MRMRFeatureSelectionTransform( relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True, fast_redundancy=False ), "ts_with_exog", - {"create": {"weekday", "monthday", "positive"}}, + {"create": {"positive", "weekday", "year"}}, ), ( TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), @@ -3527,14 +3527,14 @@ def test_inverse_transform_future_without_target_fail_resample( relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True, fast_redundancy=True ), "ts_with_exog", - {"create": {"weekday", "monthday", "positive"}}, + {"create": {"positive", "weekday", "year"}}, ), ( MRMRFeatureSelectionTransform( relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True, fast_redundancy=False ), "ts_with_exog", - {"create": {"weekday", "monthday", "positive"}}, + {"create": {"positive", "weekday", "year"}}, ), ( TreeFeatureSelectionTransform( diff --git a/tests/test_transforms/test_inference/test_transform.py b/tests/test_transforms/test_inference/test_transform.py index 981daa7a7..b4015d4d3 100644 --- a/tests/test_transforms/test_inference/test_transform.py +++ b/tests/test_transforms/test_inference/test_transform.py @@ -186,14 +186,14 @@ def _test_transform_train(self, ts, transform, expected_changes): relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=True ), "ts_with_exog", - {"remove": {"weekday", "monthday", "positive"}}, + {"remove": {"positive", "weekday", "year"}}, ), ( MRMRFeatureSelectionTransform( relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=False ), "ts_with_exog", - {"remove": {"weekday", "monthday", "positive"}}, + {"remove": {"positive", "weekday", "year"}}, ), ( TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), @@ -600,14 +600,14 @@ def test_transform_train_datetime_timestamp(self, transform, dataset_name, expec relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=True ), "ts_with_exog", - {"remove": {"weekday", "monthday", "positive"}}, + {"remove": {"positive", "weekday", "year"}}, ), ( MRMRFeatureSelectionTransform( relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=False ), "ts_with_exog", - {"remove": {"weekday", "monthday", "positive"}}, + {"remove": {"positive", "weekday", "year"}}, ), ( TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), @@ -1616,14 +1616,14 @@ def _test_transform_train_new_segments(self, ts, transform, train_segments, expe relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=True ), "ts_with_exog", - {"remove": {"weekday", "monthday", "positive"}}, + {"remove": {"positive", "weekday", "year"}}, ), ( MRMRFeatureSelectionTransform( relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=False ), "ts_with_exog", - {"remove": {"weekday", "monthday", "positive"}}, + {"remove": {"positive", "weekday", "year"}}, ), ( TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), @@ -1995,14 +1995,14 @@ def _test_transform_future_new_segments(self, ts, transform, train_segments, exp relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=True ), "ts_with_exog", - {"remove": {"weekday", "monthday", "positive"}}, + {"remove": {"positive", "weekday", "year"}}, ), ( MRMRFeatureSelectionTransform( relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=False ), "ts_with_exog", - {"remove": {"weekday", "monthday", "positive"}}, + {"remove": {"positive", "weekday", "year"}}, ), ( TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), @@ -2461,14 +2461,14 @@ def _test_transform_future_with_target(self, ts, transform, expected_changes, ga relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=True ), "ts_with_exog", - {"remove": {"weekday", "monthday", "positive"}}, + {"remove": {"positive", "weekday", "year"}}, ), ( MRMRFeatureSelectionTransform( relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=False ), "ts_with_exog", - {"remove": {"weekday", "monthday", "positive"}}, + {"remove": {"positive", "weekday", "year"}}, ), ( TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2), @@ -2897,14 +2897,14 @@ def _test_transform_future_without_target(self, ts, transform, expected_changes, relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=True ), "ts_with_exog", - {"remove": {"weekday", "monthday", "positive"}}, + {"remove": {"positive", "weekday", "year"}}, ), ( MRMRFeatureSelectionTransform( relevance_table=StatisticsRelevanceTable(), top_k=2, fast_redundancy=False ), "ts_with_exog", - {"remove": {"weekday", "monthday", "positive"}}, + {"remove": {"positive", "weekday", "year"}}, ), ( TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2),