Skip to content

Commit

Permalink
Fix MRMRFeatureSelectionTransform: change less_is_better logic, a…
Browse files Browse the repository at this point in the history
…dd `drop_zero` mode (#314)

* Fix MRMRFeatureSelectionTransform, change score formula and add drop_zero mode

* change tests

* change test_for_linux

* fix comments

* fix codestyle

* fix first iteration of mrmr

* final change in tests

* fix comments

* fix_comments

* new_test

* fix_comments

* fix docstring, catboost test and changelog

* correct_catboost

* fix changelog

* fix changelog
  • Loading branch information
yellowssnake committed May 22, 2024
1 parent fc628d3 commit b011ec8
Show file tree
Hide file tree
Showing 6 changed files with 144 additions and 32 deletions.
4 changes: 2 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add `EmbeddingWindowTransform` ([#265](https://github.com/etna-team/etna/pull/265))
- Add `TSTCCEmbeddingModel` ([#294](https://github.com/etna-team/etna/pull/294))
- Add `210-embedding_models` example notebook ([#304](https://github.com/etna-team/etna/pull/304))
-
- Add parameter `drop_zero` into `MRMRFeatureSelectionTransform` ([#308](https://github.com/etna-team/etna/issues/308))
-
-
-
Expand Down Expand Up @@ -44,7 +44,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- Fix FordA download url in classification notebook ([#309](https://github.com/etna-team/etna/pull/309))
- Allow `seaborn` dependency to have higher version ([#319](https://github.com/etna-team/etna/pull/319))
-
- Fix `MRMRFeatureSelectionTransform` to correctly handle less-is-better `relevance_table` ([#308](https://github.com/etna-team/etna/issues/308))
-
-
-
Expand Down
28 changes: 21 additions & 7 deletions etna/analysis/feature_selection/mrmr_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def mrmr(
regressors: pd.DataFrame,
top_k: int,
fast_redundancy: bool = False,
drop_zero: bool = False,
relevance_aggregation_mode: str = AggregationMode.mean,
redundancy_aggregation_mode: str = AggregationMode.mean,
atol: float = 1e-10,
Expand All @@ -59,6 +60,10 @@ def mrmr(
fast_redundancy:
* True: compute redundancy only inside the the segments, time complexity :math:`O(top\_k * n\_segments * n\_features * history\_len)`
* False: compute redundancy for all the pairs of segments, time complexity :math:`O(top\_k * n\_segments^2 * n\_features * history\_len)`
drop_zero:
* True: use only features with relevance > 0 in calculations, if their number is less than ``top_k``
randomly selects features with zero relevance so that the total number of selected features is ``top_k``
* False: use all features in calculations
relevance_aggregation_mode:
the method for relevance values per-segment aggregation
redundancy_aggregation_mode:
Expand Down Expand Up @@ -88,16 +93,26 @@ def mrmr(
relevance = relevance_table.apply(relevance_aggregation_fn).fillna(0)

all_features = relevance.index.to_list()

if top_k >= len(all_features):
return all_features.copy()

segments = set(regressors.columns.get_level_values("segment"))
selected_features: List[str] = []
not_selected_features = all_features.copy()

redundancy_table = pd.DataFrame(np.inf, index=all_features, columns=all_features)
top_k = min(top_k, len(all_features))
if drop_zero:
not_relevant_features = list(filter(lambda feature: not relevance.loc[feature] == 0, not_selected_features))
relevant_features = list(set(all_features) - set(not_relevant_features))
if top_k >= len(relevant_features):
return relevant_features + not_relevant_features[: (top_k - len(relevant_features))]
not_selected_features = relevant_features

redundancy_table = pd.DataFrame(1, index=all_features, columns=all_features)

for i in range(top_k):
score_numerator = relevance.loc[not_selected_features]
score_denominator = pd.Series(1, index=not_selected_features)
score_denominator = pd.Series(0, index=not_selected_features)
if i > 0:
last_selected_feature = selected_features[-1]
last_selected_regressor = regressors.loc[pd.IndexSlice[:], pd.IndexSlice[:, last_selected_feature]]
Expand All @@ -124,14 +139,13 @@ def mrmr(
redundancy_table.loc[not_selected_features, last_selected_feature] = (
segment_redundancy.agg(redundancy_aggregation_fn)
.clip(atol)
.fillna(np.inf)
.fillna(1)
.loc[not_selected_features]
.values.squeeze()
)

score_denominator = redundancy_table.loc[not_selected_features, selected_features].mean(axis=1)
score_denominator[np.isclose(score_denominator, 1, atol=atol)] = np.inf
score = score_numerator / score_denominator
score_denominator = redundancy_table.loc[not_selected_features, selected_features].max(axis=1)
score = score_numerator * (1 - score_denominator)
best_feature = score.index[score.argmax()]
selected_features.append(best_feature)
not_selected_features.remove(best_feature)
Expand Down
12 changes: 11 additions & 1 deletion etna/transforms/feature_selection/feature_importance.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ def __init__(
top_k: int,
features_to_use: Union[List[str], Literal["all"]] = "all",
fast_redundancy: bool = False,
drop_zero: bool = False,
relevance_aggregation_mode: str = AggregationMode.mean,
redundancy_aggregation_mode: str = AggregationMode.mean,
atol: float = 1e-10,
Expand All @@ -190,6 +191,10 @@ def __init__(
features_to_use:
columns of the dataset to select from
if "all" value is given, all columns are used
drop_zero:
* True: use only features with relevance > 0 in calculations, if their number is less than ``top_k``,
randomly selects features with zero relevance so that the total number of selected features is ``top_k``
* False: use all features in calculations
fast_redundancy:
* True: compute redundancy only inside the segments, time complexity :math:`O(top\_k \\cdot n\_segments \\cdot n\_features \\cdot history\_len)`
* False: compute redundancy for all the pairs of segments, time complexity :math:`O(top\_k \\cdot n\_segments^2 \\cdot n\_features \\cdot history\_len)`
Expand All @@ -209,6 +214,7 @@ def __init__(
self.relevance_table = relevance_table
self.top_k = top_k
self.fast_redundancy = fast_redundancy
self.drop_zero = drop_zero
self.relevance_aggregation_mode = relevance_aggregation_mode
self.redundancy_aggregation_mode = redundancy_aggregation_mode
self.atol = atol
Expand Down Expand Up @@ -236,12 +242,16 @@ def _fit(self, df: pd.DataFrame) -> "MRMRFeatureSelectionTransform":
relevance_table = self.relevance_table(df_target, df_features, **self.relevance_params)

if not self.relevance_table.greater_is_better:
relevance_table *= -1
min_relevance = relevance_table.values.min()
max_relevance = relevance_table.values.max()
relevance_table = max_relevance + min_relevance - relevance_table

self.selected_features = mrmr(
relevance_table=relevance_table,
regressors=df_features,
top_k=self.top_k,
fast_redundancy=self.fast_redundancy,
drop_zero=self.drop_zero,
relevance_aggregation_mode=self.relevance_aggregation_mode,
redundancy_aggregation_mode=self.redundancy_aggregation_mode,
atol=self.atol,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,94 @@ def test_mrmr_right_regressors(relevance_table, ts_with_regressors, fast_redunda
assert set(selected_regressors) == {"regressor_useful_0", "regressor_useful_1", "regressor_useful_2"}


@pytest.mark.parametrize("fast_redundancy", ([True, False]))
@pytest.mark.parametrize("relevance_table", ([ModelRelevanceTable()]))
def test_mrmr_top_k_greater_than_number_of_regressors(relevance_table, ts_with_regressors, fast_redundancy):
"""Check that transform selects all regressors if top_k greater than number of regressors."""
ts = ts_with_regressors

mrmr = MRMRFeatureSelectionTransform(
relevance_table=relevance_table,
top_k=20,
model=CatBoostRegressor(iterations=1),
fast_redundancy=fast_redundancy,
)
df_selected = mrmr.fit_transform(ts).to_pandas()
selected_regressors = set()
for column in df_selected.columns.get_level_values("feature"):
if column.startswith("regressor"):
selected_regressors.add(column)
assert len(selected_regressors) == len(ts.regressors)


@pytest.mark.parametrize("fast_redundancy", ([True, False]))
@pytest.mark.parametrize("relevance_table", ([ModelRelevanceTable()]))
def test_mrmr_select_top_k_regressors_in_drop_zero_mode(relevance_table, ts_with_regressors, fast_redundancy):
"""Check that transform selects top_k regressors in drop_zero mode
if number of regressors with positive relevance less than top_k."""
ts = ts_with_regressors

mrmr = MRMRFeatureSelectionTransform(
relevance_table=relevance_table,
top_k=10,
model=CatBoostRegressor(iterations=1),
drop_zero=True,
fast_redundancy=fast_redundancy,
)
df_selected = mrmr.fit_transform(ts).to_pandas()
selected_regressors = set()
for column in df_selected.columns.get_level_values("feature"):
if column.startswith("regressor"):
selected_regressors.add(column)
assert len(selected_regressors) == 10


@pytest.mark.parametrize("fast_redundancy", ([True, False]))
@pytest.mark.parametrize("relevance_table", ([ModelRelevanceTable()]))
def test_mrmr_drop_zero_mode_sanity_check(relevance_table, ts_with_regressors, fast_redundancy):
"""Check that transform selects right top_k regressors in drop_zero mode."""
ts = ts_with_regressors

mrmr = MRMRFeatureSelectionTransform(
relevance_table=relevance_table,
top_k=3,
model=RandomForestRegressor(),
drop_zero=True,
fast_redundancy=fast_redundancy,
)

df_selected = mrmr.fit_transform(ts).to_pandas()
selected_regressors = set()
for column in df_selected.columns.get_level_values("feature"):
if column.startswith("regressor"):
selected_regressors.add(column)

assert set(selected_regressors) == {"regressor_useful_0", "regressor_useful_1", "regressor_useful_2"}


@pytest.mark.parametrize("fast_redundancy", ([True, False]))
@pytest.mark.parametrize("relevance_table", ([ModelRelevanceTable()]))
def test_mrmr_drop_zero_mode_top_k_less_than_relevant(relevance_table, ts_with_regressors, fast_redundancy):
"""Check that transform selects exact top_k regressors if number of relevant regressors greater than top_k."""
ts = ts_with_regressors

mrmr = MRMRFeatureSelectionTransform(
relevance_table=relevance_table,
top_k=2,
model=RandomForestRegressor(),
drop_zero=True,
fast_redundancy=fast_redundancy,
)

df_selected = mrmr.fit_transform(ts).to_pandas()
selected_regressors = set()
for column in df_selected.columns.get_level_values("feature"):
if column.startswith("regressor"):
selected_regressors.add(column)

assert set(selected_regressors) == {"regressor_useful_0", "regressor_useful_1"}


@pytest.mark.parametrize(
"transform",
[
Expand Down
20 changes: 10 additions & 10 deletions tests/test_transforms/test_inference/test_inverse_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,14 +221,14 @@ def _test_inverse_transform_train(self, ts, transform, expected_changes):
fast_redundancy=True,
),
"ts_with_exog",
{"create": {"weekday", "monthday", "positive"}},
{"create": {"positive", "weekday", "year"}},
),
(
MRMRFeatureSelectionTransform(
relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True, fast_redundancy=False
),
"ts_with_exog",
{"create": {"weekday", "monthday", "positive"}},
{"create": {"positive", "weekday", "year"}},
),
(
TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2),
Expand Down Expand Up @@ -679,14 +679,14 @@ def test_inverse_transform_train_fail_resample(self, transform, dataset_name, ex
fast_redundancy=True,
),
"ts_with_exog",
{"create": {"weekday", "monthday", "positive"}},
{"create": {"positive", "weekday", "year"}},
),
(
MRMRFeatureSelectionTransform(
relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True, fast_redundancy=False
),
"ts_with_exog",
{"create": {"weekday", "monthday", "positive"}},
{"create": {"positive", "weekday", "year"}},
),
(
TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2),
Expand Down Expand Up @@ -1710,14 +1710,14 @@ def _test_inverse_transform_train_new_segments(self, ts, transform, train_segmen
relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True, fast_redundancy=True
),
"ts_with_exog",
{"create": {"monthday", "positive", "weekday"}},
{"create": {"positive", "weekday", "year"}},
),
(
MRMRFeatureSelectionTransform(
relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True, fast_redundancy=False
),
"ts_with_exog",
{"create": {"monthday", "positive", "weekday"}},
{"create": {"positive", "weekday", "year"}},
),
(
TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2),
Expand Down Expand Up @@ -2651,14 +2651,14 @@ def _test_inverse_transform_future_with_target(
fast_redundancy=True,
),
"ts_with_exog",
{"create": {"weekday", "monthday", "positive"}},
{"create": {"positive", "weekday", "year"}},
),
(
MRMRFeatureSelectionTransform(
relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True, fast_redundancy=False
),
"ts_with_exog",
{"create": {"weekday", "monthday", "positive"}},
{"create": {"positive", "weekday", "year"}},
),
(
TreeFeatureSelectionTransform(model=DecisionTreeRegressor(random_state=42), top_k=2),
Expand Down Expand Up @@ -3527,14 +3527,14 @@ def test_inverse_transform_future_without_target_fail_resample(
relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True, fast_redundancy=True
),
"ts_with_exog",
{"create": {"weekday", "monthday", "positive"}},
{"create": {"positive", "weekday", "year"}},
),
(
MRMRFeatureSelectionTransform(
relevance_table=StatisticsRelevanceTable(), top_k=2, return_features=True, fast_redundancy=False
),
"ts_with_exog",
{"create": {"weekday", "monthday", "positive"}},
{"create": {"positive", "weekday", "year"}},
),
(
TreeFeatureSelectionTransform(
Expand Down

0 comments on commit b011ec8

Please sign in to comment.