Skip to content

Commit

Permalink
FIX validate handle_unkown strategies in OrdinalEncoder (scikit-learn…
Browse files Browse the repository at this point in the history
  • Loading branch information
glemaitre committed Apr 22, 2021
1 parent ff4240e commit aedc5b6
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 25 deletions.
18 changes: 18 additions & 0 deletions doc/whats_new/v0.24.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,24 @@

.. currentmodule:: sklearn

.. _changes_0_24_2:

Version 0.24.2
==============

**TBD 2021**

Changelog
---------

:mod:`sklearn.preprocessing`
............................

- |Fix| Validate the constructor parameter `handle_unknown` in
:class:`preprocessing.OrdinalEncoder` to only allow for `'error'` and
`'use_encoded_value'` strategies.
:pr:`19234` by `Guillaume Lemaitre <glemaitre>`.

.. _changes_0_24_1:

Version 0.24.1
Expand Down
7 changes: 7 additions & 0 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,13 @@ def fit(self, X, y=None):
-------
self
"""
handle_unknown_strategies = ("error", "use_encoded_value")
if self.handle_unknown not in handle_unknown_strategies:
raise ValueError(
f"handle_unknown should be either 'error' or "
f"'use_encoded_value', got {self.handle_unknown}."
)

if self.handle_unknown == 'use_encoded_value':
if is_scalar_nan(self.unknown_value):
if np.dtype(self.dtype).kind != 'f':
Expand Down
65 changes: 40 additions & 25 deletions sklearn/preprocessing/tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -624,33 +624,48 @@ def test_ordinal_encoder_handle_unknowns_numeric(dtype):
assert_array_equal(X_trans_inv, inv_exp)


def test_ordinal_encoder_handle_unknowns_raise():
@pytest.mark.parametrize(
"params, err_type, err_msg",
[
(
{"handle_unknown": "use_encoded_value"},
TypeError,
"unknown_value should be an integer or np.nan when handle_unknown "
"is 'use_encoded_value', got None.",
),
(
{"unknown_value": -2},
TypeError,
"unknown_value should only be set when handle_unknown is "
"'use_encoded_value', got -2.",
),
(
{"handle_unknown": "use_encoded_value", "unknown_value": "bla"},
TypeError,
"unknown_value should be an integer or np.nan when handle_unknown "
"is 'use_encoded_value', got bla.",
),
(
{"handle_unknown": "use_encoded_value", "unknown_value": 1},
ValueError,
"The used value for unknown_value (1) is one of the values "
"already used for encoding the seen categories.",
),
(
{"handle_unknown": "ignore"},
ValueError,
"handle_unknown should be either 'error' or 'use_encoded_value', "
"got ignore.",
),
],
)
def test_ordinal_encoder_handle_unknowns_raise(params, err_type, err_msg):
# Check error message when validating input parameters
X = np.array([['a', 'x'], ['b', 'y']], dtype=object)

enc = OrdinalEncoder(handle_unknown='use_encoded_value')
msg = ("unknown_value should be an integer or np.nan when handle_unknown "
"is 'use_encoded_value', got None.")
with pytest.raises(TypeError, match=msg):
enc.fit(X)

enc = OrdinalEncoder(unknown_value=-2)
msg = ("unknown_value should only be set when handle_unknown is "
"'use_encoded_value', got -2.")
with pytest.raises(TypeError, match=msg):
enc.fit(X)

enc = OrdinalEncoder(handle_unknown='use_encoded_value',
unknown_value='bla')
msg = ("unknown_value should be an integer or np.nan when handle_unknown "
"is 'use_encoded_value', got bla.")
with pytest.raises(TypeError, match=msg):
enc.fit(X)

enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=1)
msg = ("The used value for unknown_value (1) is one of the values already "
"used for encoding the seen categories.")
with pytest.raises(ValueError, match=msg):
enc.fit(X)
encoder = OrdinalEncoder(**params)
with pytest.raises(err_type, match=err_msg):
encoder.fit(X)


def test_ordinal_encoder_handle_unknowns_nan():
Expand Down

0 comments on commit aedc5b6

Please sign in to comment.