From b3612dd2a12431344c6efb496eed4bc1e83d01cd Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Wed, 27 Feb 2019 22:12:35 -0600 Subject: [PATCH 01/10] Add drop option to OneHotEncoder --- dask_ml/preprocessing/_encoders.py | 11 ++++++++++- tests/preprocessing/test_encoders.py | 29 ++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/dask_ml/preprocessing/_encoders.py b/dask_ml/preprocessing/_encoders.py index c6e937bca..1e05ac7c0 100644 --- a/dask_ml/preprocessing/_encoders.py +++ b/dask_ml/preprocessing/_encoders.py @@ -107,13 +107,22 @@ def __init__( n_values=None, categorical_features=None, categories="auto", + drop=None, sparse=True, dtype=np.float64, handle_unknown="error", ): super(OneHotEncoder, self).__init__( - n_values, categorical_features, categories, sparse, dtype, handle_unknown + n_values, + categorical_features, + categories, + drop, + sparse, + dtype, + handle_unknown, ) + if drop is not None: + raise NotImplementedError("drop != None is not implemented yet.") def fit(self, X, y=None): if self.handle_unknown == "ignore": diff --git a/tests/preprocessing/test_encoders.py b/tests/preprocessing/test_encoders.py index a33e655e1..8ce6b3f97 100644 --- a/tests/preprocessing/test_encoders.py +++ b/tests/preprocessing/test_encoders.py @@ -34,7 +34,15 @@ def test_basic_array(sparse, method, categories): result = b.fit_transform(dX) assert_estimator_equal( - a, b, exclude={"n_values_", "feature_indices_", "active_features_", "dtypes_"} + a, + b, + exclude={ + "n_values_", + "feature_indices_", + "active_features_", + "dtypes_", + "drop_idx_", + }, ) assert isinstance(result, da.Array) @@ -71,7 +79,9 @@ def test_basic_array(sparse, method, categories): @pytest.mark.parametrize("dtype", [np.float, np.uint8]) def test_basic_dataframe(sparse, method, dask_data, dtype): a = sklearn.preprocessing.OneHotEncoder(sparse=sparse, dtype=dtype) + print(f"\na = {a}") b = dask_ml.preprocessing.OneHotEncoder(sparse=sparse, dtype=dtype) + print(f"b = {b}") if method == "fit": a.fit(df) @@ -83,7 +93,15 @@ def test_basic_dataframe(sparse, method, dask_data, dtype): result = b.fit_transform(dask_data) assert_estimator_equal( - a, b, exclude={"n_values_", "feature_indices_", "active_features_", "dtypes_"} + a, + b, + exclude={ + "n_values_", + "feature_indices_", + "active_features_", + "dtypes_", + "drop_idx_", + }, ) assert isinstance(result, type(dask_data)) @@ -106,6 +124,13 @@ def test_invalid_handle_input(): enc.fit(dX) +def test_onehotencoder_drop_raises(): + # drop is not currently supported + dask_ml.preprocessing.OneHotEncoder() + with pytest.raises(NotImplementedError): + dask_ml.preprocessing.OneHotEncoder(drop="first") + + def test_handles_numpy(): enc = dask_ml.preprocessing.OneHotEncoder() enc.fit(X) From 930938b7e0ab172322dca527cb0d13b9419cb1d8 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Wed, 27 Feb 2019 22:17:26 -0600 Subject: [PATCH 02/10] Update QuantileTransformer internals --- dask_ml/preprocessing/data.py | 47 ++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/dask_ml/preprocessing/data.py b/dask_ml/preprocessing/data.py index 89bb6b8f0..1363e450f 100644 --- a/dask_ml/preprocessing/data.py +++ b/dask_ml/preprocessing/data.py @@ -268,11 +268,7 @@ def _transform(self, X, inverse=False): return da.vstack(transformed).T def _transform_col(self, X_col, quantiles, inverse): - if self.output_distribution == "normal": - output_distribution = "norm" - else: - output_distribution = self.output_distribution - output_distribution = getattr(stats, output_distribution) + output_distribution = self.output_distribution if not inverse: lower_bound_x = quantiles[0] @@ -284,10 +280,18 @@ def _transform_col(self, X_col, quantiles, inverse): upper_bound_x = 1 lower_bound_y = quantiles[0] upper_bound_y = quantiles[-1] - X_col = X_col.map_blocks(output_distribution.cdf) + # for inverse transform, match a uniform distribution + if output_distribution == "normal": + X_col = X_col.map_blocks(stats.norm.cdf) + # else output distribution is already a uniform distribution + + if output_distribution == "normal": + lower_bounds_idx = X_col - skdata.BOUNDS_THRESHOLD < lower_bound_x + upper_bounds_idx = X_col + skdata.BOUNDS_THRESHOLD > upper_bound_x + if output_distribution == "uniform": + lower_bounds_idx = X_col == lower_bound_x + upper_bounds_idx = X_col == upper_bound_x - lower_bounds_idx = X_col - skdata.BOUNDS_THRESHOLD < lower_bound_x - upper_bounds_idx = X_col + skdata.BOUNDS_THRESHOLD > upper_bound_x if not inverse: # See the note in scikit-learn. This trick is to avoid # repeated extreme values @@ -304,12 +308,27 @@ def _transform_col(self, X_col, quantiles, inverse): X_col[lower_bounds_idx] = lower_bound_y if not inverse: - X_col = X_col.map_blocks(output_distribution.ppf) - clip_min = output_distribution.ppf(skdata.BOUNDS_THRESHOLD - np.spacing(1)) - clip_max = output_distribution.ppf( - 1 - (skdata.BOUNDS_THRESHOLD - np.spacing(1)) - ) - X_col = da.clip(X_col, clip_min, clip_max) + + if output_distribution == "normal": + X_col = X_col.map_blocks(stats.norm.ppf) + # find the value to clip the data to avoid mapping to + # infinity. Clip such that the inverse transform will be + # consistent + # clip_min = stats.norm.ppf(BOUNDS_THRESHOLD - np.spacing(1)) + # clip_max = stats.norm.ppf(1 - (BOUNDS_THRESHOLD - + # np.spacing(1))) + # X_col = np.clip(X_col, clip_min, clip_max) + + clip_min = output_distribution.ppf( + skdata.BOUNDS_THRESHOLD - np.spacing(1) + ) + clip_max = output_distribution.ppf( + 1 - (skdata.BOUNDS_THRESHOLD - np.spacing(1)) + ) + X_col = da.clip(X_col, clip_min, clip_max) + + # else output distribution is uniform and the ppf is the + # identity function so we let X_col unchanged return X_col From 32199a3bf9ec3e5b157af2e55840f571fec07bea Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Wed, 27 Feb 2019 22:30:44 -0600 Subject: [PATCH 03/10] Fix commented out code --- dask_ml/preprocessing/data.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/dask_ml/preprocessing/data.py b/dask_ml/preprocessing/data.py index 1363e450f..ce6465089 100644 --- a/dask_ml/preprocessing/data.py +++ b/dask_ml/preprocessing/data.py @@ -314,17 +314,8 @@ def _transform_col(self, X_col, quantiles, inverse): # find the value to clip the data to avoid mapping to # infinity. Clip such that the inverse transform will be # consistent - # clip_min = stats.norm.ppf(BOUNDS_THRESHOLD - np.spacing(1)) - # clip_max = stats.norm.ppf(1 - (BOUNDS_THRESHOLD - - # np.spacing(1))) - # X_col = np.clip(X_col, clip_min, clip_max) - - clip_min = output_distribution.ppf( - skdata.BOUNDS_THRESHOLD - np.spacing(1) - ) - clip_max = output_distribution.ppf( - 1 - (skdata.BOUNDS_THRESHOLD - np.spacing(1)) - ) + clip_min = stats.norm.ppf(skdata.BOUNDS_THRESHOLD - np.spacing(1)) + clip_max = stats.norm.ppf(1 - (skdata.BOUNDS_THRESHOLD - np.spacing(1))) X_col = da.clip(X_col, clip_min, clip_max) # else output distribution is uniform and the ppf is the From 0cd9a80090e1c85ceeb00c6f525fc29710ca4fef Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Thu, 28 Feb 2019 10:57:40 -0600 Subject: [PATCH 04/10] Remove print lines in test --- tests/preprocessing/test_encoders.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/preprocessing/test_encoders.py b/tests/preprocessing/test_encoders.py index 8ce6b3f97..64bae26f7 100644 --- a/tests/preprocessing/test_encoders.py +++ b/tests/preprocessing/test_encoders.py @@ -79,9 +79,7 @@ def test_basic_array(sparse, method, categories): @pytest.mark.parametrize("dtype", [np.float, np.uint8]) def test_basic_dataframe(sparse, method, dask_data, dtype): a = sklearn.preprocessing.OneHotEncoder(sparse=sparse, dtype=dtype) - print(f"\na = {a}") b = dask_ml.preprocessing.OneHotEncoder(sparse=sparse, dtype=dtype) - print(f"b = {b}") if method == "fit": a.fit(df) From c6fd7d4e1cc478056a27e4cc4529fbe480895752 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Thu, 28 Feb 2019 13:02:48 -0600 Subject: [PATCH 05/10] Add sklearn version check for OneHotEncoder --- dask_ml/preprocessing/_encoders.py | 26 +++++++++++++++++--------- tests/preprocessing/test_encoders.py | 1 - 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/dask_ml/preprocessing/_encoders.py b/dask_ml/preprocessing/_encoders.py index 1e05ac7c0..6e19fdad8 100644 --- a/dask_ml/preprocessing/_encoders.py +++ b/dask_ml/preprocessing/_encoders.py @@ -1,10 +1,12 @@ import dask import dask.array as da import numpy as np +import packaging.version import pandas as pd import sklearn.preprocessing from .label import _encode, _encode_dask_array +from .._compat import SK_VERSION from ..utils import check_array @@ -51,6 +53,9 @@ class OneHotEncoder(sklearn.preprocessing.OneHotEncoder): The used categories can be found in the ``categories_`` attribute. + drop : None, default=None + The option to drop one of the categories per feature is not yet supported. + sparse : boolean, default=True Will return sparse matrix if set True else will return an array. @@ -112,17 +117,20 @@ def __init__( dtype=np.float64, handle_unknown="error", ): - super(OneHotEncoder, self).__init__( - n_values, - categorical_features, - categories, - drop, - sparse, - dtype, - handle_unknown, - ) if drop is not None: raise NotImplementedError("drop != None is not implemented yet.") + signature = { + "n_values": n_values, + "categorical_features": categorical_features, + "categories": categories, + "drop": drop, + "sparse": sparse, + "dtype": dtype, + "handle_unknown": handle_unknown, + } + if SK_VERSION <= packaging.version.parse("0.20.2"): + del signature["drop"] + super(OneHotEncoder, self).__init__(**signature) def fit(self, X, y=None): if self.handle_unknown == "ignore": diff --git a/tests/preprocessing/test_encoders.py b/tests/preprocessing/test_encoders.py index 64bae26f7..3763f2792 100644 --- a/tests/preprocessing/test_encoders.py +++ b/tests/preprocessing/test_encoders.py @@ -123,7 +123,6 @@ def test_invalid_handle_input(): def test_onehotencoder_drop_raises(): - # drop is not currently supported dask_ml.preprocessing.OneHotEncoder() with pytest.raises(NotImplementedError): dask_ml.preprocessing.OneHotEncoder(drop="first") From 7199a4272971510920d8e947b5a308b0cb38d1b2 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Fri, 1 Mar 2019 14:16:40 -0600 Subject: [PATCH 06/10] Add allowed tolerance for QuantileTransformer test --- tests/preprocessing/test_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/preprocessing/test_data.py b/tests/preprocessing/test_data.py index 07584bdfa..9a8577d43 100644 --- a/tests/preprocessing/test_data.py +++ b/tests/preprocessing/test_data.py @@ -218,7 +218,7 @@ def test_basic(self): # set the quantiles, so that from here out, we're exact a.quantiles_ = b.quantiles_ - assert_eq_ar(a.transform(X), b.transform(X)) + assert_eq_ar(a.transform(X), b.transform(X), atol=1e-7) assert_eq_ar(X, a.inverse_transform(a.transform(X))) @pytest.mark.parametrize( From 655abbcbc3ec0a0eeee3e4becd4b99f532ce7117 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Sat, 2 Mar 2019 20:46:47 -0600 Subject: [PATCH 07/10] Update OneHotEncoder drop sklearn version to 0.21.0 --- dask_ml/preprocessing/_encoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_ml/preprocessing/_encoders.py b/dask_ml/preprocessing/_encoders.py index 6e19fdad8..4bf6d588f 100644 --- a/dask_ml/preprocessing/_encoders.py +++ b/dask_ml/preprocessing/_encoders.py @@ -128,7 +128,7 @@ def __init__( "dtype": dtype, "handle_unknown": handle_unknown, } - if SK_VERSION <= packaging.version.parse("0.20.2"): + if SK_VERSION < packaging.version.parse("0.21.0"): del signature["drop"] super(OneHotEncoder, self).__init__(**signature) From 9e55d4c77fcd648dd60ddf59af2844413564ae3a Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Sat, 2 Mar 2019 21:01:37 -0600 Subject: [PATCH 08/10] Increase test data size for TestQuantileTransformer --- tests/preprocessing/test_data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/preprocessing/test_data.py b/tests/preprocessing/test_data.py index 9a8577d43..0a52462ed 100644 --- a/tests/preprocessing/test_data.py +++ b/tests/preprocessing/test_data.py @@ -211,7 +211,7 @@ def test_basic(self): a = dpp.QuantileTransformer() b = spp.QuantileTransformer() - X = rs.uniform(size=(100, 3), chunks=50) + X = rs.uniform(size=(1000, 3), chunks=50) a.fit(X) b.fit(X) assert_estimator_equal(a, b, atol=0.02) @@ -225,13 +225,13 @@ def test_basic(self): "type_, kwargs", [ (np.array, {}), - (da.from_array, {"chunks": 10}), + (da.from_array, {"chunks": 100}), (pd.DataFrame, {"columns": ["a", "b", "c"]}), (dd.from_array, {"columns": ["a", "b", "c"]}), ], ) def test_types(self, type_, kwargs): - X = np.random.uniform(size=(20, 3)) + X = np.random.uniform(size=(1000, 3)) dX = type_(X, **kwargs) qt = spp.QuantileTransformer() qt.fit(X) From 341baaf4eb5e3f9aeadd93963a4d53286c2738f1 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Sat, 2 Mar 2019 21:35:03 -0600 Subject: [PATCH 09/10] Increase QuantileTransformer test coverage --- tests/preprocessing/test_data.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/preprocessing/test_data.py b/tests/preprocessing/test_data.py index 0a52462ed..998d82395 100644 --- a/tests/preprocessing/test_data.py +++ b/tests/preprocessing/test_data.py @@ -221,6 +221,17 @@ def test_basic(self): assert_eq_ar(a.transform(X), b.transform(X), atol=1e-7) assert_eq_ar(X, a.inverse_transform(a.transform(X))) + @pytest.mark.parametrize("output_distribution", ["uniform", "normal"]) + def test_output_distribution(self, output_distribution): + rs = da.random.RandomState(0) + a = dpp.QuantileTransformer(output_distribution=output_distribution) + b = spp.QuantileTransformer(output_distribution=output_distribution) + + X = rs.uniform(size=(1000, 3), chunks=50) + a.fit(X) + b.fit(X) + assert_estimator_equal(a, b, atol=0.02) + @pytest.mark.parametrize( "type_, kwargs", [ From 2eda3abb9b9117a2db622fa364616010632c8e3b Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Sat, 2 Mar 2019 22:03:14 -0600 Subject: [PATCH 10/10] Include transform in test --- tests/preprocessing/test_data.py | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/tests/preprocessing/test_data.py b/tests/preprocessing/test_data.py index 998d82395..968d0bc0a 100644 --- a/tests/preprocessing/test_data.py +++ b/tests/preprocessing/test_data.py @@ -206,10 +206,11 @@ def test_df_values(self): class TestQuantileTransformer(object): - def test_basic(self): + @pytest.mark.parametrize("output_distribution", ["uniform", "normal"]) + def test_basic(self, output_distribution): rs = da.random.RandomState(0) - a = dpp.QuantileTransformer() - b = spp.QuantileTransformer() + a = dpp.QuantileTransformer(output_distribution=output_distribution) + b = spp.QuantileTransformer(output_distribution=output_distribution) X = rs.uniform(size=(1000, 3), chunks=50) a.fit(X) @@ -221,17 +222,6 @@ def test_basic(self): assert_eq_ar(a.transform(X), b.transform(X), atol=1e-7) assert_eq_ar(X, a.inverse_transform(a.transform(X))) - @pytest.mark.parametrize("output_distribution", ["uniform", "normal"]) - def test_output_distribution(self, output_distribution): - rs = da.random.RandomState(0) - a = dpp.QuantileTransformer(output_distribution=output_distribution) - b = spp.QuantileTransformer(output_distribution=output_distribution) - - X = rs.uniform(size=(1000, 3), chunks=50) - a.fit(X) - b.fit(X) - assert_estimator_equal(a, b, atol=0.02) - @pytest.mark.parametrize( "type_, kwargs", [