dask · TomAugspurger · Mar 4, 2019 · Feb 28, 2019 · Feb 28, 2019 · Feb 28, 2019
diff --git a/dask_ml/preprocessing/_encoders.py b/dask_ml/preprocessing/_encoders.py
@@ -1,10 +1,12 @@
 import dask
 import dask.array as da
 import numpy as np
+import packaging.version
 import pandas as pd
 import sklearn.preprocessing
 
 from .label import _encode, _encode_dask_array
+from .._compat import SK_VERSION
 from ..utils import check_array
 
 
@@ -51,6 +53,9 @@ class OneHotEncoder(sklearn.preprocessing.OneHotEncoder):
 
         The used categories can be found in the ``categories_`` attribute.
 
+    drop : None, default=None
+        The option to drop one of the categories per feature is not yet supported.
+
     sparse : boolean, default=True
         Will return sparse matrix if set True else will return an array.
 
@@ -107,13 +112,25 @@ def __init__(
         n_values=None,
         categorical_features=None,
         categories="auto",
+        drop=None,
         sparse=True,
         dtype=np.float64,
         handle_unknown="error",
     ):
-        super(OneHotEncoder, self).__init__(
-            n_values, categorical_features, categories, sparse, dtype, handle_unknown
-        )
+        if drop is not None:
+            raise NotImplementedError("drop != None is not implemented yet.")
+        signature = {
+            "n_values": n_values,
+            "categorical_features": categorical_features,
+            "categories": categories,
+            "drop": drop,
+            "sparse": sparse,
+            "dtype": dtype,
+            "handle_unknown": handle_unknown,
+        }
+        if SK_VERSION < packaging.version.parse("0.21.0"):
+            del signature["drop"]
+        super(OneHotEncoder, self).__init__(**signature)
 
     def fit(self, X, y=None):
         if self.handle_unknown == "ignore":

diff --git a/dask_ml/preprocessing/data.py b/dask_ml/preprocessing/data.py
@@ -268,11 +268,7 @@ def _transform(self, X, inverse=False):
         return da.vstack(transformed).T
 
     def _transform_col(self, X_col, quantiles, inverse):
-        if self.output_distribution == "normal":
-            output_distribution = "norm"
-        else:
-            output_distribution = self.output_distribution
-        output_distribution = getattr(stats, output_distribution)
+        output_distribution = self.output_distribution
 
         if not inverse:
             lower_bound_x = quantiles[0]
@@ -284,10 +280,18 @@ def _transform_col(self, X_col, quantiles, inverse):
             upper_bound_x = 1
             lower_bound_y = quantiles[0]
             upper_bound_y = quantiles[-1]
-            X_col = X_col.map_blocks(output_distribution.cdf)
+            #  for inverse transform, match a uniform distribution
+            if output_distribution == "normal":
+                X_col = X_col.map_blocks(stats.norm.cdf)
+                # else output distribution is already a uniform distribution
+
+        if output_distribution == "normal":
+            lower_bounds_idx = X_col - skdata.BOUNDS_THRESHOLD < lower_bound_x
+            upper_bounds_idx = X_col + skdata.BOUNDS_THRESHOLD > upper_bound_x
+        if output_distribution == "uniform":
+            lower_bounds_idx = X_col == lower_bound_x
+            upper_bounds_idx = X_col == upper_bound_x
 
-        lower_bounds_idx = X_col - skdata.BOUNDS_THRESHOLD < lower_bound_x
-        upper_bounds_idx = X_col + skdata.BOUNDS_THRESHOLD > upper_bound_x
         if not inverse:
             # See the note in scikit-learn. This trick is to avoid
             # repeated extreme values
@@ -304,12 +308,18 @@ def _transform_col(self, X_col, quantiles, inverse):
         X_col[lower_bounds_idx] = lower_bound_y
 
         if not inverse:
-            X_col = X_col.map_blocks(output_distribution.ppf)
-            clip_min = output_distribution.ppf(skdata.BOUNDS_THRESHOLD - np.spacing(1))
-            clip_max = output_distribution.ppf(
-                1 - (skdata.BOUNDS_THRESHOLD - np.spacing(1))
-            )
-            X_col = da.clip(X_col, clip_min, clip_max)
+
+            if output_distribution == "normal":
+                X_col = X_col.map_blocks(stats.norm.ppf)
+                # find the value to clip the data to avoid mapping to
+                # infinity. Clip such that the inverse transform will be
+                # consistent
+                clip_min = stats.norm.ppf(skdata.BOUNDS_THRESHOLD - np.spacing(1))
+                clip_max = stats.norm.ppf(1 - (skdata.BOUNDS_THRESHOLD - np.spacing(1)))
+                X_col = da.clip(X_col, clip_min, clip_max)
+
+            # else output distribution is uniform and the ppf is the
+            # identity function so we let X_col unchanged
 
         return X_col
 

diff --git a/tests/preprocessing/test_data.py b/tests/preprocessing/test_data.py
@@ -206,32 +206,33 @@ def test_df_values(self):
 
 
 class TestQuantileTransformer(object):
-    def test_basic(self):
+    @pytest.mark.parametrize("output_distribution", ["uniform", "normal"])
+    def test_basic(self, output_distribution):
         rs = da.random.RandomState(0)
-        a = dpp.QuantileTransformer()
-        b = spp.QuantileTransformer()
+        a = dpp.QuantileTransformer(output_distribution=output_distribution)
+        b = spp.QuantileTransformer(output_distribution=output_distribution)
 
-        X = rs.uniform(size=(100, 3), chunks=50)
+        X = rs.uniform(size=(1000, 3), chunks=50)
         a.fit(X)
         b.fit(X)
         assert_estimator_equal(a, b, atol=0.02)
 
         # set the quantiles, so that from here out, we're exact
         a.quantiles_ = b.quantiles_
-        assert_eq_ar(a.transform(X), b.transform(X))
+        assert_eq_ar(a.transform(X), b.transform(X), atol=1e-7)
         assert_eq_ar(X, a.inverse_transform(a.transform(X)))
 
     @pytest.mark.parametrize(
         "type_, kwargs",
         [
             (np.array, {}),
-            (da.from_array, {"chunks": 10}),
+            (da.from_array, {"chunks": 100}),
             (pd.DataFrame, {"columns": ["a", "b", "c"]}),
             (dd.from_array, {"columns": ["a", "b", "c"]}),
         ],
     )
     def test_types(self, type_, kwargs):
-        X = np.random.uniform(size=(20, 3))
+        X = np.random.uniform(size=(1000, 3))
         dX = type_(X, **kwargs)
         qt = spp.QuantileTransformer()
         qt.fit(X)

diff --git a/tests/preprocessing/test_encoders.py b/tests/preprocessing/test_encoders.py
@@ -34,7 +34,15 @@ def test_basic_array(sparse, method, categories):
         result = b.fit_transform(dX)
 
     assert_estimator_equal(
-        a, b, exclude={"n_values_", "feature_indices_", "active_features_", "dtypes_"}
+        a,
+        b,
+        exclude={
+            "n_values_",
+            "feature_indices_",
+            "active_features_",
+            "dtypes_",
+            "drop_idx_",
+        },
     )
 
     assert isinstance(result, da.Array)
@@ -83,7 +91,15 @@ def test_basic_dataframe(sparse, method, dask_data, dtype):
         result = b.fit_transform(dask_data)
 
     assert_estimator_equal(
-        a, b, exclude={"n_values_", "feature_indices_", "active_features_", "dtypes_"}
+        a,
+        b,
+        exclude={
+            "n_values_",
+            "feature_indices_",
+            "active_features_",
+            "dtypes_",
+            "drop_idx_",
+        },
     )
 
     assert isinstance(result, type(dask_data))
@@ -106,6 +122,12 @@ def test_invalid_handle_input():
         enc.fit(dX)
 
 
+def test_onehotencoder_drop_raises():
+    dask_ml.preprocessing.OneHotEncoder()
+    with pytest.raises(NotImplementedError):
+        dask_ml.preprocessing.OneHotEncoder(drop="first")
+
+
 def test_handles_numpy():
     enc = dask_ml.preprocessing.OneHotEncoder()
     enc.fit(X)