From b3612dd2a12431344c6efb496eed4bc1e83d01cd Mon Sep 17 00:00:00 2001
From: James Bourbeau <jrbourbeau@gmail.com>
Date: Wed, 27 Feb 2019 22:12:35 -0600
Subject: [PATCH 01/10] Add drop option to OneHotEncoder

---
 dask_ml/preprocessing/_encoders.py   | 11 ++++++++++-
 tests/preprocessing/test_encoders.py | 29 ++++++++++++++++++++++++++--
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/dask_ml/preprocessing/_encoders.py b/dask_ml/preprocessing/_encoders.py
index c6e937bca..1e05ac7c0 100644
--- a/dask_ml/preprocessing/_encoders.py
+++ b/dask_ml/preprocessing/_encoders.py
@@ -107,13 +107,22 @@ def __init__(
         n_values=None,
         categorical_features=None,
         categories="auto",
+        drop=None,
         sparse=True,
         dtype=np.float64,
         handle_unknown="error",
     ):
         super(OneHotEncoder, self).__init__(
-            n_values, categorical_features, categories, sparse, dtype, handle_unknown
+            n_values,
+            categorical_features,
+            categories,
+            drop,
+            sparse,
+            dtype,
+            handle_unknown,
         )
+        if drop is not None:
+            raise NotImplementedError("drop != None is not implemented yet.")
 
     def fit(self, X, y=None):
         if self.handle_unknown == "ignore":
diff --git a/tests/preprocessing/test_encoders.py b/tests/preprocessing/test_encoders.py
index a33e655e1..8ce6b3f97 100644
--- a/tests/preprocessing/test_encoders.py
+++ b/tests/preprocessing/test_encoders.py
@@ -34,7 +34,15 @@ def test_basic_array(sparse, method, categories):
         result = b.fit_transform(dX)
 
     assert_estimator_equal(
-        a, b, exclude={"n_values_", "feature_indices_", "active_features_", "dtypes_"}
+        a,
+        b,
+        exclude={
+            "n_values_",
+            "feature_indices_",
+            "active_features_",
+            "dtypes_",
+            "drop_idx_",
+        },
     )
 
     assert isinstance(result, da.Array)
@@ -71,7 +79,9 @@ def test_basic_array(sparse, method, categories):
 @pytest.mark.parametrize("dtype", [np.float, np.uint8])
 def test_basic_dataframe(sparse, method, dask_data, dtype):
     a = sklearn.preprocessing.OneHotEncoder(sparse=sparse, dtype=dtype)
+    print(f"\na = {a}")
     b = dask_ml.preprocessing.OneHotEncoder(sparse=sparse, dtype=dtype)
+    print(f"b = {b}")
 
     if method == "fit":
         a.fit(df)
@@ -83,7 +93,15 @@ def test_basic_dataframe(sparse, method, dask_data, dtype):
         result = b.fit_transform(dask_data)
 
     assert_estimator_equal(
-        a, b, exclude={"n_values_", "feature_indices_", "active_features_", "dtypes_"}
+        a,
+        b,
+        exclude={
+            "n_values_",
+            "feature_indices_",
+            "active_features_",
+            "dtypes_",
+            "drop_idx_",
+        },
     )
 
     assert isinstance(result, type(dask_data))
@@ -106,6 +124,13 @@ def test_invalid_handle_input():
         enc.fit(dX)
 
 
+def test_onehotencoder_drop_raises():
+    # drop is not currently supported
+    dask_ml.preprocessing.OneHotEncoder()
+    with pytest.raises(NotImplementedError):
+        dask_ml.preprocessing.OneHotEncoder(drop="first")
+
+
 def test_handles_numpy():
     enc = dask_ml.preprocessing.OneHotEncoder()
     enc.fit(X)

From 930938b7e0ab172322dca527cb0d13b9419cb1d8 Mon Sep 17 00:00:00 2001
From: James Bourbeau <jrbourbeau@gmail.com>
Date: Wed, 27 Feb 2019 22:17:26 -0600
Subject: [PATCH 02/10] Update QuantileTransformer internals

---
 dask_ml/preprocessing/data.py | 47 ++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 14 deletions(-)

diff --git a/dask_ml/preprocessing/data.py b/dask_ml/preprocessing/data.py
index 89bb6b8f0..1363e450f 100644
--- a/dask_ml/preprocessing/data.py
+++ b/dask_ml/preprocessing/data.py
@@ -268,11 +268,7 @@ def _transform(self, X, inverse=False):
         return da.vstack(transformed).T
 
     def _transform_col(self, X_col, quantiles, inverse):
-        if self.output_distribution == "normal":
-            output_distribution = "norm"
-        else:
-            output_distribution = self.output_distribution
-        output_distribution = getattr(stats, output_distribution)
+        output_distribution = self.output_distribution
 
         if not inverse:
             lower_bound_x = quantiles[0]
@@ -284,10 +280,18 @@ def _transform_col(self, X_col, quantiles, inverse):
             upper_bound_x = 1
             lower_bound_y = quantiles[0]
             upper_bound_y = quantiles[-1]
-            X_col = X_col.map_blocks(output_distribution.cdf)
+            #  for inverse transform, match a uniform distribution
+            if output_distribution == "normal":
+                X_col = X_col.map_blocks(stats.norm.cdf)
+                # else output distribution is already a uniform distribution
+
+        if output_distribution == "normal":
+            lower_bounds_idx = X_col - skdata.BOUNDS_THRESHOLD < lower_bound_x
+            upper_bounds_idx = X_col + skdata.BOUNDS_THRESHOLD > upper_bound_x
+        if output_distribution == "uniform":
+            lower_bounds_idx = X_col == lower_bound_x
+            upper_bounds_idx = X_col == upper_bound_x
 
-        lower_bounds_idx = X_col - skdata.BOUNDS_THRESHOLD < lower_bound_x
-        upper_bounds_idx = X_col + skdata.BOUNDS_THRESHOLD > upper_bound_x
         if not inverse:
             # See the note in scikit-learn. This trick is to avoid
             # repeated extreme values
@@ -304,12 +308,27 @@ def _transform_col(self, X_col, quantiles, inverse):
         X_col[lower_bounds_idx] = lower_bound_y
 
         if not inverse:
-            X_col = X_col.map_blocks(output_distribution.ppf)
-            clip_min = output_distribution.ppf(skdata.BOUNDS_THRESHOLD - np.spacing(1))
-            clip_max = output_distribution.ppf(
-                1 - (skdata.BOUNDS_THRESHOLD - np.spacing(1))
-            )
-            X_col = da.clip(X_col, clip_min, clip_max)
+
+            if output_distribution == "normal":
+                X_col = X_col.map_blocks(stats.norm.ppf)
+                # find the value to clip the data to avoid mapping to
+                # infinity. Clip such that the inverse transform will be
+                # consistent
+                # clip_min = stats.norm.ppf(BOUNDS_THRESHOLD - np.spacing(1))
+                # clip_max = stats.norm.ppf(1 - (BOUNDS_THRESHOLD -
+                #                                 np.spacing(1)))
+                # X_col = np.clip(X_col, clip_min, clip_max)
+
+                clip_min = output_distribution.ppf(
+                    skdata.BOUNDS_THRESHOLD - np.spacing(1)
+                )
+                clip_max = output_distribution.ppf(
+                    1 - (skdata.BOUNDS_THRESHOLD - np.spacing(1))
+                )
+                X_col = da.clip(X_col, clip_min, clip_max)
+
+            # else output distribution is uniform and the ppf is the
+            # identity function so we let X_col unchanged
 
         return X_col
 

From 32199a3bf9ec3e5b157af2e55840f571fec07bea Mon Sep 17 00:00:00 2001
From: James Bourbeau <jrbourbeau@gmail.com>
Date: Wed, 27 Feb 2019 22:30:44 -0600
Subject: [PATCH 03/10] Fix commented out code

---
 dask_ml/preprocessing/data.py | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/dask_ml/preprocessing/data.py b/dask_ml/preprocessing/data.py
index 1363e450f..ce6465089 100644
--- a/dask_ml/preprocessing/data.py
+++ b/dask_ml/preprocessing/data.py
@@ -314,17 +314,8 @@ def _transform_col(self, X_col, quantiles, inverse):
                 # find the value to clip the data to avoid mapping to
                 # infinity. Clip such that the inverse transform will be
                 # consistent
-                # clip_min = stats.norm.ppf(BOUNDS_THRESHOLD - np.spacing(1))
-                # clip_max = stats.norm.ppf(1 - (BOUNDS_THRESHOLD -
-                #                                 np.spacing(1)))
-                # X_col = np.clip(X_col, clip_min, clip_max)
-
-                clip_min = output_distribution.ppf(
-                    skdata.BOUNDS_THRESHOLD - np.spacing(1)
-                )
-                clip_max = output_distribution.ppf(
-                    1 - (skdata.BOUNDS_THRESHOLD - np.spacing(1))
-                )
+                clip_min = stats.norm.ppf(skdata.BOUNDS_THRESHOLD - np.spacing(1))
+                clip_max = stats.norm.ppf(1 - (skdata.BOUNDS_THRESHOLD - np.spacing(1)))
                 X_col = da.clip(X_col, clip_min, clip_max)
 
             # else output distribution is uniform and the ppf is the

From 0cd9a80090e1c85ceeb00c6f525fc29710ca4fef Mon Sep 17 00:00:00 2001
From: James Bourbeau <jrbourbeau@gmail.com>
Date: Thu, 28 Feb 2019 10:57:40 -0600
Subject: [PATCH 04/10] Remove print lines in test

---
 tests/preprocessing/test_encoders.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/preprocessing/test_encoders.py b/tests/preprocessing/test_encoders.py
index 8ce6b3f97..64bae26f7 100644
--- a/tests/preprocessing/test_encoders.py
+++ b/tests/preprocessing/test_encoders.py
@@ -79,9 +79,7 @@ def test_basic_array(sparse, method, categories):
 @pytest.mark.parametrize("dtype", [np.float, np.uint8])
 def test_basic_dataframe(sparse, method, dask_data, dtype):
     a = sklearn.preprocessing.OneHotEncoder(sparse=sparse, dtype=dtype)
-    print(f"\na = {a}")
     b = dask_ml.preprocessing.OneHotEncoder(sparse=sparse, dtype=dtype)
-    print(f"b = {b}")
 
     if method == "fit":
         a.fit(df)

From c6fd7d4e1cc478056a27e4cc4529fbe480895752 Mon Sep 17 00:00:00 2001
From: James Bourbeau <jrbourbeau@gmail.com>
Date: Thu, 28 Feb 2019 13:02:48 -0600
Subject: [PATCH 05/10] Add sklearn version check for OneHotEncoder

---
 dask_ml/preprocessing/_encoders.py   | 26 +++++++++++++++++---------
 tests/preprocessing/test_encoders.py |  1 -
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/dask_ml/preprocessing/_encoders.py b/dask_ml/preprocessing/_encoders.py
index 1e05ac7c0..6e19fdad8 100644
--- a/dask_ml/preprocessing/_encoders.py
+++ b/dask_ml/preprocessing/_encoders.py
@@ -1,10 +1,12 @@
 import dask
 import dask.array as da
 import numpy as np
+import packaging.version
 import pandas as pd
 import sklearn.preprocessing
 
 from .label import _encode, _encode_dask_array
+from .._compat import SK_VERSION
 from ..utils import check_array
 
 
@@ -51,6 +53,9 @@ class OneHotEncoder(sklearn.preprocessing.OneHotEncoder):
 
         The used categories can be found in the ``categories_`` attribute.
 
+    drop : None, default=None
+        The option to drop one of the categories per feature is not yet supported.
+
     sparse : boolean, default=True
         Will return sparse matrix if set True else will return an array.
 
@@ -112,17 +117,20 @@ def __init__(
         dtype=np.float64,
         handle_unknown="error",
     ):
-        super(OneHotEncoder, self).__init__(
-            n_values,
-            categorical_features,
-            categories,
-            drop,
-            sparse,
-            dtype,
-            handle_unknown,
-        )
         if drop is not None:
             raise NotImplementedError("drop != None is not implemented yet.")
+        signature = {
+            "n_values": n_values,
+            "categorical_features": categorical_features,
+            "categories": categories,
+            "drop": drop,
+            "sparse": sparse,
+            "dtype": dtype,
+            "handle_unknown": handle_unknown,
+        }
+        if SK_VERSION <= packaging.version.parse("0.20.2"):
+            del signature["drop"]
+        super(OneHotEncoder, self).__init__(**signature)
 
     def fit(self, X, y=None):
         if self.handle_unknown == "ignore":
diff --git a/tests/preprocessing/test_encoders.py b/tests/preprocessing/test_encoders.py
index 64bae26f7..3763f2792 100644
--- a/tests/preprocessing/test_encoders.py
+++ b/tests/preprocessing/test_encoders.py
@@ -123,7 +123,6 @@ def test_invalid_handle_input():
 
 
 def test_onehotencoder_drop_raises():
-    # drop is not currently supported
     dask_ml.preprocessing.OneHotEncoder()
     with pytest.raises(NotImplementedError):
         dask_ml.preprocessing.OneHotEncoder(drop="first")

From 7199a4272971510920d8e947b5a308b0cb38d1b2 Mon Sep 17 00:00:00 2001
From: James Bourbeau <jrbourbeau@gmail.com>
Date: Fri, 1 Mar 2019 14:16:40 -0600
Subject: [PATCH 06/10] Add allowed tolerance for QuantileTransformer test

---
 tests/preprocessing/test_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/preprocessing/test_data.py b/tests/preprocessing/test_data.py
index 07584bdfa..9a8577d43 100644
--- a/tests/preprocessing/test_data.py
+++ b/tests/preprocessing/test_data.py
@@ -218,7 +218,7 @@ def test_basic(self):
 
         # set the quantiles, so that from here out, we're exact
         a.quantiles_ = b.quantiles_
-        assert_eq_ar(a.transform(X), b.transform(X))
+        assert_eq_ar(a.transform(X), b.transform(X), atol=1e-7)
         assert_eq_ar(X, a.inverse_transform(a.transform(X)))
 
     @pytest.mark.parametrize(

From 655abbcbc3ec0a0eeee3e4becd4b99f532ce7117 Mon Sep 17 00:00:00 2001
From: James Bourbeau <jrbourbeau@gmail.com>
Date: Sat, 2 Mar 2019 20:46:47 -0600
Subject: [PATCH 07/10] Update OneHotEncoder drop sklearn version to 0.21.0

---
 dask_ml/preprocessing/_encoders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_ml/preprocessing/_encoders.py b/dask_ml/preprocessing/_encoders.py
index 6e19fdad8..4bf6d588f 100644
--- a/dask_ml/preprocessing/_encoders.py
+++ b/dask_ml/preprocessing/_encoders.py
@@ -128,7 +128,7 @@ def __init__(
             "dtype": dtype,
             "handle_unknown": handle_unknown,
         }
-        if SK_VERSION <= packaging.version.parse("0.20.2"):
+        if SK_VERSION < packaging.version.parse("0.21.0"):
             del signature["drop"]
         super(OneHotEncoder, self).__init__(**signature)
 

From 9e55d4c77fcd648dd60ddf59af2844413564ae3a Mon Sep 17 00:00:00 2001
From: James Bourbeau <jrbourbeau@gmail.com>
Date: Sat, 2 Mar 2019 21:01:37 -0600
Subject: [PATCH 08/10] Increase test data size for TestQuantileTransformer

---
 tests/preprocessing/test_data.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/preprocessing/test_data.py b/tests/preprocessing/test_data.py
index 9a8577d43..0a52462ed 100644
--- a/tests/preprocessing/test_data.py
+++ b/tests/preprocessing/test_data.py
@@ -211,7 +211,7 @@ def test_basic(self):
         a = dpp.QuantileTransformer()
         b = spp.QuantileTransformer()
 
-        X = rs.uniform(size=(100, 3), chunks=50)
+        X = rs.uniform(size=(1000, 3), chunks=50)
         a.fit(X)
         b.fit(X)
         assert_estimator_equal(a, b, atol=0.02)
@@ -225,13 +225,13 @@ def test_basic(self):
         "type_, kwargs",
         [
             (np.array, {}),
-            (da.from_array, {"chunks": 10}),
+            (da.from_array, {"chunks": 100}),
             (pd.DataFrame, {"columns": ["a", "b", "c"]}),
             (dd.from_array, {"columns": ["a", "b", "c"]}),
         ],
     )
     def test_types(self, type_, kwargs):
-        X = np.random.uniform(size=(20, 3))
+        X = np.random.uniform(size=(1000, 3))
         dX = type_(X, **kwargs)
         qt = spp.QuantileTransformer()
         qt.fit(X)

From 341baaf4eb5e3f9aeadd93963a4d53286c2738f1 Mon Sep 17 00:00:00 2001
From: James Bourbeau <jrbourbeau@gmail.com>
Date: Sat, 2 Mar 2019 21:35:03 -0600
Subject: [PATCH 09/10] Increase QuantileTransformer test coverage

---
 tests/preprocessing/test_data.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/preprocessing/test_data.py b/tests/preprocessing/test_data.py
index 0a52462ed..998d82395 100644
--- a/tests/preprocessing/test_data.py
+++ b/tests/preprocessing/test_data.py
@@ -221,6 +221,17 @@ def test_basic(self):
         assert_eq_ar(a.transform(X), b.transform(X), atol=1e-7)
         assert_eq_ar(X, a.inverse_transform(a.transform(X)))
 
+    @pytest.mark.parametrize("output_distribution", ["uniform", "normal"])
+    def test_output_distribution(self, output_distribution):
+        rs = da.random.RandomState(0)
+        a = dpp.QuantileTransformer(output_distribution=output_distribution)
+        b = spp.QuantileTransformer(output_distribution=output_distribution)
+
+        X = rs.uniform(size=(1000, 3), chunks=50)
+        a.fit(X)
+        b.fit(X)
+        assert_estimator_equal(a, b, atol=0.02)
+
     @pytest.mark.parametrize(
         "type_, kwargs",
         [

From 2eda3abb9b9117a2db622fa364616010632c8e3b Mon Sep 17 00:00:00 2001
From: James Bourbeau <jrbourbeau@gmail.com>
Date: Sat, 2 Mar 2019 22:03:14 -0600
Subject: [PATCH 10/10] Include transform in test

---
 tests/preprocessing/test_data.py | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/tests/preprocessing/test_data.py b/tests/preprocessing/test_data.py
index 998d82395..968d0bc0a 100644
--- a/tests/preprocessing/test_data.py
+++ b/tests/preprocessing/test_data.py
@@ -206,10 +206,11 @@ def test_df_values(self):
 
 
 class TestQuantileTransformer(object):
-    def test_basic(self):
+    @pytest.mark.parametrize("output_distribution", ["uniform", "normal"])
+    def test_basic(self, output_distribution):
         rs = da.random.RandomState(0)
-        a = dpp.QuantileTransformer()
-        b = spp.QuantileTransformer()
+        a = dpp.QuantileTransformer(output_distribution=output_distribution)
+        b = spp.QuantileTransformer(output_distribution=output_distribution)
 
         X = rs.uniform(size=(1000, 3), chunks=50)
         a.fit(X)
@@ -221,17 +222,6 @@ def test_basic(self):
         assert_eq_ar(a.transform(X), b.transform(X), atol=1e-7)
         assert_eq_ar(X, a.inverse_transform(a.transform(X)))
 
-    @pytest.mark.parametrize("output_distribution", ["uniform", "normal"])
-    def test_output_distribution(self, output_distribution):
-        rs = da.random.RandomState(0)
-        a = dpp.QuantileTransformer(output_distribution=output_distribution)
-        b = spp.QuantileTransformer(output_distribution=output_distribution)
-
-        X = rs.uniform(size=(1000, 3), chunks=50)
-        a.fit(X)
-        b.fit(X)
-        assert_estimator_equal(a, b, atol=0.02)
-
     @pytest.mark.parametrize(
         "type_, kwargs",
         [