Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix sklearn dev tests #474

Merged
merged 10 commits into from Mar 4, 2019
23 changes: 20 additions & 3 deletions dask_ml/preprocessing/_encoders.py
@@ -1,10 +1,12 @@
import dask
import dask.array as da
import numpy as np
import packaging.version
import pandas as pd
import sklearn.preprocessing

from .label import _encode, _encode_dask_array
from .._compat import SK_VERSION
from ..utils import check_array


Expand Down Expand Up @@ -51,6 +53,9 @@ class OneHotEncoder(sklearn.preprocessing.OneHotEncoder):

The used categories can be found in the ``categories_`` attribute.

drop : None, default=None
The option to drop one of the categories per feature is not yet supported.

sparse : boolean, default=True
Will return sparse matrix if set True else will return an array.

Expand Down Expand Up @@ -107,13 +112,25 @@ def __init__(
n_values=None,
categorical_features=None,
categories="auto",
drop=None,
sparse=True,
dtype=np.float64,
handle_unknown="error",
):
super(OneHotEncoder, self).__init__(
n_values, categorical_features, categories, sparse, dtype, handle_unknown
)
if drop is not None:
raise NotImplementedError("drop != None is not implemented yet.")
signature = {
"n_values": n_values,
"categorical_features": categorical_features,
"categories": categories,
"drop": drop,
"sparse": sparse,
"dtype": dtype,
"handle_unknown": handle_unknown,
}
if SK_VERSION < packaging.version.parse("0.21.0"):
del signature["drop"]
super(OneHotEncoder, self).__init__(**signature)

def fit(self, X, y=None):
if self.handle_unknown == "ignore":
Expand Down
38 changes: 24 additions & 14 deletions dask_ml/preprocessing/data.py
Expand Up @@ -268,11 +268,7 @@ def _transform(self, X, inverse=False):
return da.vstack(transformed).T

def _transform_col(self, X_col, quantiles, inverse):
if self.output_distribution == "normal":
output_distribution = "norm"
else:
output_distribution = self.output_distribution
output_distribution = getattr(stats, output_distribution)
output_distribution = self.output_distribution

if not inverse:
lower_bound_x = quantiles[0]
Expand All @@ -284,10 +280,18 @@ def _transform_col(self, X_col, quantiles, inverse):
upper_bound_x = 1
lower_bound_y = quantiles[0]
upper_bound_y = quantiles[-1]
X_col = X_col.map_blocks(output_distribution.cdf)
# for inverse transform, match a uniform distribution
if output_distribution == "normal":
X_col = X_col.map_blocks(stats.norm.cdf)
# else output distribution is already a uniform distribution

if output_distribution == "normal":
lower_bounds_idx = X_col - skdata.BOUNDS_THRESHOLD < lower_bound_x
upper_bounds_idx = X_col + skdata.BOUNDS_THRESHOLD > upper_bound_x
if output_distribution == "uniform":
lower_bounds_idx = X_col == lower_bound_x
upper_bounds_idx = X_col == upper_bound_x

lower_bounds_idx = X_col - skdata.BOUNDS_THRESHOLD < lower_bound_x
upper_bounds_idx = X_col + skdata.BOUNDS_THRESHOLD > upper_bound_x
if not inverse:
# See the note in scikit-learn. This trick is to avoid
# repeated extreme values
Expand All @@ -304,12 +308,18 @@ def _transform_col(self, X_col, quantiles, inverse):
X_col[lower_bounds_idx] = lower_bound_y

if not inverse:
X_col = X_col.map_blocks(output_distribution.ppf)
clip_min = output_distribution.ppf(skdata.BOUNDS_THRESHOLD - np.spacing(1))
clip_max = output_distribution.ppf(
1 - (skdata.BOUNDS_THRESHOLD - np.spacing(1))
)
X_col = da.clip(X_col, clip_min, clip_max)

if output_distribution == "normal":
X_col = X_col.map_blocks(stats.norm.ppf)
# find the value to clip the data to avoid mapping to
# infinity. Clip such that the inverse transform will be
# consistent
clip_min = stats.norm.ppf(skdata.BOUNDS_THRESHOLD - np.spacing(1))
clip_max = stats.norm.ppf(1 - (skdata.BOUNDS_THRESHOLD - np.spacing(1)))
X_col = da.clip(X_col, clip_min, clip_max)

# else output distribution is uniform and the ppf is the
# identity function so we let X_col unchanged

return X_col

Expand Down
15 changes: 8 additions & 7 deletions tests/preprocessing/test_data.py
Expand Up @@ -206,32 +206,33 @@ def test_df_values(self):


class TestQuantileTransformer(object):
def test_basic(self):
@pytest.mark.parametrize("output_distribution", ["uniform", "normal"])
def test_basic(self, output_distribution):
rs = da.random.RandomState(0)
a = dpp.QuantileTransformer()
b = spp.QuantileTransformer()
a = dpp.QuantileTransformer(output_distribution=output_distribution)
b = spp.QuantileTransformer(output_distribution=output_distribution)

X = rs.uniform(size=(100, 3), chunks=50)
X = rs.uniform(size=(1000, 3), chunks=50)
a.fit(X)
b.fit(X)
assert_estimator_equal(a, b, atol=0.02)

# set the quantiles, so that from here out, we're exact
a.quantiles_ = b.quantiles_
assert_eq_ar(a.transform(X), b.transform(X))
assert_eq_ar(a.transform(X), b.transform(X), atol=1e-7)
assert_eq_ar(X, a.inverse_transform(a.transform(X)))

@pytest.mark.parametrize(
"type_, kwargs",
[
(np.array, {}),
(da.from_array, {"chunks": 10}),
(da.from_array, {"chunks": 100}),
(pd.DataFrame, {"columns": ["a", "b", "c"]}),
(dd.from_array, {"columns": ["a", "b", "c"]}),
],
)
def test_types(self, type_, kwargs):
X = np.random.uniform(size=(20, 3))
X = np.random.uniform(size=(1000, 3))
dX = type_(X, **kwargs)
qt = spp.QuantileTransformer()
qt.fit(X)
Expand Down
26 changes: 24 additions & 2 deletions tests/preprocessing/test_encoders.py
Expand Up @@ -34,7 +34,15 @@ def test_basic_array(sparse, method, categories):
result = b.fit_transform(dX)

assert_estimator_equal(
a, b, exclude={"n_values_", "feature_indices_", "active_features_", "dtypes_"}
a,
b,
exclude={
"n_values_",
"feature_indices_",
"active_features_",
"dtypes_",
"drop_idx_",
},
)

assert isinstance(result, da.Array)
Expand Down Expand Up @@ -83,7 +91,15 @@ def test_basic_dataframe(sparse, method, dask_data, dtype):
result = b.fit_transform(dask_data)

assert_estimator_equal(
a, b, exclude={"n_values_", "feature_indices_", "active_features_", "dtypes_"}
a,
b,
exclude={
"n_values_",
"feature_indices_",
"active_features_",
"dtypes_",
"drop_idx_",
},
)

assert isinstance(result, type(dask_data))
Expand All @@ -106,6 +122,12 @@ def test_invalid_handle_input():
enc.fit(dX)


def test_onehotencoder_drop_raises():
dask_ml.preprocessing.OneHotEncoder()
with pytest.raises(NotImplementedError):
dask_ml.preprocessing.OneHotEncoder(drop="first")


def test_handles_numpy():
enc = dask_ml.preprocessing.OneHotEncoder()
enc.fit(X)
Expand Down