-
-
Notifications
You must be signed in to change notification settings - Fork 256
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Integrate dask-expr and make CI happy #980
Integrate dask-expr and make CI happy #980
Conversation
bb68436
to
2481557
Compare
569cedc
to
d7d91b3
Compare
6240910
to
400446b
Compare
d488738
to
963567e
Compare
mark = pytest.mark.xfail( | ||
DASK_EXPR_ENABLED, | ||
reason="dask-expr: NotImplementedError in assert_eq_df(res_df.iloc[:, 1:], frame, check_dtype=False)", | ||
) | ||
|
||
@pytest.mark.parametrize("daskify", [pytest.param(True, marks=mark), False]) | ||
def test_df_transform_index(self, daskify): |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
traceback
___________________________ TestPolynomialFeatures.test_df_transform_index[True] ___________________________
self = <tests.preprocessing.test_data.TestPolynomialFeatures object at 0x7f0c5bf00bb0>, daskify = True
@pytest.mark.parametrize("daskify", [pytest.param(True, marks=mark), False])
def test_df_transform_index(self, daskify):
frame = copy(df)
if not daskify:
frame = frame.compute()
frame = frame.sample(frac=1.0)
res_df = dpp.PolynomialFeatures(
preserve_dataframe=True, degree=1
).fit_transform(frame)
> assert_eq_df(res_df.iloc[:, 1:], frame, check_dtype=False)
tests/preprocessing/test_data.py:649:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <dask_expr._indexing.ILocIndexer object at 0x7f0c691fb400>
key = (slice(None, None, None), slice(1, None, None))
def __getitem__(self, key):
msg = (
"'DataFrame.iloc' only supports selecting columns. "
"It must be used like 'df.iloc[:, column_indexer]'."
)
if not isinstance(key, tuple):
raise NotImplementedError(msg)
if len(key) > 2:
raise ValueError("Too many indexers")
iindexer, cindexer = key
if iindexer != slice(None):
raise NotImplementedError(msg)
if len(self.obj.columns) == len(set(self.obj.columns)):
col_names = self.obj.columns[cindexer]
if not is_scalar(col_names):
col_names = list(col_names)
return new_collection(Projection(self.obj, col_names))
else:
> raise NotImplementedError
E NotImplementedError
../../mambaforge/envs/dask-ml/lib/python3.10/site-packages/dask_expr/_indexing.py:51: NotImplementedError
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
does the DataFRame have duplicated columns? Then yes, we don't care for now
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, okay the output res_df
has duplicate columns, but seems expected as that's what the iloc is for, but you're saying b/c of that we don't care right now because it's not implemented.
(Pdb) p frame.columns
Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
'13', '14', '15', '16', '17', '18', '19'],
dtype='object')
(Pdb) p res_df.columns
Index(['1', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
'13', '14', '15', '16', '17', '18', '19'],
dtype='object')
if DASK_EXPR_ENABLED and daskify == "unknown" and categories != ["b", "a"]: | ||
with pytest.raises(AssertionError, match="Arrays are not equal"): | ||
np.testing.assert_array_equal(a.classes_, categories) | ||
return |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
traceback
def test_categorical(self, categories, transformed, daskify, ordered):
cat = pd.Series(
["a", "b", "a"],
dtype=pd.api.types.CategoricalDtype(categories=categories, ordered=ordered),
)
if daskify:
cat = dd.from_pandas(cat, npartitions=2)
transformed = da.from_array(transformed, chunks=(2, 1))
if daskify == "unknown":
cat = cat.cat.as_unknown()
a = dpp.LabelEncoder().fit(cat)
if daskify != "unknown":
assert a.dtype_ == cat.dtype
> np.testing.assert_array_equal(a.classes_, categories)
tests/preprocessing/test_label.py:125:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
args = (<built-in function eq>, array(['b', 'a'], dtype=object), ['a', 'b'])
kwds = {'err_msg': '', 'header': 'Arrays are not equal', 'strict': False, 'verbose': True}
@wraps(func)
def inner(*args, **kwds):
with self._recreate_cm():
> return func(*args, **kwds)
E AssertionError:
E Arrays are not equal
E
E Mismatched elements: 2 / 2 (100%)
E x: array(['b', 'a'], dtype=object)
E y: array(['a', 'b'], dtype='<U1')
../../mambaforge/envs/dask-ml/lib/python3.10/contextlib.py:79: AssertionError
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This could be legit, could you look into why the order is not correct?
if not DASK_EXPR_ENABLED: | ||
# dask-expr cannot set _meta | ||
dd_X._meta = pd.DataFrame({"c_0": [5]}) | ||
|
||
# Failure when not proving predict_meta | ||
# because of value dependent model | ||
wrap = ParallelPostFit(base) | ||
with pytest.raises(ValueError): | ||
wrap.predict(dd_X) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@phofl I kinda assume we can safely ignore this test for dask-expr? Fails b/c we cannot set _meta
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm this is not great since it seems like the behaviour actually depends on meta?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a 'good' way to modify meta? Otherwise I can try mocking it here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lets chat about this in the standup
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
okay lets move ahead then
98d150c
to
db50cea
Compare
I can't merge, don't know who can @fjetter probably? |
I can as well :) Thanks for handling this! |
First commit is only skipping tests that fail currently w/o dask-expr, can be moved to it's own PR if desired. It skips tests that fail from mostly deprecations/future warnings, and some other things due to outdated code from updated dependencies.
other commits are adding dask-expr logic and marking tests that currently fail using skipif dask-expr is enabled.