-
-
Notifications
You must be signed in to change notification settings - Fork 710
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Handle null partitions in P2P shuffling #8116
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,7 @@ | |
from distributed.shuffle._core import ShuffleId, ShuffleRun, barrier_key | ||
from distributed.worker import Status | ||
|
||
np = pytest.importorskip("numpy") | ||
pd = pytest.importorskip("pandas") | ||
dd = pytest.importorskip("dask.dataframe") | ||
|
||
|
@@ -2057,6 +2058,28 @@ async def test_handle_null_partitions_p2p_shuffling(c, s, *workers): | |
await check_scheduler_cleanup(s) | ||
|
||
|
||
@gen_cluster(client=True) | ||
async def test_handle_null_partitions_p2p_shuffling_2(c, s, a, b): | ||
def make_partition(i): | ||
"""Return null column for one partition""" | ||
if i % 2 == 1: | ||
return pd.DataFrame({"a": np.random.random(10), "b": [None] * 10}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would make the None column null explicitly, otherwise lgtm There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This fails unless I replace There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok |
||
return pd.DataFrame({"a": np.random.random(10), "b": np.random.random(10)}) | ||
|
||
ddf = dd.from_map(make_partition, range(50)) | ||
out = ddf.shuffle(on="a", shuffle="p2p", ignore_index=True) | ||
result, expected = c.compute([ddf, out]) | ||
del out | ||
result = await result | ||
expected = await expected | ||
dd.assert_eq(result, expected) | ||
del result | ||
|
||
await check_worker_cleanup(a) | ||
await check_worker_cleanup(b) | ||
await check_scheduler_cleanup(s) | ||
|
||
|
||
@gen_cluster(client=True) | ||
async def test_set_index_p2p(c, s, *workers): | ||
df = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7, 8], "b": 1}) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This test fails only fails 2/3 of the time on
main
, but that's good enough for me.