Skip to content

Commit

Permalink
Warn if tasks are submitted with identical keys but different run_spec
Browse files Browse the repository at this point in the history
  • Loading branch information
crusaderky committed Feb 16, 2024
1 parent 045dc64 commit cad1467
Show file tree
Hide file tree
Showing 4 changed files with 167 additions and 5 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ jobs:
# Increase this value to reset cache if
# continuous_integration/environment-${{ matrix.environment }}.yaml has not
# changed. See also same variable in .pre-commit-config.yaml
CACHE_NUMBER: 0
CACHE_NUMBER: 1
id: cache

- name: Update environment
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,4 @@ repos:

# Increase this value to clear the cache on GitHub actions if nothing else in this file
# has changed. See also same variable in .github/workflows/test.yaml
# CACHE_NUMBER: 0
# CACHE_NUMBER: 1
46 changes: 43 additions & 3 deletions distributed/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@

import dask
import dask.utils
from dask.base import TokenizationError, normalize_token, tokenize
from dask.core import get_deps, iskey, validate_key
from dask.typing import Key, no_default
from dask.utils import (
Expand Down Expand Up @@ -4752,6 +4753,7 @@ def _generate_taskstates(
stack = list(keys)
touched_keys = set()
touched_tasks = []
tgs_with_bad_run_spec = set()
while stack:
k = stack.pop()
if k in touched_keys:
Expand All @@ -4772,9 +4774,47 @@ def _generate_taskstates(
# run_spec in the submitted graph may be None. This happens
# when an already persisted future is part of the graph
elif k in dsk:
# TODO run a health check to verify that run_spec and dependencies
# did not change. See https://github.com/dask/distributed/pull/8185
pass
try:
tok_lhs: Any = tokenize(ts.run_spec, ensure_deterministic=True)
tok_rhs: Any = tokenize(dsk[k], ensure_deterministic=True)
except TokenizationError:
# Non-deterministic tokens; skip comparison
tok_lhs = tok_rhs = None

# Additionally check dependency names. This should only be necessary
# if run_specs can't be tokenized deterministically.
deps_lhs = {dts.key for dts in ts.dependencies}
deps_rhs = dependencies.get(k, set())

# FIXME It would be a really healthy idea to change this to a hard
# failure. However, this is not possible at the moment because of
# https://github.com/dask/dask/issues/9888
if (
tok_lhs != tok_rhs or deps_lhs != deps_rhs
) and ts.group not in tgs_with_bad_run_spec:
tgs_with_bad_run_spec.add(ts.group)
logger.warning(
f"Detected different `run_spec` for key {ts.key!r} between two "
"consecutive calls to `update_graph`. This can cause failures "
"and deadlocks down the line. Please ensure unique key names. "
"If you are using a standard dask collections, consider "
"releasing all the data before resubmitting another "
"computation. More details and help can be found at "
"https://github.com/dask/dask/issues/9888. "
+ textwrap.dedent(
f"""
Debugging information
---------------------
old task state: {ts.state}
old run_spec: {ts.run_spec!r}
new run_spec: {dsk[k]!r}
old token: {normalize_token(ts.run_spec)!r}
new token: {normalize_token(dsk[k])!r}
old dependencies: {deps_lhs}
new dependencies: {deps_rhs}
"""
)
)

if ts.run_spec:
runnable.append(ts)
Expand Down
122 changes: 122 additions & 0 deletions distributed/tests/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4702,3 +4702,125 @@ async def test_html_repr(c, s, a, b):
await asyncio.sleep(0.01)

await f


@pytest.mark.parametrize("add_deps", [False, True])
@gen_cluster(client=True, nthreads=[])
async def test_resubmit_different_task_same_key(c, s, add_deps):
"""If an intermediate key has a different run_spec (either the callable function or
the dependencies / arguments) that will conflict with what was previously defined,
it should raise an error since this can otherwise break in many different places and
cause either spurious exceptions or even deadlocks.
For a real world example where this can trigger, see
https://github.com/dask/dask/issues/9888
"""
y1 = c.submit(inc, 1, key="y")

x = delayed(inc)(1, dask_key_name="x") if add_deps else 2
y2 = delayed(inc)(x, dask_key_name="y")
z = delayed(inc)(y2, dask_key_name="z")

if add_deps: # add_deps=True corrupts the state machine
s.validate = False

with captured_logger("distributed.scheduler", level=logging.WARNING) as log:
fut = c.compute(z)
await wait_for_state("z", "waiting", s)

assert "Detected different `run_spec` for key 'y'" in log.getvalue()

async with Worker(s.address):
if not add_deps: # add_deps=True hangs
assert await y1 == 2
assert await fut == 3


@gen_cluster(client=True, nthreads=[])
async def test_resubmit_different_task_same_key_many_clients(c, s):
"""Two different clients submit a task with the same key but different run_spec's."""
async with Client(s.address, asynchronous=True) as c2:
with captured_logger("distributed.scheduler", level=logging.WARNING) as log:
x1 = c.submit(inc, 1, key="x")
x2 = c2.submit(inc, 2, key="x")

await wait_for_state("x", ("no-worker", "queued"), s)
who_wants = s.tasks["x"].who_wants
await async_poll_for(
lambda: {cs.client_key for cs in who_wants} == {c.id, c2.id}, timeout=5
)

assert "Detected different `run_spec` for key 'x'" in log.getvalue()

async with Worker(s.address):
assert await x1 == 2
assert await x2 == 2 # kept old run_spec


@gen_cluster(client=True, nthreads=[])
async def test_resubmit_nondeterministic_task_same_deps(c, s):
"""Some run_specs can't be tokenized deterministically. Silently skip comparison on
the run_spec in those cases. Dependencies must be the same.
"""
o = object()
# Round-tripping `o` through two separate cloudpickle.dumps() calls generates two
# different object instances, which yield different tokens.
x1 = c.submit(lambda x: x, o, key="x")
x2 = delayed(lambda x: x)(o, dask_key_name="x")
y = delayed(lambda x: x)(x2, dask_key_name="y")
fut = c.compute(y)
await async_poll_for(lambda: "y" in s.tasks, timeout=5)
async with Worker(s.address):
assert type(await fut) is object


@pytest.mark.parametrize("add_deps", [False, True])
@gen_cluster(client=True, nthreads=[])
async def test_resubmit_nondeterministic_task_different_deps(c, s, add_deps):
"""Some run_specs can't be tokenized deterministically. Silently skip comparison on
the run_spec in those cases. However, fail anyway if dependencies have changed.
"""
o = object()
x1 = c.submit(inc, 1, key="x1") if not add_deps else 2
x2 = c.submit(inc, 2, key="x2")
y1 = delayed(lambda i, j: i)(x1, o, dask_key_name="y").persist()
y2 = delayed(lambda i, j: i)(x2, o, dask_key_name="y")
z = delayed(inc)(y2, dask_key_name="z")

if add_deps: # add_deps=True corrupts the state machine and hangs
s.validate = False

with captured_logger("distributed.scheduler", level=logging.WARNING) as log:
fut = c.compute(z)
await wait_for_state("z", "waiting", s)
assert "Detected different `run_spec` for key 'y'" in log.getvalue()

if not add_deps: # add_deps=True corrupts the state machine and hangs
async with Worker(s.address):
assert await fut == 3


@gen_cluster(client=True, nthreads=[])
async def test_resubmit_different_task_same_key_warns_only_once(c, s):
"""If all tasks of a layer are affected by the same run_spec collision, warn
only once.
"""
x1s = c.map(inc, [0, 1, 2], key=[("x", 0), ("x", 1), ("x", 2)])
dsk = {
("x", 0): 3,
("x", 1): 4,
("x", 2): 5,
("y", 0): (inc, ("x", 0)),
("y", 1): (inc, ("x", 1)),
("y", 2): (inc, ("x", 2)),
}
with captured_logger("distributed.scheduler", level=logging.WARNING) as log:
ys = c.get(dsk, [("y", 0), ("y", 1), ("y", 2)], sync=False)
await wait_for_state(("y", 2), "waiting", s)

assert (
len(re.findall("Detected different `run_spec` for key ", log.getvalue())) == 1
)

async with Worker(s.address):
assert await c.gather(ys) == [2, 3, 4]

0 comments on commit cad1467

Please sign in to comment.