Skip to content

Commit

Permalink
CI job running tests with queuing on (#6989)
Browse files Browse the repository at this point in the history
  • Loading branch information
gjoseph92 committed Sep 5, 2022
1 parent 6120b75 commit b133009
Show file tree
Hide file tree
Showing 10 changed files with 107 additions and 44 deletions.
32 changes: 25 additions & 7 deletions .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,24 @@ jobs:
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
python-version: ["3.8", "3.9", "3.10"]
queuing: [no_queue]
# Cherry-pick test modules to split the overall runtime roughly in half
partition: [ci1, not ci1]
exclude:
- os: macos-latest
python-version: 3.9
include:
- partition: "ci1"
partition-label: "ci1"
- partition: "not ci1"
partition-label: "notci1"
- os: ubuntu-latest
python-version: 3.9
queuing: queue
partition: "ci1"
- os: ubuntu-latest
python-version: 3.9
queuing: queue
partition: "not ci1"

# Uncomment to stress-test the test suite for random failures.
# Must also change env.TEST_ID below.
# Must also change `export TEST_ID` in first step below.
# This will take a LONG time and delay all PRs across the whole github.com/dask!
# To avoid hamstringing other people, change 'on: [push, pull_request]' above
# to just 'on: [push]'; this way the stress test will run exclusively in your
Expand All @@ -45,10 +50,18 @@ jobs:

env:
CONDA_FILE: continuous_integration/environment-${{ matrix.python-version }}.yaml
TEST_ID: ${{ matrix.os }}-${{ matrix.python-version }}-${{ matrix.partition-label }}
# TEST_ID: ${{ matrix.os }}-${{ matrix.python-version }}-${{ matrix.partition-label }}-${{ matrix.run }}

steps:
- name: Set $TEST_ID
run: |
export PARTITION_LABEL=$( echo "${{ matrix.partition }}" | sed "s/ //" )
export TEST_ID="${{ matrix.os }}-${{ matrix.python-version }}-${{ matrix.queuing }}-$PARTITION_LABEL"
# Switch to this version for stress-test:
# export TEST_ID="${{ matrix.os }}-${{ matrix.python-version }}-${{ matrix.queuing }}-$PARTITION_LABEL-${{ matrix.run }}"
echo "TEST_ID: $TEST_ID"
echo "TEST_ID=$TEST_ID" >> $GITHUB_ENV
shell: bash

- name: Checkout source
uses: actions/checkout@v2
with:
Expand Down Expand Up @@ -145,6 +158,11 @@ jobs:
export DISABLE_IPV6=1
fi
if [[ "${{ matrix.queuing }}" = "queue" ]]; then
export DASK_DISTRIBUTED__SCHEDULER__WORKER_SATURATION=1.0
echo "worker-saturation: $DASK_DISTRIBUTED__SCHEDULER__WORKER_SATURATION"
fi
source continuous_integration/scripts/set_ulimit.sh
set -o pipefail
mkdir reports
Expand Down
33 changes: 17 additions & 16 deletions continuous_integration/scripts/test_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,22 +136,23 @@ def get_jobs(run, session):
cache[url] = jobs

df_jobs = pandas.DataFrame.from_records(jobs)
extracted = df_jobs.name.str.extract(
r"\(([\w\-]+), (\d\.\d+),\s([\w|\s]+)\)"
).dropna()
df_jobs["OS"] = extracted[0]
df_jobs["python_version"] = extracted[1]
df_jobs["partition"] = extracted[2]
# We later need to join on this. Somehow the job ID is not part of the workflow schema and we have no other way to join
df_jobs["suite_name"] = (
df_jobs["OS"]
+ "-"
+ df_jobs["python_version"]
+ "-"
+ df_jobs["partition"].str.replace(" ", "")
name_components = (
df_jobs.name.str.extract(r"test \((.+)\)", expand=False)
.dropna()
.str.split(", ", expand=True)
.set_axis(["OS", "python_version", "queuing", "partition"], axis="columns")
.assign(
# We later need to join on this. Somehow the job ID is not part of the workflow schema and we have no other way to join
suite_name=lambda df: df["OS"]
+ "-"
+ df["python_version"]
+ "-"
+ df["queuing"]
+ "-"
+ df["partition"].str.replace(" ", "")
)
)

return df_jobs
return pandas.concat([df_jobs, name_components], axis="columns")


def get_workflow_run_listing(
Expand Down Expand Up @@ -205,7 +206,7 @@ def suite_from_name(name: str) -> str:
can have matrix partitions, pytest marks, etc. Basically,
just lop off the front of the name to get the suite.
"""
return "-".join(name.split("-")[:3])
return "-".join(name.split("-")[:4])


def download_and_parse_artifact(
Expand Down
3 changes: 2 additions & 1 deletion distributed/dashboard/tests/test_scheduler_bokeh.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,13 +281,14 @@ async def test_ProcessingHistogram(c, s, a, b):
ph = ProcessingHistogram(s)
ph.update()
assert (ph.source.data["top"] != 0).sum() == 1
assert ph.source.data["right"][-1] < 2

futures = c.map(slowinc, range(10), delay=0.050)
while not s.tasks:
await asyncio.sleep(0.01)

ph.update()
assert ph.source.data["right"][-1] > 2
assert ph.source.data["right"][-1] >= 2


@gen_cluster(client=True)
Expand Down
6 changes: 5 additions & 1 deletion distributed/deploy/tests/test_adaptive.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,11 @@ def test_basic_no_loop(cleanup):
@gen_test()
async def test_target_duration():
with dask.config.set(
{"distributed.scheduler.default-task-durations": {"slowinc": 1}}
{
"distributed.scheduler.default-task-durations": {"slowinc": 1},
# adaptive target for queued tasks doesn't yet consider default or learned task durations
"distributed.scheduler.worker-saturation": float("inf"),
}
):
async with LocalCluster(
n_workers=0,
Expand Down
7 changes: 7 additions & 0 deletions distributed/tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import gc
import inspect
import logging
import math
import operator
import os
import pathlib
Expand Down Expand Up @@ -5515,6 +5516,12 @@ async def close():
loop.run_sync(close) # TODO: client.close() does not unset global client


# FIXME shouldn't consistently fail on windows, may be an actual bug
@pytest.mark.skipif(
WINDOWS
and math.isfinite(dask.config.get("distributed.scheduler.worker-saturation")),
reason="flaky on Windows with queuing active",
)
@pytest.mark.slow
@gen_cluster(client=True, Worker=Nanny, timeout=60, nthreads=[("127.0.0.1", 3)] * 2)
async def test_nested_compute(c, s, a, b):
Expand Down
6 changes: 5 additions & 1 deletion distributed/tests/test_reschedule.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@
@gen_cluster(
client=True,
nthreads=[("", 1)] * 2,
config={"distributed.scheduler.work-stealing": False},
config={
"distributed.scheduler.work-stealing": False,
# Difficult to get many tasks in processing with scheduler-side queuing
"distributed.scheduler.worker-saturation": float("inf"),
},
)
async def test_scheduler_reschedule(c, s, a, b):
xs = c.map(slowinc, range(100), key="x", delay=0.1)
Expand Down
17 changes: 13 additions & 4 deletions distributed/tests/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
wait,
)
from distributed.comm.addressing import parse_host_port
from distributed.compatibility import LINUX, WINDOWS
from distributed.compatibility import LINUX, MACOS, WINDOWS
from distributed.core import ConnectionPool, Status, clean_exception, connect, rpc
from distributed.metrics import time
from distributed.protocol.pickle import dumps, loads
Expand Down Expand Up @@ -150,6 +150,7 @@ def test_decide_worker_coschedule_order_neighbors(ndeps, nthreads):
nthreads=nthreads,
config={
"distributed.scheduler.work-stealing": False,
"distributed.scheduler.worker-saturation": float("inf"),
},
)
async def test_decide_worker_coschedule_order_neighbors_(c, s, *workers):
Expand Down Expand Up @@ -832,7 +833,7 @@ async def test_ready_remove_worker(s, a, b):
elif math.isinf(s.WORKER_SATURATION):
cmp = operator.gt
else:
pytest.fail(f"{s.WORKER_OVERSATURATION=}, must be 1 or inf")
pytest.fail(f"{s.WORKER_SATURATION=}, must be 1 or inf")

assert all(cmp(len(w.processing), w.nthreads) for w in s.workers.values()), (
list(s.workers.values()),
Expand Down Expand Up @@ -1465,6 +1466,12 @@ async def test_balance_many_workers(c, s, *workers):
assert {len(w.has_what) for w in s.workers.values()} == {0, 1}


# FIXME test is very timing-based; if some threads are consistently slower than others,
# they'll receive fewer tasks from the queue (a good thing).
@pytest.mark.skipif(
MACOS and math.isfinite(dask.config.get("distributed.scheduler.worker-saturation")),
reason="flaky on macOS with queuing active",
)
@nodebug
@gen_cluster(
client=True,
Expand Down Expand Up @@ -3743,7 +3750,9 @@ async def test_Scheduler__to_dict(c, s, a):
assert isinstance(d["workers"][a.address]["memory"]["process"], int)


@gen_cluster(client=True, nthreads=[])
@gen_cluster(
client=True, nthreads=[], config={"distributed.scheduler.worker-saturation": 1.0}
)
async def test_TaskState__to_dict(c, s):
"""tasks that are listed as dependencies of other tasks are dumped as a short repr
and always appear in full under Scheduler.tasks
Expand All @@ -3760,7 +3769,7 @@ async def test_TaskState__to_dict(c, s):
assert isinstance(tasks["y"], dict)
assert isinstance(tasks["z"], dict)
assert tasks["x"]["dependents"] == ["<TaskState 'y' waiting>"]
assert tasks["y"]["dependencies"] == ["<TaskState 'x' no-worker>"]
assert tasks["y"]["dependencies"] == ["<TaskState 'x' queued>"]


def _verify_cluster_state(
Expand Down
30 changes: 22 additions & 8 deletions distributed/tests/test_steal.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import contextlib
import itertools
import logging
import math
import random
import weakref
from operator import mul
Expand Down Expand Up @@ -800,9 +801,14 @@ async def test_steal_twice(c, s, a, b):

while len(s.tasks) < 100: # tasks are all allocated
await asyncio.sleep(0.01)
# Wait for b to start stealing tasks
while len(b.state.tasks) < 30:
await asyncio.sleep(0.01)
if math.isinf(s.WORKER_SATURATION):
# Wait for b to start stealing tasks
while len(b.state.tasks) < 30:
await asyncio.sleep(0.01)
else:
# Wait for b to complete some tasks
while len(b.data) < 8:
await asyncio.sleep(0.01)

# Army of new workers arrives to help
workers = await asyncio.gather(*(Worker(s.address) for _ in range(20)))
Expand All @@ -816,6 +822,8 @@ async def test_steal_twice(c, s, a, b):
), f"Too many workers without keys ({len(empty_workers)} out of {len(s.workers)})"
# This also tests that some tasks were stolen from b
# (see `while len(b.state.tasks) < 30` above)
# If queuing is enabled, then there was nothing to steal from b,
# so this just tests the queue was balanced not-terribly.
assert max(len(ws.has_what) for ws in s.workers.values()) < 30

assert a.state.in_flight_tasks_count == 0
Expand Down Expand Up @@ -1072,6 +1080,11 @@ async def test_steal_concurrent_simple(c, s, *workers):
assert not ws2.has_what


# FIXME shouldn't consistently fail, may be an actual bug?
@pytest.mark.skipif(
math.isfinite(dask.config.get("distributed.scheduler.worker-saturation")),
reason="flaky with queuing active",
)
@gen_cluster(
client=True,
config={
Expand All @@ -1082,12 +1095,13 @@ async def test_steal_reschedule_reset_in_flight_occupancy(c, s, *workers):
# https://github.com/dask/distributed/issues/5370
steal = s.extensions["stealing"]
w0 = workers[0]
futs1 = c.map(
slowinc,
range(10),
key=[f"f1-{ix}" for ix in range(10)],
roots = c.map(
inc,
range(6),
key=[f"r-{ix}" for ix in range(6)],
)
while not w0.state.tasks:
futs1 = [c.submit(slowinc, f, key=f"f1-{ix}") for f in roots for ix in range(4)]
while not w0.state.ready:
await asyncio.sleep(0.01)

# ready is a heap but we don't need last, just not the next
Expand Down
4 changes: 2 additions & 2 deletions distributed/tests/test_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -1542,7 +1542,7 @@ async def f(ev):
)
await a.close()
assert task.cancelled()
assert s.tasks["f1"].state == "no-worker"
assert s.tasks["f1"].state in ("queued", "no-worker")


@pytest.mark.slow
Expand Down Expand Up @@ -1570,7 +1570,7 @@ async def f(ev):
assert "Failed to cancel asyncio task" in logger.getvalue()
assert time() - start < 5
assert not task.cancelled()
assert s.tasks["f1"].state == "no-worker"
assert s.tasks["f1"].state in ("queued", "no-worker")
task.cancel()
await asyncio.wait({task})

Expand Down
13 changes: 9 additions & 4 deletions distributed/tests/test_worker_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -922,16 +922,21 @@ def __reduce__(self):
return bool, (paused,)

futs = c.map(SlowSpill, range(N_TOTAL))
while len(a.data.slow) < N_PAUSE + 1:
while len(a.data.slow) < (N_PAUSE + 1 if a.state.ready else N_PAUSE):
await asyncio.sleep(0.01)

assert a.status == Status.paused
# Worker should have become paused after the first `SlowSpill` was evicted, because
# the spill to disk took longer than the memory monitor interval.
assert len(a.data.fast) == 0
assert len(a.data.slow) == N_PAUSE + 1
n_spilled_while_paused = sum(paused is True for paused in a.data.slow.values())
assert N_PAUSE <= n_spilled_while_paused <= N_PAUSE + 1
# With queuing enabled, after the 3rd `SlowSpill` has been created, there's a race
# between the scheduler sending the worker a new task, and the memory monitor
# running and pausing the worker. If the worker gets paused before the 4th task
# lands, only 3 will be in memory. If after, the 4th will block on the semaphore
# until one of the others is spilled.
assert len(a.data.slow) in (N_PAUSE, N_PAUSE + 1)
n_spilled_while_not_paused = sum(paused is False for paused in a.data.slow.values())
assert 0 <= n_spilled_while_not_paused <= 1


@pytest.mark.slow
Expand Down

0 comments on commit b133009

Please sign in to comment.