Skip to content

Commit

Permalink
Reduce local test flakyness (#676)
Browse files Browse the repository at this point in the history
* Remove note about tests not working on M1

* Use `docker_image` in `test_pod_create_and_delete`

Changed as the previously used `gcr.io/google_containers/pause` is not available for arm64

* Ability to re-run to `helm` tests

* Use `utcnow()` instead of `now()` for fake GCP auth

Otherwise tests running in a non-UTC timezone will fail

* Increase timeouts to avoid raceconditions on slower machines

* Use readyness instead of "Running"
  • Loading branch information
bstadlbauer committed Mar 14, 2023
1 parent 1f1c856 commit 363e000
Show file tree
Hide file tree
Showing 7 changed files with 48 additions and 23 deletions.
10 changes: 8 additions & 2 deletions dask_kubernetes/aiopykube/tests/test_pykube_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


@pytest.mark.asyncio
async def test_pod_create_and_delete(k8s_cluster):
async def test_pod_create_and_delete(docker_image, k8s_cluster):
api = HTTPClient(KubeConfig.from_env())
name = "test-" + uuid.uuid4().hex[:10]
pod = Pod(
Expand All @@ -19,7 +19,13 @@ async def test_pod_create_and_delete(k8s_cluster):
"metadata": {"name": name},
"spec": {
"containers": [
{"name": "pause", "image": "gcr.io/google_containers/pause"}
# Cannot use `gcr.io/google_containers/pause` as it's not available
# for arm64
{
"name": "pause",
"image": docker_image,
"command": ["sleep", "1000"],
},
]
},
},
Expand Down
2 changes: 1 addition & 1 deletion dask_kubernetes/classic/tests/fake_gcp_auth.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import datetime
import json

expiry = datetime.datetime.now() + datetime.timedelta(seconds=5)
expiry = datetime.datetime.utcnow() + datetime.timedelta(seconds=5)
expiry.replace(tzinfo=datetime.timezone.utc)
expiry_str = expiry.isoformat("T") + "Z"

Expand Down
14 changes: 7 additions & 7 deletions dask_kubernetes/classic/tests/test_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ async def test_pod_from_yaml(k8s_cluster, docker_image):
await cluster
async with Client(cluster, asynchronous=True) as client:
future = client.submit(lambda x: x + 1, 10)
result = await future.result(timeout=10)
result = await future.result(timeout=30)
assert result == 11

await client.wait_for_workers(2)
Expand Down Expand Up @@ -448,7 +448,7 @@ async def test_scale_up_down(cluster, client):
start = time()
while len(cluster.scheduler_info["workers"]) != 2:
await asyncio.sleep(0.1)
assert time() < start + 20
assert time() < start + 60

a, b = list(cluster.scheduler_info["workers"])
x = client.submit(np.ones, 1, workers=a)
Expand All @@ -462,7 +462,7 @@ async def test_scale_up_down(cluster, client):
start = time()
while len(cluster.scheduler_info["workers"]) != 1:
await asyncio.sleep(0.1)
assert time() < start + 20
assert time() < start + 60

# assert set(cluster.scheduler_info["workers"]) == {b}

Expand Down Expand Up @@ -800,7 +800,7 @@ async def get_worker_pods():
start = time()
while len(cluster.scheduler_info["workers"]) != 2:
await asyncio.sleep(0.1)
assert time() < start + 20
assert time() < start + 60

worker_pods = await get_worker_pods()
assert len(worker_pods) == 2
Expand All @@ -814,12 +814,12 @@ async def get_worker_pods():
if to_delete not in worker_pods:
break
await asyncio.sleep(0.1)
assert time() < start + 20
assert time() < start + 60
# test whether adapt will bring it back
start = time()
while len(cluster.scheduler_info["workers"]) != 2:
await asyncio.sleep(0.1)
assert time() < start + 20
assert time() < start + 60
assert len(cluster.scheduler_info["workers"]) == 2


Expand Down Expand Up @@ -888,7 +888,7 @@ async def test_auto_refresh(cluster):
for task in asyncio.all_tasks():
if task.get_name() == "dask_auth_auto_refresh":
loader.auto_refresh = False
await asyncio.wait_for(task, 10)
await asyncio.wait_for(task, 60)
break
else:
assert False
4 changes: 2 additions & 2 deletions dask_kubernetes/classic/tests/test_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ def test_scale_up_down(cluster, client):
start = time()
while len(cluster.scheduler_info["workers"]) != 2:
sleep(0.1)
assert time() < start + 10
assert time() < start + 30

a, b = list(cluster.scheduler_info["workers"])
x = client.submit(np.ones, 1, workers=a)
Expand All @@ -324,7 +324,7 @@ def test_scale_up_down(cluster, client):
start = time()
while len(cluster.scheduler_info["workers"]) != 1:
sleep(0.1)
assert time() < start + 20
assert time() < start + 60

# assert set(cluster.scheduler_info["workers"]) == {b}

Expand Down
18 changes: 15 additions & 3 deletions dask_kubernetes/helm/tests/test_helm.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,21 @@
@pytest.fixture(scope="session")
def chart_repo():
repo_name = "dask"
subprocess.run(
["helm", "repo", "add", repo_name, "https://helm.dask.org/"], check=True
)
repo_url = "https://helm.dask.org/"
output = subprocess.run(["helm", "repo", "list"], capture_output=True)
repo_lines = output.stdout.decode().splitlines()[1:] # First line is header
dask_repo_present = False
for repo_line in repo_lines:
repo, url = repo_line.replace(" ", "").split("\t")
if repo == repo_name:
if url.rstrip("/") != repo_url.rstrip("/"):
raise ValueError(f"Dask repo already present with different URL {url}")
dask_repo_present = True
if not dask_repo_present:
subprocess.run(
["helm", "repo", "add", repo_name, repo_url],
check=True,
)
subprocess.run(["helm", "repo", "update"], check=True)
return repo_name

Expand Down
21 changes: 15 additions & 6 deletions dask_kubernetes/operator/controller/tests/test_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,13 @@ async def test_scalesimplecluster(k8s_cluster, kopf_runner, gen_cluster):
await asyncio.sleep(0.1)
while worker_pod_name not in k8s_cluster.kubectl("get", "pods"):
await asyncio.sleep(0.1)
while "Running" not in k8s_cluster.kubectl(
"get", "pods", scheduler_pod_name
):
await asyncio.sleep(0.1)
k8s_cluster.kubectl(
"wait",
"pods",
"--for=condition=Ready",
scheduler_pod_name,
"--timeout=120s",
)
with k8s_cluster.port_forward(f"service/{service_name}", 8786) as port:
async with Client(
f"tcp://localhost:{port}", asynchronous=True
Expand Down Expand Up @@ -155,7 +158,13 @@ async def test_simplecluster(k8s_cluster, kopf_runner, gen_cluster):
await asyncio.sleep(0.1)
while worker_pod_name not in k8s_cluster.kubectl("get", "pods"):
await asyncio.sleep(0.1)

k8s_cluster.kubectl(
"wait",
"pods",
"--for=condition=Ready",
scheduler_pod_name,
"--timeout=120s",
)
with k8s_cluster.port_forward(f"service/{service_name}", 8786) as port:
async with Client(
f"tcp://localhost:{port}", asynchronous=True
Expand All @@ -174,7 +183,7 @@ async def test_simplecluster(k8s_cluster, kopf_runner, gen_cluster):
"-o",
"jsonpath='{.items[0].spec.containers[0].env[0]}'",
)
# Just check if its in the string, no need to parse the json
# Just check if it's in the string, no need to parse the json
assert "SCHEDULER_ENV" in scheduler_env

# Get the first annotation (the only one) of the scheduler
Expand Down
2 changes: 0 additions & 2 deletions doc/source/testing.rst
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
Testing
=======

.. warning:: Tests are not working on aarch64 (Apple M1) at the moment due to an architecture incompatibility between ``pytest-kind-control-plane`` and the docker image built from ``ci/Dockerfile``, similar to `this GitHub issue <https://github.com/kubernetes-sigs/kind/issues/2402>`_.

Running the test suite for ``dask-kubernetes`` doesn't require an existing Kubernetes cluster but does require
`Docker <https://docs.docker.com/get-docker/>`_, `kubectl <https://kubernetes.io/docs/tasks/tools/#kubectl>`_ and `helm <https://helm.sh/docs/intro/install/>`_.

Expand Down

0 comments on commit 363e000

Please sign in to comment.