From 61e10ec46e690c35691eb74b88bc15ce4e79cfb6 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 23 Aug 2022 15:49:16 +0100 Subject: [PATCH] Optimise scheduler.get_comm_cost set difference (#6931) --- distributed/scheduler.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/distributed/scheduler.py b/distributed/scheduler.py index a8b5f5b3f76..3367f2fb8ad 100644 --- a/distributed/scheduler.py +++ b/distributed/scheduler.py @@ -2620,7 +2620,17 @@ def get_comm_cost(self, ts: TaskState, ws: WorkerState) -> float: on the given worker. """ dts: TaskState - deps: set = ts.dependencies.difference(ws.has_what) + deps: set + if 10 * len(ts.dependencies) < len(ws.has_what): + # In the common case where the number of dependencies is + # much less than the number of tasks that we have, + # construct the set of deps that require communication in + # O(len(dependencies)) rather than O(len(has_what)) time. + # Factor of 10 is a guess at the overhead of explicit + # iteration as opposed to just calling set.difference + deps = {dep for dep in ts.dependencies if dep not in ws.has_what} + else: + deps = ts.dependencies.difference(ws.has_what) nbytes: int = 0 for dts in deps: nbytes += dts.nbytes