From de166f68488457f867736788691ba1543a691c86 Mon Sep 17 00:00:00 2001 From: crusaderky Date: Fri, 8 Mar 2024 22:32:30 +0000 Subject: [PATCH] Weigh gilknocker Prometheus metric by duration (#8558) --- distributed/dashboard/components/scheduler.py | 7 ++++--- distributed/http/scheduler/prometheus/core.py | 3 ++- .../http/scheduler/tests/test_scheduler_http.py | 2 +- distributed/http/tests/test_core.py | 4 ++-- distributed/http/worker/prometheus/core.py | 3 ++- .../http/worker/tests/test_worker_http.py | 2 +- distributed/system_monitor.py | 13 ++++++------- docs/source/prometheus.rst | 16 ++++++++++------ 8 files changed, 28 insertions(+), 22 deletions(-) diff --git a/distributed/dashboard/components/scheduler.py b/distributed/dashboard/components/scheduler.py index b941216823..3d4e9b6fc5 100644 --- a/distributed/dashboard/components/scheduler.py +++ b/distributed/dashboard/components/scheduler.py @@ -3911,7 +3911,6 @@ def __init__(self, scheduler, **kwargs): @log_errors def update(self): s = self.scheduler - monitor_gil = s.monitor.monitor_gil_contention self.data["values"] = [ s._tick_interval_observed, @@ -3919,11 +3918,13 @@ def update(self): sum(w.metrics["event_loop_interval"] for w in s.workers.values()) / (len(s.workers) or 1), self.gil_contention_workers, - ][:: 1 if monitor_gil else 2] + ][:: 1 if s.monitor.monitor_gil_contention else 2] # Format event loop as time and GIL (if configured) as % self.data["text"] = [ - f"{x * 100:.1f}%" if i % 2 and monitor_gil else format_time(x) + f"{x * 100:.1f}%" + if i % 2 and s.monitor.monitor_gil_contention + else format_time(x) for i, x in enumerate(self.data["values"]) ] update(self.source, self.data) diff --git a/distributed/http/scheduler/prometheus/core.py b/distributed/http/scheduler/prometheus/core.py index adea703f50..b3ac06b6b2 100644 --- a/distributed/http/scheduler/prometheus/core.py +++ b/distributed/http/scheduler/prometheus/core.py @@ -56,7 +56,8 @@ def collect(self) -> Iterator[GaugeMetricFamily | CounterMetricFamily]: yield CounterMetricFamily( self.build_name("gil_contention"), "GIL contention metric", - value=self.server.monitor._cumulative_gil_contention, + value=self.server.monitor.cumulative_gil_contention, + unit="seconds", ) yield CounterMetricFamily( diff --git a/distributed/http/scheduler/tests/test_scheduler_http.py b/distributed/http/scheduler/tests/test_scheduler_http.py index d51ddf33ea..25ede716fb 100644 --- a/distributed/http/scheduler/tests/test_scheduler_http.py +++ b/distributed/http/scheduler/tests/test_scheduler_http.py @@ -132,7 +132,7 @@ async def test_prometheus(c, s, a, b): except ImportError: pass # pragma: nocover else: - expected_metrics.add("dask_scheduler_gil_contention") + expected_metrics.add("dask_scheduler_gil_contention_seconds") assert set(active_metrics.keys()) == expected_metrics assert active_metrics["dask_scheduler_clients"].samples[0].value == 1.0 diff --git a/distributed/http/tests/test_core.py b/distributed/http/tests/test_core.py index 7f43d58115..3bccaf11ce 100644 --- a/distributed/http/tests/test_core.py +++ b/distributed/http/tests/test_core.py @@ -73,8 +73,8 @@ async def test_prometheus_api_doc(c, s, a, _): gil_metrics = set() # Already in worker_metrics except ImportError: gil_metrics = { - "dask_scheduler_gil_contention_total", - "dask_worker_gil_contention_total", + "dask_scheduler_gil_contention_seconds_total", + "dask_worker_gil_contention_seconds_total", } implemented = scheduler_metrics | worker_metrics | crick_metrics | gil_metrics diff --git a/distributed/http/worker/prometheus/core.py b/distributed/http/worker/prometheus/core.py index f62027cf47..5354ee521c 100644 --- a/distributed/http/worker/prometheus/core.py +++ b/distributed/http/worker/prometheus/core.py @@ -63,7 +63,8 @@ def collect(self) -> Iterator[Metric]: yield CounterMetricFamily( self.build_name("gil_contention"), "GIL contention metric", - value=self.server.monitor._cumulative_gil_contention, + value=self.server.monitor.cumulative_gil_contention, + unit="seconds", ) yield GaugeMetricFamily( diff --git a/distributed/http/worker/tests/test_worker_http.py b/distributed/http/worker/tests/test_worker_http.py index 36213589dd..8fc56662bf 100644 --- a/distributed/http/worker/tests/test_worker_http.py +++ b/distributed/http/worker/tests/test_worker_http.py @@ -59,7 +59,7 @@ async def test_prometheus(c, s, a): except ImportError: pass # pragma: nocover else: - expected_metrics.add("dask_worker_gil_contention_total") + expected_metrics.add("dask_worker_gil_contention_seconds_total") try: import crick # noqa: F401 diff --git a/distributed/system_monitor.py b/distributed/system_monitor.py index bccce514ca..c3f3cec1d1 100644 --- a/distributed/system_monitor.py +++ b/distributed/system_monitor.py @@ -31,7 +31,7 @@ class SystemMonitor: _last_host_cpu_counters: Any # dynamically-defined psutil namedtuple _last_gil_contention: float # 0-1 value - _cumulative_gil_contention: float + cumulative_gil_contention: float gpu_name: str | None gpu_memory_total: int @@ -114,12 +114,11 @@ def __init__( self.monitor_gil_contention = False else: self.quantities["gil_contention"] = deque(maxlen=maxlen) - self._cumulative_gil_contention = 0.0 + self.cumulative_gil_contention = 0.0 raw_interval = dask.config.get( "distributed.admin.system-monitor.gil.interval", ) - interval = parse_timedelta(raw_interval, default="us") * 1e6 - + interval = parse_timedelta(raw_interval) * 1e6 self._gilknocker = KnockKnock(polling_interval_micros=int(interval)) self._gilknocker.start() @@ -197,10 +196,10 @@ def update(self) -> dict[str, Any]: self._last_host_cpu_counters = host_cpu if self.monitor_gil_contention: - self._last_gil_contention = self._gilknocker.contention_metric - self._cumulative_gil_contention += self._last_gil_contention - result["gil_contention"] = self._last_gil_contention + gil_contention = self._gilknocker.contention_metric self._gilknocker.reset_contention_metric() + result["gil_contention"] = self._last_gil_contention = gil_contention + self.cumulative_gil_contention += duration * gil_contention # Note: WINDOWS constant doesn't work with `mypy --platform win32` if sys.platform != "win32": diff --git a/docs/source/prometheus.rst b/docs/source/prometheus.rst index 8a97993ed7..8ca3165cd4 100644 --- a/docs/source/prometheus.rst +++ b/docs/source/prometheus.rst @@ -26,9 +26,11 @@ dask_scheduler_clients Number of clients connected dask_scheduler_desired_workers Number of workers scheduler needs for task graph -dask_scheduler_gil_contention_total - Value representing cumulative total of GIL contention, - in the form of summed percentages. +dask_scheduler_gil_contention_seconds_total + Value representing cumulative total of *potential* GIL contention, + in the form of cumulative seconds during which any thread held the GIL locked. + Other threads may or may not have been actually trying to acquire the GIL in the + meantime. .. note:: Requires ``gilknocker`` to be installed, and @@ -128,9 +130,11 @@ dask_worker_tasks Number of tasks at worker dask_worker_threads Number of worker threads -dask_worker_gil_contention_total - Value representing cumulative total GIL contention on worker, - in the form of summed percentages. +dask_worker_gil_contention_seconds_total + Value representing cumulative total of *potential* GIL contention, + in the form of cumulative seconds during which any thread held the GIL locked. + Other threads may or may not have been actually trying to acquire the GIL in the + meantime. .. note:: Requires ``gilknocker`` to be installed, and