|
8 | 8 |
|
9 | 9 | from __future__ import annotations |
10 | 10 |
|
| 11 | +import threading |
11 | 12 | import uuid |
12 | 13 |
|
13 | 14 | import pytest |
@@ -164,6 +165,81 @@ def _boom(*args, **kwargs): |
164 | 165 | ) |
165 | 166 |
|
166 | 167 |
|
| 168 | +# --- Worker run-loop span ----------------------------------------------- |
| 169 | + |
| 170 | + |
| 171 | +def _build_worker_for_loop_test() -> Worker: |
| 172 | + """Construct a Worker bypassing __init__ so `_run_loop` can run a single |
| 173 | + iteration without a real ProcessPoolExecutor. Tests override the maintenance |
| 174 | + methods to control when the loop exits.""" |
| 175 | + worker = Worker.__new__(Worker) |
| 176 | + worker.queues = ["default"] |
| 177 | + worker._is_shutting_down = False |
| 178 | + worker._heartbeat_registered = True |
| 179 | + worker._inflight_futures = {} |
| 180 | + worker._inflight_lock = threading.Lock() |
| 181 | + worker.max_processes = 1 |
| 182 | + worker.max_pending_per_process = 1 |
| 183 | + worker.maybe_heartbeat = lambda: None |
| 184 | + worker.maybe_log_stats = lambda: None |
| 185 | + worker.maybe_check_job_results = lambda: None |
| 186 | + worker.maybe_schedule_jobs = lambda: None |
| 187 | + return worker |
| 188 | + |
| 189 | + |
| 190 | +@pytest.mark.usefixtures("db") |
| 191 | +def test_worker_loop_emits_internal_span_per_iteration( |
| 192 | + otel_spans: InMemorySpanExporter, |
| 193 | +) -> None: |
| 194 | + """Each worker loop iteration wraps the maintenance work in a `worker loop` |
| 195 | + INTERNAL span so DB transients during maintenance land on a span.""" |
| 196 | + from opentelemetry.trace import StatusCode |
| 197 | + |
| 198 | + worker = _build_worker_for_loop_test() |
| 199 | + |
| 200 | + def shutdown_during_heartbeat() -> None: |
| 201 | + worker._is_shutting_down = True |
| 202 | + |
| 203 | + worker.maybe_heartbeat = shutdown_during_heartbeat # ty: ignore[invalid-assignment] |
| 204 | + worker._run_loop() |
| 205 | + |
| 206 | + loop_spans = [s for s in otel_spans.get_finished_spans() if s.name == "worker loop"] |
| 207 | + assert len(loop_spans) == 1 |
| 208 | + span = loop_spans[0] |
| 209 | + assert span.kind == SpanKind.INTERNAL |
| 210 | + assert span.status.status_code == StatusCode.UNSET |
| 211 | + |
| 212 | + |
| 213 | +@pytest.mark.usefixtures("db") |
| 214 | +def test_worker_loop_records_error_when_maintenance_fails( |
| 215 | + otel_spans: InMemorySpanExporter, |
| 216 | +) -> None: |
| 217 | + """A maintenance exception leaves the loop running and stamps the canonical |
| 218 | + failure signal (status=ERROR + error.type) on the `worker loop` span. This |
| 219 | + is the path that previously swallowed DB transients like the production |
| 220 | + `psycopg.OperationalError` we saw escaping `rescue_job_results`.""" |
| 221 | + from opentelemetry.trace import StatusCode |
| 222 | + |
| 223 | + worker = _build_worker_for_loop_test() |
| 224 | + |
| 225 | + def boom_then_shutdown() -> None: |
| 226 | + worker._is_shutting_down = True |
| 227 | + raise RuntimeError("db transient") |
| 228 | + |
| 229 | + worker.maybe_heartbeat = boom_then_shutdown # ty: ignore[invalid-assignment] |
| 230 | + # Must NOT raise — the loop catches and continues, just like in production. |
| 231 | + worker._run_loop() |
| 232 | + |
| 233 | + loop_spans = [s for s in otel_spans.get_finished_spans() if s.name == "worker loop"] |
| 234 | + assert len(loop_spans) == 1 |
| 235 | + span = loop_spans[0] |
| 236 | + assert span.status.status_code == StatusCode.ERROR |
| 237 | + assert span.attributes is not None |
| 238 | + assert span.attributes["error.type"] == "RuntimeError" |
| 239 | + exception_events = [e for e in span.events if e.name == "exception"] |
| 240 | + assert exception_events |
| 241 | + |
| 242 | + |
167 | 243 | # --- Worker-state observable gauges ------------------------------------- |
168 | 244 | # |
169 | 245 | # Each Worker owns a WorkerMetrics; instantiating one swaps it in as the |
|
0 commit comments