Skip to content

Commit e9c35b8

Browse files
committed
Mark workflow-boundary span exceptions with escaped=True
Set exception.escaped=True on SERVER (5xx response), PRODUCER (failed enqueue) and CONSUMER (failed job) span exception events so downstream tooling can distinguish workflow-level failures from internal child-span exceptions. The OTel SDK's auto-record path uses escaped=False, which made the attribute useless across all 17,692 historical events. For the PRODUCER span, pass record_exception=False to suppress the SDK auto-record and call record_span_error inside the except so the failed send carries a single, correctly-marked event (also stamping error.type on the span, matching SERVER/CONSUMER).
1 parent 93fc384 commit e9c35b8

5 files changed

Lines changed: 106 additions & 34 deletions

File tree

plain-jobs/plain/jobs/jobs.py

Lines changed: 43 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,14 @@
2727
from plain import postgres
2828
from plain.postgres import transaction
2929
from plain.utils import timezone
30-
from plain.utils.otel import format_exception_type
3130

3231
from .locks import postgres_advisory_lock
33-
from .otel import operation_duration_histogram, sent_messages_counter, tracer
32+
from .otel import (
33+
operation_duration_histogram,
34+
record_span_error,
35+
sent_messages_counter,
36+
tracer,
37+
)
3438
from .registry import JobParameters, jobs_registry
3539

3640
if TYPE_CHECKING:
@@ -89,12 +93,16 @@ def run_in_worker(
8993
}
9094
start_time = time.perf_counter()
9195
skipped = False
92-
try:
93-
with tracer.start_as_current_span(
94-
f"send {queue}",
95-
kind=SpanKind.PRODUCER,
96-
attributes={**metric_attributes, MESSAGING_OPERATION_NAME: "send"},
97-
) as span:
96+
with tracer.start_as_current_span(
97+
f"send {queue}",
98+
kind=SpanKind.PRODUCER,
99+
attributes={**metric_attributes, MESSAGING_OPERATION_NAME: "send"},
100+
# We record manually via record_span_error (escaped=True) at the
101+
# workflow boundary; suppress the SDK's escaped=False auto-record
102+
# so failed sends carry a single, correctly-marked event.
103+
record_exception=False,
104+
) as span:
105+
try:
98106
try:
99107
frame = sys._getframe(1)
100108
filename = frame.f_code.co_filename
@@ -176,30 +184,33 @@ def run_in_worker(
176184
)
177185

178186
return job_request
179-
except Exception as e:
180-
metric_attributes[ERROR_TYPE] = format_exception_type(e)
181-
raise
182-
finally:
183-
# Skipped enqueues are visible on the span (`job.enqueue.skipped`)
184-
# but do not fire the messaging counter — no message was sent, so
185-
# there's nothing for `messaging.client.sent.messages` to count.
186-
if not skipped:
187-
duration = time.perf_counter() - start_time
188-
if ERROR_TYPE in metric_attributes:
189-
# No commit is coming — record now so failed sends are visible.
190-
sent_messages_counter.add(1, metric_attributes)
191-
operation_duration_histogram.record(duration, metric_attributes)
192-
else:
193-
# Defer to the outer commit so a caller-level rollback
194-
# doesn't leave a phantom send. Runs immediately if not
195-
# inside a transaction.
196-
attrs = metric_attributes
197-
198-
def _emit() -> None:
199-
sent_messages_counter.add(1, attrs)
200-
operation_duration_histogram.record(duration, attrs)
201-
202-
transaction.on_commit(_emit)
187+
except Exception as e:
188+
# Stamps escaped=True on the span event, ERROR_TYPE on both
189+
# the span and `metric_attributes` (so the finally below
190+
# picks up the failed-send branch).
191+
record_span_error(span, e, metric_attributes)
192+
raise
193+
finally:
194+
# Skipped enqueues are visible on the span (`job.enqueue.skipped`)
195+
# but do not fire the messaging counter — no message was sent, so
196+
# there's nothing for `messaging.client.sent.messages` to count.
197+
if not skipped:
198+
duration = time.perf_counter() - start_time
199+
if ERROR_TYPE in metric_attributes:
200+
# No commit is coming — record now so failed sends are visible.
201+
sent_messages_counter.add(1, metric_attributes)
202+
operation_duration_histogram.record(duration, metric_attributes)
203+
else:
204+
# Defer to the outer commit so a caller-level rollback
205+
# doesn't leave a phantom send. Runs immediately if not
206+
# inside a transaction.
207+
attrs = metric_attributes
208+
209+
def _emit() -> None:
210+
sent_messages_counter.add(1, attrs)
211+
operation_duration_histogram.record(duration, attrs)
212+
213+
transaction.on_commit(_emit)
203214

204215
def get_requested_jobs(
205216
self, *, concurrency_key: str | None = None, include_retries: bool = False

plain-jobs/plain/jobs/otel.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def record_span_error(
6565
metric attribute dict, and return the error.type string so the caller
6666
can forward it to other instruments."""
6767
error_type = format_exception_type(exc)
68-
span.record_exception(exc)
68+
span.record_exception(exc, escaped=True)
6969
span.set_status(trace.StatusCode.ERROR)
7070
span.set_attribute(ERROR_TYPE, error_type)
7171
metric_attributes[ERROR_TYPE] = error_type

plain-jobs/tests/internal/test_otel.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,64 @@ def test_enqueue_skipped_marks_span(otel_spans: InMemorySpanExporter) -> None:
7676
assert span.attributes["job.enqueue.skipped"] is True
7777

7878

79+
@pytest.mark.usefixtures("db")
80+
def test_failed_enqueue_marks_producer_span_exception_as_escaped(
81+
monkeypatch: pytest.MonkeyPatch,
82+
otel_spans: InMemorySpanExporter,
83+
) -> None:
84+
"""A failing enqueue's PRODUCER span must record the exception event with
85+
`exception.escaped=True` — caller-side workflow boundary, same signal as
86+
SERVER/CONSUMER."""
87+
88+
def _boom(*args, **kwargs):
89+
raise RuntimeError("save failed")
90+
91+
from plain.jobs.models import JobRequest
92+
93+
monkeypatch.setattr(JobRequest, "save", _boom)
94+
95+
with pytest.raises(RuntimeError):
96+
_NoopJob().run_in_worker()
97+
98+
producer_spans = [
99+
s for s in otel_spans.get_finished_spans() if s.kind == SpanKind.PRODUCER
100+
]
101+
assert producer_spans, "expected PRODUCER span from run_in_worker()"
102+
span = producer_spans[-1]
103+
exception_events = [e for e in span.events if e.name == "exception"]
104+
# Exactly one event — `record_exception=False` on start_as_current_span
105+
# suppresses the SDK's escaped=False auto-record so the manual call is
106+
# the sole event.
107+
assert len(exception_events) == 1
108+
attrs = exception_events[0].attributes
109+
assert attrs is not None
110+
assert attrs["exception.escaped"] == "True"
111+
112+
113+
@pytest.mark.usefixtures("db")
114+
def test_failing_job_marks_consumer_span_exception_as_escaped(
115+
otel_spans: InMemorySpanExporter,
116+
) -> None:
117+
"""A failing job's CONSUMER span must record the exception event with
118+
`exception.escaped=True` — the workflow-level failure signal that
119+
downstream tools filter on."""
120+
request = _BoomJob().run_in_worker()
121+
assert request is not None
122+
process = request.convert_to_job_process(worker_id=uuid.uuid4())
123+
process.run()
124+
125+
consumer_spans = [
126+
s for s in otel_spans.get_finished_spans() if s.kind == SpanKind.CONSUMER
127+
]
128+
assert consumer_spans, "expected CONSUMER span from JobProcess.run()"
129+
span = consumer_spans[-1]
130+
exception_events = [e for e in span.events if e.name == "exception"]
131+
assert exception_events
132+
attrs = exception_events[0].attributes
133+
assert attrs is not None
134+
assert attrs["exception.escaped"] == "True"
135+
136+
79137
@pytest.mark.usefixtures("db")
80138
def test_enqueue_failure_records_error_type_on_metric(
81139
monkeypatch: pytest.MonkeyPatch,

plain/plain/internal/handlers/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ def _finalize_span(self, span: trace.Span, response: Response) -> None:
178178
if response.status_code >= 500:
179179
span.set_status(trace.StatusCode.ERROR)
180180
if response.exception:
181-
span.record_exception(response.exception)
181+
span.record_exception(response.exception, escaped=True)
182182
span.set_attribute(
183183
error_attributes.ERROR_TYPE,
184184
format_exception_type(response.exception),

plain/tests/internal/test_otel_spans.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ def test_500_records_exception_and_error_status(error_client) -> None:
6666
assert span.attributes["error.type"] == "RuntimeError"
6767
exception_events = [e for e in span.events if e.name == "exception"]
6868
assert exception_events
69+
attrs = exception_events[0].attributes
70+
assert attrs is not None
71+
assert attrs["exception.escaped"] == "True"
6972

7073

7174
def _invoke_handler(router_path: str) -> None:

0 commit comments

Comments
 (0)