Add plain-jobs OTel tests and document metric contract

davegaeddert · davegaeddert · commit 2a106fbad26c · 2026-04-27T10:19:16.000-05:00
Tests cover the `Job.run_in_worker()` enqueue path: send-span semconv
attributes, the skipped-enqueue branch, and failure-path metric
recording.

README and a code comment document the two metric contracts that
the tests can't directly assert under the test rollback:

- Successful enqueues defer to `transaction.on_commit`, so a rolled-back
  caller produces no metric — matching OTel's "MUST NOT count messages
  that were created but haven't yet been sent."
- Skipped enqueues are visible on the span (`job.enqueue.skipped`) but
  intentionally do not increment `messaging.client.sent.messages` —
  no message was sent, so there's nothing to count.
diff --git a/plain-jobs/plain/jobs/README.md b/plain-jobs/plain/jobs/README.md
@@ -181,11 +181,21 @@ plain jobs worker --stats-every 60
 
 The worker integrates with OpenTelemetry for distributed tracing. Spans are created for:
 
-- Job scheduling (`run_in_worker`)
-- Job execution
-- Job completion/failure
+- Job scheduling (`run_in_worker`) — emits a `send {queue}` PRODUCER span with the OTel `messaging.*` semconv attributes
+- Job execution — emits a `process {queue}` CONSUMER span linked back to the originating send span
+- Job completion/failure — recorded as the span's status and `error.type` attribute on failure
 
-Jobs can be linked to the originating trace context, allowing you to track jobs initiated from web requests.
+Jobs are linked to the originating trace context, allowing you to follow jobs initiated from web requests.
+
+Two messaging metrics are recorded:
+
+- `messaging.client.sent.messages` — counter incremented for each enqueue
+- `messaging.client.operation.duration` — histogram of enqueue/process durations
+
+Two contract details to be aware of:
+
+- **Successful enqueues record metrics on transaction commit.** If you call `run_in_worker` inside a transaction that later rolls back, the message was never actually persisted — so the counter and histogram do not fire. This matches the OTel semconv: "MUST NOT count messages that were created but haven't yet been sent." Failed enqueues record immediately so transient errors are still visible.
+- **Skipped enqueues are visible in spans, not in metrics.** When `should_enqueue` returns `False` (e.g., a concurrency-key collision), the span gets `job.enqueue.skipped=True` but no metric is recorded — there was no send to count.
 
 ## Settings
 
diff --git a/plain-jobs/plain/jobs/jobs.py b/plain-jobs/plain/jobs/jobs.py
@@ -180,6 +180,9 @@ def run_in_worker(
             metric_attributes[ERROR_TYPE] = format_exception_type(e)
             raise
         finally:
+            # Skipped enqueues are visible on the span (`job.enqueue.skipped`)
+            # but do not fire the messaging counter — no message was sent, so
+            # there's nothing for `messaging.client.sent.messages` to count.
             if not skipped:
                 duration = time.perf_counter() - start_time
                 if ERROR_TYPE in metric_attributes:
diff --git a/plain-jobs/tests/test_otel.py b/plain-jobs/tests/test_otel.py
@@ -0,0 +1,98 @@
+"""OTel instrumentation tests for the job enqueue path.
+
+The process (consumer) side runs through `JobProcess.convert_to_result()`
+and is exercised by the worker; tests for it would need worker setup and
+are deferred. These tests cover `Job.run_in_worker()`, which is the
+hottest user-facing path.
+"""
+
+from __future__ import annotations
+
+import pytest
+from opentelemetry.sdk.metrics.export import InMemoryMetricReader
+from opentelemetry.sdk.trace.export.in_memory_span_exporter import (
+    InMemorySpanExporter,
+)
+from opentelemetry.trace import SpanKind
+
+from plain.jobs import Job
+
+
+class _NoopJob(Job):
+    def run(self) -> None:
+        pass
+
+
+class _ExclusiveJob(Job):
+    """Job that always reports `should_enqueue=False` to exercise the
+    skipped-enqueue branch without needing pre-existing rows."""
+
+    def run(self) -> None:
+        pass
+
+    def should_enqueue(self, concurrency_key: str) -> bool:
+        return False
+
+
+@pytest.mark.usefixtures("db")
+def test_enqueue_emits_send_span(otel_spans: InMemorySpanExporter) -> None:
+    _NoopJob().run_in_worker()
+
+    spans = [s for s in otel_spans.get_finished_spans() if s.name == "send default"]
+    assert spans, "expected a `send default` PRODUCER span"
+    span = spans[-1]
+    attrs = span.attributes
+    assert attrs is not None
+    assert span.kind == SpanKind.PRODUCER
+    assert attrs["messaging.system"] == "plain.jobs"
+    assert attrs["messaging.operation.type"] == "send"
+    assert attrs["messaging.operation.name"] == "send"
+    assert attrs["messaging.destination.name"] == "default"
+    assert "messaging.message.id" in attrs
+    assert "code.function.name" in attrs
+
+
+@pytest.mark.usefixtures("db")
+def test_enqueue_skipped_marks_span(otel_spans: InMemorySpanExporter) -> None:
+    result = _ExclusiveJob().run_in_worker(concurrency_key="busy")
+
+    assert result is None
+    span = next(s for s in otel_spans.get_finished_spans() if s.name == "send default")
+    assert span.attributes is not None
+    assert span.attributes["job.enqueue.skipped"] is True
+
+
+@pytest.mark.usefixtures("db")
+def test_enqueue_failure_records_error_type_on_metric(
+    monkeypatch: pytest.MonkeyPatch,
+    otel_spans: InMemorySpanExporter,
+    otel_metrics: InMemoryMetricReader,
+) -> None:
+    # The success path defers metric recording to `transaction.on_commit`,
+    # which never fires under the test rollback. The failure path records
+    # immediately, so it's the one we can assert on here.
+    def _boom(*args, **kwargs):
+        raise RuntimeError("save failed")
+
+    from plain.jobs.models import JobRequest
+
+    monkeypatch.setattr(JobRequest, "save", _boom)
+
+    with pytest.raises(RuntimeError):
+        _NoopJob().run_in_worker()
+
+    data = otel_metrics.get_metrics_data()
+    assert data is not None
+    sent_points = [
+        p
+        for rm in data.resource_metrics
+        for sm in rm.scope_metrics
+        for m in sm.metrics
+        if m.name == "messaging.client.sent.messages"
+        for p in m.data.data_points
+    ]
+    assert sent_points, "expected sent_messages counter point on failure"
+    assert all(p.attributes.get("error.type") == "RuntimeError" for p in sent_points)
+    assert all(
+        p.attributes.get("messaging.system") == "plain.jobs" for p in sent_points
+    )