Split plain-jobs consumed counter by terminal outcome and add workers gauge

davegaeddert · davegaeddert · commit 49fa1952eaa3 · 2026-05-06T16:39:34.000-05:00
Move messaging.client.consumed.messages from JobProcess.run()'s finally
block to record_consumed(), called from convert_to_result() and defer().
Adds a plain.jobs.outcome attribute (successful/errored/lost/cancelled/
deferred) so the rescue and cancellation paths show up in throughput
dashboards, and forwards error.type from the live exception path.

New plain.jobs.workers observable gauge splits WorkerHeartbeat rows by
plain.jobs.worker.state (active/stale) using a single cutoff snapshot.
heartbeat_cutoff() is extracted to models.py so rescue, admin, and OTel
agree on which workers are alive.
diff --git a/plain-jobs/plain/jobs/README.md b/plain-jobs/plain/jobs/README.md
@@ -228,15 +228,30 @@ The worker integrates with OpenTelemetry for distributed tracing. Spans are crea
 
 Jobs are linked to the originating trace context, allowing you to follow jobs initiated from web requests.
 
-Two messaging metrics are recorded:
+Messaging metrics:
 
 - `messaging.client.sent.messages` — counter incremented for each enqueue
+- `messaging.client.consumed.messages` — counter incremented for every terminal `JobResult`. Carries a `plain.jobs.outcome` attribute (`successful`, `errored`, `lost`, `cancelled`, `deferred`) so dashboards can split throughput by outcome.
 - `messaging.client.operation.duration` — histogram of enqueue/process durations
+- `plain.jobs.queue.wait.duration` — histogram of how long a job waited in queue before a worker picked it up
+
+Per-worker observable gauges (queryable per `messaging.destination.name` where applicable):
+
+- `plain.jobs.worker.processes` — OS processes spawned by this worker
+- `plain.jobs.queue.depth` — pending `JobRequest`s ready to run
+- `plain.jobs.queue.scheduled` — `JobRequest`s with `start_at` in the future
+- `plain.jobs.queue.oldest.age` — age in seconds of the oldest ready-to-run `JobRequest`
+- `plain.jobs.running` — `JobProcess` rows currently running
+
+Worker-liveness gauge (global, no per-queue dimension):
+
+- `plain.jobs.workers` — `WorkerHeartbeat` row count, split by a `plain.jobs.worker.state` attribute taking `active` (within `JOBS_HEARTBEAT_TIMEOUT`) or `stale` (past it, eligible for rescue on the next tick)
 
 Two contract details to be aware of:
 
 - **Successful enqueues record metrics on transaction commit.** If you call `run_in_worker` inside a transaction that later rolls back, the message was never actually persisted — so the counter and histogram do not fire. This matches the OTel semconv: "MUST NOT count messages that were created but haven't yet been sent." Failed enqueues record immediately so transient errors are still visible.
 - **Skipped enqueues are visible in spans, not in metrics.** When `should_enqueue` returns `False` (e.g., a concurrency-key collision), the span gets `job.enqueue.skipped=True` but no metric is recorded — there was no send to count.
+- **Observable gauges emit once per worker process.** When two workers cover the same queue, the per-queue gauges emit identical values from each. `plain.jobs.workers` is global (no per-queue dimension) and likewise emits the full table count from every worker. Aggregate these gauges with `last_value`/`max`, never `sum`.
 
 ## Settings
 
diff --git a/plain-jobs/plain/jobs/admin.py b/plain-jobs/plain/jobs/admin.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import datetime
 from datetime import timedelta
 
 from plain import postgres
@@ -14,14 +13,14 @@
 from plain.http import RedirectResponse
 from plain.postgres.expressions import Case, When
 from plain.runtime import settings
-from plain.utils import timezone
 
 from .models import (
     JobProcess,
     JobRequest,
     JobResult,
     JobResultQuerySet,
     WorkerHeartbeat,
+    heartbeat_cutoff,
 )
 
 
@@ -133,10 +132,6 @@ def get_metric(self) -> int:
         return JobProcess.query.running().count()
 
 
-def _heartbeat_cutoff() -> datetime.datetime:
-    return timezone.now() - timedelta(seconds=settings.JOBS_HEARTBEAT_TIMEOUT)
-
-
 class ActiveWorkersCard(Card):
     title = "Active workers"
     text = "View"
@@ -147,7 +142,7 @@ def get_description(self) -> str:
 
     def get_metric(self) -> int:
         return WorkerHeartbeat.query.filter(
-            last_heartbeat_at__gte=_heartbeat_cutoff()
+            last_heartbeat_at__gte=heartbeat_cutoff()
         ).count()
 
     def get_link(self) -> str:
@@ -167,7 +162,7 @@ def get_description(self) -> str:
 
     def get_metric(self) -> int:
         return WorkerHeartbeat.query.filter(
-            last_heartbeat_at__lt=_heartbeat_cutoff()
+            last_heartbeat_at__lt=heartbeat_cutoff()
         ).count()
 
     def get_link(self) -> str:
@@ -364,7 +359,7 @@ def get_initial_queryset(self) -> postgres.QuerySet[WorkerHeartbeat]:
             queryset = super().get_initial_queryset()
             return queryset.annotate(
                 stale=Case(
-                    When(last_heartbeat_at__lt=_heartbeat_cutoff(), then=True),
+                    When(last_heartbeat_at__lt=heartbeat_cutoff(), then=True),
                     default=False,
                     output_field=postgres.BooleanField(),
                 ),
@@ -373,7 +368,7 @@ def get_initial_queryset(self) -> postgres.QuerySet[WorkerHeartbeat]:
         def filter_queryset(
             self, queryset: postgres.QuerySet[WorkerHeartbeat]
         ) -> postgres.QuerySet[WorkerHeartbeat]:
-            cutoff = _heartbeat_cutoff()
+            cutoff = heartbeat_cutoff()
             if self.filter == "Active":
                 return queryset.filter(last_heartbeat_at__gte=cutoff)
             if self.filter == "Stale":
diff --git a/plain-jobs/plain/jobs/models.py b/plain-jobs/plain/jobs/models.py
@@ -8,15 +8,8 @@
 
 from opentelemetry.semconv._incubating.attributes.messaging_attributes import (
     MESSAGING_CONSUMER_GROUP_NAME,
-    MESSAGING_DESTINATION_NAME,
     MESSAGING_MESSAGE_ID,
     MESSAGING_OPERATION_NAME,
-    MESSAGING_OPERATION_TYPE,
-    MESSAGING_SYSTEM,
-    MessagingOperationTypeValues,
-)
-from opentelemetry.semconv.attributes.code_attributes import (
-    CODE_FUNCTION_NAME,
 )
 from opentelemetry.trace import Link, SpanContext, SpanKind, TraceFlags
 
@@ -29,9 +22,10 @@
 
 from .exceptions import DeferError, DeferJob
 from .otel import (
-    consumed_messages_counter,
     operation_duration_histogram,
+    process_metric_attributes,
     queue_wait_duration_histogram,
+    record_consumed,
     record_span_error,
     tracer,
 )
@@ -293,12 +287,9 @@ def run(self) -> JobResult:
                     extra={"job_uuid": self.uuid},
                 )
 
-        metric_attributes: dict[str, Any] = {
-            MESSAGING_SYSTEM: "plain.jobs",
-            MESSAGING_OPERATION_TYPE: MessagingOperationTypeValues.PROCESS.value,
-            MESSAGING_DESTINATION_NAME: self.queue,
-            CODE_FUNCTION_NAME: f"{self.job_class}.run",
-        }
+        metric_attributes: dict[str, Any] = process_metric_attributes(
+            self.queue, self.job_class
+        )
         start_time = time.perf_counter()
         try:
             with tracer.start_as_current_span(
@@ -349,10 +340,11 @@ def run(self) -> JobResult:
                         "Defer failed",
                         extra={"job_class": self.job_class, "error": str(e)},
                     )
-                    record_span_error(span, e, metric_attributes)
+                    error_type = record_span_error(span, e, metric_attributes)
                     return self.convert_to_result(
                         status=JobResultStatuses.ERRORED,
                         error=str(e),
+                        error_type=error_type,
                     )
 
                 except Exception as e:
@@ -363,14 +355,14 @@ def run(self) -> JobResult:
                     # second log line. Rare; correct outcome; not worth
                     # pre-checking on every successful job.
                     logger.exception(e)
-                    record_span_error(span, e, metric_attributes)
+                    error_type = record_span_error(span, e, metric_attributes)
                     return self.convert_to_result(
                         status=JobResultStatuses.ERRORED,
                         error="".join(traceback.format_tb(e.__traceback__)),
+                        error_type=error_type,
                     )
         finally:
             duration = time.perf_counter() - start_time
-            consumed_messages_counter.add(1, metric_attributes)
             operation_duration_histogram.record(duration, metric_attributes)
 
     def defer(self, *, job: Job, defer_exception: DeferJob) -> JobResult:
@@ -443,14 +435,29 @@ def defer(self, *, job: Job, defer_exception: DeferJob) -> JobResult:
                 span_id=self.span_id,
             )
 
-            return result
+        # Counter ticks for the DEFERRED outcome too — defer() bypasses
+        # convert_to_result, so without this the deferred path would not
+        # show up in the consumed counter.
+        record_consumed(result)
+        return result
 
     def convert_to_result(
-        self, *, status: str, error: str = "", fire_hook: bool = True
+        self,
+        *,
+        status: str,
+        error: str = "",
+        error_type: str | None = None,
+        fire_hook: bool = True,
     ) -> JobResult:
         """
         Convert this JobProcess to a JobResult.
 
+        error_type, when supplied, is the OTel-style exception name (matching
+        the spec's `error.type` attribute). It rides along to the consumed
+        counter so dashboards can group ERRORED jobs by exception class. Only
+        the live exception-driven paths supply it — rescue (LOST) and direct
+        cancellations have no exception object to derive it from.
+
         fire_hook controls whether on_aborted dispatches synchronously. The
         rescue path passes fire_hook=False so it can dispatch hooks AFTER its
         outer transaction commits — otherwise a hook DB error would mark the
@@ -483,6 +490,13 @@ def convert_to_result(
             # Delete the JobProcess now
             self.delete()
 
+        # Counter ticks for every terminal status — the live SUCCESSFUL/ERRORED
+        # paths plus the LOST/CANCELLED paths that don't flow through
+        # JobProcess.run()'s finally. The outcome attribute lets dashboards
+        # split throughput by final status; error_type is forwarded for ERRORED
+        # jobs caught by the live path.
+        record_consumed(result, error_type=error_type)
+
         # Fire Job.on_aborted outside the atomic block so a raise in user code
         # can't roll back the framework's bookkeeping. Only for terminal
         # statuses run() couldn't observe.
@@ -768,6 +782,15 @@ def __str__(self) -> str:
         return f"WorkerHeartbeat({self.worker_id} on {self.hostname}:{self.pid})"
 
 
+def heartbeat_cutoff() -> datetime.datetime:
+    """The timestamp before which a WorkerHeartbeat is considered stale.
+
+    Single source of truth — rescue, admin display, and OTel gauges all
+    consult this so they agree on which workers are alive.
+    """
+    return timezone.now() - datetime.timedelta(seconds=settings.JOBS_HEARTBEAT_TIMEOUT)
+
+
 def rescue_stale_workers() -> list[JobResult]:
     """
     Convert in-flight JobProcess rows from dead workers to JobResult(LOST).
@@ -786,9 +809,7 @@ def rescue_stale_workers() -> list[JobResult]:
     inherently global: filtering would let one rescuer claim a dead heartbeat
     without converting all of that worker's jobs, stranding the rest forever.
     """
-    cutoff = timezone.now() - datetime.timedelta(
-        seconds=settings.JOBS_HEARTBEAT_TIMEOUT
-    )
+    cutoff = heartbeat_cutoff()
     dead_workers = WorkerHeartbeat.query.filter(last_heartbeat_at__lt=cutoff)
 
     pending_hooks: list[JobResult] = []
diff --git a/plain-jobs/plain/jobs/otel.py b/plain-jobs/plain/jobs/otel.py
@@ -6,8 +6,14 @@
 
 from opentelemetry import metrics, trace
 from opentelemetry.metrics import CallbackOptions, Observation
+from opentelemetry.semconv._incubating.attributes.code_attributes import (
+    CODE_FUNCTION_NAME,
+)
 from opentelemetry.semconv._incubating.attributes.messaging_attributes import (
     MESSAGING_DESTINATION_NAME,
+    MESSAGING_OPERATION_TYPE,
+    MESSAGING_SYSTEM,
+    MessagingOperationTypeValues,
 )
 from opentelemetry.semconv._incubating.metrics.messaging_metrics import (
     create_messaging_client_consumed_messages,
@@ -16,13 +22,21 @@
 )
 from opentelemetry.semconv.attributes.error_attributes import ERROR_TYPE
 
+from plain.postgres import Q
 from plain.postgres.aggregates import Count, Min
 from plain.utils import timezone
 from plain.utils.otel import format_exception_type
 
 if TYPE_CHECKING:
+    from .models import JobResult
     from .workers import Worker
 
+# Attribute key for the terminal-status dimension on the consumed counter.
+PLAIN_JOBS_OUTCOME = "plain.jobs.outcome"
+
+# Attribute key for the worker-liveness dimension on plain.jobs.workers.
+PLAIN_JOBS_WORKER_STATE = "plain.jobs.worker.state"
+
 try:
     _package_version = importlib.metadata.version("plain.jobs")
 except importlib.metadata.PackageNotFoundError:
@@ -46,12 +60,46 @@ def record_span_error(
     span: trace.Span,
     exc: BaseException,
     metric_attributes: dict[str, Any],
-) -> None:
+) -> str:
+    """Mark the span as failed, stamp error.type on it and on the per-call
+    metric attribute dict, and return the error.type string so the caller
+    can forward it to other instruments."""
     error_type = format_exception_type(exc)
     span.record_exception(exc)
     span.set_status(trace.StatusCode.ERROR)
     span.set_attribute(ERROR_TYPE, error_type)
     metric_attributes[ERROR_TYPE] = error_type
+    return error_type
+
+
+def process_metric_attributes(queue: str, job_class: str) -> dict[str, Any]:
+    """Base attribute dict for messaging.client.* process-side metrics.
+
+    Shared by JobProcess.run() (which adds error.type for failed jobs) and
+    record_consumed (which adds the outcome dimension). One builder so
+    keys/values stay in lockstep across the two call sites.
+    """
+    return {
+        MESSAGING_SYSTEM: "plain.jobs",
+        MESSAGING_OPERATION_TYPE: MessagingOperationTypeValues.PROCESS.value,
+        MESSAGING_DESTINATION_NAME: queue,
+        CODE_FUNCTION_NAME: f"{job_class}.run",
+    }
+
+
+def record_consumed(result: JobResult, *, error_type: str | None = None) -> None:
+    """Record one consumed-message metric point per terminal JobResult.
+
+    `plain.jobs.outcome` carries the terminal status (successful/errored/
+    lost/cancelled/deferred). `error.type` is included when known — i.e.,
+    when the live path caught an exception and forwarded it through. The
+    rescue path (LOST) and direct cancellations don't carry an error type
+    because there is no exception object to derive it from."""
+    attrs = process_metric_attributes(result.queue, result.job_class)
+    attrs[PLAIN_JOBS_OUTCOME] = result.status.lower()
+    if error_type is not None:
+        attrs[ERROR_TYPE] = error_type
+    consumed_messages_counter.add(1, attrs)
 
 
 class WorkerMetrics:
@@ -117,6 +165,15 @@ def _register_instruments(cls) -> None:
             unit="{job}",
             description="JobProcess rows currently running, per queue.",
         )
+        meter.create_observable_gauge(
+            name="plain.jobs.workers",
+            callbacks=[cls._gauge_workers],
+            unit="{worker}",
+            description=(
+                "WorkerHeartbeat row count, split by liveness state "
+                "(active=within JOBS_HEARTBEAT_TIMEOUT, stale=past it)."
+            ),
+        )
 
     # --- Callbacks ----------------------------------------------------------
 
@@ -191,6 +248,25 @@ def _gauge_running(cls, options: CallbackOptions) -> Iterable[Observation]:
 
         return _count_per_queue(JobProcess.query.running(), active.worker.queues)
 
+    # The worker-liveness gauge observes the global WorkerHeartbeat table and
+    # doesn't need a calling Worker — emit unconditionally so dashboards keep
+    # reporting even during a full worker drain. One snapshot of the cutoff
+    # is shared across both observations so a row landing exactly at the
+    # boundary can't be counted in both states (or neither).
+    @classmethod
+    def _gauge_workers(cls, options: CallbackOptions) -> Iterable[Observation]:
+        from .models import WorkerHeartbeat, heartbeat_cutoff
+
+        cutoff = heartbeat_cutoff()
+        counts = WorkerHeartbeat.query.aggregate(
+            active=Count("id", filter=Q(last_heartbeat_at__gte=cutoff)),
+            stale=Count("id", filter=Q(last_heartbeat_at__lt=cutoff)),
+        )
+        return [
+            Observation(counts["active"], {PLAIN_JOBS_WORKER_STATE: "active"}),
+            Observation(counts["stale"], {PLAIN_JOBS_WORKER_STATE: "stale"}),
+        ]
+
 
 def _count_per_queue(queryset: Any, queues: list[str]) -> list[Observation]:
     rows = queryset.filter(queue__in=queues).values("queue").annotate(c=Count("*"))
diff --git a/plain-jobs/tests/test_otel.py b/plain-jobs/tests/test_otel.py