plain-jobs: skip silently when defer re-enqueue is blocked

davegaeddert · davegaeddert · commit 3171bb423815 · 2026-05-20T21:13:21.000-05:00
Previously `JobProcess.defer()` raised `DeferError` when the re-enqueue's
`should_enqueue()` returned False, surfacing as an `ERRORED` JobResult and
a per-job exception on the consumer span. That contradicted the framework
convention everywhere else: `run_in_worker()` and `retry_job()` both return
None silently in the same situation. Honor `should_enqueue=False` the same
way in the defer path — record `DEFERRED` with `retry_job_request_uuid=NULL`
and stamp `plain.jobs.defer.skipped=True` on the consumer span. Remove the
now-unraised `DeferError`.
diff --git a/plain-jobs/plain/jobs/__init__.py b/plain-jobs/plain/jobs/__init__.py
@@ -2,9 +2,9 @@
 
 __version__ = version("plain.jobs")
 
-from .exceptions import DeferError, DeferJob
+from .exceptions import DeferJob
 from .jobs import Job
 from .middleware import JobMiddleware
 from .registry import register_job
 
-__all__ = ["Job", "DeferJob", "DeferError", "JobMiddleware", "register_job"]
+__all__ = ["Job", "DeferJob", "JobMiddleware", "register_job"]
diff --git a/plain-jobs/plain/jobs/exceptions.py b/plain-jobs/plain/jobs/exceptions.py
@@ -21,14 +21,3 @@ def __init__(self, *, delay: int, increment_retries: bool = False):
         self.delay = delay
         self.increment_retries = increment_retries
         super().__init__(f"Job deferred for {delay} seconds")
-
-
-class DeferError(Exception):
-    """Raised when a deferred job cannot be re-enqueued.
-
-    This typically happens when concurrency limits prevent the job from being
-    re-queued. The transaction will be rolled back and the job will remain
-    in its current state, then be converted to ERRORED status for retry.
-    """
-
-    pass
diff --git a/plain-jobs/plain/jobs/models.py b/plain-jobs/plain/jobs/models.py
@@ -20,7 +20,7 @@
 from plain.runtime import settings
 from plain.utils import timezone
 
-from .exceptions import DeferError, DeferJob
+from .exceptions import DeferJob
 from .otel import (
     operation_duration_histogram,
     process_metric_attributes,
@@ -326,25 +326,19 @@ def run(self) -> JobResult:
                                 "job_process_uuid": self.uuid,
                             },
                         )
-                        return self.defer(job=job, defer_exception=e)
+                        result = self.defer(job=job, defer_exception=e)
+                        if result.retry_job_request_uuid is None:
+                            # Re-enqueue was blocked by should_enqueue() —
+                            # either the default uniqueness rule (a peer
+                            # exists) or a user override (rate limit, custom
+                            # rule). Same treatment as the initial-enqueue
+                            # path's `job.enqueue.skipped`: not an error,
+                            # just visibility on the consumer span.
+                            span.set_attribute("plain.jobs.defer.skipped", True)
+                        return result
 
                     return self.convert_to_result(status=JobResultStatuses.SUCCESSFUL)
 
-                except DeferError as e:
-                    # Defer failed (e.g., concurrency limit reached during re-enqueue)
-                    # The transaction was rolled back, so the JobProcess still exists in DB.
-                    # The pk was restored in defer() before raising, so we can proceed normally.
-                    logger.warning(
-                        "Defer failed",
-                        extra={"job_class": self.job_class, "error": str(e)},
-                    )
-                    error_type = record_span_error(span, e, metric_attributes)
-                    return self.convert_to_result(
-                        status=JobResultStatuses.ERRORED,
-                        error=str(e),
-                        error_type=error_type,
-                    )
-
                 except Exception as e:
                     # Note: if a rescuer already wrote JobResult(LOST) for this
                     # row (heartbeat went stale during a long job, then the job
@@ -367,12 +361,17 @@ def defer(self, *, job: Job, defer_exception: DeferJob) -> JobResult:
         """Defer this job by re-enqueueing it for later execution.
 
         Atomically deletes the JobProcess, re-enqueues the job, and creates
-        a JobResult linking to the new request. This ensures the concurrency
-        slot is released before attempting to re-enqueue.
-
-        Raises:
-            DeferError: If the job cannot be re-enqueued (e.g., due to concurrency limits).
-                       The transaction will be rolled back and the JobProcess will remain.
+        a JobResult. The concurrency slot is released before re-enqueue so
+        the new request's own `should_enqueue()` check can pass.
+
+        If `should_enqueue()` blocks the re-enqueue, the framework honors
+        that signal — same convention as `run_in_worker()` and `retry_job()`,
+        which both return `None` silently in the same situation. The
+        JobResult is still `DEFERRED` but `retry_job_request_uuid` is
+        `None`, the error message records that the re-enqueue was skipped,
+        and the caller stamps `plain.jobs.defer.skipped=True` on the
+        consumer span so this case is queryable in APM without surfacing
+        as an exception.
         """
         # Calculate new retry_attempt based on increment_retries
         retry_attempt = (
@@ -383,7 +382,6 @@ def defer(self, *, job: Job, defer_exception: DeferJob) -> JobResult:
 
         with transaction.atomic():
             # 1. Save JobProcess state and delete (releases concurrency slot)
-            saved_id = self.id
             job_process_uuid = self.uuid
             job_request_uuid = self.job_request_uuid
             requested_at = self.requested_at
@@ -400,21 +398,23 @@ def defer(self, *, job: Job, defer_exception: DeferJob) -> JobResult:
                 concurrency_key=self.concurrency_key,
             )
 
-            # Check if re-enqueue failed
             if new_job_request is None:
-                # Restore id since transaction will roll back and object still exists
-                self.id = saved_id
-                raise DeferError(
-                    f"Failed to re-enqueue deferred job {self.job_class}: "
-                    f"concurrency limit reached for key '{self.concurrency_key}'"
+                error = (
+                    f"Deferred for {defer_exception.delay} seconds "
+                    f"(re-enqueue skipped: should_enqueue() returned False "
+                    f"for concurrency_key '{self.concurrency_key}')"
                 )
+                retry_job_request_uuid = None
+            else:
+                error = f"Deferred for {defer_exception.delay} seconds"
+                retry_job_request_uuid = new_job_request.uuid
 
-            # 3. Create JobResult linking to new request
+            # 3. Create JobResult (linking to new request if one was created)
             result = JobResult.query.create(
                 ended_at=timezone.now(),
-                error=f"Deferred for {defer_exception.delay} seconds",
+                error=error,
                 status=JobResultStatuses.DEFERRED,
-                retry_job_request_uuid=new_job_request.uuid,
+                retry_job_request_uuid=retry_job_request_uuid,
                 # From the JobProcess
                 job_process_uuid=job_process_uuid,
                 started_at=started_at,
diff --git a/plain-jobs/tests/internal/test_otel.py b/plain-jobs/tests/internal/test_otel.py
@@ -544,6 +544,34 @@ def test_consumed_counter_records_outcome_for_deferred(
     assert deferred, "expected a consumed counter point with outcome=deferred"
 
 
+@pytest.mark.usefixtures("db")
+def test_defer_skipped_when_reenqueue_blocked() -> None:
+    """When defer()'s re-enqueue is blocked by should_enqueue() returning
+    False, the framework honors the signal silently — same convention as
+    run_in_worker() and retry_job(), which both return None in the same
+    situation. The result is recorded as DEFERRED with no retry uuid so
+    the case is visible in admin without surfacing as an exception."""
+    from plain.jobs.exceptions import DeferJob
+    from plain.jobs.models import JobResultStatuses
+
+    # Seed a JobProcess via a job whose should_enqueue allows it through.
+    request = _NoopJob().run_in_worker(concurrency_key="busy")
+    assert request is not None
+    process = request.convert_to_job_process(worker_id=uuid.uuid4())
+
+    # Defer using a job that always says should_enqueue=False. In
+    # production these would be the same class; using two here lets the
+    # initial enqueue succeed and only the re-enqueue get blocked.
+    result = process.defer(
+        job=_ExclusiveJob(),
+        defer_exception=DeferJob(delay=60),
+    )
+
+    assert result.status == JobResultStatuses.DEFERRED
+    assert result.retry_job_request_uuid is None
+    assert "re-enqueue skipped" in result.error
+
+
 @pytest.mark.usefixtures("db")
 def test_workers_gauge_splits_by_state_attribute(metrics, settings) -> None:
     """One `plain.jobs.workers` gauge with `plain.jobs.worker.state` attribute