diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f584d1..8d90a43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Bulk retry with delay — "+5s", "+10s", "+30s", and "+1m" stagger buttons on the Failed Jobs page retry all matched jobs with a configurable interval between each; the first job runs immediately, subsequent jobs are scheduled at incremental offsets; uses per-execution `retry` so `scheduled_at` is respected by SolidQueue's dispatcher; buttons only appear when more than one job is present - Scheduled job management — "Run Now" promotes a scheduled job to run immediately by back-dating its `scheduled_at`; "+1h", "+24h", and "+7d" buttons push `scheduled_at` forward by the chosen offset; both actions update the execution and the underlying job record; Turbo Stream responses remove the row on "Run Now" and update the `scheduled_at` cell in place on postpone ## [0.9.0] - 2026-05-20 diff --git a/README.md b/README.md index ae12182..1503d85 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ SolidQueueWeb surfaces all of this in a browser UI available at any route you ch - **Queues** — all queues sorted by name with size; oldest ready job latency (color-coded, with UTC timestamp tooltip); Done (24h) and Failed (24h) throughput counts; a mini 12-bar failure rate sparkline per queue showing failure % per hour over the last 12 hours; pause/resume controls - **Jobs** — filterable by status (ready, scheduled, claimed, blocked, failed) and by queue; search by job class name with dynamic auto-submit; time-based period filter (1 h / 24 h / 7 d); discard individual or all jobs; Turbo Frame navigation so only the table updates on filter or search; auto-refreshes every 10 seconds - **Scheduled job management** — reschedule a scheduled job to run immediately ("Run Now") or push its `scheduled_at` forward by 1 h, 24 h, or 7 d; Turbo Stream responses update the row in place -- **Failed jobs** — list of failed executions with error details; search by class name; filter by queue; time-based period filter; retry or discard individually or in bulk +- **Failed jobs** — list of failed executions with error details; search by class name; filter by queue; time-based period filter; retry or discard individually or in bulk; bulk retry with configurable stagger (+5s / +10s / +30s / +1m) to avoid thundering herd on recovery - **Job detail** — full arguments, timestamps, blocked-until date, and error backtrace; action buttons based on job status - **Queue management** — pause and resume individual queues; queue-scoped job list with status filter, search, and discard - **Recurring tasks** — all configured recurring tasks with cron schedule, next run time, last run time, and static/dynamic classification @@ -115,7 +115,6 @@ No authentication is enforced by default. When the `authenticate` block returns Planned features, roughly ordered by priority: **Operations** -- Bulk retry with delay — retry all failed jobs with a configurable stagger to avoid thundering herd - Admin audit log — record who retried or discarded which jobs and when (requires host-app user identity) **Infrastructure** diff --git a/app/controllers/solid_queue_web/retry_failed_jobs_controller.rb b/app/controllers/solid_queue_web/retry_failed_jobs_controller.rb index d8eab0c..7950a0f 100644 --- a/app/controllers/solid_queue_web/retry_failed_jobs_controller.rb +++ b/app/controllers/solid_queue_web/retry_failed_jobs_controller.rb @@ -1,19 +1,42 @@ module SolidQueueWeb class RetryFailedJobsController < ApplicationController + STAGGER_INTERVALS = { "5s" => 5.seconds, "10s" => 10.seconds, "30s" => 30.seconds, "1m" => 1.minute }.freeze + before_action :set_filter_params def create executions = params[:id] ? [SolidQueue::FailedExecution.find(params[:id])] : filtered_scope.to_a jobs = executions.map(&:job) - SolidQueue::FailedExecution.retry_all(jobs) + + if params[:stagger].present? && executions.size > 1 + interval = STAGGER_INTERVALS[params[:stagger]] + raise ArgumentError, "Invalid stagger interval." unless interval + executions.each_with_index do |execution, i| + execution.job.update!(scheduled_at: i.zero? ? nil : Time.current + (i * interval)) + execution.retry + end + else + SolidQueue::FailedExecution.retry_all(jobs) + end redirect_to failed_jobs_path(queue: @queue, q: @search, period: @period), - notice: "#{jobs.size} #{"job".pluralize(jobs.size)} queued for retry." + notice: retry_notice(jobs.size) + rescue ArgumentError => e + redirect_to failed_jobs_path, alert: e.message rescue => e redirect_to failed_jobs_path, alert: "Could not retry job: #{e.message}" end private + def retry_notice(count) + label = "#{count} #{"job".pluralize(count)}" + if params[:stagger].present? && count > 1 + "#{label} queued for retry, staggered #{params[:stagger]} apart." + else + "#{label} queued for retry." + end + end + def set_filter_params @queue = params[:queue].presence @search = params[:q].presence diff --git a/app/views/solid_queue_web/failed_jobs/index.html.erb b/app/views/solid_queue_web/failed_jobs/index.html.erb index 232b981..b817251 100644 --- a/app/views/solid_queue_web/failed_jobs/index.html.erb +++ b/app/views/solid_queue_web/failed_jobs/index.html.erb @@ -9,6 +9,16 @@ params: { queue: @queue, q: @search, period: @period }, class: "sqd-btn sqd-btn--primary", data: { confirm: "Retry all #{@failed_jobs.size} failed jobs?" } %> + <% if @failed_jobs.size > 1 %> + <% %w[5s 10s 30s 1m].each do |interval| %> + <%= button_to "+#{interval}", retry_all_failed_jobs_path, + method: :post, + params: { stagger: interval, queue: @queue, q: @search, period: @period }, + class: "sqd-btn sqd-btn--muted sqd-btn--sm", + title: "Retry all, staggered #{interval} apart", + data: { confirm: "Retry #{@failed_jobs.size} failed jobs staggered #{interval} apart?" } %> + <% end %> + <% end %> <%= button_to "Discard All", discard_all_failed_jobs_path, method: :post, params: { queue: @queue, q: @search, period: @period }, diff --git a/spec/requests/solid_queue_web/failed_jobs_spec.rb b/spec/requests/solid_queue_web/failed_jobs_spec.rb index 9f99982..02f8e71 100644 --- a/spec/requests/solid_queue_web/failed_jobs_spec.rb +++ b/spec/requests/solid_queue_web/failed_jobs_spec.rb @@ -221,6 +221,52 @@ end end + describe "POST /jobs/failed_jobs/retry_all with stagger (RetryFailedJobsController#create)" do + let!(:second_execution) do + j = SolidQueue::Job.create!( + queue_name: "default", + class_name: "AnotherJob", + arguments: {}, + active_job_id: SecureRandom.uuid + ) + j.ready_execution&.destroy + SolidQueue::FailedExecution.create!( + job: j, + error: { exception_class: "RuntimeError", message: "oops", backtrace: [] } + ) + end + + it "staggers scheduled_at by the given interval" do + post "/jobs/failed_jobs/retry_all", params: { stagger: "10s" } + scheduled = SolidQueue::ScheduledExecution.count + expect(scheduled).to eq(1) + end + + it "retries the first job immediately (no scheduled_at)" do + post "/jobs/failed_jobs/retry_all", params: { stagger: "10s" } + ready = SolidQueue::ReadyExecution.count + expect(ready).to eq(1) + end + + it "includes stagger info in the notice" do + post "/jobs/failed_jobs/retry_all", params: { stagger: "10s" } + follow_redirect! + expect(response.body).to include("staggered 10s apart") + end + + it "rejects invalid stagger values" do + post "/jobs/failed_jobs/retry_all", params: { stagger: "bogus" } + expect(response).to redirect_to("/jobs/failed_jobs") + follow_redirect! + expect(response.body).to include("Invalid stagger interval") + end + + it "does not stagger when only one job matches the filter" do + post "/jobs/failed_jobs/retry_all", params: { stagger: "10s", q: "TestJob" } + expect(SolidQueue::ScheduledExecution.count).to eq(0) + end + end + describe "GET /jobs/failed_jobs.csv (CSV export)" do it "returns a CSV file" do get "/jobs/failed_jobs", params: { format: :csv }