From b753baad4664dff8c1e05b2b2631f9e0fb37cd1b Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 5 Jun 2026 12:04:25 +0100 Subject: [PATCH 1/9] Undo over-zealous error handling in finally promise --- src/worker.ts | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/worker.ts b/src/worker.ts index 692e8964..a21bd2fd 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -66,11 +66,9 @@ export function makeNewWorker( const promise: Promise & { /** @internal */ worker?: Worker; - } = workerDeferred - .finally(() => { - return hooks.process("stopWorker", { worker, withPgClient }); - }) - .catch(noop); + } = workerDeferred.finally(() => { + return hooks.process("stopWorker", { worker, withPgClient }); + }); promise.then( () => { @@ -428,5 +426,3 @@ export function makeNewWorker( return worker; } - -function noop() {} From 233077655409ab5b7f80fe74e44b9647f12b0cba Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 5 Jun 2026 12:27:08 +0100 Subject: [PATCH 2/9] Add lots of missing detail to RELEASE_NOTES --- RELEASE_NOTES.md | 44 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 7214b5c8..c3f3f636 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -25,29 +25,61 @@ Read more: than the `workerId`. Be sure to upgrade [Worker Pro](https://worker.graphile.org/docs/pro) at the same time if you're using it! +- BREAKING: Graphile Worker Pro users will need to update to 0.2.x at the same + time as updating Graphile Worker. +- New **batching of job completion** can be enabled by setting + `preset.worker.completeJobBatchDelay` and `preset.worker.failJobBatchDelay` to + a number `0` or higher. This can significantly reduce load on the database, at + the cost of potentially leaving more locked jobs if worker crashes or is + killed without cleanup before the batch executes. +- New **local queue** can be enabled by setting `preset.worker.localQueue.size` + to `1` or higher (recommend setting it to `concurrency + 1`). This allows jobs + to be fetched in a batch and distributed locally rather on-demand fetching + from each worker in the local pool; this significantly reduces load on the + database especially if you're running with high concurrency. The trade-off is + that more jobs will be locked at a time, and when local queues are full higher + priority tasks added to the database won't be "seen" until the local queue is + refetched. - New `addJobs()` JS method to enable efficiently adding a batch of jobs via the - JS API + JS API. +- Prepared statements can now be disabled via + `preset.worker.preparedStatements = false`, useful when connecting to the + database in a way that does not reliably support them. There's a minor + performance penalty, so only turn them off if you need to. +- New `middleware` system enabled by `graphile-config` is now preferred over + hooks (which will be deprecated and removed in later versions). Middleware + better enable activities to be wrapped, e.g. `bootstrap` middleware can take + actions both before and after bootstrapping and share variables between them, + rather than the old `prebootstrap`/`postbootstrap` hooks which need more + complex integration. +- Fixed issue with cron where enough errors (for example due to database + downtime) would cause it to stop trying, leaving a seemingly healthy worker + that wasn't executing cron. Cron will now retry startup indefinitely (with + exponential backoff) _and_ will perform the relevant backfills when the + connection is re-established. - DEPRECATION: `quickAddJob` has been renamed to `addJobAdhoc` to make it clearer that it's for use in one-off locations (some felt the "quick" referred to the speed it executed, rather than the amount of effort required from the - programmer) + programmer). - We'll now warn you if you haven't installed error handlers on the pool, and - will only install them ourself if needed + will only install them ourself if needed. - Fixes bug where CLI defaults override `graphile.config.js` settings (by - removing CLI defaults) + removing CLI defaults). - Fix bug where executable tasks had their stdout/stderr ignored; this is now output via logging (thanks @wineTGH). - Fix race condition when multiple workers attempt to initialise the database at - the same time + the same time. - `helpers.abortSignal` is no longer typed as `| undefined`. It is still experimental! - `helpers.abortPromise` added; will reject when `abortSignal` aborts (useful - for `Promise.race()`) + for `Promise.race()`). - `backfillPeriod` is now marked as optional in TypeScript (defaults to 0). - Support for loading tasks from nested folders in crontab. - (`* * * * * nested/folder/task ?jobKey=my_key&jobKeyMode=preserve_run_at`) - Most of our event emitters now trap errors and output a log if such error were to occur - useful for debugging. +- Worker event payloads now include `ctx`, making plugin/event integrations more + consistent. ## v0.16.6 From d8f09f30b0842583ec22a2da272a9a9a5e7ce19d Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 5 Jun 2026 12:29:37 +0100 Subject: [PATCH 3/9] yarn website:update --- website/docs/config.md | 46 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/website/docs/config.md b/website/docs/config.md index 2df9200f..70423ce7 100644 --- a/website/docs/config.md +++ b/website/docs/config.md @@ -156,15 +156,28 @@ Here are the options under the `worker` key as defined by +Options for Graphile Worker + ```ts { + completeJobBatchDelay?: number; concurrentJobs?: number; connectionString?: string; crontabFile?: string; events?: WorkerEvents; + failJobBatchDelay?: number; fileExtensions?: string[]; getQueueNameBatchDelay?: number; gracefulShutdownAbortTimeout?: number; + localQueue?: { + size: number; + ttl?: number; + refetchDelay?: { + durationMs: number; + threshold?: number; + maxAbortThreshold?: number; + }; +}; logger?: Logger<{}>; maxPoolSize?: number; maxResetLockedInterval?: number; @@ -177,9 +190,17 @@ Here are the options under the `worker` key as defined by } ``` -See the -[Graphile Worker source](https://github.com/jcgsville/worker/blob/85c36ac4e684a3a782fc528dca95c8ba6177fa8a/src/config.ts#L13) -for the default `worker` options set by the default Worker Preset. +### worker.completeJobBatchDelay + +Type: `number | undefined` + +The time in milliseconds to wait after a `completeJob` call to see if there are +any other completeJob calls that can be batched together. A setting of `-1` +disables this. + +Enabling this feature increases the time for which jobs are locked past +completion, thus increasing the risk of catastrophic failure resulting in the +jobs being executed again once they expire. ### worker.concurrentJobs @@ -210,6 +231,16 @@ startup. (Without this, Worker will provision its own `EventEmitter`, but you can't retrieve it until the promise returned by the API you have called has resolved.) +### worker.failJobBatchDelay + +Type: `number | undefined` + +The time in milliseconds to wait after a `failJob` call to see if there are any +other failJob calls that can be batched together. A setting of `-1` disables +this. + +Enabling this feature increases the time for which jobs are locked past failure. + ### worker.fileExtensions Type: `string[] | undefined` @@ -236,6 +267,15 @@ How long in milliseconds after a gracefulShutdown is triggered should Graphile Worker wait to trigger the AbortController, which should cancel supported asynchronous actions? +### worker.localQueue + +Type: +`{ size: number; ttl?: number; refetchDelay?: { durationMs: number; threshold?: number; maxAbortThreshold?: number; }; } | undefined` + +If you're running in high concurrency, you will likely want to reduce the load +on the database by using a local queue to distribute jobs to workers rather than +having each ask the database directly. + ### worker.logger Type: `Logger<{}> | undefined` From 59143b04d6b3a9edd0b00c98c7821a689a19fb3d Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 5 Jun 2026 13:34:01 +0100 Subject: [PATCH 4/9] More detail in docs --- src/index.ts | 50 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/src/index.ts b/src/index.ts index 6f431c9b..7a3d91e3 100644 --- a/src/index.ts +++ b/src/index.ts @@ -263,9 +263,21 @@ declare global { events?: WorkerEvents; /** - * If you're running in high concurrency, you will likely want to reduce - * the load on the database by using a local queue to distribute jobs to - * workers rather than having each ask the database directly. + * The localQueue enables Graphile Worker to lock and pull down a batch + * of jobs to execute at once, distributing them to individual workers on + * demand without the worker needing a roundtrip to the database to fetch + * the next job. This significantly reduces load on the database and can + * massively improve throughput, but it does have tradeoffs: more jobs + * locked[^1] so job execution latency may increase (e.g. if jobs are + * locked in a local queue on one Worker instance whilst another Worker + * instance sits idle), and newly added high priority jobs will not be + * detected until the local queue is exhausted and triggers a re-fetch. + * + * [^1]: Jobs are locked when they enter the local queue so no other + * Worker instance can grab them, but this means even jobs that aren't + * being worked may be locked. + * + * @see {@link https://worker.graphile.org/docs/performance#batching} */ localQueue?: { /** @@ -278,10 +290,20 @@ declare global { * * This setting can help reduce the load on your database from looking * for jobs, but is only really effective when there are often many jobs - * queued and ready to go, and can increase the latency of job execution + * queued and ready to go. It can increase the latency of job execution * because a single worker may lock jobs into its queue leaving other * workers idle. * + * A good starting point is often `concurrentJobs + 1`, but the correct + * setting will depend very heavily on your setup - if you have very + * high throughput and extremely fast tasks then a much higher number + * may make sense, whereas if you have low throughput or slower tasks + * then a lower value or even disabling may make sense. Even setting + * this to `2` can be impactful as it allows the next job to already be + * available locally when a task completes. + * + * @see {@link https://worker.graphile.org/docs/performance#batching} + * * @default `-1` */ size: number; @@ -362,11 +384,18 @@ declare global { /** * The time in milliseconds to wait after a `completeJob` call to see if * there are any other completeJob calls that can be batched together. A - * setting of `-1` disables this. + * setting of `-1` disables this. This is most impactful when you have + * high throughput (even intermittently) as it can significantly reduce + * database load, though there are trade-offs. Even setting this to `0` + * can be impactful, but we recommend starting with a noticeable fraction + * of a second `250`. * * Enabling this feature increases the time for which jobs are locked - * past completion, thus increasing the risk of catastrophic failure - * resulting in the jobs being executed again once they expire. + * past completion, thus increasing the risk that a catastrophic failure + * (e.g. worker crash or kill) may result in the jobs being executed + * again once they expire (after 4 hours by default). + * + * @see {@link https://worker.graphile.org/docs/performance#batching} * * @default `-1` */ @@ -375,10 +404,13 @@ declare global { /** * The time in milliseconds to wait after a `failJob` call to see if * there are any other failJob calls that can be batched together. A - * setting of `-1` disables this. + * setting of `-1` disables this. See `completeJobBatchDelay` for further + * details. * - * Enabling this feature increases the time for which jobs are locked + * Enabling this feature increases the time for which jobs may be locked * past failure. + + * @see {@link https://worker.graphile.org/docs/performance#batching} * * @default `-1` */ From 9b355ada071e8ca2990a67963c52ab4b535d9019 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 5 Jun 2026 13:34:17 +0100 Subject: [PATCH 5/9] Update website --- website/docs/config.md | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/website/docs/config.md b/website/docs/config.md index 70423ce7..03420ecb 100644 --- a/website/docs/config.md +++ b/website/docs/config.md @@ -196,11 +196,15 @@ Type: `number | undefined` The time in milliseconds to wait after a `completeJob` call to see if there are any other completeJob calls that can be batched together. A setting of `-1` -disables this. +disables this. This is most impactful when you have high throughput (even +intermittently) as it can significantly reduce database load, though there are +trade-offs. Even setting this to `0` can be impactful, but we recommend starting +with a noticeable fraction of a second `250`. Enabling this feature increases the time for which jobs are locked past -completion, thus increasing the risk of catastrophic failure resulting in the -jobs being executed again once they expire. +completion, thus increasing the risk that a catastrophic failure (e.g. worker +crash or kill) may result in the jobs being executed again once they expire +(after 4 hours by default). ### worker.concurrentJobs @@ -237,9 +241,10 @@ Type: `number | undefined` The time in milliseconds to wait after a `failJob` call to see if there are any other failJob calls that can be batched together. A setting of `-1` disables -this. +this. See `completeJobBatchDelay` for further details. -Enabling this feature increases the time for which jobs are locked past failure. +Enabling this feature increases the time for which jobs may be locked past +failure. ### worker.fileExtensions @@ -272,9 +277,19 @@ asynchronous actions? Type: `{ size: number; ttl?: number; refetchDelay?: { durationMs: number; threshold?: number; maxAbortThreshold?: number; }; } | undefined` -If you're running in high concurrency, you will likely want to reduce the load -on the database by using a local queue to distribute jobs to workers rather than -having each ask the database directly. +The localQueue enables Graphile Worker to lock and pull down a batch of jobs to +execute at once, distributing them to individual workers on demand without the +worker needing a roundtrip to the database to fetch the next job. This +significantly reduces load on the database and can massively improve throughput, +but it does have tradeoffs: more jobs locked[^1] so job execution latency may +increase (e.g. if jobs are locked in a local queue on one Worker instance whilst +another Worker instance sits idle), and newly added high priority jobs will not +be detected until the local queue is exhausted and triggers a re-fetch. + +[^1]: + Jobs are locked when they enter the local queue so no other Worker instance + can grab them, but this means even jobs that aren't being worked may be + locked. ### worker.logger From 927598168e11ce58102d8ac68323660a77a83571 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 5 Jun 2026 14:03:01 +0100 Subject: [PATCH 6/9] Add details of localQueue and batching to docs. --- website/docs/performance.md | 79 +++++++++++++++++++++++++++++++------ website/docs/scaling.md | 19 ++++++--- 2 files changed, 79 insertions(+), 19 deletions(-) diff --git a/website/docs/performance.md b/website/docs/performance.md index 8e8ed7f3..3bcf878a 100644 --- a/website/docs/performance.md +++ b/website/docs/performance.md @@ -55,21 +55,74 @@ still wish to set it higher and then using Node's `child_process` or `worker_threads` to share the compute load over multiple cores without significantly impacting the main worker's run loop. -## Enabling batching for highest performance +## Batching Graphile Worker is limited by the performance of the underlying Postgres -database, and when you hit this limit performance will start to go down (rather -than up) as you add more workers. - -To mitigate this, we've added batching functionality to many of the internal -methods which you can enable via the configuration. For example using a local -queue enables each pool to pull down a configurable number of jobs up front so -its workers can start a new job the moment their previous one completes without -having to request a new job from the database. This batching also reduces load -on the database since there are fewer total queries per second, but it's a -slight trade-off since more jobs are checked out but not necessarily actively -being worked on, so latency may increase and in the event of a crash more jobs -will be locked. +database; when you hit this limit, performance will start to go down (rather +than up) as you add more workers. To mitigate this, batching functionality can +be enabled via the configuration. If you have a high throughput job queue (even +intermittently) we recommend that you enable batching; the settings to use will +depend on your setup, but consider these as a starting point: + +```ts +let concurrentJobs = 10; // Set this to whatever suits +const preset = { + worker: { + concurrentJobs, + + // Sensible default for local queue size: one more than concurrency + localQueue: { size: concurrentJobs + 1 }, + completeJobBatchDelay: 25, + failJobBatchDelay: 25, + }, +}; +``` + +Read on for more details. + +### `localQueue` + +The local queue feature is disabled by default, but it can have a significant +impact on high throughput queues, both reducing database load and increasing +throughput - sometimes by an order of magnitude! + +The local queue enables each pool to pull down a configurable number of jobs up +front so its workers can start a new job the moment their previous one completes +without having to request a new job from the database. This batching also +reduces load on the database since there are fewer total queries per second, and +table scans are allowed to return additional results. However, it's a trade-off +since more jobs are checked out (locked) but not necessarily actively being +worked on, so: + +- if a worker doesn't exit gracefully (e.g. it crashes or is forcefully killed), + more jobs will remain locked and unable to execute until the 4 hour limit + expores. (Mitigation: Graphile Worker Pro.) +- execution latency may increase if jobs exist in one worker's local queue + whilst another worker sits idle. (Mitigation: `preset.worker.localQueue.ttl` + determines how long tasks may sit in the local queue without being worked on.) + +If your tasks are somewhat slow (taking many tens of seconds or more) and your +throughput is very low or you need high priority tasks to be executed ASAP then +you should set your localQueue size to either a low number (2+) or disable it +entirely (`-1`). When doing so, you can leave `completeJobBatchDelay` and +`failJobBatchDelay` enabled. + +### `completeJobBatchDelay` / `failJobBatchDelay` + +These methods cause job releasing (complete/fail) to become asynchronous, +allowing multiple completes/fails in a small window of time to be released via +the same roundtrip to the database, significantly reducing load on the database +and WAL churn. + +The trade-off is that jobs will not be released immediately, so in the event of +a catastrophic failure (worker crash or forced termination) more jobs may be +left in the locked state than otherwise. So long as you ensure that your workers +always exit cleanly/gracefully, these delays can significantly reduce database +load and improve throughput with minimal additional risk. + +In general, we'd advise all users to enable these settings, even if they are set +to `0` for minimal delay. `250` seems a reasonable default - release jobs at +most once every quarter of a second. ## Running the performance tests diff --git a/website/docs/scaling.md b/website/docs/scaling.md index 9fd4859d..bfac895b 100644 --- a/website/docs/scaling.md +++ b/website/docs/scaling.md @@ -5,12 +5,19 @@ title: Scaling tips PostgreSQL is not what you'd build a job queue on if you're the size of Facebook... But you're not the size of Facebook, right? -Postgres can get you pretty far, processing over 10,000 jobs per second in our -benchmarks. That's **almost a billion jobs per day**. Using Postgres as your job -queue via Graphile Worker can keep your infrastructure simple, enabling you to -focus less on infrastructure and more on getting your product's features to -market. But to maintain this performance, there's some things you must keep in -mind. +Postgres can get you pretty far, and combined with Node even further — +Graphile Worker can process over 180,000 trivial jobs per second in our +benchmarks. That's around **fifteen _billion_ jobs per day**. Using Postgres as +your job queue via Graphile Worker can keep your infrastructure simple, enabling +you to focus less on infrastructure and more on getting your product's features +to market. But to maintain this performance, there's some things you must keep +in mind. + +## Enable batching + +If you have high throughput (even intermittently), we recommend you consider +[enabling batching](/docs/performance#batching) for highest performance. This +can improve performance by over an order of magnitude. ## Keep your jobs table small From 10434dbeec20ae9f024c37c8bd2e45751111e0a5 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 5 Jun 2026 14:57:25 +0100 Subject: [PATCH 7/9] Typo --- website/docs/performance.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/performance.md b/website/docs/performance.md index 3bcf878a..94972e48 100644 --- a/website/docs/performance.md +++ b/website/docs/performance.md @@ -96,7 +96,7 @@ worked on, so: - if a worker doesn't exit gracefully (e.g. it crashes or is forcefully killed), more jobs will remain locked and unable to execute until the 4 hour limit - expores. (Mitigation: Graphile Worker Pro.) + expires. (Mitigation: Graphile Worker Pro.) - execution latency may increase if jobs exist in one worker's local queue whilst another worker sits idle. (Mitigation: `preset.worker.localQueue.ttl` determines how long tasks may sit in the local queue without being worked on.) From cfade5190882bb6cf764d74e2d2548b012f320cc Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 5 Jun 2026 14:57:30 +0100 Subject: [PATCH 8/9] Add link --- website/docs/scaling.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/website/docs/scaling.md b/website/docs/scaling.md index bfac895b..616cc02a 100644 --- a/website/docs/scaling.md +++ b/website/docs/scaling.md @@ -58,7 +58,8 @@ delete from graphile_worker._private_jobs where attempts = max_attempts and lock Jobs scheduled to run in the future can also keep the number of jobs in the jobs table higher, impacting peak performance. Be thoughtful about these tasks, and -consider batching if it becomes an issue. +consider using [batch jobs](/docs/library/add-job#batch-jobs) if it becomes an +issue. ## Use the latest Graphile Worker release From 15734de2705d84db2d4b601b0ff62fc96dba074b Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 5 Jun 2026 14:58:12 +0100 Subject: [PATCH 9/9] Consistency --- website/docs/performance.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/performance.md b/website/docs/performance.md index 94972e48..47b1a093 100644 --- a/website/docs/performance.md +++ b/website/docs/performance.md @@ -72,8 +72,8 @@ const preset = { // Sensible default for local queue size: one more than concurrency localQueue: { size: concurrentJobs + 1 }, - completeJobBatchDelay: 25, - failJobBatchDelay: 25, + completeJobBatchDelay: 250, + failJobBatchDelay: 250, }, }; ```