diff --git a/packages/gatsby-source-drupal/README.md b/packages/gatsby-source-drupal/README.md index 64e377413a3d2..9db4a3b78f2a4 100644 --- a/packages/gatsby-source-drupal/README.md +++ b/packages/gatsby-source-drupal/README.md @@ -34,6 +34,12 @@ module.exports = { } ``` +On the Drupal side, we highly recommend installing [JSON:API +Extras](https://www.drupal.org/project/jsonapi_extras) and enabling "Include +count in collection queries" `/admin/config/services/jsonapi/extras` as that +[speeds up fetching data from Drupal by around +4x](https://github.com/gatsbyjs/gatsby/pull/32883). + ### Filters You can use the `filters` option to limit the data that is retrieved from Drupal. Filters are applied per JSON API collection. You can use any [valid JSON API filter query](https://www.drupal.org/docs/8/modules/jsonapi/filtering). For large data sets this can reduce the build time of your application by allowing Gatsby to skip content you'll never use. diff --git a/packages/gatsby-source-drupal/src/gatsby-node.js b/packages/gatsby-source-drupal/src/gatsby-node.js index 7ed22d79de363..4ad61439251a9 100644 --- a/packages/gatsby-source-drupal/src/gatsby-node.js +++ b/packages/gatsby-source-drupal/src/gatsby-node.js @@ -22,7 +22,28 @@ const agent = { // http2: new http2wrapper.Agent(), } +let start +let apiRequestCount = 0 +let initialSourcing = true +let globalReporter async function worker([url, options]) { + // Log out progress during the initial sourcing. + if (initialSourcing) { + apiRequestCount += 1 + if (!start) { + start = Date.now() + } + const queueLength = requestQueue.length() + if (apiRequestCount % 50 === 0) { + globalReporter.verbose( + `gatsby-source-drupal has ${queueLength} API requests queued and the current request rate is ${( + apiRequestCount / + ((Date.now() - start) / 1000) + ).toFixed(2)} requests / second` + ) + } + } + return got(url, { agent, cache: false, @@ -72,6 +93,7 @@ exports.sourceNodes = async ( }, pluginOptions ) => { + globalReporter = reporter const { baseUrl, apiBase = `jsonapi`, @@ -293,6 +315,7 @@ exports.sourceNodes = async ( drupalFetchActivity.start() let allData + const typeRequestsQueued = new Set() try { const res = await requestQueue.push([ urlJoin(baseUrl, apiBase), @@ -370,7 +393,39 @@ exports.sourceNodes = async ( if (d.body.included) { dataArray.push(...d.body.included) } - if (d.body.links && d.body.links.next) { + + // If JSON:API extras is configured to add the resource count, we can queue + // all API requests immediately instead of waiting for each request to return + // the next URL. This lets us request resources in parallel vs. sequentially + // which is much faster. + if (d.body.meta?.count) { + // If we hadn't added urls yet + if (d.body.links.next?.href && !typeRequestsQueued.has(type)) { + typeRequestsQueued.add(type) + + // Get count of API requests + // We round down as we've already gotten the first page at this point. + const pageSize = new URL(d.body.links.next.href).searchParams.get( + `page[limit]` + ) + const requestsCount = Math.floor(d.body.meta.count / pageSize) + + reporter.verbose( + `queueing ${requestsCount} API requests for type ${type} which has ${d.body.meta.count} entities.` + ) + + const newUrl = new URL(d.body.links.next.href) + await Promise.all( + _.range(requestsCount).map(pageOffset => { + // We're starting 1 ahead. + pageOffset += 1 + // Construct URL with new pageOffset. + newUrl.searchParams.set(`page[offset]`, pageOffset * pageSize) + return getNext(newUrl.toString()) + }) + ) + } + } else if (d.body.links?.next) { await getNext(d.body.links.next) } } @@ -480,6 +535,9 @@ exports.sourceNodes = async ( createNode(node) } + // We're now done with the initial sourcing. + initialSourcing = false + return }