diff --git a/packages/vinext/src/entries/pages-server-entry.ts b/packages/vinext/src/entries/pages-server-entry.ts index 0992d9af7..ac4dc7db1 100644 --- a/packages/vinext/src/entries/pages-server-entry.ts +++ b/packages/vinext/src/entries/pages-server-entry.ts @@ -122,6 +122,10 @@ export async function generateServerEntry( rewrites: nextConfig?.rewrites ?? { beforeFiles: [], afterFiles: [], fallback: [] }, headers: nextConfig?.headers ?? [], expireTime: nextConfig?.expireTime, + // Serialized regex source — see `.nextjs-ref/packages/next/src/shared/lib/router/utils/html-bots.ts`. + // Used by the bot-aware fallback path (#1543) so the override stays + // consistent with streaming metadata gating. + htmlLimitedBots: nextConfig?.htmlLimitedBots, i18n: nextConfig?.i18n ?? null, // Mirrors Next.js `experimental.disableOptimizedLoading` — when false // (the default), page scripts are emitted with `defer` in . See @@ -789,6 +793,12 @@ async function _renderPage(request, url, manifest, middlewareHeaders, options) { }, routePattern, routeUrl, + // Bot-aware fallback flip (#1543). When a crawler hits an unlisted + // \`fallback: true\` path we want a synchronous render with real + // props, not the \`Loading...\` shell. Mirrors Next.js's bot check in + // \`.nextjs-ref/packages/next/src/server/route-modules/pages/pages-handler.ts\`. + userAgent: request.headers.get("user-agent") || undefined, + htmlLimitedBots: vinextConfig.htmlLimitedBots, runInFreshUnifiedContext(callback) { var revalCtx = _createUnifiedCtx({ executionContext: _getRequestExecutionContext(), diff --git a/packages/vinext/src/index.ts b/packages/vinext/src/index.ts index 9d585a62d..08af5a096 100644 --- a/packages/vinext/src/index.ts +++ b/packages/vinext/src/index.ts @@ -3446,6 +3446,7 @@ export default function vinext(options: VinextOptions = {}): PluginOption[] { nextConfig?.trailingSlash ?? false, middlewarePath !== null, nextConfig?.clientTraceMetadata, + nextConfig?.htmlLimitedBots, ); const mwStatus = req.__vinextMiddlewareStatus; diff --git a/packages/vinext/src/server/dev-server.ts b/packages/vinext/src/server/dev-server.ts index 1d779c866..ca643ecf8 100644 --- a/packages/vinext/src/server/dev-server.ts +++ b/packages/vinext/src/server/dev-server.ts @@ -50,6 +50,7 @@ import { buildDefaultPagesNotFoundResponse } from "./pages-default-404.js"; import { resolvePagesPageMethodResponse } from "./pages-page-method.js"; import { isSerializableProps } from "./pages-serializable-props.js"; import { loadUserDocumentInitialProps } from "./pages-document-initial-props.js"; +import { isBotUserAgent } from "../utils/html-limited-bots.js"; /** * Render a React element to a string using renderToReadableStream. @@ -267,6 +268,14 @@ export function createSSRHandler( * `next.config`. When undefined or empty, no meta tags are emitted. */ clientTraceMetadata?: readonly string[], + /** + * Serialized `htmlLimitedBots` regexp source from `next.config`. Used by the + * bot-aware fallback path (#1543): when a crawler hits an unlisted + * `fallback: true` path we render synchronously with real props instead of + * the `Loading...` shell. Mirrors the Next.js production check in + * `.nextjs-ref/packages/next/src/server/route-modules/pages/pages-handler.ts`. + */ + htmlLimitedBots?: string, ) { const matcher = fileMatcher ?? createValidFileMatcher(); @@ -526,19 +535,30 @@ export function createSSRHandler( // Render the loading shell for `fallback: true` when the path // wasn't pre-rendered. Data requests still resolve real props so // the client can swap in after the shell ships. + // + // Crawler/bot deopt (#1543): if the request is from a known + // crawler, skip the fallback shell and synchronously render the + // full page so the bot indexes real content (not `Loading...`). + // Mirrors Next.js's bot check in + // `.nextjs-ref/packages/next/src/server/route-modules/pages/pages-handler.ts`. if (fallback === true && !isValidPath && !isDataReq) { - isFallbackRender = true; - if (typeof routerShim.setSSRContext === "function") { - routerShim.setSSRContext({ - pathname: patternToNextFormat(route.pattern), - query, - asPath: url, - locale: locale ?? currentDefaultLocale, - locales: i18nConfig?.locales, - defaultLocale: currentDefaultLocale, - domainLocales, - isFallback: true, - }); + const userAgentHeader = req.headers["user-agent"]; + const userAgent = Array.isArray(userAgentHeader) ? userAgentHeader[0] : userAgentHeader; + const isBotReq = !!userAgent && isBotUserAgent(userAgent, htmlLimitedBots); + if (!isBotReq) { + isFallbackRender = true; + if (typeof routerShim.setSSRContext === "function") { + routerShim.setSSRContext({ + pathname: patternToNextFormat(route.pattern), + query, + asPath: url, + locale: locale ?? currentDefaultLocale, + locales: i18nConfig?.locales, + defaultLocale: currentDefaultLocale, + domainLocales, + isFallback: true, + }); + } } } } diff --git a/packages/vinext/src/server/pages-page-data.ts b/packages/vinext/src/server/pages-page-data.ts index e70c8d322..4359b0470 100644 --- a/packages/vinext/src/server/pages-page-data.ts +++ b/packages/vinext/src/server/pages-page-data.ts @@ -13,6 +13,7 @@ import { } from "./pages-page-response.js"; import { buildDefaultPagesNotFoundResponse } from "./pages-default-404.js"; import { isSerializableProps } from "./pages-serializable-props.js"; +import { isBotUserAgent } from "../utils/html-limited-bots.js"; type PagesRedirectResult = { destination: string; @@ -161,6 +162,25 @@ export type ResolvePagesPageDataOptions = { * trigger lands. */ isOnDemandRevalidate?: boolean; + /** + * Request `User-Agent` header. Used to detect crawlers/bots so that an + * unlisted `fallback: true` path is rendered synchronously (with real + * props) for bots instead of shipping the loading shell. Mirrors Next.js + * `pages-handler.ts`: + * + * if ((isIsrFallback && isBot(req.headers['user-agent'])) || isMinimalMode) { + * isIsrFallback = false + * } + * + * See: `.nextjs-ref/packages/next/src/server/route-modules/pages/pages-handler.ts`. + */ + userAgent?: string; + /** + * Serialized `htmlLimitedBots` regexp source from `next.config`. When set, + * extends the default bot list used by `isBotUserAgent` for the fallback + * flip — keeps the override consistent with streaming metadata gating. + */ + htmlLimitedBots?: string; pageModule: PagesPageModule; params: Record; query: Record; @@ -395,6 +415,20 @@ export async function resolvePagesPageData( } } + // Crawler/bot deopt: a bot hitting an unlisted `fallback: true` path + // should get a blocking synchronous render (real content) rather than the + // loading shell, so the crawler indexes the actual page and not + // `Loading...`. Mirrors Next.js's bot check in `pages-handler.ts`: + // `.nextjs-ref/packages/next/src/server/route-modules/pages/pages-handler.ts`. + // Refs #1543. + if ( + isFallback && + options.userAgent && + isBotUserAgent(options.userAgent, options.htmlLimitedBots) + ) { + isFallback = false; + } + let pageProps: Record = {}; let gsspRes: PagesMutableGsspResponse | null = null; diff --git a/packages/vinext/src/utils/html-limited-bots.ts b/packages/vinext/src/utils/html-limited-bots.ts index 3d6adcff0..90f57c838 100644 --- a/packages/vinext/src/utils/html-limited-bots.ts +++ b/packages/vinext/src/utils/html-limited-bots.ts @@ -2,6 +2,13 @@ // packages/next/src/shared/lib/router/utils/html-bots.ts const HTML_LIMITED_BOT_UA_RE_STRING = String.raw`[\w-]+-Google|Google-[\w-]+|Chrome-Lighthouse|Slurp|DuckDuckBot|baiduspider|yandex|sogou|bitlybot|tumblr|vkShare|quora link preview|redditbot|ia_archiver|Bingbot|BingPreview|applebot|facebookexternalhit|facebookcatalog|Twitterbot|LinkedInBot|Slackbot|Discordbot|WhatsApp|SkypeUriPreview|Yeti|googleweblight`; +// Headless browser bot (executes JS). Mirrors Next.js +// `HEADLESS_BROWSER_BOT_UA_RE` in +// `.nextjs-ref/packages/next/src/shared/lib/router/utils/is-bot.ts`. +// Matches "Googlebot" but NOT "Mediapartners-Google" / "AdsBot-Google" / +// other Google crawlers, which are covered by the HTML-limited list. +const HEADLESS_BROWSER_BOT_UA_RE = /Googlebot(?!-)|Googlebot$/i; + const htmlLimitedBotRegexCache = new Map(); export function getHtmlLimitedBotRegex(htmlLimitedBots: string | undefined): RegExp { @@ -13,3 +20,25 @@ export function getHtmlLimitedBotRegex(htmlLimitedBots: string | undefined): Reg htmlLimitedBotRegexCache.set(source, regex); return regex; } + +/** + * Returns true when the User-Agent matches a known crawler/bot. Combines + * Next.js's "headless browser bot" check (Googlebot proper) with the + * "HTML-limited bot" list (Bingbot, DuckDuckBot, facebookexternalhit, …). + * + * Used by the Pages Router fallback path: a bot hitting an unlisted + * `fallback: true` route should get a synchronous render (real content) and + * not the loading shell, so the crawler indexes the actual page. Mirrors + * Next.js's `isBot()` in `.nextjs-ref/packages/next/src/shared/lib/router/utils/is-bot.ts` + * and the bot-aware fallback flip in + * `.nextjs-ref/packages/next/src/server/route-modules/pages/pages-handler.ts`. + * + * `htmlLimitedBots` allows next.config to override the HTML-limited list + * (same flag that drives `getHtmlLimitedBotRegex`), so a custom list applies + * to both streaming metadata gating and bot-aware fallback rendering. + */ +export function isBotUserAgent(userAgent: string, htmlLimitedBots?: string): boolean { + if (!userAgent) return false; + if (HEADLESS_BROWSER_BOT_UA_RE.test(userAgent)) return true; + return getHtmlLimitedBotRegex(htmlLimitedBots).test(userAgent); +} diff --git a/tests/pages-page-data.test.ts b/tests/pages-page-data.test.ts index e2ae13a3c..3f7ae8935 100644 --- a/tests/pages-page-data.test.ts +++ b/tests/pages-page-data.test.ts @@ -120,6 +120,70 @@ describe("pages page data", () => { await expect(result.response.text()).resolves.toContain("This page could not be found."); }); + // Refs #1543: a crawler/bot UA hitting an unlisted `fallback: true` path + // must NOT receive the loading shell — it should render synchronously so + // the bot indexes real content. Mirrors Next.js's bot check in + // `.nextjs-ref/packages/next/src/server/route-modules/pages/pages-handler.ts`. + it("does not set isFallback for bot User-Agent on unlisted fallback: true paths", async () => { + let gspCalled = false; + const result = await resolvePagesPageData( + createOptions({ + pageModule: { + async getStaticPaths() { + return { + fallback: true, + paths: [{ params: { slug: "hello-world" } }], + }; + }, + async getStaticProps({ params }) { + gspCalled = true; + return { props: { slug: params?.slug ?? null } }; + }, + }, + params: { slug: "unknown" }, + query: { slug: "unknown" }, + route: { isDynamic: true }, + routeUrl: "/posts/unknown", + userAgent: "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + }), + ); + + expect(result.kind).toBe("render"); + if (result.kind !== "render") throw new Error("expected render result"); + expect(result.isFallback).toBe(false); + expect(gspCalled).toBe(true); + expect(result.pageProps).toMatchObject({ slug: "unknown" }); + }); + + it("sets isFallback for normal browser User-Agent on unlisted fallback: true paths", async () => { + const result = await resolvePagesPageData( + createOptions({ + pageModule: { + async getStaticPaths() { + return { + fallback: true, + paths: [{ params: { slug: "hello-world" } }], + }; + }, + async getStaticProps() { + throw new Error("getStaticProps should not run on a fallback shell render"); + }, + }, + params: { slug: "unknown" }, + query: { slug: "unknown" }, + route: { isDynamic: true }, + routeUrl: "/posts/unknown", + userAgent: + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36", + }), + ); + + expect(result.kind).toBe("render"); + if (result.kind !== "render") throw new Error("expected render result"); + expect(result.isFallback).toBe(true); + expect(result.pageProps).toEqual({}); + }); + it("short-circuits getServerSideProps responses after res.end()", async () => { const responsePromise = Promise.resolve( new Response('{"ok":true}', { diff --git a/tests/pages-router.test.ts b/tests/pages-router.test.ts index 28978aa1f..a5d4b5c8c 100644 --- a/tests/pages-router.test.ts +++ b/tests/pages-router.test.ts @@ -1486,6 +1486,56 @@ describe("Pages Router integration", () => { expect(json.pageProps).toMatchObject({ pid: "unknown" }); }); + // Refs #1543: bot/crawler requests must bypass the `fallback: true` loading + // shell and synchronously render real content so crawlers index the page, + // not `Loading...`. Mirrors Next.js's bot check in + // `.nextjs-ref/packages/next/src/server/route-modules/pages/pages-handler.ts` + // and the Next.js e2e regression test + // `.nextjs-ref/test/e2e/prerender-crawler.test.ts`. + it("renders synchronously (not the fallback shell) for crawler UAs on unlisted fallback: true paths", async () => { + const userAgents = [ + "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + "Mozilla/5.0 (compatible; Bingbot/2.0; +http://www.bing.com/bingbot.htm)", + "DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)", + "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)", + "facebookexternalhit/1.0 (+http://www.facebook.com/externalhit_uatext.php)", + ]; + for (const userAgent of userAgents) { + const slug = `bot-slug-${Math.random().toString(36).slice(2)}`; + const res = await fetch(`${baseUrl}/products/${slug}`, { + headers: { "user-agent": userAgent }, + }); + expect(res.status, `UA: ${userAgent}`).toBe(200); + const html = await res.text(); + // Bot should see the real rendered page, not the loading shell. + expect(html, `UA: ${userAgent}`).not.toContain("Loading product..."); + expect(html, `UA: ${userAgent}`).toMatch(new RegExp(`Product ID:.*${slug}`)); + const match = html.match(/__NEXT_DATA__\s*=\s*(\{.*?\})\s*[;<]/); + expect(match, `UA: ${userAgent}`).toBeTruthy(); + const nextData = JSON.parse(match![1]); + expect(nextData.isFallback, `UA: ${userAgent}`).toBe(false); + expect(nextData.props.pageProps).toMatchObject({ pid: slug }); + } + }); + + it("still ships the fallback shell for normal browser UAs on unlisted fallback: true paths", async () => { + // Counterpart of the crawler test — the bot-flip must not catch real + // browsers. Plain Chrome UA should still receive the loading shell. + const res = await fetch(`${baseUrl}/products/non-bot-slug`, { + headers: { + "user-agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36", + }, + }); + expect(res.status).toBe(200); + const html = await res.text(); + expect(html).toContain("Loading product..."); + const match = html.match(/__NEXT_DATA__\s*=\s*(\{.*?\})\s*[;<]/); + expect(match).toBeTruthy(); + const nextData = JSON.parse(match![1]); + expect(nextData.isFallback).toBe(true); + }); + it("includes isFallback: false in __NEXT_DATA__", async () => { const res = await fetch(`${baseUrl}/products/widget`); const html = await res.text();