From 5d099c4b8ab1ea345920a7ee7e22f2ba50c33c53 Mon Sep 17 00:00:00 2001 From: EGOIST <0x142857@gmail.com> Date: Sat, 1 Aug 2020 01:31:39 +0800 Subject: [PATCH] feat: expose onBrowserPage option --- README.md | 12 ++++++++++++ package.json | 2 +- src/Crawler.ts | 40 ++++++++++++++++++++++++---------------- src/cli.ts | 3 +++ yarn.lock | 8 ++++---- 5 files changed, 44 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 15e0318..f7af404 100644 --- a/README.md +++ b/README.md @@ -164,6 +164,18 @@ module.exports = { Now you should call `window.__my_snapshot__()` instead. +### Access Puppeteer browser page + +Access the [`page`](https://pptr.dev/#?product=Puppeteer&version=v5.2.1&show=api-class-page) instance, for example, to expose some functions from Node.js to browser: + +```js +module.exports = { + async onBrowserPage(page) { + await page.exposeFunction('md5', (content) => md5(content)) + }, +} +``` + ### Source directory This is the same as using CLI `presite ./path/to/your/spa`: diff --git a/package.json b/package.json index c3fd364..b6caff1 100644 --- a/package.json +++ b/package.json @@ -30,7 +30,7 @@ "polka": "^0.5.2", "read-pkg-up": "^2.0.0", "serve-static": "^1.14.1", - "taki": "2.2.2", + "taki": "2.3.0", "update-notifier": "^4.1.0" }, "devDependencies": { diff --git a/src/Crawler.ts b/src/Crawler.ts index 6ef1525..9b556a7 100644 --- a/src/Crawler.ts +++ b/src/Crawler.ts @@ -4,6 +4,7 @@ import chalk from 'chalk' import { PromiseQueue } from '@egoist/promise-queue' import { Writer } from './Writer' import { Logger } from './Logger' +import { Page } from 'puppeteer-core' export const SPECIAL_EXTENSIONS_RE = /\.(xml|json)$/ @@ -14,16 +15,12 @@ const routeToFile = (route: string) => { return route.replace(/\/?$/, '/index.html') } -const getHref = (attrs: string) => { - const match = /href\s*=\s*(?:"(.*?)"|'(.*?)'|([^\s>]*))/.exec(attrs) - return match && (match[1] || match[2] || match[3]) -} - type CrawlerOptions = { hostname: string port: number options: { routes: string[] | (() => Promise) + onBrowserPage?: (page: Page) => void | Promise } writer: Writer logger: Logger @@ -48,13 +45,31 @@ export class Crawler { const queue = new PromiseQueue( async (route: string) => { const file = routeToFile(route) + let links: Set | undefined const html = await request({ url: `http://${hostname}:${port}${route}`, onBeforeRequest(url) { logger.log(`Crawling contents from ${chalk.cyan(url)}`) }, + async onBeforeClosingPage(page) { + links = new Set( + await page.evaluate( + ({ hostname, port }: { hostname: string; port: string }) => { + return Array.from(document.querySelectorAll('a')) + .filter((a) => { + return a.hostname === hostname && a.port === port + }) + .map((a) => a.pathname) + }, + { hostname, port: String(port) } + ) + ) + }, manually: SPECIAL_EXTENSIONS_RE.test(route) ? true : undefined, - onCreatedPage(page) { + async onCreatedPage(page) { + if (options.onBrowserPage) { + await options.onBrowserPage(page) + } page.on('console', (e) => { const type = e.type() // @ts-ignore @@ -68,16 +83,9 @@ export class Crawler { }, }) - // find all `` tags in exported html files and export links that are not yet exported - let match: RegExpExecArray | null = null - const LINK_RE = //gm - while ((match = LINK_RE.exec(html))) { - const href = getHref(match[1]) - if (href) { - const parsed = parseUrl(href) - if (!parsed.host && parsed.pathname) { - queue.add(parsed.pathname) - } + if (links && links.size > 0) { + for (const link of links) { + queue.add(link) } } diff --git a/src/cli.ts b/src/cli.ts index 389c933..f1b1985 100755 --- a/src/cli.ts +++ b/src/cli.ts @@ -4,6 +4,7 @@ import { cac } from 'cac' import chalk from 'chalk' import update from 'update-notifier' import JoyCon from 'joycon' +import { Page } from 'puppeteer-core' const pkg: typeof import('../package.json') = require('../package') @@ -36,6 +37,7 @@ async function main() { baseDir?: string outDir?: string routes?: string[] | (() => Promise) + onBrowserPage?: (page: Page) => void | Promise } let config: Required @@ -85,6 +87,7 @@ async function main() { port: server.port!, options: { routes: config.routes, + onBrowserPage: config.onBrowserPage, }, writer, logger, diff --git a/yarn.lock b/yarn.lock index ff79da4..16d583b 100644 --- a/yarn.lock +++ b/yarn.lock @@ -3155,10 +3155,10 @@ supports-color@^7.1.0: dependencies: has-flag "^4.0.0" -taki@2.2.2: - version "2.2.2" - resolved "https://registry.yarnpkg.com/taki/-/taki-2.2.2.tgz#79f44c1a04efbd171e7881caa980f667a5486abe" - integrity sha512-lKaHTw5RLFWbB4rs7vVo9O5UsC9/DIl51ijYKzgXQ44nHqgDxqPMHdCb2GnIthfC99lzvNdFup+qxjISKXV5tA== +taki@2.3.0: + version "2.3.0" + resolved "https://registry.yarnpkg.com/taki/-/taki-2.3.0.tgz#c5ad67af7e04eaaee8c11ba8736e51aed84fc2f5" + integrity sha512-CP4lDcqHWj2s03HrJUGF2jn+k/ABm4JUu2IZ8zLGyqPO/nhjxQVHBx11UaaNqg2gIoBIUSvEk5KgFE/AnCOAeQ== dependencies: debug "4.1.1" html-minifier "4.0.0"