diff --git a/Readme.md b/Readme.md index 5ae6a53e7e..c90a1d2431 100644 --- a/Readme.md +++ b/Readme.md @@ -134,6 +134,11 @@ The options in the `xml` object are taken directly from [htmlparser2](https://gi For a full list of options and their effects, see [domhandler](https://github.com/fb55/DomHandler) and [htmlparser2's options](https://github.com/fb55/htmlparser2/wiki/Parser-options). +#### Using `htmlparser2` + +Cheerio ships with two parsers, `parse5` and `htmlparser2`. The +former is the default for HTML, the latter the default for XML. + Some users may wish to parse markup with the `htmlparser2` library, and traverse/manipulate the resulting structure with Cheerio. This may be the case for those upgrading from pre-1.0 releases of Cheerio (which relied on @@ -156,6 +161,13 @@ const dom = htmlparser2.parseDocument(document, options); const $ = cheerio.load(dom); ``` +If you want to save some bytes, you can use Cheerio's _slim_ export, which +always uses `htmlparser2`: + +```js +const cheerio = require('cheerio/lib/slim'); +``` + ### Selectors Cheerio's selector implementation is nearly identical to jQuery's, so the API is very similar. @@ -210,21 +222,6 @@ cheerio.html($('.pear')); //=>
  • Pear
  • ``` -By default, `html` will leave some tags open. Sometimes you may instead want to render a valid XML document. For example, you might parse the following XML snippet: - -```js -const $ = cheerio.load( - '' -); -``` - -... and later want to render to XML. To do this, you can use the 'xml' utility function: - -```js -$.xml(); -//=> -``` - You may also render the text content of a Cheerio object using the `text` static method: ```js @@ -246,7 +243,7 @@ $.prototype.logHtml = function () { $('body').logHtml(); // logs "Hello, world!" to the console ``` -If you're using TypeScript, you should also add a type definition for your new method: +If you're using TypeScript, you should add a type definition for your new method: ```ts declare module 'cheerio' { diff --git a/src/api/manipulation.ts b/src/api/manipulation.ts index 31a7a339ad..21d3552a71 100644 --- a/src/api/manipulation.ts +++ b/src/api/manipulation.ts @@ -5,8 +5,8 @@ */ import { Node, NodeWithChildren, Element, Text, hasChildren } from 'domhandler'; -import { default as parse, update as updateDOM } from '../parse'; -import { html as staticHtml, text as staticText } from '../static'; +import { update as updateDOM } from '../parse'; +import { text as staticText } from '../static'; import { domEach, cloneDom, isTag, isHtml, isCheerio } from '../utils'; import { removeElement } from 'domutils'; import type { Cheerio } from '../cheerio'; @@ -39,7 +39,7 @@ export function _makeDomArray( ); } if (typeof elem === 'string') { - return parse(elem, this.options, false).children; + return this._parse(elem, this.options, false).children; } return clone ? cloneDom([elem]) : [elem]; } @@ -63,7 +63,7 @@ function _insert( if (!hasChildren(el)) return; const domSrc = typeof elems[0] === 'function' - ? elems[0].call(el, i, staticHtml(el.children)) + ? elems[0].call(el, i, this._render(el.children)) : (elems as Node[]); const dom = this._makeDomArray(domSrc, i < lastIdx); @@ -599,7 +599,7 @@ export function after( const domSrc = typeof elems[0] === 'function' - ? elems[0].call(el, i, staticHtml(el.children)) + ? elems[0].call(el, i, this._render(el.children)) : (elems as Node[]); const dom = this._makeDomArray(domSrc, i < lastIdx); @@ -713,7 +713,7 @@ export function before( const domSrc = typeof elems[0] === 'function' - ? elems[0].call(el, i, staticHtml(el.children)) + ? elems[0].call(el, i, this._render(el.children)) : (elems as Node[]); const dom = this._makeDomArray(domSrc, i < lastIdx); @@ -923,7 +923,7 @@ export function html( if (str === undefined) { const el = this[0]; if (!el || !hasChildren(el)) return null; - return staticHtml(el.children, this.options); + return this._render(el.children); } // Keep main options unchanged @@ -939,7 +939,7 @@ export function html( const content = isCheerio(str) ? str.toArray() - : parse(`${str}`, opts, false).children; + : this._parse(`${str}`, opts, false).children; updateDOM(content, el); }); @@ -952,7 +952,7 @@ export function html( * @returns The rendered document. */ export function toString(this: Cheerio): string { - return staticHtml(this, this.options); + return this._render(this); } /** @@ -992,9 +992,9 @@ export function text( } if (typeof str === 'function') { // Function support - return domEach(this, (el, i) => { - text.call(this._make(el), str.call(el, i, staticText([el]))); - }); + return domEach(this, (el, i) => + this._make(el).text(str.call(el, i, staticText([el]))) + ); } // Append text node to each selected elements diff --git a/src/cheerio.ts b/src/cheerio.ts index 96e58fd02c..d8b88c3821 100644 --- a/src/cheerio.ts +++ b/src/cheerio.ts @@ -64,6 +64,30 @@ export abstract class Cheerio implements ArrayLike { dom: ArrayLike | T | string, context?: BasicAcceptedElems ): Cheerio; + + /** + * Parses some content. + * + * @private + * @param content - Content to parse. + * @param options - Options for parsing. + * @param isDocument - Allows parser to be switched to fragment mode. + * @returns A document containing the `content`. + */ + abstract _parse( + content: string | Document | Node | Node[] | Buffer, + options: InternalOptions, + isDocument: boolean + ): Document; + + /** + * Render an element or a set of elements. + * + * @private + * @param dom - DOM to render. + * @returns The rendered DOM. + */ + abstract _render(dom: Node | ArrayLike): string; } export interface Cheerio diff --git a/src/index.ts b/src/index.ts index be3d85ceac..f9fe7c21fd 100644 --- a/src/index.ts +++ b/src/index.ts @@ -4,6 +4,7 @@ * @category Cheerio */ export type { Cheerio } from './cheerio'; + /** * Types used in signatures of Cheerio methods. * @@ -22,8 +23,38 @@ export type { */ export type { Node, NodeWithChildren, Element, Document } from 'domhandler'; -export * from './load'; -import { load } from './load'; +export type { CheerioAPI } from './load'; +import { getLoad } from './load'; +import { getParse } from './parse'; +import { renderWithParse5, parseWithParse5 } from './parsers/parse5-adapter'; +import renderWithHtmlparser2 from 'dom-serializer'; +import { parseDocument as parseWithHtmlparser2 } from 'htmlparser2'; + +const parse = getParse((content, options, isDocument) => + options.xmlMode || options._useHtmlParser2 + ? parseWithHtmlparser2(content, options) + : parseWithParse5(content, options, isDocument) +); + +// Duplicate docs due to https://github.com/TypeStrong/typedoc/issues/1616 +/** + * Create a querying function, bound to a document created from the provided markup. + * + * Note that similar to web browser contexts, this operation may introduce + * ``, ``, and `` elements; set `isDocument` to `false` to + * switch to fragment mode and disable this. + * + * @param content - Markup to be loaded. + * @param options - Options for the created instance. + * @param isDocument - Allows parser to be switched to fragment mode. + * @returns The loaded document. + * @see {@link https://cheerio.js.org#loading} for additional usage information. + */ +export const load = getLoad(parse, (dom, options) => + options.xmlMode || options._useHtmlParser2 + ? renderWithHtmlparser2(dom, options) + : renderWithParse5(dom) +); /** * The default cheerio instance. diff --git a/src/load.ts b/src/load.ts index 481e458651..f58c92a6c1 100644 --- a/src/load.ts +++ b/src/load.ts @@ -7,20 +7,17 @@ import { import * as staticMethods from './static'; import { Cheerio } from './cheerio'; import { isHtml, isCheerio } from './utils'; -import parse from './parse'; import type { Node, Document, Element } from 'domhandler'; -import type * as Load from './load'; import { SelectorType, BasicAcceptedElems } from './types'; type StaticType = typeof staticMethods; -type LoadType = typeof Load; /** * A querying function, bound to a document created from the provided markup. * * Also provides several helper methods for dealing with the document as a whole. */ -export interface CheerioAPI extends StaticType, LoadType { +export interface CheerioAPI extends StaticType { /** * This selector method is the starting point for traversing and manipulating * the document. Like jQuery, it's the primary method for selecting elements @@ -71,145 +68,165 @@ export interface CheerioAPI extends StaticType, LoadType { /** Mimic jQuery's prototype alias for plugin authors. */ fn: typeof Cheerio.prototype; + + load: ReturnType; } -/** - * Create a querying function, bound to a document created from the provided - * markup. Note that similar to web browser contexts, this operation may - * introduce ``, ``, and `` elements; set `isDocument` to - * `false` to switch to fragment mode and disable this. - * - * @param content - Markup to be loaded. - * @param options - Options for the created instance. - * @param isDocument - Allows parser to be switched to fragment mode. - * @returns The loaded document. - * @see {@link https://cheerio.js.org#loading} for additional usage information. - */ -export function load( - content: string | Node | Node[] | Buffer, - options?: CheerioOptions | null, - isDocument = true -): CheerioAPI { - if ((content as string | null) == null) { - throw new Error('cheerio.load() expects a string'); - } - - const internalOpts = { ...defaultOptions, ...flattenOptions(options) }; - const initialRoot = parse(content, internalOpts, isDocument); - - /** Create an extended class here, so that extensions only live on one instance. */ - class LoadedCheerio extends Cheerio { - _make( - selector?: ArrayLike | T | string, - context?: BasicAcceptedElems | null - ): Cheerio { - const cheerio = initialize(selector, context); - cheerio.prevObject = this; - - return cheerio; +export function getLoad( + parse: typeof Cheerio.prototype._parse, + render: (dom: Node | ArrayLike, options: InternalOptions) => string +) { + /** + * Create a querying function, bound to a document created from the provided markup. + * + * Note that similar to web browser contexts, this operation may introduce + * ``, ``, and `` elements; set `isDocument` to `false` to + * switch to fragment mode and disable this. + * + * @param content - Markup to be loaded. + * @param options - Options for the created instance. + * @param isDocument - Allows parser to be switched to fragment mode. + * @returns The loaded document. + * @see {@link https://cheerio.js.org#loading} for additional usage information. + */ + return function load( + content: string | Node | Node[] | Buffer, + options?: CheerioOptions | null, + isDocument = true + ): CheerioAPI { + if ((content as string | null) == null) { + throw new Error('cheerio.load() expects a string'); } - } - function initialize( - selector?: ArrayLike | T | S, - context?: BasicAcceptedElems | null, - root: BasicAcceptedElems = initialRoot, - opts?: CheerioOptions - ): Cheerio { - type Result = S extends SelectorType ? Element : T; - - // $($) - if (selector && isCheerio(selector)) return selector; - - const options = { - ...internalOpts, - ...flattenOptions(opts), - }; - const r = - typeof root === 'string' - ? [parse(root, options, false)] - : 'length' in root - ? root - : [root]; - const rootInstance = isCheerio(r) - ? r - : new LoadedCheerio(r, null, options); - // Add a cyclic reference, so that calling methods on `_root` never fails. - rootInstance._root = rootInstance; - - // $(), $(null), $(undefined), $(false) - if (!selector) { - return new LoadedCheerio(undefined, rootInstance, options); + const internalOpts = { ...defaultOptions, ...flattenOptions(options) }; + const initialRoot = parse(content, internalOpts, isDocument); + + /** Create an extended class here, so that extensions only live on one instance. */ + class LoadedCheerio extends Cheerio { + _make( + selector?: ArrayLike | T | string, + context?: BasicAcceptedElems | null + ): Cheerio { + const cheerio = initialize(selector, context); + cheerio.prevObject = this; + + return cheerio; + } + + _parse( + content: string | Document | Node | Node[] | Buffer, + options: InternalOptions, + isDocument: boolean + ) { + return parse(content, options, isDocument); + } + + _render(dom: Node | ArrayLike): string { + return render(dom, this.options); + } } - const elements: Node[] | undefined = - typeof selector === 'string' && isHtml(selector) - ? // $() - parse(selector, options, false).children - : isNode(selector) - ? // $(dom) - [selector] - : Array.isArray(selector) - ? // $([dom]) - selector - : undefined; - - const instance = new LoadedCheerio(elements, rootInstance, options); - - if (elements || !selector) { - return instance as any; - } + function initialize( + selector?: ArrayLike | T | S, + context?: BasicAcceptedElems | null, + root: BasicAcceptedElems = initialRoot, + opts?: CheerioOptions + ): Cheerio { + type Result = S extends SelectorType ? Element : T; + + // $($) + if (selector && isCheerio(selector)) return selector; + + const options = { + ...internalOpts, + ...flattenOptions(opts), + }; + const r = + typeof root === 'string' + ? [parse(root, options, false)] + : 'length' in root + ? root + : [root]; + const rootInstance = isCheerio(r) + ? r + : new LoadedCheerio(r, null, options); + // Add a cyclic reference, so that calling methods on `_root` never fails. + rootInstance._root = rootInstance; + + // $(), $(null), $(undefined), $(false) + if (!selector) { + return new LoadedCheerio(undefined, rootInstance, options); + } + + const elements: Node[] | undefined = + typeof selector === 'string' && isHtml(selector) + ? // $() + parse(selector, options, false).children + : isNode(selector) + ? // $(dom) + [selector] + : Array.isArray(selector) + ? // $([dom]) + selector + : undefined; + + const instance = new LoadedCheerio(elements, rootInstance, options); + + if (elements || !selector) { + return instance as any; + } + + if (typeof selector !== 'string') throw new Error(''); + + // We know that our selector is a string now. + let search = selector; + + const searchContext: Cheerio | undefined = !context + ? // If we don't have a context, maybe we have a root, from loading + rootInstance + : typeof context === 'string' + ? isHtml(context) + ? // $('li', '
      ...
    ') + new LoadedCheerio( + [parse(context, options, false)], + rootInstance, + options + ) + : // $('li', 'ul') + ((search = `${context} ${search}` as S), rootInstance) + : isCheerio(context) + ? // $('li', $) + context + : // $('li', node), $('li', [nodes]) + new LoadedCheerio( + Array.isArray(context) ? context : [context], + rootInstance, + options + ); - if (typeof selector !== 'string') throw new Error(''); + // If we still don't have a context, return + if (!searchContext) return instance as any; - // We know that our selector is a string now. - let search = selector; + /* + * #id, .class, tag + */ + return searchContext.find(search) as Cheerio; + } - const searchContext: Cheerio | undefined = !context - ? // If we don't have a context, maybe we have a root, from loading - rootInstance - : typeof context === 'string' - ? isHtml(context) - ? // $('li', '
      ...
    ') - new LoadedCheerio( - [parse(context, options, false)], - rootInstance, - options - ) - : // $('li', 'ul') - ((search = `${context} ${search}` as S), rootInstance) - : isCheerio(context) - ? // $('li', $) - context - : // $('li', node), $('li', [nodes]) - new LoadedCheerio( - Array.isArray(context) ? context : [context], - rootInstance, - options - ); - - // If we still don't have a context, return - if (!searchContext) return instance as any; - - /* - * #id, .class, tag - */ - return searchContext.find(search) as Cheerio; - } - - // Add in static methods & properties - Object.assign(initialize, staticMethods, { - load, - // `_root` and `_options` are used in static methods. - _root: initialRoot, - _options: internalOpts, - // Add `fn` for plugins - fn: LoadedCheerio.prototype, - // Add the prototype here to maintain `instanceof` behavior. - prototype: LoadedCheerio.prototype, - }); - - return initialize as CheerioAPI; + // Add in static methods & properties + Object.assign(initialize, staticMethods, { + load, + // `_root` and `_options` are used in static methods. + _root: initialRoot, + _options: internalOpts, + // Add `fn` for plugins + fn: LoadedCheerio.prototype, + // Add the prototype here to maintain `instanceof` behavior. + prototype: LoadedCheerio.prototype, + }); + + return initialize as CheerioAPI; + }; } function isNode(obj: any): obj is Node { diff --git a/src/parse.spec.ts b/src/parse.spec.ts index b8f9c360bb..a987c151bc 100644 --- a/src/parse.spec.ts +++ b/src/parse.spec.ts @@ -1,7 +1,16 @@ import type { Document, Element } from 'domhandler'; -import parse from './parse'; +import { getParse } from './parse'; import defaultOpts from './options'; +import { parseDocument as parseWithHtmlparser2 } from 'htmlparser2'; +import { parseWithParse5 } from './parsers/parse5-adapter'; + +const parse = getParse((content, options, isDocument) => + options.xmlMode || options._useHtmlParser2 + ? parseWithHtmlparser2(content, options) + : parseWithParse5(content, options, isDocument) +); + // Tags const basic = ''; const siblings = '

    '; diff --git a/src/parse.ts b/src/parse.ts index 36e29138a3..25fbb2464d 100644 --- a/src/parse.ts +++ b/src/parse.ts @@ -1,6 +1,4 @@ import { removeElement } from 'domutils'; -import { parse as parseWithHtmlparser2 } from './parsers/htmlparser2-adapter'; -import { parse as parseWithParse5 } from './parsers/parse5-adapter'; import { Node, Document, @@ -12,35 +10,41 @@ import type { InternalOptions } from './options'; /* * Parser */ -export default function parse( - content: string | Document | Node | Node[] | Buffer, - options: InternalOptions, - isDocument: boolean -): Document { - if (typeof Buffer !== 'undefined' && Buffer.isBuffer(content)) { - content = content.toString(); - } +export function getParse( + parser: ( + content: string, + options: InternalOptions, + isDocument: boolean + ) => Document +) { + return function parse( + content: string | Document | Node | Node[] | Buffer, + options: InternalOptions, + isDocument: boolean + ): Document { + if (typeof Buffer !== 'undefined' && Buffer.isBuffer(content)) { + content = content.toString(); + } - if (typeof content === 'string') { - return options.xmlMode || options._useHtmlParser2 - ? parseWithHtmlparser2(content, options) - : parseWithParse5(content, options, isDocument); - } + if (typeof content === 'string') { + return parser(content, options, isDocument); + } - const doc = content as Node | Node[] | Document; + const doc = content as Node | Node[] | Document; - if (!Array.isArray(doc) && checkIsDocument(doc)) { - // If `doc` is already a root, just return it - return doc; - } + if (!Array.isArray(doc) && checkIsDocument(doc)) { + // If `doc` is already a root, just return it + return doc; + } - // Add conent to new root element - const root = new Document([]); + // Add conent to new root element + const root = new Document([]); - // Update the DOM using the root - update(doc, root); + // Update the DOM using the root + update(doc, root); - return root; + return root; + }; } /** diff --git a/src/parsers/htmlparser2-adapter.ts b/src/parsers/htmlparser2-adapter.ts deleted file mode 100644 index af3624041c..0000000000 --- a/src/parsers/htmlparser2-adapter.ts +++ /dev/null @@ -1,2 +0,0 @@ -export { parseDocument as parse } from 'htmlparser2'; -export { default as render } from 'dom-serializer'; diff --git a/src/parsers/parse5-adapter.ts b/src/parsers/parse5-adapter.ts index fbc9c27c1c..1b8f0c7bfe 100644 --- a/src/parsers/parse5-adapter.ts +++ b/src/parsers/parse5-adapter.ts @@ -7,7 +7,7 @@ interface Parse5Options extends InternalOptions { context?: Node; } -export function parse( +export function parseWithParse5( content: string, options: Parse5Options, isDocument?: boolean @@ -30,7 +30,7 @@ export function parse( parseFragment(context, content, opts); } -export function render(dom: Node | ArrayLike): string { +export function renderWithParse5(dom: Node | ArrayLike): string { /* * `dom-serializer` passes over the special "root" node and renders the * node's children in its place. To mimic this behavior with `parse5`, an diff --git a/src/slim.ts b/src/slim.ts new file mode 100644 index 0000000000..1dfe839cfa --- /dev/null +++ b/src/slim.ts @@ -0,0 +1,35 @@ +/** @file Alternative Entry point for Cheerio, excluding parse5. */ + +export type { + Cheerio, + CheerioAPI, + CheerioOptions, + HTMLParser2Options, + Node, + NodeWithChildren, + Element, + Document, +} from '.'; + +/** + * Types used in signatures of Cheerio methods. + * + * @category Cheerio + */ +export * from './types'; + +import { getLoad } from './load'; +import { getParse } from './parse'; +import render from 'dom-serializer'; +import { parseDocument } from 'htmlparser2'; + +/** + * Create a querying function, bound to a document created from the provided markup. + * + * @param content - Markup to be loaded. + * @param options - Options for the created instance. + * @param isDocument - Always `false` here, as we are always using `htmlparser2`. + * @returns The loaded document. + * @see {@link https://cheerio.js.org#loading} for additional usage information. + */ +export const load = getLoad(getParse(parseDocument), render); diff --git a/src/static.ts b/src/static.ts index b5b2508fcb..83aff66253 100644 --- a/src/static.ts +++ b/src/static.ts @@ -1,3 +1,4 @@ +import { BasicAcceptedElems } from './types'; import type { CheerioAPI, Cheerio } from '.'; import { Node, Document, isText, hasChildren } from 'domhandler'; import { @@ -6,10 +7,7 @@ import { default as defaultOptions, flatten as flattenOptions, } from './options'; -import { select } from 'cheerio-select'; import { ElementType } from 'htmlparser2'; -import { render as renderWithParse5 } from './parsers/parse5-adapter'; -import { render as renderWithHtmlparser2 } from './parsers/htmlparser2-adapter'; /** * Helper function to render a DOM. @@ -20,21 +18,13 @@ import { render as renderWithHtmlparser2 } from './parsers/htmlparser2-adapter'; * @returns The rendered document. */ function render( - that: CheerioAPI | undefined, - dom: ArrayLike | Node | string | undefined, + that: CheerioAPI, + dom: BasicAcceptedElems | undefined, options: InternalOptions ): string { - const toRender = dom - ? typeof dom === 'string' - ? select(dom, that?._root ?? [], options) - : dom - : that?._root.children; + if (!that) return ''; - if (!toRender) return ''; - - return options.xmlMode || options._useHtmlParser2 - ? renderWithHtmlparser2(toRender, options) - : renderWithParse5(toRender); + return that(dom ?? that._root.children, null, undefined, options).toString(); } /** @@ -44,9 +34,11 @@ function render( * @returns Whether the object is an options object. */ function isOptions( - dom?: string | ArrayLike | Node | InternalOptions | null -): dom is InternalOptions { + dom?: BasicAcceptedElems | CheerioOptions | null, + options?: CheerioOptions +): dom is CheerioOptions { return ( + !options && typeof dom === 'object' && dom != null && !('length' in dom) && @@ -60,7 +52,7 @@ function isOptions( * @param options - Options for the renderer. * @returns The rendered document. */ -export function html(this: CheerioAPI | void, options?: CheerioOptions): string; +export function html(this: CheerioAPI, options?: CheerioOptions): string; /** * Renders the document. * @@ -69,13 +61,13 @@ export function html(this: CheerioAPI | void, options?: CheerioOptions): string; * @returns The rendered document. */ export function html( - this: CheerioAPI | void, - dom?: string | ArrayLike | Node, + this: CheerioAPI, + dom?: BasicAcceptedElems, options?: CheerioOptions ): string; export function html( - this: CheerioAPI | void, - dom?: string | ArrayLike | Node | CheerioOptions, + this: CheerioAPI, + dom?: BasicAcceptedElems | CheerioOptions, options?: CheerioOptions ): string { /* @@ -84,10 +76,7 @@ export function html( * check dom argument for dom element specific properties * assume there is no 'length' or 'type' properties in the options object */ - if (!options && isOptions(dom)) { - options = dom; - dom = undefined; - } + const toRender = isOptions(dom) ? ((options = dom), undefined) : dom; /* * Sometimes `$.html()` is used without preloading html, @@ -95,15 +84,11 @@ export function html( */ const opts = { ...defaultOptions, - ...(this ? this._options : {}), + ...this?._options, ...flattenOptions(options ?? {}), }; - return render( - this || undefined, - dom as string | Cheerio | Node | undefined, - opts - ); + return render(this, toRender, opts); } /** @@ -112,10 +97,7 @@ export function html( * @param dom - Element to render. * @returns THe rendered document. */ -export function xml( - this: CheerioAPI, - dom?: string | ArrayLike | Node -): string { +export function xml(this: CheerioAPI, dom?: BasicAcceptedElems): string { const options = { ...this._options, xmlMode: true }; return render(this, dom, options);