From 78af88d7b480823a1f789c7d5a7e94573694ff65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Fri, 27 Aug 2021 13:31:59 +0100 Subject: [PATCH] fix(tokenizer): Don't lose data on `.pause` (#927) --- src/Tokenizer.spec.ts | 24 ++++++++ src/Tokenizer.ts | 70 ++++++++++++------------ src/__snapshots__/Tokenizer.spec.ts.snap | 16 ++++++ 3 files changed, 75 insertions(+), 35 deletions(-) diff --git a/src/Tokenizer.spec.ts b/src/Tokenizer.spec.ts index 5a71ee48f..8f5672cf4 100644 --- a/src/Tokenizer.spec.ts +++ b/src/Tokenizer.spec.ts @@ -41,4 +41,28 @@ describe("Tokenizer", () => { expect(tokenize("
")).toMatchSnapshot(); }); }); + + it("should not lose data when pausing", () => { + const log: unknown[][] = []; + const tokenizer = new Tokenizer( + {}, + new Proxy({} as any, { + get(_, prop) { + return (...args: unknown[]) => { + if (prop === "ontext") { + tokenizer.pause(); + } + log.push([prop, ...args]); + }; + }, + }) + ); + + tokenizer.write("& it up!"); + tokenizer.resume(); + tokenizer.resume(); + tokenizer.end(); + + expect(log).toMatchSnapshot(); + }); }); diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index a606e3a45..b88278b64 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -817,30 +817,27 @@ export default class Tokenizer { return !this.xmlMode && this.baseState === State.Text; } + /** + * Remove data that has already been consumed from the buffer. + */ private cleanup() { - if (this.sectionStart < 0) { - this.buffer = ""; - this.bufferOffset += this._index; - this._index = 0; - } else if (this.running) { - if (this._state === State.Text) { - if (this.sectionStart !== this._index) { - this.cbs.ontext(this.buffer.substr(this.sectionStart)); - } - this.buffer = ""; - this.bufferOffset += this._index; - this._index = 0; - } else if (this.sectionStart === this._index) { - // The section just started - this.buffer = ""; - this.bufferOffset += this._index; - this._index = 0; - } else { - // Remove everything unnecessary - this.buffer = this.buffer.substr(this.sectionStart); - this._index -= this.sectionStart; - this.bufferOffset += this.sectionStart; - } + // If we are inside of text, emit what we already have. + if ( + this.running && + this._state === State.Text && + this.sectionStart !== this._index + ) { + // TODO: We could emit attribute data here as well. + this.cbs.ontext(this.buffer.substr(this.sectionStart)); + this.sectionStart = this._index; + } + + const start = this.sectionStart < 0 ? this._index : this.sectionStart; + this.buffer = this.buffer.substr(start); + this._index -= start; + this.bufferOffset += start; + + if (this.sectionStart > 0) { this.sectionStart = 0; } } @@ -1000,6 +997,7 @@ export default class Tokenizer { this.cbs.onend(); } + /** Handle any trailing data. */ private handleTrailingData() { const data = this.buffer.substr(this.sectionStart); if ( @@ -1029,21 +1027,23 @@ export default class Tokenizer { this.decodeNumericEntity(16, false); // All trailing data will have been consumed } else if ( - this._state !== State.InTagName && - this._state !== State.BeforeAttributeName && - this._state !== State.BeforeAttributeValue && - this._state !== State.AfterAttributeName && - this._state !== State.InAttributeName && - this._state !== State.InAttributeValueSq && - this._state !== State.InAttributeValueDq && - this._state !== State.InAttributeValueNq && - this._state !== State.InClosingTagName + this._state === State.InTagName || + this._state === State.BeforeAttributeName || + this._state === State.BeforeAttributeValue || + this._state === State.AfterAttributeName || + this._state === State.InAttributeName || + this._state === State.InAttributeValueSq || + this._state === State.InAttributeValueDq || + this._state === State.InAttributeValueNq || + this._state === State.InClosingTagName ) { + /* + * If we are currently in an opening or closing tag, us not calling the + * respective callback signals that the tag should be ignored. + */ + } else { this.cbs.ontext(data); } - /* - * TODO add a way to remove current tag - */ } private getSection(): string { diff --git a/src/__snapshots__/Tokenizer.spec.ts.snap b/src/__snapshots__/Tokenizer.spec.ts.snap index 5b8d9202c..38a60dfff 100644 --- a/src/__snapshots__/Tokenizer.spec.ts.snap +++ b/src/__snapshots__/Tokenizer.spec.ts.snap @@ -1,5 +1,21 @@ // Jest Snapshot v1, https://goo.gl/fbAQLP +exports[`Tokenizer should not lose data when pausing 1`] = ` +Array [ + Array [ + "ontext", + "&", + ], + Array [ + "ontext", + " it up!", + ], + Array [ + "onend", + ], +] +`; + exports[`Tokenizer should support self-closing special tags for self-closing script tag 1`] = ` Array [ Array [