diff --git a/lib/htmlparser.js b/lib/htmlparser.js index fbaa31d..7f8f8b1 100644 --- a/lib/htmlparser.js +++ b/lib/htmlparser.js @@ -49,6 +49,7 @@ var Mode = { Tag: 'tag', Attr: 'attr', CData: 'cdata', + Doctype: 'doctype', Comment: 'comment' }; @@ -136,6 +137,8 @@ function Parser (builder, options) { return this._parseAttr(this._state); case Mode.CData: return this._parseCData(this._state); + case Mode.Doctype: + return this._parseDoctype(this._state); case Mode.Comment: return this._parseComment(this._state); } @@ -224,6 +227,11 @@ function Parser (builder, options) { state.pos += 8; return; } + if (!match[1] && match[2].substr(0, 8) === '!DOCTYPE') { + state.mode = Mode.Doctype; + state.pos += 8; + return; + } if (!state.done && (state.pos + match[0].length) === state.data.length) { //We're at the and of the data, might be incomplete state.needData = true; @@ -400,6 +408,35 @@ function Parser (builder, options) { } }; + Parser.prototype._parseDoctype = function Parser$_parseDoctype () { + var state = this._state; + var foundPos = state.data.indexOf('>', state.pos); + if (foundPos < 0 && state.done) { + foundPos = state.data.length; + } + if (foundPos < 0) { + Parser.re_parseCData_findEnding.lastIndex = state.pos; + if (!state.pendingText) { + state.pendingText = []; + } + state.pendingText.push(state.data.substr(state.pos, state.data.length)); + state.pos = state.data.length; + state.needData = true; + } else { + var text; + if (state.pendingText) { + state.pendingText.push(state.data.substring(state.pos, foundPos)); + text = state.pendingText.join(''); + state.pendingText = null; + } else { + text = state.data.substring(state.pos, foundPos); + } + this._write({ type: Mode.Doctype, data: text }); + state.mode = Mode.Text; + state.pos = foundPos + 1; + } + }; + Parser.re_parseComment_findEnding = /\-{1,2}$/; Parser.prototype._parseComment = function Parser$_parseComment () { var state = this._state; diff --git a/tests/parser.js b/tests/parser.js index bf70ac1..b2df208 100644 --- a/tests/parser.js +++ b/tests/parser.js @@ -367,6 +367,11 @@ exports['html inside comment'] = { , expected: [{ type: 'comment', data: '
foo
'}] }; +exports['transitional doctype'] = { + data: [''] + , expected: [{ type: 'doctype', data: ' HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"'}] +}; + exports['html inside cdata'] = { data: ['foo ]]>'] , expected: [{ type: 'cdata', data: '
foo
'}]