Permalink
Browse files

Fix for parsing broken processing instructions.

Some HTML contains broken processing instructions, such as this one
encountered in the wild: `<?xml:namespace prefix = o ns = "urn:schemas-microsoft-com🏢office" />`.

The parser crashes on this since it only accepts `?>` (or end-of-file) as
the end of a processing instruction. This patch fixes it by also
allowing `>` or `/>` to end a processing instruction.
  • Loading branch information...
jkoops authored and etrepum committed Oct 15, 2010
1 parent 3bea608 commit 9e45e2ebf93caba04d8b09272ced7678dc9561a3
Showing with 14 additions and 0 deletions.
  1. +14 −0 src/mochiweb_html.erl
View
@@ -576,6 +576,10 @@ find_qgt(Bin, S=#decoder{offset=O}) ->
case Bin of
<<_:O/binary, "?>", _/binary>> ->
?ADV_COL(S, 2);
+ <<_:O/binary, ">", _/binary>> ->
+ ?ADV_COL(S, 1);
+ <<_:O/binary, "/>", _/binary>> ->
+ ?ADV_COL(S, 2);
%% tokenize_attributes takes care of this state:
%% <<_:O/binary, C, _/binary>> ->
%% find_qgt(Bin, ?INC_CHAR(S, C));
@@ -1236,5 +1240,15 @@ parse_missing_attr_name_test() ->
{<<"html">>, [ { <<"=">>, <<"=">> }, { <<"black">>, <<"black">> } ], [] },
mochiweb_html:parse(D0)),
ok.
+
+parse_broken_pi_test() ->
+ D0 = <<"<html><?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" /></html>">>,
+ ?assertEqual(
+ {<<"html">>, [], [
+ { pi, <<"xml:namespace">>, [ { <<"prefix">>, <<"o">> },
+ { <<"ns">>, <<"urn:schemas-microsoft-com:office:office">> } ] }
+ ] },
+ mochiweb_html:parse(D0)),
+ ok.
-endif.

0 comments on commit 9e45e2e

Please sign in to comment.