Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Fix for parsing broken processing instructions.

Some HTML contains broken processing instructions, such as this one
encountered in the wild: `<?xml:namespace prefix = o ns = "urn:schemas-microsoft-com:office:office" />`.

The parser crashes on this since it only accepts `?>` (or end-of-file) as
the end of a processing instruction. This patch fixes it by also
allowing `>` or `/>` to end a processing instruction.
  • Loading branch information...
commit 9e45e2ebf93caba04d8b09272ced7678dc9561a3 1 parent 3bea608
Jeroen Koops jkoops authored etrepum committed

Showing 1 changed file with 14 additions and 0 deletions. Show diff stats Hide diff stats

  1. +14 0 src/mochiweb_html.erl
14 src/mochiweb_html.erl
@@ -576,6 +576,10 @@ find_qgt(Bin, S=#decoder{offset=O}) ->
576 576 case Bin of
577 577 <<_:O/binary, "?>", _/binary>> ->
578 578 ?ADV_COL(S, 2);
  579 + <<_:O/binary, ">", _/binary>> ->
  580 + ?ADV_COL(S, 1);
  581 + <<_:O/binary, "/>", _/binary>> ->
  582 + ?ADV_COL(S, 2);
579 583 %% tokenize_attributes takes care of this state:
580 584 %% <<_:O/binary, C, _/binary>> ->
581 585 %% find_qgt(Bin, ?INC_CHAR(S, C));
@@ -1236,5 +1240,15 @@ parse_missing_attr_name_test() ->
1236 1240 {<<"html">>, [ { <<"=">>, <<"=">> }, { <<"black">>, <<"black">> } ], [] },
1237 1241 mochiweb_html:parse(D0)),
1238 1242 ok.
  1243 +
  1244 +parse_broken_pi_test() ->
  1245 + D0 = <<"<html><?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" /></html>">>,
  1246 + ?assertEqual(
  1247 + {<<"html">>, [], [
  1248 + { pi, <<"xml:namespace">>, [ { <<"prefix">>, <<"o">> },
  1249 + { <<"ns">>, <<"urn:schemas-microsoft-com:office:office">> } ] }
  1250 + ] },
  1251 + mochiweb_html:parse(D0)),
  1252 + ok.
1239 1253
1240 1254 -endif.

0 comments on commit 9e45e2e

Please sign in to comment.
Something went wrong with that request. Please try again.