Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Xml compliance against control characters #62

Merged
merged 3 commits into from
May 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion c_src/exml.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ namespace {
ErlNifResourceType *parser_type;

constexpr int default_parse_flags() {
return rapidxml::parse_no_string_terminators;
return rapidxml::parse_no_string_terminators | rapidxml::parse_validate_control_chars;
}

constexpr int parse_one() {
Expand Down
62 changes: 57 additions & 5 deletions c_src/rapidxml.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,11 @@ namespace rapidxml
//! and duplicate attributes (with different prefices)
const int parse_validate_xmlns = 0x4000;

//! Parse flag instructing the parser to validate control characters.
//! https://www.w3.org/TR/2008/REC-xml-20081126/
//! Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
const int parse_validate_control_chars = 0x8000;

// Compound flags

//! Parse flags which represent default behaviour of the parser.
Expand Down Expand Up @@ -312,6 +317,7 @@ namespace rapidxml
template<int Dummy>
struct lookup_tables
{
static const unsigned char lookup_control_points[256]; // Forbidden control characters
static const unsigned char lookup_whitespace[256]; // Whitespace table
static const unsigned char lookup_node_name[256]; // Node name table
static const unsigned char lookup_element_name[256]; // Element name table
Expand Down Expand Up @@ -1745,6 +1751,15 @@ namespace rapidxml
///////////////////////////////////////////////////////////////////////
// Internal character utility functions

// Detect whitespace character
struct control_points_pred
{
static unsigned char test(Ch ch)
{
return internal::lookup_tables<0>::lookup_control_points[static_cast<unsigned char>(ch)];
}
};

// Detect whitespace character
struct whitespace_pred
{
Expand Down Expand Up @@ -1888,11 +1903,22 @@ namespace rapidxml
static void skip(Ch *&text)
{
Ch *tmp = text;
while (StopPred::test(*tmp))
while (StopPred::test(*tmp)) {
check_control<control_points_pred, Flags>(tmp);
++tmp;
}
text = tmp;
}

// Fail if a forbidden control character is found
template<class ControlPred, int Flags>
static void check_control(Ch *&text)
{
if (Flags & parse_validate_control_chars)
if (!ControlPred::test(*text))
RAPIDXML_PARSE_ERROR("unexpected control character", text);
}

// Skip characters until predicate evaluates to true while doing the following:
// - replacing XML character entity references with proper characters (&apos; &amp; &quot; &lt; &gt; &#...;)
// - condensing whitespace sequences to single space character
Expand Down Expand Up @@ -2074,6 +2100,7 @@ namespace rapidxml
while (text[0] != Ch('?') || text[1] != Ch('>'))
{
if (!text[0]) RAPIDXML_PARSE_ERROR("unexpected end of data", text);
check_control<control_points_pred, Flags>(text);
++text;
}
text += 2; // Skip '?>'
Expand Down Expand Up @@ -2107,6 +2134,7 @@ namespace rapidxml
while (text[0] != Ch('-') || text[1] != Ch('-') || text[2] != Ch('>'))
{
if (!text[0]) RAPIDXML_PARSE_ERROR("unexpected end of data", text);
check_control<control_points_pred, Flags>(text);
++text;
}
text += 3; // Skip '-->'
Expand All @@ -2120,6 +2148,7 @@ namespace rapidxml
while (text[0] != Ch('-') || text[1] != Ch('-') || text[2] != Ch('>'))
{
if (!text[0]) RAPIDXML_PARSE_ERROR("unexpected end of data", text);
check_control<control_points_pred, Flags>(text);
++text;
}

Expand Down Expand Up @@ -2328,8 +2357,8 @@ namespace rapidxml
// Skip until end of cdata
while (text[0] != Ch(']') || text[1] != Ch(']') || text[2] != Ch('>'))
{
if (!text[0])
RAPIDXML_PARSE_ERROR("unexpected end of data", text);
if (!text[0]) RAPIDXML_PARSE_ERROR("unexpected end of data", text);
check_control<control_points_pred, Flags>(text);
++text;
}
text += 3; // Skip ]]>
Expand All @@ -2340,8 +2369,8 @@ namespace rapidxml
Ch *value = text;
while (text[0] != Ch(']') || text[1] != Ch(']') || text[2] != Ch('>'))
{
if (!text[0])
RAPIDXML_PARSE_ERROR("unexpected end of data", text);
if (!text[0]) RAPIDXML_PARSE_ERROR("unexpected end of data", text);
check_control<control_points_pred, Flags>(text);
++text;
}

Expand Down Expand Up @@ -2655,6 +2684,29 @@ namespace rapidxml
namespace internal
{

// Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
template<int Dummy>
const unsigned char lookup_tables<Dummy>::lookup_control_points[256] =
{
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, // 0
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F
};

// Whitespace (space \n \r \t)
template<int Dummy>
const unsigned char lookup_tables<Dummy>::lookup_whitespace[256] =
Expand Down
44 changes: 40 additions & 4 deletions test/exml_properties_tests.erl
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,20 @@ p(Name, Property) ->
(proper:conjunction([{Name, Property}]),
[100, long_result, {to_file, user}])).

vector_1_forbidden_control_char_test() ->
?assertMatch({error, _}, exml:parse(<<"<body>", 16#1B,"</body>">>)).

vector_2_forbidden_control_char_test() ->
?assertMatch({error, _}, exml:parse(<<"<body", 16#1B,"></body", 16#1B,">">>)).

vector_3_forbidden_control_char_test() ->
?assertMatch({error, _}, exml:parse(<<"<body lang='en' bad='", 16#1B, "'></body>">>)).

fail_forbidden_control_char_test() ->
p("All valid xml cdata can be parsed",
?FORALL(Doc, utf8_doc_bad(),
not is_parseable(Doc))).

parse_test() ->
p("All valid xml cdata can be parsed",
?FORALL(Doc, utf8_doc(),
Expand Down Expand Up @@ -49,33 +63,55 @@ parse(Doc) ->
utf8_doc() ->
?LET({{ElOpen,ElClose}, Cdata},
{xml_open_close(), xml_cdata()},
unicode:characters_to_binary
(ElOpen ++ Cdata ++ ElClose)).
unicode:characters_to_binary(ElOpen ++ Cdata ++ ElClose)).

utf8_doc_bad() ->
?LET({{ElOpen,ElClose}, Cdata},
{xml_open_close_maybe_bad(), utf8_text_bad()},
unicode:characters_to_binary(ElOpen ++ Cdata ++ ElClose)).

xml_open_close() ->
?LET(TagName, tagname_text(),
{lists:flatten("<" ++ TagName ++ ">"),
lists:flatten("</" ++ TagName ++ ">")}).

xml_open_close_maybe_bad() ->
?LET(TagName, tagname_text_maybe_bad(),
{lists:flatten("<" ++ TagName ++ ">"),
lists:flatten("</" ++ TagName ++ ">")}).

tagname_text() ->
non_empty(list(choose($a, $z))).

tagname_text_maybe_bad() ->
non_empty(list(oneof([$a, $z, xml_c0_forbidden_control()]))).

%% see: https://en.wikipedia.org/wiki/Valid_characters_in_XML#XML_1.0
utf8_char() ->
%% see: https://en.wikipedia.org/wiki/Valid_characters_in_XML#XML_1.0
oneof([xml_escaped_entity(),
xml_c0_control(),
xml_utf8_bmp_char()]).

xml_c0_control() ->
elements([16#0009, 16#000A, 16#000D]).

xml_c0_forbidden_control() ->
elements([16#0000, 16#0001, 16#0002, 16#0003, 16#0004, 16#0005, 16#0006, 16#0007,
16#0008, 16#000B, 16#000C, 16#000E, 16#000F,
16#0010, 16#0011, 16#0012, 16#0013, 16#0014, 16#0015, 16#0016, 16#0017,
16#0018, 16#0019, 16#001A, 16#001B, 16#001C, 16#001D, 16#001E, 16#001F]).

utf8_text_bad() ->
non_empty(list(xml_c0_forbidden_control())).

xml_utf8_bmp_char() ->
?SUCHTHAT(C, oneof([choose(16#0020,16#D7FF),
choose(16#E000, 16#FFFD)]),
choose(16#E000, 16#FFFD)]),
not lists:member(C, [$<,$>,$&])).

xml_escaped_entity() ->
oneof(["&amp;", "&lt;", "&gt;"]).

utf8_text() ->
non_empty(list(utf8_char())).

Expand Down