Skip to content

Commit

Permalink
Tagfilter extension
Browse files Browse the repository at this point in the history
When we encounter a tag that causes an HTML 5 parser's content model
flag [1] to be changed to RCDATA, CDATA or RAWTEXT [2] [3], we escape
the tag by replacing its opening "<" with "&lt;".  This causes the tag
to appear verbatim in the page it's placed on.

We do this to prevent users breaking the page content, where the parser
would not interpret further tags as inserted by cmark as HTML until a
matching close tag was hit.  (Such a closing tag could exist if a user
entered it themselves, but it'd cause all cmark-generated markup in
between to be rendered raw, and is unlikely to be desireable behaviour.)

[1] https://www.w3.org/TR/2009/WD-html5-20090423/syntax.html#tokenization
[2] https://www.w3.org/TR/2009/WD-html5-20090212/serializing-html-fragments.html#parsing-html-fragments
[3] https://github.com/google/gumbo-parser/blob/aa91b27b02c0c80c482e24348a457ed7c3c088e0/src/parser.c#L4023-L4053
  • Loading branch information
Yuki Izumi authored and Yuki Izumi committed Jun 27, 2017
1 parent e001c1e commit b5ccb88
Show file tree
Hide file tree
Showing 8 changed files with 125 additions and 5 deletions.
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ afl:
-o test/afl_results \
-x test/afl_dictionary \
-t 100 \
$(CMARK) -e table -e strikethrough -e autolink $(CMARK_OPTS)
$(CMARK) -e table -e strikethrough -e autolink -e tagfilter $(CMARK_OPTS)

clang-check: all
${CLANG_CHECK} -p build -analyze src/*.c
Expand Down Expand Up @@ -156,8 +156,8 @@ $(ALLTESTS): $(SPEC) $(EXTENSIONS_SPEC)
leakcheck: $(ALLTESTS)
for format in html man xml latex commonmark; do \
for opts in "" "--smart"; do \
echo "cmark -t $$format -e table -e strikethrough -e autolink $$opts" ; \
valgrind -q --leak-check=full --dsymutil=yes --suppressions=suppressions --error-exitcode=1 $(PROG) -t $$format -e table -e strikethrough -e autolink $$opts $(ALLTESTS) >/dev/null || exit 1;\
echo "cmark -t $$format -e table -e strikethrough -e autolink -e tagfilter $$opts" ; \
valgrind -q --leak-check=full --dsymutil=yes --suppressions=suppressions --error-exitcode=1 $(PROG) -t $$format -e table -e strikethrough -e autolink -e tagfilter $$opts $(ALLTESTS) >/dev/null || exit 1;\
done; \
done;

Expand Down
1 change: 1 addition & 0 deletions extensions/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ set(LIBRARY_SOURCES
table.c
strikethrough.c
autolink.c
tagfilter.c
ext_scanners.c
ext_scanners.re
ext_scanners.h
Expand Down
2 changes: 2 additions & 0 deletions extensions/core-extensions.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@
#include "autolink.h"
#include "strikethrough.h"
#include "table.h"
#include "tagfilter.h"

int core_extensions_registration(cmark_plugin *plugin) {
cmark_plugin_register_syntax_extension(plugin, create_table_extension());
cmark_plugin_register_syntax_extension(plugin,
create_strikethrough_extension());
cmark_plugin_register_syntax_extension(plugin, create_autolink_extension());
cmark_plugin_register_syntax_extension(plugin, create_tagfilter_extension());
return 1;
}
59 changes: 59 additions & 0 deletions extensions/tagfilter.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#include "tagfilter.h"
#include <parser.h>

static const char *blacklist[] = {
"title", "textarea", "style", "xmp", "iframe",
"noembed", "noframes", "script", "plaintext", NULL,
};

static int is_tag(const unsigned char *tag_data, size_t tag_size,
const char *tagname) {
size_t i;

if (tag_size < 3 || tag_data[0] != '<')
return 0;

i = 1;

if (tag_data[i] == '/') {
i++;
}

for (; i < tag_size; ++i, ++tagname) {
if (*tagname == 0)
break;

if (tag_data[i] != *tagname)
return 0;
}

if (i == tag_size)
return 0;

if (cmark_isspace(tag_data[i]) || tag_data[i] == '>')
return 1;

if (tag_data[i] == '/' && tag_size >= i + 2 && tag_data[i + 1] == '>')
return 1;

return 0;
}

static int filter(cmark_syntax_extension *ext, const unsigned char *tag,
size_t tag_len) {
const char **it;

for (it = blacklist; *it; ++it) {
if (is_tag(tag, tag_len, *it)) {
return 0;
}
}

return 1;
}

cmark_syntax_extension *create_tagfilter_extension(void) {
cmark_syntax_extension *ext = cmark_syntax_extension_new("tagfilter");
cmark_syntax_extension_set_html_filter_func(ext, filter);
return ext;
}
8 changes: 8 additions & 0 deletions extensions/tagfilter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef TAGFILTER_H
#define TAGFILTER_H

#include "core-extensions.h"

cmark_syntax_extension *create_tagfilter_extension(void);

#endif
4 changes: 2 additions & 2 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,14 @@ IF (PYTHONINTERP_FOUND)
)

add_test(extensions_executable
${PYTHON_EXECUTABLE} "${CMAKE_CURRENT_SOURCE_DIR}/spec_tests.py" "--no-normalize" "--spec" "${CMAKE_CURRENT_SOURCE_DIR}/extensions.txt" "--program" "${CMAKE_CURRENT_BINARY_DIR}/../src/cmark -e table -e strikethrough -e autolink"
${PYTHON_EXECUTABLE} "${CMAKE_CURRENT_SOURCE_DIR}/spec_tests.py" "--no-normalize" "--spec" "${CMAKE_CURRENT_SOURCE_DIR}/extensions.txt" "--program" "${CMAKE_CURRENT_BINARY_DIR}/../src/cmark -e table -e strikethrough -e autolink -e tagfilter"
)

add_test(roundtrip_extensions_executable
${PYTHON_EXECUTABLE}
"${CMAKE_CURRENT_SOURCE_DIR}/roundtrip_tests.py"
"--spec" "${CMAKE_CURRENT_SOURCE_DIR}/extensions.txt"
"--program" "${CMAKE_CURRENT_BINARY_DIR}/../src/cmark -e table -e strikethrough -e autolink"
"--program" "${CMAKE_CURRENT_BINARY_DIR}/../src/cmark -e table -e strikethrough -e autolink -e tagfilter"
)

add_test(regressiontest_executable
Expand Down
6 changes: 6 additions & 0 deletions test/afl_test_cases/test.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,10 @@ google ~~yahoo~~

google.com http://google.com google@google.com

and <xmp> but

<surewhynot>
sure
</surewhynot>

[f]: /u "t"
44 changes: 44 additions & 0 deletions test/extensions.txt
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,50 @@ Full stop outside parens shouldn't be included http://google.com/ok.
````````````````````````````````


## HTML tag filter


```````````````````````````````` example
This is <xmp> not okay, but **this** <strong>is</strong>.

<p>This is <xmp> not okay, but **this** <strong>is</strong>.</p>

Nope, I won't have <textarea>.

<p>No <textarea> here either.</p>

<p>This <random /> <thing> is okay</thing> though.</p>

Yep, <totally>okay</totally>.

<!-- HTML comments are okay, though. -->
<!- But we're strict. ->
<! No nonsense. >
<!-- Leave multiline comments the heck alone, though, okay?
Even with {"x":"y"} or 1 > 2 or whatever. Even **markdown**.
-->
<!--- Support everything CommonMark's parser does. -->
<!---->
<!--thistoo-->
.
<p>This is &lt;xmp> not okay, but <strong>this</strong> <strong>is</strong>.</p>
<p>This is &lt;xmp> not okay, but **this** <strong>is</strong>.</p>
<p>Nope, I won't have &lt;textarea>.</p>
<p>No &lt;textarea> here either.</p>
<p>This <random /> <thing> is okay</thing> though.</p>
<p>Yep, <totally>okay</totally>.</p>
<!-- HTML comments are okay, though. -->
<p>&lt;!- But we're strict. -&gt;
&lt;! No nonsense. &gt;</p>
<!-- Leave multiline comments the heck alone, though, okay?
Even with {"x":"y"} or 1 > 2 or whatever. Even **markdown**.
-->
<!--- Support everything CommonMark's parser does. -->
<!---->
<!--thistoo-->
````````````````````````````````


## Interop

Autolink and strikethrough.
Expand Down

0 comments on commit b5ccb88

Please sign in to comment.