From 323990590efd3bfdbec5601a782a429951170cdc Mon Sep 17 00:00:00 2001 From: Isaac Muse Date: Sat, 16 Mar 2019 15:20:22 -0600 Subject: [PATCH] Null character should translate to REPLACEMENT CHARACTER (U+FFFD) (#126) Fixes #124 --- docs/src/markdown/about/changelog.md | 2 ++ soupsieve/css_parser.py | 16 ++++++------- tests/test_level1/test_class.py | 21 ++++++++++++++++ tests/test_level2/test_attribute.py | 36 ++++++++++++++++++++++++++++ 4 files changed, 67 insertions(+), 8 deletions(-) diff --git a/docs/src/markdown/about/changelog.md b/docs/src/markdown/about/changelog.md index 37bf3026..c4102d45 100644 --- a/docs/src/markdown/about/changelog.md +++ b/docs/src/markdown/about/changelog.md @@ -5,6 +5,8 @@ - **NEW**: Allow `:contans()` to accept a list of text to search for. - **FIX**: Don't install test files when installing the `soupsieve` package. - **FIX**: Improve efficiency of `:contains()` comparison. +- **FIX**: Null characters should translate to the Unicode REPLACEMENT CHARACTER (`U+FFFD`) according to the spec. This +applies to CSS escaped NULL characters as well. ## 1.8.0 diff --git a/soupsieve/css_parser.py b/soupsieve/css_parser.py index a537d5f7..ba16fc94 100644 --- a/soupsieve/css_parser.py +++ b/soupsieve/css_parser.py @@ -7,6 +7,8 @@ from collections import OrderedDict from .util import SelectorSyntaxError +UNICODE_REPLACEMENT_CHAR = 0xFFFD + # Simple pseudo classes that take no parameters PSEUDO_SIMPLE = { ":any-link", @@ -240,13 +242,11 @@ def css_unescape(content, string=False): def replace(m): """Replace with the appropriate substitute.""" - return util.uchr(int(m.group(1)[1:], 16)) if m.group(1) else m.group(2)[1:] - - def replace_string(m): - """Replace with the appropriate substitute for a string.""" - if m.group(1): - value = util.uchr(int(m.group(1)[1:], 16)) + codepoint = int(m.group(1)[1:], 16) + if codepoint == 0: + codepoint = UNICODE_REPLACEMENT_CHAR + value = util.uchr(codepoint) elif m.group(2): value = m.group(2)[1:] else: @@ -254,7 +254,7 @@ def replace_string(m): return value - return RE_CSS_ESC.sub(replace, content) if not string else RE_CSS_STR_ESC.sub(replace_string, content) + return (RE_CSS_ESC if not string else RE_CSS_STR_ESC).sub(replace, content) class SelectorPattern(object): @@ -376,7 +376,7 @@ class CSSParser(object): def __init__(self, selector, custom=None, flags=0): """Initialize.""" - self.pattern = selector + self.pattern = selector.replace('\x00', '\ufffd') self.flags = flags self.debug = self.flags & util.DEBUG self.quirks = self.flags & util._QUIRKS diff --git a/tests/test_level1/test_class.py b/tests/test_level1/test_class.py index 62ea8354..105455a7 100644 --- a/tests/test_level1/test_class.py +++ b/tests/test_level1/test_class.py @@ -15,6 +15,17 @@ class TestClass(util.TestCase): """ + # Browsers normally replace NULL with `\uFFFD`, but some of the parsers + # we test just strip out NULL, so we will simulate and just insert `\uFFFD` directly + # to ensure consistent behavior in our tests across parsers. + MARKUP_NULL = """ +
+

Some text in a paragraph. + Link +

+
+ """ + def test_class(self): """Test class.""" @@ -35,6 +46,16 @@ def test_type_and_class(self): flags=util.HTML ) + def test_type_and_class_escaped_null(self): + """Test type and class with an escaped null character.""" + + self.assert_selector( + self.MARKUP_NULL, + r"a.\0 bar", + ["2"], + flags=util.HTML + ) + def test_malformed_class(self): """Test malformed class.""" diff --git a/tests/test_level2/test_attribute.py b/tests/test_level2/test_attribute.py index 5a919f71..1d39ba0a 100644 --- a/tests/test_level2/test_attribute.py +++ b/tests/test_level2/test_attribute.py @@ -33,6 +33,22 @@ class TestAttribute(util.TestCase): """ + # Browsers normally replace NULL with `\uFFFD`, but some of the parsers + # we test just strip out NULL, so we will simulate and just insert `\uFFFD` directly + # to ensure consistent behavior in our tests across parsers. + MARKUP_NULL = """ +
+

Some text in a paragraph.

+ Link + Direct child +
+    Child 1
+    Child 2
+    Child 3
+    
+
+ """ + def test_attribute(self): """Test attribute.""" @@ -150,6 +166,26 @@ def test_attribute_escaped_newline(self): flags=util.HTML ) + def test_attribute_equal_literal_null(self): + """Test attribute with value that equals specified value with a literal null character.""" + + self.assert_selector( + self.MARKUP_NULL, + '[id="\x00pre"]', + ["\ufffdpre"], + flags=util.HTML + ) + + def test_attribute_equal_escaped_null(self): + """Test attribute with value that equals specified value with an escaped null character.""" + + self.assert_selector( + self.MARKUP_NULL, + r'[id="\0 pre"]', + ["\ufffdpre"], + flags=util.HTML + ) + def test_invalid_tag(self): """ Test invalid tag.