diff --git a/docs/src/markdown/about/changelog.md b/docs/src/markdown/about/changelog.md index 37bf3026..c4102d45 100644 --- a/docs/src/markdown/about/changelog.md +++ b/docs/src/markdown/about/changelog.md @@ -5,6 +5,8 @@ - **NEW**: Allow `:contans()` to accept a list of text to search for. - **FIX**: Don't install test files when installing the `soupsieve` package. - **FIX**: Improve efficiency of `:contains()` comparison. +- **FIX**: Null characters should translate to the Unicode REPLACEMENT CHARACTER (`U+FFFD`) according to the spec. This +applies to CSS escaped NULL characters as well. ## 1.8.0 diff --git a/soupsieve/css_parser.py b/soupsieve/css_parser.py index a537d5f7..ba16fc94 100644 --- a/soupsieve/css_parser.py +++ b/soupsieve/css_parser.py @@ -7,6 +7,8 @@ from collections import OrderedDict from .util import SelectorSyntaxError +UNICODE_REPLACEMENT_CHAR = 0xFFFD + # Simple pseudo classes that take no parameters PSEUDO_SIMPLE = { ":any-link", @@ -240,13 +242,11 @@ def css_unescape(content, string=False): def replace(m): """Replace with the appropriate substitute.""" - return util.uchr(int(m.group(1)[1:], 16)) if m.group(1) else m.group(2)[1:] - - def replace_string(m): - """Replace with the appropriate substitute for a string.""" - if m.group(1): - value = util.uchr(int(m.group(1)[1:], 16)) + codepoint = int(m.group(1)[1:], 16) + if codepoint == 0: + codepoint = UNICODE_REPLACEMENT_CHAR + value = util.uchr(codepoint) elif m.group(2): value = m.group(2)[1:] else: @@ -254,7 +254,7 @@ def replace_string(m): return value - return RE_CSS_ESC.sub(replace, content) if not string else RE_CSS_STR_ESC.sub(replace_string, content) + return (RE_CSS_ESC if not string else RE_CSS_STR_ESC).sub(replace, content) class SelectorPattern(object): @@ -376,7 +376,7 @@ class CSSParser(object): def __init__(self, selector, custom=None, flags=0): """Initialize.""" - self.pattern = selector + self.pattern = selector.replace('\x00', '\ufffd') self.flags = flags self.debug = self.flags & util.DEBUG self.quirks = self.flags & util._QUIRKS diff --git a/tests/test_level1/test_class.py b/tests/test_level1/test_class.py index 62ea8354..105455a7 100644 --- a/tests/test_level1/test_class.py +++ b/tests/test_level1/test_class.py @@ -15,6 +15,17 @@ class TestClass(util.TestCase): """ + # Browsers normally replace NULL with `\uFFFD`, but some of the parsers + # we test just strip out NULL, so we will simulate and just insert `\uFFFD` directly + # to ensure consistent behavior in our tests across parsers. + MARKUP_NULL = """ +
Some text in a paragraph. + Link +
+