Skip to content

Commit

Permalink
Allow :is(), :has(), and :where() to forgive empty slots (#226)
Browse files Browse the repository at this point in the history
Resolves #122
  • Loading branch information
facelessuser committed Sep 10, 2021
1 parent ac996fe commit 30b6089
Show file tree
Hide file tree
Showing 10 changed files with 194 additions and 62 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -1,3 +1,5 @@
.DS_Store

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
9 changes: 9 additions & 0 deletions docs/src/markdown/about/changelog.md
@@ -1,5 +1,14 @@
# Changelog

## 2.3

- **NEW**: `:has()`, `:is()`, and `:where()` now use use a forgiving selector list. While not as forgiving as due to
syntax errors as CSS might be, it will forgive such things as empty sets and empty slots due to multiple consecutive
commas, leading commas, or trailing commas. Essentially, these pseudo-classes will match all non-empty selectors and
ignore empty ones. As the scraping environment is different that a browser environment, it was chosen not to
aggressively forgive bad syntax and invalid features to ensure the user is alerted that their program may not perform
as expected.

## 2.2.1

- **FIX**: Fix an issue with namespaces when one of the keys is `self`.
Expand Down
4 changes: 2 additions & 2 deletions mkdocs.yml
Expand Up @@ -105,6 +105,6 @@ plugins:
- search:
separator: '[:\s\-]+'
- git-revision-date-localized
- minify:
minify_html: true
# - minify:
# minify_html: true
- mkdocs_pymdownx_material_extras
1 change: 0 additions & 1 deletion requirements/docs.txt
@@ -1,4 +1,3 @@
mkdocs_pymdownx_material_extras==1.2.2
mkdocs-git-revision-date-localized-plugin
mkdocs-minify-plugin
pyspelling
2 changes: 1 addition & 1 deletion soupsieve/__meta__.py
Expand Up @@ -188,5 +188,5 @@ def parse_version(ver):
return Version(major, minor, micro, release, pre, post, dev)


__version_info__ = Version(2, 2, 1, "final")
__version_info__ = Version(2, 3, 0, ".dev")
__version__ = __version_info__._get_canonical()
3 changes: 3 additions & 0 deletions soupsieve/css_match.py
Expand Up @@ -784,6 +784,9 @@ def match_relations(self, el, relation):

found = False

if isinstance(relation[0], ct.SelectorNull) or relation[0].rel_type is None:
return found

if relation[0].rel_type.startswith(':'):
found = self.match_future_relations(el, relation)
else:
Expand Down
111 changes: 77 additions & 34 deletions soupsieve/css_parser.py
Expand Up @@ -196,6 +196,7 @@
FLG_IN_RANGE = 0x80
FLG_OUT_OF_RANGE = 0x100
FLG_PLACEHOLDER_SHOWN = 0x200
FLG_FORGIVE = 0x400

# Maximum cached patterns to store
_MAXCACHE = 500
Expand Down Expand Up @@ -715,11 +716,14 @@ def parse_pseudo_open(self, sel, name, has_selector, iselector, index):
flags = FLG_PSEUDO | FLG_OPEN
if name == ':not':
flags |= FLG_NOT
if name == ':has':
flags |= FLG_RELATIVE
elif name == ':has':
flags |= FLG_RELATIVE | FLG_FORGIVE
elif name in (':where', ':is'):
flags |= FLG_FORGIVE

sel.selectors.append(self.parse_selectors(iselector, index, flags))
has_selector = True

return has_selector

def parse_has_combinator(self, sel, m, has_selector, selectors, rel_type, index):
Expand All @@ -731,12 +735,9 @@ def parse_has_combinator(self, sel, m, has_selector, selectors, rel_type, index)
if combinator == COMMA_COMBINATOR:
if not has_selector:
# If we've not captured any selector parts, the comma is either at the beginning of the pattern
# or following another comma, both of which are unexpected. Commas must split selectors.
raise SelectorSyntaxError(
"The combinator '{}' at postion {}, must have a selector before it".format(combinator, index),
self.pattern,
index
)
# or following another comma, both of which are unexpected. But shouldn't fail the pseudo-class.
sel.no_match = True

sel.rel_type = rel_type
selectors[-1].relations.append(sel)
rel_type = ":" + WS_COMBINATOR
Expand All @@ -757,41 +758,50 @@ def parse_has_combinator(self, sel, m, has_selector, selectors, rel_type, index)
self.pattern,
index
)

# Set the leading combinator for the next selector.
rel_type = ':' + combinator
sel = _Selector()

sel = _Selector()
has_selector = False
return has_selector, sel, rel_type

def parse_combinator(self, sel, m, has_selector, selectors, relations, is_pseudo, index):
def parse_combinator(self, sel, m, has_selector, selectors, relations, is_pseudo, is_forgive, index):
"""Parse combinator tokens."""

combinator = m.group('relation').strip()
if not combinator:
combinator = WS_COMBINATOR
if not has_selector:
raise SelectorSyntaxError(
"The combinator '{}' at postion {}, must have a selector before it".format(combinator, index),
self.pattern,
index
)
if not is_forgive or combinator != COMMA_COMBINATOR:
raise SelectorSyntaxError(
"The combinator '{}' at postion {}, must have a selector before it".format(combinator, index),
self.pattern,
index
)

if combinator == COMMA_COMBINATOR:
if not sel.tag and not is_pseudo:
# Implied `*`
sel.tag = ct.SelectorTag('*', None)
sel.relations.extend(relations)
selectors.append(sel)
del relations[:]
# If we are in a forgiving pseudo class, just make the selector a "no match"
if combinator == COMMA_COMBINATOR:
sel.no_match = True
del relations[:]
selectors.append(sel)
else:
sel.relations.extend(relations)
sel.rel_type = combinator
del relations[:]
relations.append(sel)
sel = _Selector()
if combinator == COMMA_COMBINATOR:
if not sel.tag and not is_pseudo:
# Implied `*`
sel.tag = ct.SelectorTag('*', None)
sel.relations.extend(relations)
selectors.append(sel)
del relations[:]
else:
sel.relations.extend(relations)
sel.rel_type = combinator
del relations[:]
relations.append(sel)

sel = _Selector()
has_selector = False

return has_selector, sel

def parse_class_id(self, sel, m, has_selector):
Expand Down Expand Up @@ -862,12 +872,15 @@ def parse_pseudo_dir(self, sel, m, has_selector):
def parse_selectors(self, iselector, index=0, flags=0):
"""Parse selectors."""

# Initialize important variables
sel = _Selector()
selectors = []
has_selector = False
closed = False
relations = []
rel_type = ":" + WS_COMBINATOR

# Setup various flags
is_open = bool(flags & FLG_OPEN)
is_pseudo = bool(flags & FLG_PSEUDO)
is_relative = bool(flags & FLG_RELATIVE)
Expand All @@ -878,7 +891,9 @@ def parse_selectors(self, iselector, index=0, flags=0):
is_in_range = bool(flags & FLG_IN_RANGE)
is_out_of_range = bool(flags & FLG_OUT_OF_RANGE)
is_placeholder_shown = bool(flags & FLG_PLACEHOLDER_SHOWN)
is_forgive = bool(flags & FLG_FORGIVE)

# Print out useful debug stuff
if self.debug: # pragma: no cover
if is_pseudo:
print(' is_pseudo: True')
Expand All @@ -900,7 +915,10 @@ def parse_selectors(self, iselector, index=0, flags=0):
print(' is_out_of_range: True')
if is_placeholder_shown:
print(' is_placeholder_shown: True')
if is_forgive:
print(' is_forgive: True')

# The algorithm for relative selectors require an initial selector in the selector list
if is_relative:
selectors.append(_Selector())

Expand Down Expand Up @@ -929,11 +947,13 @@ def parse_selectors(self, iselector, index=0, flags=0):
is_html = True
elif key == 'pseudo_close':
if not has_selector:
raise SelectorSyntaxError(
"Expected a selector at postion {}".format(m.start(0)),
self.pattern,
m.start(0)
)
if not is_forgive:
raise SelectorSyntaxError(
"Expected a selector at postion {}".format(m.start(0)),
self.pattern,
m.start(0)
)
sel.no_match = True
if is_open:
closed = True
break
Expand All @@ -950,7 +970,7 @@ def parse_selectors(self, iselector, index=0, flags=0):
)
else:
has_selector, sel = self.parse_combinator(
sel, m, has_selector, selectors, relations, is_pseudo, index
sel, m, has_selector, selectors, relations, is_pseudo, is_forgive, index
)
elif key == 'attribute':
has_selector = self.parse_attribute_selector(sel, m, has_selector)
Expand All @@ -969,13 +989,15 @@ def parse_selectors(self, iselector, index=0, flags=0):
except StopIteration:
pass

# Handle selectors that are not closed
if is_open and not closed:
raise SelectorSyntaxError(
"Unclosed pseudo-class at position {}".format(index),
self.pattern,
index
)

# Cleanup completed selector piece
if has_selector:
if not sel.tag and not is_pseudo:
# Implied `*`
Expand All @@ -987,8 +1009,28 @@ def parse_selectors(self, iselector, index=0, flags=0):
sel.relations.extend(relations)
del relations[:]
selectors.append(sel)
else:

# Forgive empty slots in pseudo-classes that have lists (and are forgiving)
elif is_forgive:
if is_relative:
# Handle relative selectors pseudo-classes with empty slots like `:has()`
if selectors and selectors[-1].rel_type is None and rel_type == ': ':
sel.rel_type = rel_type
sel.no_match = True
selectors[-1].relations.append(sel)
has_selector = True
else:
# Handle normal pseudo-classes with empty slots
if not selectors or not relations:
# Others like `:is()` etc.
sel.no_match = True
del relations[:]
selectors.append(sel)
has_selector = True

if not has_selector:
# We will always need to finish a selector when `:has()` is used as it leads with combining.
# May apply to others as well.
raise SelectorSyntaxError(
'Expected a selector at position {}'.format(index),
self.pattern,
Expand All @@ -1009,6 +1051,7 @@ def parse_selectors(self, iselector, index=0, flags=0):
if is_placeholder_shown:
selectors[-1].flags = ct.SEL_PLACEHOLDER_SHOWN

# Return selector list
return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html)

def selector_iter(self, pattern):
Expand Down
21 changes: 21 additions & 0 deletions tests/test_level3/test_not.py
@@ -1,6 +1,7 @@
"""Test not selectors."""
from .. import util
from bs4 import BeautifulSoup as BS
from soupsieve import SelectorSyntaxError


class TestNot(util.TestCase):
Expand Down Expand Up @@ -55,3 +56,23 @@ def test_none_inputs(self):
soup = BS('<span foo="something">text</span>', 'html.parser')
soup.span['foo'] = None
self.assertEqual(len(soup.select('span:not([foo])')), 0)

def test_invalid_pseudo_empty(self):
"""Test pseudo class group with empty set."""

self.assert_raises(':not()', SelectorSyntaxError)

def test_invalid_pseudo_trailing_comma(self):
"""Test pseudo class group with trailing comma."""

self.assert_raises(':not(.class,)', SelectorSyntaxError)

def test_invalid_pseudo_leading_comma(self):
"""Test pseudo class group with leading comma."""

self.assert_raises(':not(,.class)', SelectorSyntaxError)

def test_invalid_pseudo_multi_comma(self):
"""Test pseudo class group with multiple commas."""

self.assert_raises(':not(.this,,.that)', SelectorSyntaxError)
58 changes: 39 additions & 19 deletions tests/test_level4/test_has.py
Expand Up @@ -129,20 +129,50 @@ def test_has_nested_pseudo(self):
flags=util.HTML
)

def test_invalid_incomplete_has(self):
"""Test `:has()` fails with just a combinator."""
def test_has_empty(self):
"""Test has with empty slot due to multiple commas."""

self.assert_raises(':has(>)', SelectorSyntaxError)
self.assert_selector(
self.MARKUP2,
'div:has()',
[],
flags=util.HTML
)

def test_invalid_has_empty(self):
"""Test `:has()` fails with empty function parameters."""
def test_has_multi_commas(self):
"""Test has with empty slot due to multiple commas."""

self.assert_raises(':has()', SelectorSyntaxError)
self.assert_selector(
self.MARKUP2,
'div:has(> .bbbb, .ffff, , .jjjj)',
['0', '4', '8'],
flags=util.HTML
)

def test_invalid_has_double_comma(self):
"""Test `:has()` fails with consecutive commas."""
def test_has_leading_commas(self):
"""Test has with empty slot due to leading commas."""

self.assert_raises(':has(> has,, a)', SelectorSyntaxError)
self.assert_selector(
self.MARKUP2,
'div:has(, > .bbbb, .ffff, .jjjj)',
['0', '4', '8'],
flags=util.HTML
)

def test_has_trailing_commas(self):
"""Test has with empty slot due to trailing commas."""

self.assert_selector(
self.MARKUP2,
'div:has(> .bbbb, .ffff, .jjjj, )',
['0', '4', '8'],
flags=util.HTML
)

def test_invalid_incomplete_has(self):
"""Test `:has()` fails with just a combinator."""

self.assert_raises(':has(>)', SelectorSyntaxError)

def test_invalid_has_double_combinator(self):
"""Test `:has()` fails with consecutive combinators."""
Expand All @@ -155,13 +185,3 @@ def test_invalid_has_trailing_combinator(self):
"""Test `:has()` fails with trailing combinator."""

self.assert_raises(':has(> has >)', SelectorSyntaxError)

def test_invalid_has_trailing_comma(self):
"""Test `:has()` fails with trailing comma."""

self.assert_raises(':has(> has,)', SelectorSyntaxError)

def test_invalid_has_start_comma(self):
"""Test `:has()` fails with trailing comma."""

self.assert_raises(':has(, p)', SelectorSyntaxError)

0 comments on commit 30b6089

Please sign in to comment.