Skip to content

Commit

Permalink
Update language filter algorithm (#160)
Browse files Browse the repository at this point in the history
Instead of using regular expression, use the algorithm as described
in RFC4647.
  • Loading branch information
facelessuser committed Sep 26, 2019
1 parent 21bd825 commit de2119a
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 16 deletions.
54 changes: 53 additions & 1 deletion soupsieve/css_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
RE_DATETIME = re.compile(
r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'
)
RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)')

MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November
FEB = 2
Expand Down Expand Up @@ -545,6 +546,57 @@ def find_bidi(self, el):
return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
return None

def extended_language_filter(self, lang_range, lang_tag):
"""Filter the language tags."""

match = True
lang_range = RE_WILD_STRIP.sub('-', lang_range).lower()
ranges = lang_range.split('-')
subtags = lang_tag.lower().split('-')
length = len(ranges)
rindex = 0
sindex = 0
r = ranges[rindex]
s = subtags[sindex]

# Primary tag needs to match
if r != '*' and r != s:
match = False

rindex += 1
sindex += 1

# Match until we run out of ranges
while match and rindex < length:
r = ranges[rindex]
try:
s = subtags[sindex]
except IndexError:
# Ran out of subtags,
# but we still have ranges
match = False
continue

# Empty range
if not r:
match = False
continue

# Matched range
elif s == r:
rindex += 1

# Implicit wildcard cannot match
# singletons
elif len(s) == 1:
match = False
continue

# Implicitly matched, so grab next subtag
sindex += 1

return match

def match_attribute_name(self, el, attr, prefix):
"""Match attribute name and return value if it exists."""

Expand Down Expand Up @@ -1100,7 +1152,7 @@ def match_lang(self, el, langs):
for patterns in langs:
match = False
for pattern in patterns:
if pattern.match(found_lang):
if self.extended_language_filter(pattern, found_lang):
match = True
if not match:
break
Expand Down
19 changes: 4 additions & 15 deletions soupsieve/css_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,6 @@
RE_WS_BEGIN = re.compile('^{}*'.format(WSC))
RE_WS_END = re.compile('{}*$'.format(WSC))
RE_CUSTOM = re.compile(r'^{}$'.format(PAT_PSEUDO_CLASS_CUSTOM), re.X)
RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)')

# Constants
# List split token
Expand Down Expand Up @@ -842,22 +841,12 @@ def parse_pseudo_lang(self, sel, m, has_selector):
continue
value = token.group('value')
if value.startswith(('"', "'")):
parts = RE_WILD_STRIP.sub('-', css_unescape(value[1:-1], True)).split('-')
value = css_unescape(value[1:-1], True)
else:
parts = RE_WILD_STRIP.sub('-', css_unescape(value)).split('-')

new_parts = []
length = len(parts)
start = 1 if parts and parts[0] == '*' else 0
for index, part in enumerate(parts):
if part == '*':
new_parts.append(
'[a-z0-9]+' if length == 1 else r'(?:(?:[a-z0-9]+)(?:-[a-z0-9]{2,})*?-)'
)
else:
new_parts.append(('' if index == start else r'(?:-[a-z0-9]{2,})*?-') + re.escape(part))
value = css_unescape(value)

patterns.append(value)

patterns.append(re.compile(r'^{}(?:-.*)?$'.format(''.join(new_parts)), re.I))
sel.lang.append(ct.SelectorLang(patterns))
has_selector = True

Expand Down
11 changes: 11 additions & 0 deletions tests/test_level4/test_lang.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,17 @@ def test_lang(self):
flags=util.HTML
)

def test_lang_missing_range(self):
"""Test language range with a missing range."""

# Implicit wild
self.assert_selector(
self.MARKUP,
"p:lang(de--DE)",
[],
flags=util.HTML
)

def test_explicit_wildcard(self):
"""Test language with explicit wildcard (same as implicit)."""

Expand Down

0 comments on commit de2119a

Please sign in to comment.