Update language filter algorithm (#160)

Instead of using regular expression, use the algorithm as described in RFC4647.
facelessuser · Sep 26, 2019 · de2119a · de2119a
1 parent 21bd825
commit de2119a
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 16 deletions.
diff --git a/soupsieve/css_match.py b/soupsieve/css_match.py
@@ -43,6 +43,7 @@
 RE_DATETIME = re.compile(
     r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'
 )
+RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)')
 
 MONTHS_30 = (4, 6, 9, 11)  # April, June, September, and November
 FEB = 2
@@ -545,6 +546,57 @@ def find_bidi(self, el):
                     return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
         return None
 
+    def extended_language_filter(self, lang_range, lang_tag):
+        """Filter the language tags."""
+
+        match = True
+        lang_range = RE_WILD_STRIP.sub('-', lang_range).lower()
+        ranges = lang_range.split('-')
+        subtags = lang_tag.lower().split('-')
+        length = len(ranges)
+        rindex = 0
+        sindex = 0
+        r = ranges[rindex]
+        s = subtags[sindex]
+
+        # Primary tag needs to match
+        if r != '*' and r != s:
+            match = False
+
+        rindex += 1
+        sindex += 1
+
+        # Match until we run out of ranges
+        while match and rindex < length:
+            r = ranges[rindex]
+            try:
+                s = subtags[sindex]
+            except IndexError:
+                # Ran out of subtags,
+                # but we still have ranges
+                match = False
+                continue
+
+            # Empty range
+            if not r:
+                match = False
+                continue
+
+            # Matched range
+            elif s == r:
+                rindex += 1
+
+            # Implicit wildcard cannot match
+            # singletons
+            elif len(s) == 1:
+                match = False
+                continue
+
+            # Implicitly matched, so grab next subtag
+            sindex += 1
+
+        return match
+
     def match_attribute_name(self, el, attr, prefix):
         """Match attribute name and return value if it exists."""
 
@@ -1100,7 +1152,7 @@ def match_lang(self, el, langs):
             for patterns in langs:
                 match = False
                 for pattern in patterns:
-                    if pattern.match(found_lang):
+                    if self.extended_language_filter(pattern, found_lang):
                         match = True
                 if not match:
                     break

diff --git a/soupsieve/css_parser.py b/soupsieve/css_parser.py
@@ -173,7 +173,6 @@
 RE_WS_BEGIN = re.compile('^{}*'.format(WSC))
 RE_WS_END = re.compile('{}*$'.format(WSC))
 RE_CUSTOM = re.compile(r'^{}$'.format(PAT_PSEUDO_CLASS_CUSTOM), re.X)
-RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)')
 
 # Constants
 # List split token
@@ -842,22 +841,12 @@ def parse_pseudo_lang(self, sel, m, has_selector):
                 continue
             value = token.group('value')
             if value.startswith(('"', "'")):
-                parts = RE_WILD_STRIP.sub('-', css_unescape(value[1:-1], True)).split('-')
+                value = css_unescape(value[1:-1], True)
             else:
-                parts = RE_WILD_STRIP.sub('-', css_unescape(value)).split('-')
-
-            new_parts = []
-            length = len(parts)
-            start = 1 if parts and parts[0] == '*' else 0
-            for index, part in enumerate(parts):
-                if part == '*':
-                    new_parts.append(
-                        '[a-z0-9]+' if length == 1 else r'(?:(?:[a-z0-9]+)(?:-[a-z0-9]{2,})*?-)'
-                    )
-                else:
-                    new_parts.append(('' if index == start else r'(?:-[a-z0-9]{2,})*?-') + re.escape(part))
+                value = css_unescape(value)
+
+            patterns.append(value)
 
-            patterns.append(re.compile(r'^{}(?:-.*)?$'.format(''.join(new_parts)), re.I))
         sel.lang.append(ct.SelectorLang(patterns))
         has_selector = True
 

diff --git a/tests/test_level4/test_lang.py b/tests/test_level4/test_lang.py
@@ -46,6 +46,17 @@ def test_lang(self):
             flags=util.HTML
         )
 
+    def test_lang_missing_range(self):
+        """Test language range with a missing range."""
+
+        # Implicit wild
+        self.assert_selector(
+            self.MARKUP,
+            "p:lang(de--DE)",
+            [],
+            flags=util.HTML
+        )
+
     def test_explicit_wildcard(self):
         """Test language with explicit wildcard (same as implicit)."""