Update Tokenizer.explain for special cases with whitespace (#13086)

* Update Tokenizer.explain for special cases with whitespace Update `Tokenizer.explain` to skip special case matches if the exact text has not been matched due to intervening whitespace. Enable fuzzy `Tokenizer.explain` tests with additional whitespace normalization. * Add unit test for special cases with whitespace, xfail fuzzy tests again
explosion · Nov 6, 2023 · 0c25725 · 0c25725
1 parent ff9ddb6
commit 0c25725
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 4 deletions.
diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py
@@ -85,6 +85,18 @@ def test_tokenizer_explain_special_matcher(en_vocab):
     assert tokens == explain_tokens
 
 
+def test_tokenizer_explain_special_matcher_whitespace(en_vocab):
+    rules = {":]": [{"ORTH": ":]"}]}
+    tokenizer = Tokenizer(
+        en_vocab,
+        rules=rules,
+    )
+    text = ": ]"
+    tokens = [t.text for t in tokenizer(text)]
+    explain_tokens = [t[1] for t in tokenizer.explain(text)]
+    assert tokens == explain_tokens
+
+
 @hypothesis.strategies.composite
 def sentence_strategy(draw: hypothesis.strategies.DrawFn, max_n_words: int = 4) -> str:
     """
@@ -123,6 +135,9 @@ def test_tokenizer_explain_fuzzy(lang: str, sentence: str) -> None:
     """
 
     tokenizer: Tokenizer = spacy.blank(lang).tokenizer
-    tokens = [t.text for t in tokenizer(sentence) if not t.is_space]
+    # Tokenizer.explain is not intended to handle whitespace or control
+    # characters in the same way as Tokenizer
+    sentence = re.sub(r"\s+", " ", sentence).strip()
+    tokens = [t.text for t in tokenizer(sentence)]
     debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
     assert tokens == debug_tokens, f"{tokens}, {debug_tokens}, {sentence}"
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
@@ -730,9 +730,16 @@ cdef class Tokenizer:
             if i in spans_by_start:
                 span = spans_by_start[i]
                 exc = [d[ORTH] for d in special_cases[span.label_]]
-                for j, orth in enumerate(exc):
-                    final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
-                i += len(span)
+                # The phrase matcher can overmatch for tokens separated by
+                # spaces in the text but not in the underlying rule, so skip
+                # cases where the texts aren't identical
+                if span.text != "".join([self.vocab.strings[orth] for orth in exc]):
+                    final_tokens.append(tokens[i])
+                    i += 1
+                else:
+                    for j, orth in enumerate(exc):
+                        final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
+                    i += len(span)
             else:
                 final_tokens.append(tokens[i])
                 i += 1