From 0c25725359c8898959af10fc11562d5cf0e77308 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 6 Nov 2023 17:29:59 +0100 Subject: [PATCH] Update Tokenizer.explain for special cases with whitespace (#13086) * Update Tokenizer.explain for special cases with whitespace Update `Tokenizer.explain` to skip special case matches if the exact text has not been matched due to intervening whitespace. Enable fuzzy `Tokenizer.explain` tests with additional whitespace normalization. * Add unit test for special cases with whitespace, xfail fuzzy tests again --- spacy/tests/tokenizer/test_explain.py | 17 ++++++++++++++++- spacy/tokenizer.pyx | 13 ++++++++++--- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py index 5b4eeca1630..78932f6539c 100644 --- a/spacy/tests/tokenizer/test_explain.py +++ b/spacy/tests/tokenizer/test_explain.py @@ -85,6 +85,18 @@ def test_tokenizer_explain_special_matcher(en_vocab): assert tokens == explain_tokens +def test_tokenizer_explain_special_matcher_whitespace(en_vocab): + rules = {":]": [{"ORTH": ":]"}]} + tokenizer = Tokenizer( + en_vocab, + rules=rules, + ) + text = ": ]" + tokens = [t.text for t in tokenizer(text)] + explain_tokens = [t[1] for t in tokenizer.explain(text)] + assert tokens == explain_tokens + + @hypothesis.strategies.composite def sentence_strategy(draw: hypothesis.strategies.DrawFn, max_n_words: int = 4) -> str: """ @@ -123,6 +135,9 @@ def test_tokenizer_explain_fuzzy(lang: str, sentence: str) -> None: """ tokenizer: Tokenizer = spacy.blank(lang).tokenizer - tokens = [t.text for t in tokenizer(sentence) if not t.is_space] + # Tokenizer.explain is not intended to handle whitespace or control + # characters in the same way as Tokenizer + sentence = re.sub(r"\s+", " ", sentence).strip() + tokens = [t.text for t in tokenizer(sentence)] debug_tokens = [t[1] for t in tokenizer.explain(sentence)] assert tokens == debug_tokens, f"{tokens}, {debug_tokens}, {sentence}" diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index a239eaf456f..6f2b10734c5 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -730,9 +730,16 @@ cdef class Tokenizer: if i in spans_by_start: span = spans_by_start[i] exc = [d[ORTH] for d in special_cases[span.label_]] - for j, orth in enumerate(exc): - final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth])) - i += len(span) + # The phrase matcher can overmatch for tokens separated by + # spaces in the text but not in the underlying rule, so skip + # cases where the texts aren't identical + if span.text != "".join([self.vocab.strings[orth] for orth in exc]): + final_tokens.append(tokens[i]) + i += 1 + else: + for j, orth in enumerate(exc): + final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth])) + i += len(span) else: final_tokens.append(tokens[i]) i += 1