Skip to content

Commit

Permalink
Update Tokenizer.explain for special cases with whitespace (#13086)
Browse files Browse the repository at this point in the history
* Update Tokenizer.explain for special cases with whitespace

Update `Tokenizer.explain` to skip special case matches if the exact
text has not been matched due to intervening whitespace.

Enable fuzzy `Tokenizer.explain` tests with additional whitespace
normalization.

* Add unit test for special cases with whitespace, xfail fuzzy tests again
  • Loading branch information
adrianeboyd committed Nov 6, 2023
1 parent ff9ddb6 commit 0c25725
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 4 deletions.
17 changes: 16 additions & 1 deletion spacy/tests/tokenizer/test_explain.py
Expand Up @@ -85,6 +85,18 @@ def test_tokenizer_explain_special_matcher(en_vocab):
assert tokens == explain_tokens


def test_tokenizer_explain_special_matcher_whitespace(en_vocab):
rules = {":]": [{"ORTH": ":]"}]}
tokenizer = Tokenizer(
en_vocab,
rules=rules,
)
text = ": ]"
tokens = [t.text for t in tokenizer(text)]
explain_tokens = [t[1] for t in tokenizer.explain(text)]
assert tokens == explain_tokens


@hypothesis.strategies.composite
def sentence_strategy(draw: hypothesis.strategies.DrawFn, max_n_words: int = 4) -> str:
"""
Expand Down Expand Up @@ -123,6 +135,9 @@ def test_tokenizer_explain_fuzzy(lang: str, sentence: str) -> None:
"""

tokenizer: Tokenizer = spacy.blank(lang).tokenizer
tokens = [t.text for t in tokenizer(sentence) if not t.is_space]
# Tokenizer.explain is not intended to handle whitespace or control
# characters in the same way as Tokenizer
sentence = re.sub(r"\s+", " ", sentence).strip()
tokens = [t.text for t in tokenizer(sentence)]
debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
assert tokens == debug_tokens, f"{tokens}, {debug_tokens}, {sentence}"
13 changes: 10 additions & 3 deletions spacy/tokenizer.pyx
Expand Up @@ -730,9 +730,16 @@ cdef class Tokenizer:
if i in spans_by_start:
span = spans_by_start[i]
exc = [d[ORTH] for d in special_cases[span.label_]]
for j, orth in enumerate(exc):
final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
i += len(span)
# The phrase matcher can overmatch for tokens separated by
# spaces in the text but not in the underlying rule, so skip
# cases where the texts aren't identical
if span.text != "".join([self.vocab.strings[orth] for orth in exc]):
final_tokens.append(tokens[i])
i += 1
else:
for j, orth in enumerate(exc):
final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
i += len(span)
else:
final_tokens.append(tokens[i])
i += 1
Expand Down

0 comments on commit 0c25725

Please sign in to comment.