Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Raise error if deps not provided with heads #8335

Merged
merged 10 commits into from Jun 15, 2021
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions spacy/errors.py
Expand Up @@ -843,6 +843,10 @@ class Errors:
"DependencyMatcher token patterns. The token pattern in "
"RIGHT_ATTR should return matches that are each exactly one token "
"long. Invalid pattern:\n{node}")
E1017 = ("Heads passed without deps. Normally this is a mistake, but if "
"intentional then pass filler deps such as `[\"dep\"] * len(heads)`.")
polm marked this conversation as resolved.
Show resolved Hide resolved




# Deprecated model shortcuts, only used in errors and warnings
Expand Down
6 changes: 6 additions & 0 deletions spacy/tests/doc/test_creation.py
Expand Up @@ -63,3 +63,9 @@ def test_create_from_words_and_text(vocab):
words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
text = " 'dogs'\n\nrun "
(words, spaces) = util.get_words_and_spaces(words + ["away"], text)

def test_create_with_heads_and_no_deps(vocab):
words = "I like ginger".split()
heads = list(range(len(words)))
with pytest.raises(ValueError):
doc = Doc(vocab, words=words, heads=heads)
23 changes: 18 additions & 5 deletions spacy/tests/doc/test_retokenize_merge.py
Expand Up @@ -108,9 +108,12 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab):
words = ["The", "players", "start", "."]
lemmas = [t.lower() for t in words]
heads = [1, 2, 2, 2]
deps = ["dep"] * len(heads)
tags = ["DT", "NN", "VBZ", "."]
pos = ["DET", "NOUN", "VERB", "PUNCT"]
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas)
doc = Doc(
en_vocab, words=words, tags=tags, pos=pos, heads=heads, deps=deps, lemmas=lemmas
)
assert len(doc) == 4
assert doc[0].text == "The"
assert doc[0].tag_ == "DT"
Expand All @@ -123,7 +126,9 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab):
assert doc[0].tag_ == "NN"
assert doc[0].pos_ == "NOUN"
assert doc[0].lemma_ == "the players"
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas)
doc = Doc(
en_vocab, words=words, tags=tags, pos=pos, heads=heads, deps=deps, lemmas=lemmas
)
assert len(doc) == 4
assert doc[0].text == "The"
assert doc[0].tag_ == "DT"
Expand Down Expand Up @@ -190,17 +195,19 @@ def test_doc_retokenize_span_np_merges(en_tokenizer):

text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript."
heads = [1, 1, 10, 7, 3, 3, 7, 10, 9, 10, 1, 10, 11, 12, 13, 13, 1]
deps = ["dep"] * len(heads)
tokens = en_tokenizer(text)
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
with doc.retokenize() as retokenizer:
for ent in doc.ents:
attrs = {"tag": ent.label_, "lemma": ent.lemma_, "ent_type": ent.label_}
retokenizer.merge(ent, attrs=attrs)

text = "One test with entities like New York City so the ents list is not void"
heads = [1, 1, 1, 2, 3, 6, 7, 4, 12, 11, 11, 12, 1, 12, 12]
deps = ["dep"] * len(heads)
tokens = en_tokenizer(text)
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
with doc.retokenize() as retokenizer:
for ent in doc.ents:
retokenizer.merge(ent)
Expand All @@ -210,6 +217,7 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
# fmt: off
text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n"
heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
deps = ["dep"] * len(heads)
tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
ents = ["O"] * len(heads)
Expand All @@ -221,7 +229,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
# fmt: on
tokens = en_tokenizer(text)
doc = Doc(
tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents
tokens.vocab,
words=[t.text for t in tokens],
heads=heads,
deps=deps,
tags=tags,
ents=ents,
)
assert len(doc) == 17
with doc.retokenize() as retokenizer:
Expand Down
6 changes: 4 additions & 2 deletions spacy/tests/doc/test_retokenize_split.py
Expand Up @@ -44,7 +44,8 @@ def test_doc_retokenize_split_lemmas(en_vocab):
# If lemmas are not set, leave unset
words = ["LosAngeles", "start", "."]
heads = [1, 2, 2]
doc = Doc(en_vocab, words=words, heads=heads)
deps = ["dep"] * len(heads)
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
with doc.retokenize() as retokenizer:
retokenizer.split(
doc[0],
Expand All @@ -57,7 +58,8 @@ def test_doc_retokenize_split_lemmas(en_vocab):
# If lemmas are set, use split orth as default lemma
words = ["LosAngeles", "start", "."]
heads = [1, 2, 2]
doc = Doc(en_vocab, words=words, heads=heads)
deps = ["dep"] * len(heads)
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
for t in doc:
t.lemma_ = "a"
with doc.retokenize() as retokenizer:
Expand Down
5 changes: 3 additions & 2 deletions spacy/tests/doc/test_token_api.py
Expand Up @@ -95,7 +95,8 @@ def test_doc_token_api_ancestors(en_vocab):
# the structure of this sentence depends on the English annotation scheme
words = ["Yesterday", "I", "saw", "a", "dog", "that", "barked", "loudly", "."]
heads = [2, 2, 2, 4, 2, 6, 4, 6, 2]
doc = Doc(en_vocab, words=words, heads=heads)
deps = ["dep"] * len(heads)
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
assert [t.text for t in doc[6].ancestors] == ["dog", "saw"]
assert [t.text for t in doc[1].ancestors] == ["saw"]
assert [t.text for t in doc[2].ancestors] == []
Expand Down Expand Up @@ -146,7 +147,7 @@ def test_doc_token_api_head_setter(en_vocab):
assert doc[4].left_edge.i == 0
assert doc[2].left_edge.i == 0
# head token must be from the same document
doc2 = Doc(en_vocab, words=words, heads=heads)
doc2 = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
with pytest.raises(ValueError):
doc[0].head = doc2[0]
# test sentence starts when two sentences are joined
Expand Down
4 changes: 2 additions & 2 deletions spacy/tests/parser/test_parse_navigate.py
Expand Up @@ -69,7 +69,7 @@ def heads():


def test_parser_parse_navigate_consistency(en_vocab, words, heads):
doc = Doc(en_vocab, words=words, heads=heads)
doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
for head in doc:
for child in head.lefts:
assert child.head == head
Expand Down Expand Up @@ -109,7 +109,7 @@ def test_parser_parse_navigate_child_consistency(en_vocab, words, heads):


def test_parser_parse_navigate_edges(en_vocab, words, heads):
doc = Doc(en_vocab, words=words, heads=heads)
doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
for token in doc:
subtree = list(token.subtree)
debug = "\t".join((token.text, token.left_edge.text, subtree[0].text))
Expand Down
2 changes: 2 additions & 0 deletions spacy/tokens/doc.pyx
Expand Up @@ -275,6 +275,8 @@ cdef class Doc:
deps = [dep if dep is not None else MISSING_DEP_ for dep in deps]
if deps and not heads:
heads = [0] * len(deps)
if heads and not deps:
raise ValueError(Errors.E1017)
if sent_starts is not None:
for i in range(len(sent_starts)):
if sent_starts[i] is True:
Expand Down