Skip to content

Commit

Permalink
Raise error if deps not provided with heads (#8335)
Browse files Browse the repository at this point in the history
* Fill in deps if not provided with heads

Before this change, if heads were passed without deps they would be
silently ignored, which could be confusing. See #8334.

* Use "dep" instead of a blank string

This is the customary placeholder dep. It might be better to show an
error here instead though.

* Throw error on heads without deps

* Add a test

* Fix tests

* Formatting

* Fix all tests

* Fix a test I missed

* Revise error message

* Clean up whitespace

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
  • Loading branch information
polm and adrianeboyd committed Jun 15, 2021
1 parent 0fd0d94 commit 2c105cd
Show file tree
Hide file tree
Showing 7 changed files with 39 additions and 11 deletions.
3 changes: 3 additions & 0 deletions spacy/errors.py
Expand Up @@ -843,6 +843,9 @@ class Errors:
"DependencyMatcher token patterns. The token pattern in "
"RIGHT_ATTR should return matches that are each exactly one token "
"long. Invalid pattern:\n{node}")
E1017 = ("A Doc object requires both 'deps' and 'heads' for dependency "
"parses. If no dependency labels are available, provide "
"placeholder deps such as `deps=[\"dep\"]*len(heads)`.")


# Deprecated model shortcuts, only used in errors and warnings
Expand Down
7 changes: 7 additions & 0 deletions spacy/tests/doc/test_creation.py
Expand Up @@ -63,3 +63,10 @@ def test_create_from_words_and_text(vocab):
words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
text = " 'dogs'\n\nrun "
(words, spaces) = util.get_words_and_spaces(words + ["away"], text)


def test_create_with_heads_and_no_deps(vocab):
words = "I like ginger".split()
heads = list(range(len(words)))
with pytest.raises(ValueError):
doc = Doc(vocab, words=words, heads=heads)
23 changes: 18 additions & 5 deletions spacy/tests/doc/test_retokenize_merge.py
Expand Up @@ -108,9 +108,12 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab):
words = ["The", "players", "start", "."]
lemmas = [t.lower() for t in words]
heads = [1, 2, 2, 2]
deps = ["dep"] * len(heads)
tags = ["DT", "NN", "VBZ", "."]
pos = ["DET", "NOUN", "VERB", "PUNCT"]
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas)
doc = Doc(
en_vocab, words=words, tags=tags, pos=pos, heads=heads, deps=deps, lemmas=lemmas
)
assert len(doc) == 4
assert doc[0].text == "The"
assert doc[0].tag_ == "DT"
Expand All @@ -123,7 +126,9 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab):
assert doc[0].tag_ == "NN"
assert doc[0].pos_ == "NOUN"
assert doc[0].lemma_ == "the players"
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas)
doc = Doc(
en_vocab, words=words, tags=tags, pos=pos, heads=heads, deps=deps, lemmas=lemmas
)
assert len(doc) == 4
assert doc[0].text == "The"
assert doc[0].tag_ == "DT"
Expand Down Expand Up @@ -190,17 +195,19 @@ def test_doc_retokenize_span_np_merges(en_tokenizer):

text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript."
heads = [1, 1, 10, 7, 3, 3, 7, 10, 9, 10, 1, 10, 11, 12, 13, 13, 1]
deps = ["dep"] * len(heads)
tokens = en_tokenizer(text)
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
with doc.retokenize() as retokenizer:
for ent in doc.ents:
attrs = {"tag": ent.label_, "lemma": ent.lemma_, "ent_type": ent.label_}
retokenizer.merge(ent, attrs=attrs)

text = "One test with entities like New York City so the ents list is not void"
heads = [1, 1, 1, 2, 3, 6, 7, 4, 12, 11, 11, 12, 1, 12, 12]
deps = ["dep"] * len(heads)
tokens = en_tokenizer(text)
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
with doc.retokenize() as retokenizer:
for ent in doc.ents:
retokenizer.merge(ent)
Expand All @@ -210,6 +217,7 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
# fmt: off
text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n"
heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
deps = ["dep"] * len(heads)
tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
ents = ["O"] * len(heads)
Expand All @@ -221,7 +229,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
# fmt: on
tokens = en_tokenizer(text)
doc = Doc(
tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents
tokens.vocab,
words=[t.text for t in tokens],
heads=heads,
deps=deps,
tags=tags,
ents=ents,
)
assert len(doc) == 17
with doc.retokenize() as retokenizer:
Expand Down
6 changes: 4 additions & 2 deletions spacy/tests/doc/test_retokenize_split.py
Expand Up @@ -44,7 +44,8 @@ def test_doc_retokenize_split_lemmas(en_vocab):
# If lemmas are not set, leave unset
words = ["LosAngeles", "start", "."]
heads = [1, 2, 2]
doc = Doc(en_vocab, words=words, heads=heads)
deps = ["dep"] * len(heads)
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
with doc.retokenize() as retokenizer:
retokenizer.split(
doc[0],
Expand All @@ -57,7 +58,8 @@ def test_doc_retokenize_split_lemmas(en_vocab):
# If lemmas are set, use split orth as default lemma
words = ["LosAngeles", "start", "."]
heads = [1, 2, 2]
doc = Doc(en_vocab, words=words, heads=heads)
deps = ["dep"] * len(heads)
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
for t in doc:
t.lemma_ = "a"
with doc.retokenize() as retokenizer:
Expand Down
5 changes: 3 additions & 2 deletions spacy/tests/doc/test_token_api.py
Expand Up @@ -95,7 +95,8 @@ def test_doc_token_api_ancestors(en_vocab):
# the structure of this sentence depends on the English annotation scheme
words = ["Yesterday", "I", "saw", "a", "dog", "that", "barked", "loudly", "."]
heads = [2, 2, 2, 4, 2, 6, 4, 6, 2]
doc = Doc(en_vocab, words=words, heads=heads)
deps = ["dep"] * len(heads)
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
assert [t.text for t in doc[6].ancestors] == ["dog", "saw"]
assert [t.text for t in doc[1].ancestors] == ["saw"]
assert [t.text for t in doc[2].ancestors] == []
Expand Down Expand Up @@ -146,7 +147,7 @@ def test_doc_token_api_head_setter(en_vocab):
assert doc[4].left_edge.i == 0
assert doc[2].left_edge.i == 0
# head token must be from the same document
doc2 = Doc(en_vocab, words=words, heads=heads)
doc2 = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
with pytest.raises(ValueError):
doc[0].head = doc2[0]
# test sentence starts when two sentences are joined
Expand Down
4 changes: 2 additions & 2 deletions spacy/tests/parser/test_parse_navigate.py
Expand Up @@ -69,7 +69,7 @@ def heads():


def test_parser_parse_navigate_consistency(en_vocab, words, heads):
doc = Doc(en_vocab, words=words, heads=heads)
doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
for head in doc:
for child in head.lefts:
assert child.head == head
Expand Down Expand Up @@ -109,7 +109,7 @@ def test_parser_parse_navigate_child_consistency(en_vocab, words, heads):


def test_parser_parse_navigate_edges(en_vocab, words, heads):
doc = Doc(en_vocab, words=words, heads=heads)
doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
for token in doc:
subtree = list(token.subtree)
debug = "\t".join((token.text, token.left_edge.text, subtree[0].text))
Expand Down
2 changes: 2 additions & 0 deletions spacy/tokens/doc.pyx
Expand Up @@ -275,6 +275,8 @@ cdef class Doc:
deps = [dep if dep is not None else MISSING_DEP_ for dep in deps]
if deps and not heads:
heads = [0] * len(deps)
if heads and not deps:
raise ValueError(Errors.E1017)
if sent_starts is not None:
for i in range(len(sent_starts)):
if sent_starts[i] is True:
Expand Down

0 comments on commit 2c105cd

Please sign in to comment.