Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EntityRuler improve disk load error message #9658

Merged
merged 7 commits into from
Nov 23, 2021
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions spacy/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -887,6 +887,7 @@ class Errors(metaclass=ErrorsWithCodes):
E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
"Non-UD tags should use the `tag` property.")
E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.")


# Deprecated model shortcuts, only used in errors and warnings
Expand Down
12 changes: 10 additions & 2 deletions spacy/pipeline/entityruler.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,10 +431,16 @@ def from_disk(
path = ensure_path(path)
self.clear()
depr_patterns_path = path.with_suffix(".jsonl")
if depr_patterns_path.is_file():
if path.suffix == ".jsonl": # user provides a jsonl
if path.is_file:
patterns = srsly.read_jsonl(path)
self.add_patterns(patterns)
else:
raise ValueError(Errors.E1023.format(path=path))
elif depr_patterns_path.is_file():
patterns = srsly.read_jsonl(depr_patterns_path)
self.add_patterns(patterns)
else:
elif path.is_dir(): # path is a valid directory
cfg = {}
deserializers_patterns = {
"patterns": lambda p: self.add_patterns(
Expand All @@ -451,6 +457,8 @@ def from_disk(
self.nlp.vocab, attr=self.phrase_matcher_attr
)
from_disk(path, deserializers_patterns, {})
else: # path is not a valid directory or file
raise ValueError(Errors.E146.format(path=path))
return self

def to_disk(
Expand Down
22 changes: 22 additions & 0 deletions spacy/tests/pipeline/test_entity_ruler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from spacy.language import Language
from spacy.pipeline import EntityRuler
from spacy.errors import MatchPatternError
from spacy.tests.util import make_tempdir

from thinc.api import NumpyOps, get_current_ops


Expand Down Expand Up @@ -238,3 +240,23 @@ def test_entity_ruler_multiprocessing(nlp, n_process):
for doc in nlp.pipe(texts, n_process=2):
for ent in doc.ents:
assert ent.ent_id_ == "1234"


def test_entity_ruler_serialize_jsonl(nlp, patterns):
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)
with make_tempdir() as d:
ruler.to_disk(d / "test_ruler.jsonl")
ruler.from_disk(d / "test_ruler.jsonl") # read from an existing jsonl file
with pytest.raises(ValueError):
ruler.from_disk(d / "non_existing.jsonl") # read from an bad jsonl file
svlandeg marked this conversation as resolved.
Show resolved Hide resolved


def test_entity_ruler_serialize_dir(nlp, patterns):
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)
with make_tempdir() as d:
ruler.to_disk(d / "test_ruler")
ruler.from_disk(d / "test_ruler") # read from an existing directory
with pytest.raises(ValueError):
ruler.from_disk(d / "non_existing_dir") # read from a bad directory