Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Support adding pipeline component instances #12710

Open
wants to merge 16 commits into
base: v4
Choose a base branch
from
2 changes: 2 additions & 0 deletions spacy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def load(
enable: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES,
exclude: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES,
config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
pipe_instances: Dict[str, Any] = util.SimpleFrozenDict(),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
pipe_instances: Dict[str, Any] = util.SimpleFrozenDict(),
pipe_instances: Dict[str, PipeCallable] = util.SimpleFrozenDict(),

cf the type we're using in add_pipe_instance()

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment for the other methods, too.

) -> Language:
"""Load a spaCy model from an installed package or a local path.

Expand All @@ -55,6 +56,7 @@ def load(
enable=enable,
exclude=exclude,
config=config,
pipe_instances=pipe_instances,
)


Expand Down
4 changes: 4 additions & 0 deletions spacy/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,9 @@ class Warnings(metaclass=ErrorsWithCodes):
W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
"key attribute for vectors, configure it through Vectors(attr=) or "
"'spacy init vectors --attr'")
W126 = ("Pipe instance '{name}' is being added with a vocab "
"instance that does not match other components. This is "
"usually an error.")


class Errors(metaclass=ErrorsWithCodes):
Expand Down Expand Up @@ -981,6 +984,7 @@ class Errors(metaclass=ErrorsWithCodes):
" 'min_length': {min_length}, 'max_length': {max_length}")
E1054 = ("The text, including whitespace, must match between reference and "
"predicted docs when training {component}.")
E1055 = ("Cannot create Language instance from config: missing pipeline components. The following components were added by instance (rather than config) via the 'Language.add_pipe_instance()' method, but are not present in the 'pipe_instances' variable: {names}")


# Deprecated model shortcuts, only used in errors and warnings
Expand Down
127 changes: 123 additions & 4 deletions spacy/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@
# This is the base config for the [pretraining] block and currently not included
# in the main config and only added via the 'init fill-config' command
DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg"
# Factory name indicating that the component wasn't constructed by a factory,
# and was instead passed by instance
INSTANCE_FACTORY_NAME = "__added_by_instance__"

# Type variable for contexts piped with documents
_AnyContext = TypeVar("_AnyContext")
Expand Down Expand Up @@ -768,9 +771,13 @@ def add_pipe(
raw_config: Optional[Config] = None,
validate: bool = True,
) -> PipeCallable:
"""Add a component to the processing pipeline. Valid components are
callables that take a `Doc` object, modify it and return it. Only one
of before/after/first/last can be set. Default behaviour is "last".
"""Add a component to the processing pipeline, by factory name and config.
Valid components are callables that take a `Doc` object, modify it, and return it.
Only one of before/after/first/last can be set. Default behaviour is "last".

Using this method tells spaCy how to construct your component, allowing you to load
your pipeline back using generic code. See 'Language.add_pipe_instance' to add a
component object instead, avoiding the need to create a factory.

factory_name (str): Name of the component factory.
name (str): Name of pipeline component. Overwrites existing
Expand Down Expand Up @@ -818,12 +825,61 @@ def add_pipe(
raw_config=raw_config,
validate=validate,
)
pipe_index = self._get_pipe_index(before, after, first, last)
self._pipe_meta[name] = self.get_factory_meta(factory_name)
pipe_index = self._get_pipe_index(before, after, first, last)
self._components.insert(pipe_index, (name, pipe_component))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can revert the changes to add_pipe in this PR, if we're not overloading.

self._link_components()
return pipe_component

def add_pipe_instance(
self,
component: PipeCallable,
name: Optional[str] = None,
*,
before: Optional[Union[str, int]] = None,
after: Optional[Union[str, int]] = None,
first: Optional[bool] = None,
last: Optional[bool] = None,
) -> PipeCallable:
"""Add a component instance to the processing pipeline. Valid components
are callables that take a `Doc` object, modify it and return it. Only one
of before/after/first/last can be set. Default behaviour is "last".

A limitation of this method is that spaCy will not know how to reconstruct
your pipeline after you save it out (unlike the 'Language.add_pipe()' method,
where you provide a config and let spaCy construct the instance). See 'spacy.load'
for details of how to load back a pipeline with components added by instance.

component (Callable[[Doc], Doc]): The component to add.
name (str): Name of pipeline component. Overwrites existing
component.name attribute if available. If no name is set and
the component exposes no name attribute, component.__name__ is
used. An error is raised if a name already exists in the pipeline.
before (Union[str, int]): Name or index of the component to insert new
component directly before.
after (Union[str, int]): Name or index of the component to insert new
component directly after.
first (bool): If True, insert component first in the pipeline.
last (bool): If True, insert component last in the pipeline.
RETURNS (Callable[[Doc], Doc]): The pipeline component.

DOCS: https://spacy.io/api/language#add_pipe_instance
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: add documentation to API page

"""
name = name if name is not None else getattr(component, "name")
if name is None:
raise ValueError("TODO error")
Comment on lines +868 to +870
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alternatively, we could require name to be a str in this method, and not Optional.

if name in self.component_names:
raise ValueError(Errors.E007.format(name=name, opts=self.component_names))

# It would be possible to take arguments for the FactoryMeta here, but we'll then have
# a problem on deserialization: where will the data be coming from?
# I think if someone wants that, they should register a component function.
self._pipe_meta[name] = FactoryMeta(INSTANCE_FACTORY_NAME)
self._pipe_configs[name] = Config()
pipe_index = self._get_pipe_index(before, after, first, last)
self._components.insert(pipe_index, (name, component))
return component

def _get_pipe_index(
self,
before: Optional[Union[str, int]] = None,
Expand Down Expand Up @@ -1735,6 +1791,7 @@ def from_config(
meta: Dict[str, Any] = SimpleFrozenDict(),
auto_fill: bool = True,
validate: bool = True,
pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
) -> "Language":
"""Create the nlp object from a loaded config. Will set up the tokenizer
and language data, add pipeline components etc. If no config is provided,
Expand Down Expand Up @@ -1810,6 +1867,11 @@ def from_config(

# Warn about require_gpu usage in jupyter notebook
warn_if_jupyter_cupy()
# If we've been passed pipe instances, check whether
# they have a Vocab instance, and if they do, use
# that one. This also performs some additional checks and
# warns if there's a mismatch.
vocab = _get_instantiated_vocab(vocab, pipe_instances)

# Note that we don't load vectors here, instead they get loaded explicitly
# inside stuff like the spacy train function. If we loaded them here,
Expand All @@ -1826,6 +1888,11 @@ def from_config(
interpolated = filled.interpolate() if not filled.is_interpolated else filled
pipeline = interpolated.get("components", {})
sourced = util.get_sourced_components(interpolated)
# Check for components that aren't in the pipe_instances dict, aren't disabled,
# and aren't built by factory.
missing_components = _find_missing_components(pipeline, pipe_instances, exclude)
if missing_components:
raise ValueError(Errors.E1055.format(names=", ".join(missing_components)))
# If components are loaded from a source (existing models), we cache
# them here so they're only loaded once
source_nlps = {}
Expand All @@ -1835,6 +1902,16 @@ def from_config(
if pipe_name not in pipeline:
opts = ", ".join(pipeline.keys())
raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))
if pipe_name in pipe_instances:
if pipe_name in exclude:
continue
else:
nlp.add_pipe_instance(pipe_instances[pipe_name])
# Is it important that we instantiate pipes that
# aren't excluded? It seems like we would want
# the exclude check above. I've left it how it
# is though, in case there's some sort of crazy
# load-bearing side-effects someone is relying on?
pipe_cfg = util.copy_config(pipeline[pipe_name])
raw_config = Config(filled["components"][pipe_name])
if pipe_name not in exclude:
Comment on lines 1902 to 1917
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that it looks like we should move the exclude check to be the first thing in the for loop - we specifically say that excluded components won't be loaded.

Let's move that to a separate PR though?

Expand Down Expand Up @@ -2337,3 +2414,45 @@ def step(self) -> None:
if self.count >= self.chunk_size:
self.count = 0
self.send()


def _get_instantiated_vocab(
vocab: Union[bool, Vocab], pipe_instances: Dict[str, Any]
) -> Union[bool, Vocab]:
vocab_instances = {}
for name, instance in pipe_instances.items():
if hasattr(instance, "vocab") and isinstance(instance.vocab, Vocab):
vocab_instances[name] = instance.vocab
if not vocab_instances:
return vocab
elif isinstance(vocab, Vocab):
for name, inst_voc in vocab_instances.items():
if inst_voc is not vocab:
warnings.warn(Warnings.W126.format(name=name))
return vocab
else:
resolved_vocab = None
for name, inst_voc in vocab_instances.items():
if resolved_vocab is None:
resolved_vocab = inst_voc
elif inst_voc is not resolved_vocab:
warnings.warn(Warnings.W126.format(name=name))
# We should guarantee a vocab from the logic above.
assert resolved_vocab is not None
return resolved_vocab


def _find_missing_components(
pipeline: Dict[str, Dict[str, Any]],
pipe_instances: Dict[str, Any],
exclude: Iterable[str],
) -> List[str]:
missing = []
for name, config in pipeline.items():
if (
config.get("factory") == INSTANCE_FACTORY_NAME
and name not in pipe_instances
and name not in exclude
):
missing.append(name)
return missing
37 changes: 37 additions & 0 deletions spacy/tests/test_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -800,3 +800,40 @@ def bad_pipe(doc):
nlp.add_pipe("test_component_bad_pipe")
with pytest.raises(ValueError, match="instead of a Doc"):
nlp("text")


@pytest.mark.parametrize(
"components,kwargs,position",
[
(["t1", "t2"], {"before": "t1"}, 0),
(["t1", "t2"], {"after": "t1"}, 1),
(["t1", "t2"], {"after": "t1"}, 1),
(["t1", "t2"], {"first": True}, 0),
(["t1", "t2"], {"last": True}, 2),
(["t1", "t2"], {"last": False}, 2),
(["t1", "t2"], {"first": False}, ValueError),
],
)
def test_add_pipe_instance(components, kwargs, position):
nlp = Language()
for name in components:
nlp.add_pipe("textcat", name=name)
pipe_names = list(nlp.pipe_names)
if isinstance(position, int):
result = nlp.add_pipe_instance(evil_component, name="new_component", **kwargs)
assert result is evil_component
pipe_names.insert(position, "new_component")
assert nlp.pipe_names == pipe_names
else:
with pytest.raises(ValueError):
result = nlp.add_pipe_instance(
evil_component, name="new_component", **kwargs
)


def test_add_pipe_instance_to_bytes():
nlp = Language()
nlp.add_pipe("textcat", name="t1")
nlp.add_pipe("textcat", name="t2")
nlp.add_pipe_instance(evil_component, name="new_component")
b = nlp.to_bytes()
25 changes: 24 additions & 1 deletion spacy/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,7 @@ def load_model(
enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
) -> "Language":
"""Load a model from a package or data path.

Expand All @@ -449,6 +450,9 @@ def load_model(
exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation.
pipe_instances (Dict[str, Any]): Dictionary of components
to be added to the pipeline directly (not created from
config)
RETURNS (Language): The loaded nlp object.
"""
kwargs = {
Expand All @@ -457,6 +461,7 @@ def load_model(
"enable": enable,
"exclude": exclude,
"config": config,
"pipe_instances": pipe_instances,
}
if isinstance(name, str): # name or string path
if name.startswith("blank:"): # shortcut for blank model
Expand All @@ -480,6 +485,7 @@ def load_model_from_package(
enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
) -> "Language":
"""Load a model from an installed package.

Expand All @@ -495,10 +501,13 @@ def load_model_from_package(
components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation.
pipe_instances (Dict[str, Any]): Dictionary of components
to be added to the pipeline directly (not created from
config)
RETURNS (Language): The loaded nlp object.
"""
cls = importlib.import_module(name)
return cls.load(vocab=vocab, disable=disable, enable=enable, exclude=exclude, config=config) # type: ignore[attr-defined]
return cls.load(vocab=vocab, disable=disable, enable=enable, exclude=exclude, config=config, pipe_instances=pipe_instances) # type: ignore[attr-defined]


def load_model_from_path(
Expand All @@ -510,6 +519,7 @@ def load_model_from_path(
enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
) -> "Language":
"""Load a model from a data directory path. Creates Language class with
pipeline from config.cfg and then calls from_disk() with path.
Expand All @@ -527,6 +537,9 @@ def load_model_from_path(
components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation.
pipe_instances (Dict[str, Any]): Dictionary of components
to be added to the pipeline directly (not created from
config)
RETURNS (Language): The loaded nlp object.
"""
if not model_path.exists():
Expand All @@ -543,6 +556,7 @@ def load_model_from_path(
enable=enable,
exclude=exclude,
meta=meta,
pipe_instances=pipe_instances,
)
return nlp.from_disk(model_path, exclude=exclude, overrides=overrides)

Expand All @@ -557,6 +571,7 @@ def load_model_from_config(
exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
auto_fill: bool = False,
validate: bool = True,
pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
) -> "Language":
"""Create an nlp object from a config. Expects the full config file including
a section "nlp" containing the settings for the nlp object.
Expand All @@ -574,6 +589,9 @@ def load_model_from_config(
components won't be loaded.
auto_fill (bool): Whether to auto-fill config with missing defaults.
validate (bool): Whether to show config validation errors.
pipe_instances (Dict[str, Any]): Dictionary of components
to be added to the pipeline directly (not created from
config)
RETURNS (Language): The loaded nlp object.
"""
if "nlp" not in config:
Expand All @@ -593,6 +611,7 @@ def load_model_from_config(
auto_fill=auto_fill,
validate=validate,
meta=meta,
pipe_instances=pipe_instances,
)
return nlp

Expand Down Expand Up @@ -656,6 +675,7 @@ def load_model_from_init_py(
enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
Copy link
Member

@svlandeg svlandeg Jun 26, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not having this in load_model_from_init_py broke nlp = spacy.load("en_core_web_sm"), which wasn't caught by the test suite :|

) -> "Language":
"""Helper function to use in the `load()` method of a model package's
__init__.py.
Expand All @@ -671,6 +691,9 @@ def load_model_from_init_py(
components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation.
pipe_instances (Dict[str, Any]): Dictionary of components
to be added to the pipeline directly (not created from
config)
RETURNS (Language): The loaded nlp object.
"""
model_path = Path(init_file).parent
Expand Down