diff --git a/spacy/__init__.py b/spacy/__init__.py index 1a18ad0d580..cd24ec243fc 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -32,6 +32,7 @@ def load( enable: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES, exclude: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES, config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(), + pipe_instances: Dict[str, Any] = util.SimpleFrozenDict(), ) -> Language: """Load a spaCy model from an installed package or a local path. @@ -55,6 +56,7 @@ def load( enable=enable, exclude=exclude, config=config, + pipe_instances=pipe_instances, ) diff --git a/spacy/errors.py b/spacy/errors.py index db1a886aa8f..f39634617d7 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -219,6 +219,9 @@ class Warnings(metaclass=ErrorsWithCodes): W125 = ("The StaticVectors key_attr is no longer used. To set a custom " "key attribute for vectors, configure it through Vectors(attr=) or " "'spacy init vectors --attr'") + W126 = ("Pipe instance '{name}' is being added with a vocab " + "instance that does not match other components. This is " + "usually an error.") class Errors(metaclass=ErrorsWithCodes): @@ -981,6 +984,7 @@ class Errors(metaclass=ErrorsWithCodes): " 'min_length': {min_length}, 'max_length': {max_length}") E1054 = ("The text, including whitespace, must match between reference and " "predicted docs when training {component}.") + E1055 = ("Cannot create Language instance from config: missing pipeline components. The following components were added by instance (rather than config) via the 'Language.add_pipe_instance()' method, but are not present in the 'pipe_instances' variable: {names}") # Deprecated model shortcuts, only used in errors and warnings diff --git a/spacy/language.py b/spacy/language.py index fd616483be8..adc574d42d7 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -75,6 +75,9 @@ # This is the base config for the [pretraining] block and currently not included # in the main config and only added via the 'init fill-config' command DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg" +# Factory name indicating that the component wasn't constructed by a factory, +# and was instead passed by instance +INSTANCE_FACTORY_NAME = "__added_by_instance__" # Type variable for contexts piped with documents _AnyContext = TypeVar("_AnyContext") @@ -768,9 +771,13 @@ def add_pipe( raw_config: Optional[Config] = None, validate: bool = True, ) -> PipeCallable: - """Add a component to the processing pipeline. Valid components are - callables that take a `Doc` object, modify it and return it. Only one - of before/after/first/last can be set. Default behaviour is "last". + """Add a component to the processing pipeline, by factory name and config. + Valid components are callables that take a `Doc` object, modify it, and return it. + Only one of before/after/first/last can be set. Default behaviour is "last". + + Using this method tells spaCy how to construct your component, allowing you to load + your pipeline back using generic code. See 'Language.add_pipe_instance' to add a + component object instead, avoiding the need to create a factory. factory_name (str): Name of the component factory. name (str): Name of pipeline component. Overwrites existing @@ -818,12 +825,61 @@ def add_pipe( raw_config=raw_config, validate=validate, ) - pipe_index = self._get_pipe_index(before, after, first, last) self._pipe_meta[name] = self.get_factory_meta(factory_name) + pipe_index = self._get_pipe_index(before, after, first, last) self._components.insert(pipe_index, (name, pipe_component)) self._link_components() return pipe_component + def add_pipe_instance( + self, + component: PipeCallable, + name: Optional[str] = None, + *, + before: Optional[Union[str, int]] = None, + after: Optional[Union[str, int]] = None, + first: Optional[bool] = None, + last: Optional[bool] = None, + ) -> PipeCallable: + """Add a component instance to the processing pipeline. Valid components + are callables that take a `Doc` object, modify it and return it. Only one + of before/after/first/last can be set. Default behaviour is "last". + + A limitation of this method is that spaCy will not know how to reconstruct + your pipeline after you save it out (unlike the 'Language.add_pipe()' method, + where you provide a config and let spaCy construct the instance). See 'spacy.load' + for details of how to load back a pipeline with components added by instance. + + component (Callable[[Doc], Doc]): The component to add. + name (str): Name of pipeline component. Overwrites existing + component.name attribute if available. If no name is set and + the component exposes no name attribute, component.__name__ is + used. An error is raised if a name already exists in the pipeline. + before (Union[str, int]): Name or index of the component to insert new + component directly before. + after (Union[str, int]): Name or index of the component to insert new + component directly after. + first (bool): If True, insert component first in the pipeline. + last (bool): If True, insert component last in the pipeline. + RETURNS (Callable[[Doc], Doc]): The pipeline component. + + DOCS: https://spacy.io/api/language#add_pipe_instance + """ + name = name if name is not None else getattr(component, "name") + if name is None: + raise ValueError("TODO error") + if name in self.component_names: + raise ValueError(Errors.E007.format(name=name, opts=self.component_names)) + + # It would be possible to take arguments for the FactoryMeta here, but we'll then have + # a problem on deserialization: where will the data be coming from? + # I think if someone wants that, they should register a component function. + self._pipe_meta[name] = FactoryMeta(INSTANCE_FACTORY_NAME) + self._pipe_configs[name] = Config() + pipe_index = self._get_pipe_index(before, after, first, last) + self._components.insert(pipe_index, (name, component)) + return component + def _get_pipe_index( self, before: Optional[Union[str, int]] = None, @@ -1735,6 +1791,7 @@ def from_config( meta: Dict[str, Any] = SimpleFrozenDict(), auto_fill: bool = True, validate: bool = True, + pipe_instances: Dict[str, Any] = SimpleFrozenDict(), ) -> "Language": """Create the nlp object from a loaded config. Will set up the tokenizer and language data, add pipeline components etc. If no config is provided, @@ -1810,6 +1867,11 @@ def from_config( # Warn about require_gpu usage in jupyter notebook warn_if_jupyter_cupy() + # If we've been passed pipe instances, check whether + # they have a Vocab instance, and if they do, use + # that one. This also performs some additional checks and + # warns if there's a mismatch. + vocab = _get_instantiated_vocab(vocab, pipe_instances) # Note that we don't load vectors here, instead they get loaded explicitly # inside stuff like the spacy train function. If we loaded them here, @@ -1826,6 +1888,11 @@ def from_config( interpolated = filled.interpolate() if not filled.is_interpolated else filled pipeline = interpolated.get("components", {}) sourced = util.get_sourced_components(interpolated) + # Check for components that aren't in the pipe_instances dict, aren't disabled, + # and aren't built by factory. + missing_components = _find_missing_components(pipeline, pipe_instances, exclude) + if missing_components: + raise ValueError(Errors.E1055.format(names=", ".join(missing_components))) # If components are loaded from a source (existing models), we cache # them here so they're only loaded once source_nlps = {} @@ -1835,6 +1902,16 @@ def from_config( if pipe_name not in pipeline: opts = ", ".join(pipeline.keys()) raise ValueError(Errors.E956.format(name=pipe_name, opts=opts)) + if pipe_name in pipe_instances: + if pipe_name in exclude: + continue + else: + nlp.add_pipe_instance(pipe_instances[pipe_name]) + # Is it important that we instantiate pipes that + # aren't excluded? It seems like we would want + # the exclude check above. I've left it how it + # is though, in case there's some sort of crazy + # load-bearing side-effects someone is relying on? pipe_cfg = util.copy_config(pipeline[pipe_name]) raw_config = Config(filled["components"][pipe_name]) if pipe_name not in exclude: @@ -2337,3 +2414,45 @@ def step(self) -> None: if self.count >= self.chunk_size: self.count = 0 self.send() + + +def _get_instantiated_vocab( + vocab: Union[bool, Vocab], pipe_instances: Dict[str, Any] +) -> Union[bool, Vocab]: + vocab_instances = {} + for name, instance in pipe_instances.items(): + if hasattr(instance, "vocab") and isinstance(instance.vocab, Vocab): + vocab_instances[name] = instance.vocab + if not vocab_instances: + return vocab + elif isinstance(vocab, Vocab): + for name, inst_voc in vocab_instances.items(): + if inst_voc is not vocab: + warnings.warn(Warnings.W126.format(name=name)) + return vocab + else: + resolved_vocab = None + for name, inst_voc in vocab_instances.items(): + if resolved_vocab is None: + resolved_vocab = inst_voc + elif inst_voc is not resolved_vocab: + warnings.warn(Warnings.W126.format(name=name)) + # We should guarantee a vocab from the logic above. + assert resolved_vocab is not None + return resolved_vocab + + +def _find_missing_components( + pipeline: Dict[str, Dict[str, Any]], + pipe_instances: Dict[str, Any], + exclude: Iterable[str], +) -> List[str]: + missing = [] + for name, config in pipeline.items(): + if ( + config.get("factory") == INSTANCE_FACTORY_NAME + and name not in pipe_instances + and name not in exclude + ): + missing.append(name) + return missing diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 51eec32399c..7d531b22639 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -800,3 +800,40 @@ def bad_pipe(doc): nlp.add_pipe("test_component_bad_pipe") with pytest.raises(ValueError, match="instead of a Doc"): nlp("text") + + +@pytest.mark.parametrize( + "components,kwargs,position", + [ + (["t1", "t2"], {"before": "t1"}, 0), + (["t1", "t2"], {"after": "t1"}, 1), + (["t1", "t2"], {"after": "t1"}, 1), + (["t1", "t2"], {"first": True}, 0), + (["t1", "t2"], {"last": True}, 2), + (["t1", "t2"], {"last": False}, 2), + (["t1", "t2"], {"first": False}, ValueError), + ], +) +def test_add_pipe_instance(components, kwargs, position): + nlp = Language() + for name in components: + nlp.add_pipe("textcat", name=name) + pipe_names = list(nlp.pipe_names) + if isinstance(position, int): + result = nlp.add_pipe_instance(evil_component, name="new_component", **kwargs) + assert result is evil_component + pipe_names.insert(position, "new_component") + assert nlp.pipe_names == pipe_names + else: + with pytest.raises(ValueError): + result = nlp.add_pipe_instance( + evil_component, name="new_component", **kwargs + ) + + +def test_add_pipe_instance_to_bytes(): + nlp = Language() + nlp.add_pipe("textcat", name="t1") + nlp.add_pipe("textcat", name="t2") + nlp.add_pipe_instance(evil_component, name="new_component") + b = nlp.to_bytes() diff --git a/spacy/util.py b/spacy/util.py index 762699a9756..f8b2c6cce14 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -438,6 +438,7 @@ def load_model( enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), + pipe_instances: Dict[str, Any] = SimpleFrozenDict(), ) -> "Language": """Load a model from a package or data path. @@ -449,6 +450,9 @@ def load_model( exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. config (Dict[str, Any] / Config): Config overrides as nested dict or dict keyed by section values in dot notation. + pipe_instances (Dict[str, Any]): Dictionary of components + to be added to the pipeline directly (not created from + config) RETURNS (Language): The loaded nlp object. """ kwargs = { @@ -457,6 +461,7 @@ def load_model( "enable": enable, "exclude": exclude, "config": config, + "pipe_instances": pipe_instances, } if isinstance(name, str): # name or string path if name.startswith("blank:"): # shortcut for blank model @@ -480,6 +485,7 @@ def load_model_from_package( enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), + pipe_instances: Dict[str, Any] = SimpleFrozenDict(), ) -> "Language": """Load a model from an installed package. @@ -495,10 +501,13 @@ def load_model_from_package( components won't be loaded. config (Dict[str, Any] / Config): Config overrides as nested dict or dict keyed by section values in dot notation. + pipe_instances (Dict[str, Any]): Dictionary of components + to be added to the pipeline directly (not created from + config) RETURNS (Language): The loaded nlp object. """ cls = importlib.import_module(name) - return cls.load(vocab=vocab, disable=disable, enable=enable, exclude=exclude, config=config) # type: ignore[attr-defined] + return cls.load(vocab=vocab, disable=disable, enable=enable, exclude=exclude, config=config, pipe_instances=pipe_instances) # type: ignore[attr-defined] def load_model_from_path( @@ -510,6 +519,7 @@ def load_model_from_path( enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), + pipe_instances: Dict[str, Any] = SimpleFrozenDict(), ) -> "Language": """Load a model from a data directory path. Creates Language class with pipeline from config.cfg and then calls from_disk() with path. @@ -527,6 +537,9 @@ def load_model_from_path( components won't be loaded. config (Dict[str, Any] / Config): Config overrides as nested dict or dict keyed by section values in dot notation. + pipe_instances (Dict[str, Any]): Dictionary of components + to be added to the pipeline directly (not created from + config) RETURNS (Language): The loaded nlp object. """ if not model_path.exists(): @@ -543,6 +556,7 @@ def load_model_from_path( enable=enable, exclude=exclude, meta=meta, + pipe_instances=pipe_instances, ) return nlp.from_disk(model_path, exclude=exclude, overrides=overrides) @@ -557,6 +571,7 @@ def load_model_from_config( exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, auto_fill: bool = False, validate: bool = True, + pipe_instances: Dict[str, Any] = SimpleFrozenDict(), ) -> "Language": """Create an nlp object from a config. Expects the full config file including a section "nlp" containing the settings for the nlp object. @@ -574,6 +589,9 @@ def load_model_from_config( components won't be loaded. auto_fill (bool): Whether to auto-fill config with missing defaults. validate (bool): Whether to show config validation errors. + pipe_instances (Dict[str, Any]): Dictionary of components + to be added to the pipeline directly (not created from + config) RETURNS (Language): The loaded nlp object. """ if "nlp" not in config: @@ -593,6 +611,7 @@ def load_model_from_config( auto_fill=auto_fill, validate=validate, meta=meta, + pipe_instances=pipe_instances, ) return nlp @@ -656,6 +675,7 @@ def load_model_from_init_py( enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), + pipe_instances: Dict[str, Any] = SimpleFrozenDict(), ) -> "Language": """Helper function to use in the `load()` method of a model package's __init__.py. @@ -671,6 +691,9 @@ def load_model_from_init_py( components won't be loaded. config (Dict[str, Any] / Config): Config overrides as nested dict or dict keyed by section values in dot notation. + pipe_instances (Dict[str, Any]): Dictionary of components + to be added to the pipeline directly (not created from + config) RETURNS (Language): The loaded nlp object. """ model_path = Path(init_file).parent