explosion · richardpaulhudson · May 9, 2022 · May 9, 2022 · May 9, 2022 · May 10, 2022
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
@@ -1,5 +1,5 @@
 from typing import Dict, Any, Union, List, Optional, Tuple, Iterable
-from typing import TYPE_CHECKING, overload
+from typing import TYPE_CHECKING, overload, cast
 import sys
 import shutil
 from pathlib import Path
@@ -221,24 +221,45 @@ def validate_project_commands(config: Dict[str, Any]) -> None:
 
     config (Dict[str, Any]): The loaded config.
     """
+
+    def verify_workflow_step(workflow_name: str, step: str) -> None:
+        if step not in command_names:
+            msg.fail(
+                f"Unknown command specified in workflow '{workflow_name}': {step}",
+                f"Workflows can only refer to commands defined in the 'commands' "
+                f"section of the {PROJECT_FILE}.",
+                exits=1,
+            )
+
     command_names = [cmd["name"] for cmd in config.get("commands", [])]
     workflows = config.get("workflows", {})
     duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
     if duplicates:
         err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
         msg.fail(err, exits=1)
-    for workflow_name, workflow_steps in workflows.items():
+    for workflow_name, workflow_step_or_lists in workflows.items():
         if workflow_name in command_names:
             err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
             msg.fail(err, exits=1)
-        for step in workflow_steps:
-            if step not in command_names:
-                msg.fail(
-                    f"Unknown command specified in workflow '{workflow_name}': {step}",
-                    f"Workflows can only refer to commands defined in the 'commands' "
-                    f"section of the {PROJECT_FILE}.",
-                    exits=1,
-                )
+        for step_or_list in workflow_step_or_lists:
+            if isinstance(step_or_list, str):
+                verify_workflow_step(workflow_name, step_or_list)
+            else:
+                workflow_list = cast(List[str], step_or_list)
+                if len(workflow_list) < 2:
+                    msg.fail(
+                        f"Invalid multiprocessing group within '{workflow_name}'.",
+                        f"A multiprocessing group must reference at least two commands.",
+                        exits=1,
+                    )
+                if len(workflow_list) != len(set(workflow_list)):
+                    msg.fail(
+                        f"A multiprocessing group within '{workflow_name}' contains a command more than once.",
+                        f"This is not permitted because it is then not possible to determine when to rerun.",
+                        exits=1,
+                    )
+                for step in workflow_list:
+                    verify_workflow_step(workflow_name, step)
 
 
 def get_hash(data, exclude: Iterable[str] = tuple()) -> str:

diff --git a/spacy/cli/project/document.py b/spacy/cli/project/document.py
@@ -14,8 +14,8 @@
 Commands are only re-run if their inputs have changed."""
 INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They
 can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run)
-and will run the specified commands in order. Commands are only re-run if their
-inputs have changed."""
+and will run the specified commands in order. Commands grouped within square brackets
+are run in parallel. Commands are only re-run if their inputs have changed."""
 INTRO_ASSETS = f"""The following assets are defined by the project. They can
 be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets)
 in the project directory."""
@@ -69,7 +69,17 @@ def project_document(
         md.add(md.table(data, ["Command", "Description"]))
     # Workflows
     wfs = config.get("workflows", {}).items()
-    data = [(md.code(n), " &rarr; ".join(md.code(w) for w in stp)) for n, stp in wfs]
+    data = []
+    for n, steps in wfs:
+        rendered_steps = []
+        for step in steps:
+            if isinstance(step, str):
+                rendered_steps.append(md.code(step))
+            else:
+                rendered_steps.append(
+                    "[" + ", ".join(md.code(p_step) for p_step in step) + "]"
+                )
+        data.append([md.code(n), " &rarr; ".join(rendered_steps)])
     if data:
         md.add(md.title(3, "Workflows", "⏭"))
         md.add(INTRO_WORKFLOWS)

diff --git a/spacy/cli/project/dvc.py b/spacy/cli/project/dvc.py
@@ -1,5 +1,5 @@
 """This module contains helpers and subcommands for integrating spaCy projects
-with Data Version Controk (DVC). https://dvc.org"""
+with Data Version Control (DVC). https://dvc.org"""
 from typing import Dict, Any, List, Optional, Iterable
 import subprocess
 from pathlib import Path
@@ -105,7 +105,13 @@ def update_dvc_config(
         dvc_config_path.unlink()
     dvc_commands = []
     config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
-    for name in workflows[workflow]:
+    names = []
+    for cmdOrMultiprocessingGroup in workflows[workflow]:
+        if isinstance(cmdOrMultiprocessingGroup, str):
+            names.append(cmdOrMultiprocessingGroup)
+        else:
+            names.extend(cmdOrMultiprocessingGroup)
+    for name in names:
         command = config_commands[name]
         deps = command.get("deps", [])
         outputs = command.get("outputs", [])

diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py
@@ -1,3 +1,4 @@
+import multiprocessing
 from pathlib import Path
 from wasabi import msg
 from .remote_storage import RemoteStorage
@@ -37,6 +38,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
     # We use a while loop here because we don't know how the commands
     # will be ordered. A command might need dependencies from one that's later
     # in the list.
+    mult_group_mutex = multiprocessing.Lock()
     while commands:
         for i, cmd in enumerate(list(commands)):
             logger.debug(f"CMD: {cmd['name']}.")
@@ -52,7 +54,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
 
                 out_locs = [project_dir / out for out in cmd.get("outputs", [])]
                 if all(loc.exists() for loc in out_locs):
-                    update_lockfile(project_dir, cmd)
+                    update_lockfile(project_dir, cmd, mult_group_mutex=mult_group_mutex)
                 # We remove the command from the list here, and break, so that
                 # we iterate over the loop again.
                 commands.pop(i)

diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
@@ -1,5 +1,7 @@
-from typing import Optional, List, Dict, Sequence, Any, Iterable
+from typing import Optional, List, Dict, Sequence, Any, Iterable, cast
 from pathlib import Path
+from multiprocessing import Process, Lock
+from multiprocessing.synchronize import Lock as Lock_t
 from wasabi import msg
 from wasabi.util import locale_escape
 import sys
@@ -50,6 +52,7 @@ def project_run(
     force: bool = False,
     dry: bool = False,
     capture: bool = False,
+    mult_group_mutex: Optional[Lock_t] = None,
 ) -> None:
     """Run a named script defined in the project.yml. If the script is part
     of the default pipeline (defined in the "run" section), DVC is used to
@@ -67,21 +70,44 @@ def project_run(
         when you want to turn over execution to the command, and capture=True
         when you want to run the command more like a function.
     """
+    if mult_group_mutex is None:
+        mult_group_mutex = Lock()
     config = load_project_config(project_dir, overrides=overrides)
     commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
     workflows = config.get("workflows", {})
     validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
     if subcommand in workflows:
         msg.info(f"Running workflow '{subcommand}'")
-        for cmd in workflows[subcommand]:
-            project_run(
-                project_dir,
-                cmd,
-                overrides=overrides,
-                force=force,
-                dry=dry,
-                capture=capture,
-            )
+        for cmdOrMultiprocessingGroup in workflows[subcommand]:
+            if isinstance(cmdOrMultiprocessingGroup, str):
+                project_run(
+                    project_dir,
+                    cmdOrMultiprocessingGroup,
+                    overrides=overrides,
+                    force=force,
+                    dry=dry,
+                    capture=capture,
+                    mult_group_mutex=mult_group_mutex,
+                )
+            else:
+                processes = [
+                    Process(
+                        target=project_run,
+                        args=(project_dir, cmd),
+                        kwargs={
+                            "overrides": overrides,
+                            "force": force,
+                            "dry": dry,
+                            "capture": capture,
+                            "mult_group_mutex": mult_group_mutex,
+                        },
+                    )
+                    for cmd in cmdOrMultiprocessingGroup
+                ]
+                for process in processes:
+                    process.start()
+                for process in processes:
+                    process.join()
     else:
         cmd = commands[subcommand]
         for dep in cmd.get("deps", []):
@@ -93,13 +119,18 @@ def project_run(
         check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
         with working_dir(project_dir) as current_dir:
             msg.divider(subcommand)
-            rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
+            rerun = check_rerun(
+                current_dir,
+                cmd,
+                check_spacy_commit=check_spacy_commit,
+                mult_group_mutex=mult_group_mutex,
+            )
             if not rerun and not force:
                 msg.info(f"Skipping '{cmd['name']}': nothing changed")
             else:
                 run_commands(cmd["script"], dry=dry, capture=capture)
                 if not dry:
-                    update_lockfile(current_dir, cmd)
+                    update_lockfile(current_dir, cmd, mult_group_mutex=mult_group_mutex)
 
 
 def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
@@ -123,7 +154,12 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
             if help_text:
                 print(f"\n{help_text}\n")
         elif subcommand in workflows:
-            steps = workflows[subcommand]
+            steps = []
+            for cmdOrMultiprocessingGroup in workflows[subcommand]:
+                if isinstance(cmdOrMultiprocessingGroup, str):
+                    steps.append(cmdOrMultiprocessingGroup)
+                else:
+                    steps.extend(cmdOrMultiprocessingGroup)
             print(f"\nWorkflow consisting of {len(steps)} commands:")
             steps_data = [
                 (f"{i + 1}. {step}", commands[step].get("help", ""))
@@ -157,7 +193,7 @@ def run_commands(
 
     commands (List[str]): The string commands.
     silent (bool): Don't print the commands.
-    dry (bool): Perform a dry run and don't execut anything.
+    dry (bool): Perform a dry run and don't execute anything.
     capture (bool): Whether to capture the output and errors of individual commands.
         If False, the stdout and stderr will not be redirected, and if there's an error,
         sys.exit will be called with the return code. You should use capture=False
@@ -212,6 +248,7 @@ def check_rerun(
     *,
     check_spacy_version: bool = True,
     check_spacy_commit: bool = False,
+    mult_group_mutex: Lock_t,
 ) -> bool:
     """Check if a command should be rerun because its settings or inputs/outputs
     changed.
@@ -224,51 +261,62 @@ def check_rerun(
     # Always rerun if no-skip is set
     if command.get("no_skip", False):
         return True
-    lock_path = project_dir / PROJECT_LOCK
-    if not lock_path.exists():  # We don't have a lockfile, run command
-        return True
-    data = srsly.read_yaml(lock_path)
-    if command["name"] not in data:  # We don't have info about this command
-        return True
-    entry = data[command["name"]]
-    # Always run commands with no outputs (otherwise they'd always be skipped)
-    if not entry.get("outs", []):
-        return True
-    # Always rerun if spaCy version or commit hash changed
-    spacy_v = entry.get("spacy_version")
-    commit = entry.get("spacy_git_version")
-    if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__):
-        info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
-        msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}")
-        return True
-    if check_spacy_commit and commit != GIT_VERSION:
-        info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
-        msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
-        return True
-    # If the entry in the lockfile matches the lockfile entry that would be
-    # generated from the current command, we don't rerun because it means that
-    # all inputs/outputs, hashes and scripts are the same and nothing changed
-    lock_entry = get_lock_entry(project_dir, command)
-    exclude = ["spacy_version", "spacy_git_version"]
-    return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
+    with mult_group_mutex:
+        lock_path = project_dir / PROJECT_LOCK
+        if not lock_path.exists():  # We don't have a lockfile, run command
+            return True
+        data = srsly.read_yaml(lock_path)
+        if command["name"] not in data:  # We don't have info about this command
+            return True
+        entry = data[command["name"]]
+        # Always run commands with no outputs (otherwise they'd always be skipped)
+        if not entry.get("outs", []):
+            return True
+        # Always rerun if spaCy version or commit hash changed
+        spacy_v = entry.get("spacy_version")
+        commit = entry.get("spacy_git_version")
+        if check_spacy_version and not is_minor_version_match(
+            spacy_v, about.__version__
+        ):
+            info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
+            msg.info(
+                f"Re-running '{command['name']}': spaCy minor version changed {info}"
+            )
+            return True
+        if check_spacy_commit and commit != GIT_VERSION:
+            info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
+            msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
+            return True
+        # If the entry in the lockfile matches the lockfile entry that would be
+        # generated from the current command, we don't rerun because it means that
+        # all inputs/outputs, hashes and scripts are the same and nothing changed
+        lock_entry = get_lock_entry(project_dir, command)
+        exclude = ["spacy_version", "spacy_git_version"]
+        return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
 
 
-def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
+def update_lockfile(
+    project_dir: Path,
+    command: Dict[str, Any],
+    mult_group_mutex: Lock_t,
+) -> None:
     """Update the lockfile after running a command. Will create a lockfile if
     it doesn't yet exist and will add an entry for the current command, its
     script and dependencies/outputs.
 
     project_dir (Path): The current project directory.
     command (Dict[str, Any]): The command, as defined in the project.yml.
+    mult_group_mutex: the mutex preventing concurrent writes
     """
-    lock_path = project_dir / PROJECT_LOCK
-    if not lock_path.exists():
-        srsly.write_yaml(lock_path, {})
-        data = {}
-    else:
-        data = srsly.read_yaml(lock_path)
-    data[command["name"]] = get_lock_entry(project_dir, command)
-    srsly.write_yaml(lock_path, data)
+    with mult_group_mutex:
+        lock_path = project_dir / PROJECT_LOCK
+        if not lock_path.exists():
+            srsly.write_yaml(lock_path, {})
+            data = {}
+        else:
+            data = srsly.read_yaml(lock_path)
+        data[command["name"]] = get_lock_entry(project_dir, command)
+        srsly.write_yaml(lock_path, data)
 
 
 def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]:

diff --git a/spacy/schemas.py b/spacy/schemas.py
@@ -458,8 +458,8 @@ class ProjectConfigSchema(BaseModel):
     vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands")
     env: Dict[StrictStr, Any] = Field({}, title="Optional variable names to substitute in commands, mapped to environment variable names")
     assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets")
-    workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
-    commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
+    workflows: Dict[StrictStr, List[Union[StrictStr, List[StrictStr]]]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
+    commands: List[ProjectConfigCommand] = Field([], title="Project command shortcuts")
     title: Optional[str] = Field(None, title="Project title")
     spacy_version: Optional[StrictStr] = Field(None, title="spaCy version range that the project is compatible with")
     # fmt: on