diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 5b89bbc61..30c7d3027 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -121,6 +121,11 @@ jobs: matrix: cwl-version: [v1.0, v1.1, v1.2] container: [docker, singularity, podman] + extras: [""] + include: + - cwl-version: v1.2 + container: docker + extras: "--fast-parser" steps: - uses: actions/checkout@v3 @@ -141,6 +146,7 @@ jobs: version: ${{ matrix.cwl-version }} container: ${{ matrix.container }} spec_branch: main + CWLTOOL_OPTIONS: ${{ matrix.extras }} run: ./conformance-test.sh release_test: diff --git a/README.rst b/README.rst index 4a76a5656..87313bff6 100644 --- a/README.rst +++ b/README.rst @@ -667,6 +667,19 @@ given in the following table; all are optional. +----------------+------------------+----------+------------------------------+ +Enabling Fast Parser (experimental) +----------------------------------- + +For very large workflows, `cwltool` can spend a lot of time in +initialization, before the first step runs. There is an experimental +flag ``--fast-parser`` which can dramatically reduce the +initialization overhead, however as of this writing it has several limitations: + +- Error reporting in general is worse than the standard parser, you will want to use it with workflows that you know are already correct. + +- It does not check for dangling links (these will become runtime errors instead of loading errors) + +- Several other cases fail, as documented in https://github.com/common-workflow-language/cwltool/pull/1720 =========== Development diff --git a/conformance-test.sh b/conformance-test.sh index 90075a7fb..385bfd8f7 100755 --- a/conformance-test.sh +++ b/conformance-test.sh @@ -56,6 +56,7 @@ pip3 install -U setuptools wheel pip pip3 uninstall -y cwltool pip3 install -e . pip3 install codecov cwltest>=2.1 +root_folder=${PWD} pushd "${repo}-${spec_branch}" || exit 1 # shellcheck disable=SC2043 @@ -71,6 +72,7 @@ cat > "${COVERAGE_RC}" < 0 )); then else EXCLUDE="" fi +export CWLTOOL_OPTIONS +echo CWLTOOL_OPTIONS="${CWLTOOL_OPTIONS}" # shellcheck disable=SC2086 LC_ALL=C.UTF-8 ./run_test.sh --junit-xml=result3.xml ${EXCLUDE} \ RUNNER=${CWLTOOL_WITH_COV} "-j$(nproc)" ${BADGE} \ - ${DRAFT} "${EXTRA}" \ - "--classname=py3_${container}" + ${DRAFT} \ + "--classname=py3_${container}_$(echo ${CWLTOOL_OPTIONS} | tr "[:blank:]-" _)" # LC_ALL=C is to work around junit-xml ASCII only bug # capture return code of ./run_test.sh diff --git a/cwltool/argparser.py b/cwltool/argparser.py index 8a2328fce..2d5246ee0 100644 --- a/cwltool/argparser.py +++ b/cwltool/argparser.py @@ -576,6 +576,13 @@ def arg_parser() -> argparse.ArgumentParser: default=True, help=argparse.SUPPRESS, ) + parser.add_argument( + "--fast-parser", + dest="fast_parser", + action="store_true", + default=False, + help=argparse.SUPPRESS, + ) reggroup = parser.add_mutually_exclusive_group() reggroup.add_argument( diff --git a/cwltool/context.py b/cwltool/context.py index ef0a7e4f6..4afb1ea10 100644 --- a/cwltool/context.py +++ b/cwltool/context.py @@ -4,7 +4,18 @@ import shutil import tempfile import threading -from typing import IO, Any, Callable, Dict, Iterable, List, Optional, TextIO, Union +from typing import ( + IO, + Any, + Callable, + Dict, + Iterable, + List, + Optional, + TextIO, + Tuple, + Union, +) # move to a regular typing import when Python 3.3-3.6 is no longer supported from ruamel.yaml.comments import CommentedMap @@ -23,6 +34,8 @@ from .utils import DEFAULT_TMP_PREFIX, CWLObjectType, HasReqsHints, ResolverType if TYPE_CHECKING: + from cwl_utils.parser.cwl_v1_2 import LoadingOptions + from .process import Process from .provenance import ResearchObject # pylint: disable=unused-import from .provenance_profile import ProvenanceProfile @@ -102,7 +115,10 @@ def __init__(self, kwargs: Optional[Dict[str, Any]] = None) -> None: self.relax_path_checks = False # type: bool self.singularity = False # type: bool self.podman = False # type: bool - self.eval_timeout = 60 # type: float + self.eval_timeout: float = 60 + self.codegen_idx: Dict[str, Tuple[Any, "LoadingOptions"]] = {} + self.fast_parser = False + self.skip_resolve_all = False super().__init__(kwargs) diff --git a/cwltool/load_tool.py b/cwltool/load_tool.py index 8f942e5b4..22646b9cc 100644 --- a/cwltool/load_tool.py +++ b/cwltool/load_tool.py @@ -18,7 +18,9 @@ Union, cast, ) +from functools import partial +from cwl_utils.parser import cwl_v1_2, cwl_v1_2_utils from ruamel.yaml.comments import CommentedMap, CommentedSeq from schema_salad.exceptions import ValidationException from schema_salad.ref_resolver import Loader, file_uri @@ -264,6 +266,117 @@ def _add_blank_ids( ) +def _fast_parser_convert_stdstreams_to_files( + processobj: Union[cwl_v1_2.Process, MutableSequence[cwl_v1_2.Process]] +) -> None: + if isinstance(processobj, cwl_v1_2.CommandLineTool): + cwl_v1_2_utils.convert_stdstreams_to_files(processobj) + elif isinstance(processobj, cwl_v1_2.Workflow): + for st in processobj.steps: + _fast_parser_convert_stdstreams_to_files(st.run) + elif isinstance(processobj, MutableSequence): + for p in processobj: + _fast_parser_convert_stdstreams_to_files(p) + + +def _fast_parser_expand_hint_class( + hints: Optional[Any], loadingOptions: cwl_v1_2.LoadingOptions +) -> None: + + if isinstance(hints, MutableSequence): + for h in hints: + if isinstance(h, MutableMapping) and "class" in h: + for k, v in loadingOptions.namespaces.items(): + if h["class"].startswith(k + ":"): + h["class"] = v + h["class"][len(k) + 1 :] + + +def _fast_parser_handle_hints( + processobj: Union[cwl_v1_2.Process, MutableSequence[cwl_v1_2.Process]], + loadingOptions: cwl_v1_2.LoadingOptions, +) -> None: + if isinstance(processobj, (cwl_v1_2.CommandLineTool, cwl_v1_2.Workflow)): + _fast_parser_expand_hint_class(processobj.hints, loadingOptions) + + if isinstance(processobj, cwl_v1_2.Workflow): + for st in processobj.steps: + _fast_parser_expand_hint_class(st.hints, loadingOptions) + _fast_parser_handle_hints(st.run, loadingOptions) + elif isinstance(processobj, MutableSequence): + for p in processobj: + _fast_parser_handle_hints(p, loadingOptions) + + +def update_index(document_loader: Loader, pr: CommentedMap) -> None: + if "id" in pr: + document_loader.idx[pr["id"]] = pr + + +def fast_parser( + workflowobj: Union[CommentedMap, CommentedSeq, None], + fileuri: Optional[str], + uri: str, + loadingContext: LoadingContext, +) -> Tuple[Union[CommentedMap, CommentedSeq], CommentedMap]: + lopt = cwl_v1_2.LoadingOptions(idx=loadingContext.codegen_idx, fileuri=fileuri) + + if uri not in loadingContext.codegen_idx: + cwl_v1_2.load_document_with_metadata( + workflowobj, + fileuri, + loadingOptions=lopt, + addl_metadata_fields=["id", "cwlVersion"], + ) + + objects, loadopt = loadingContext.codegen_idx[uri] + + _fast_parser_convert_stdstreams_to_files(objects) + _fast_parser_handle_hints(objects, loadopt) + + processobj: Union[MutableMapping[str, Any], MutableSequence[Any], float, str, None] + + processobj = cwl_v1_2.save(objects, relative_uris=False) + + metadata: Dict[str, Any] = {} + metadata["id"] = loadopt.fileuri + + if loadopt.namespaces: + metadata["$namespaces"] = loadopt.namespaces + if loadopt.schemas: + metadata["$schemas"] = loadopt.schemas + if loadopt.baseuri: + metadata["$base"] = loadopt.baseuri + for k, v in loadopt.addl_metadata.items(): + if isinstance(processobj, MutableMapping) and k in processobj: + metadata[k] = processobj[k] + else: + metadata[k] = v + + if loadingContext.loader: + loadingContext.loader.graph += loadopt.graph + + # Need to match the document loader's index with the fast parser index + # Get the base URI (no fragments) for documents that use $graph + nofrag = urllib.parse.urldefrag(uri)[0] + objects, loadopt = loadingContext.codegen_idx[nofrag] + fileobj = cmap( + cast( + Union[int, float, str, Dict[str, Any], List[Any], None], + cwl_v1_2.save(objects, relative_uris=False), + ) + ) + visit_class( + fileobj, + ("CommandLineTool", "Workflow", "ExpressionTool"), + partial(update_index, loadingContext.loader), + ) + + return cast( + Union[CommentedMap, CommentedSeq], + cmap(cast(Union[Dict[str, Any], List[Any]], processobj)), + ), cast(CommentedMap, cmap(metadata)) + + def resolve_and_validate_document( loadingContext: LoadingContext, workflowobj: Union[CommentedMap, CommentedSeq], @@ -295,6 +408,8 @@ def resolve_and_validate_document( fileuri = urllib.parse.urldefrag(uri)[0] + metadata: CWLObjectType + cwlVersion = loadingContext.metadata.get("cwlVersion") if not cwlVersion: cwlVersion = workflowobj.get("cwlVersion") @@ -367,7 +482,7 @@ def resolve_and_validate_document( if isinstance(avsc_names, Exception): raise avsc_names - processobj = None # type: Optional[ResolveType] + processobj: ResolveType document_loader = Loader( sch_document_loader.ctx, schemagraph=sch_document_loader.graph, @@ -378,11 +493,36 @@ def resolve_and_validate_document( doc_cache=loadingContext.doc_cache, ) + loadingContext.loader = document_loader + if cwlVersion == "v1.0": _add_blank_ids(workflowobj) - document_loader.resolve_all(workflowobj, fileuri) - processobj, metadata = document_loader.resolve_ref(uri) + if cwlVersion != "v1.2": + loadingContext.fast_parser = False + + if loadingContext.skip_resolve_all: + # Some integrations (e.g. Arvados) loads documents, makes + # in-memory changes to them (which are applied to the objects + # in the document_loader index), and then sends them back + # through the loading machinery. + # + # In this case, the functions of resolve_all() have already + # happened. Because resolve_all() is expensive, we don't want + # to do it again if it's going to be a no-op, so the + # skip_resolve_all flag tells us just to use the document + # as-is from the loader index. + # + # Note that at the moment, fast_parser code path is considered + # functionally the same as resolve_all() for this case. + # + processobj, metadata = document_loader.resolve_ref(uri) + elif loadingContext.fast_parser: + processobj, metadata = fast_parser(workflowobj, fileuri, uri, loadingContext) + else: + document_loader.resolve_all(workflowobj, fileuri) + processobj, metadata = document_loader.resolve_ref(uri) + if not isinstance(processobj, (CommentedMap, CommentedSeq)): raise ValidationException("Workflow must be a CommentedMap or CommentedSeq.") @@ -405,12 +545,12 @@ def resolve_and_validate_document( if isinstance(processobj, CommentedMap): uri = processobj["id"] - _convert_stdstreams_to_files(workflowobj) + if not loadingContext.fast_parser: + _convert_stdstreams_to_files(workflowobj) if isinstance(jobobj, CommentedMap): loadingContext.jobdefaults = jobobj - loadingContext.loader = document_loader loadingContext.avsc_names = avsc_names loadingContext.metadata = metadata @@ -429,12 +569,10 @@ def resolve_and_validate_document( ) document_loader.idx[processobj["id"]] = processobj - def update_index(pr: CommentedMap) -> None: - if "id" in pr: - document_loader.idx[pr["id"]] = pr - visit_class( - processobj, ("CommandLineTool", "Workflow", "ExpressionTool"), update_index + processobj, + ("CommandLineTool", "Workflow", "ExpressionTool"), + partial(update_index, document_loader), ) return loadingContext, uri @@ -446,7 +584,18 @@ def make_tool( """Make a Python CWL object.""" if loadingContext.loader is None: raise ValueError("loadingContext must have a loader") - resolveduri, metadata = loadingContext.loader.resolve_ref(uri) + + resolveduri: Union[float, str, CommentedMap, CommentedSeq, None] + metadata: CWLObjectType + + if ( + loadingContext.fast_parser + and isinstance(uri, str) + and not loadingContext.skip_resolve_all + ): + resolveduri, metadata = fast_parser(None, None, uri, loadingContext) + else: + resolveduri, metadata = loadingContext.loader.resolve_ref(uri) processobj = None if isinstance(resolveduri, MutableSequence): diff --git a/cwltool/main.py b/cwltool/main.py index 63e3d8679..e82a8ee6c 100755 --- a/cwltool/main.py +++ b/cwltool/main.py @@ -1011,7 +1011,8 @@ def main( argsl = sys.argv[1:] addl = [] # type: List[str] if "CWLTOOL_OPTIONS" in os.environ: - addl = os.environ["CWLTOOL_OPTIONS"].split(" ") + c_opts = os.environ["CWLTOOL_OPTIONS"].split(" ") + addl = [x for x in c_opts if x != ""] parser = arg_parser() argcomplete.autocomplete(parser) args = parser.parse_args(addl + argsl) diff --git a/cwltool/update.py b/cwltool/update.py index e60c696f9..04a2d95d5 100644 --- a/cwltool/update.py +++ b/cwltool/update.py @@ -124,14 +124,23 @@ def fix_inputBinding(t: CWLObjectType) -> None: upd = upd["$graph"] for proc in aslist(upd): proc.setdefault("hints", CommentedSeq()) - proc["hints"].insert( - 0, CommentedMap([("class", "NetworkAccess"), ("networkAccess", True)]) + na = CommentedMap([("class", "NetworkAccess"), ("networkAccess", True)]) + + if hasattr(proc.lc, "filename"): + comment_filename = proc.lc.filename + else: + comment_filename = "" + na.lc.filename = comment_filename + + proc["hints"].insert(0, na) + + ll = CommentedMap( + [("class", "LoadListingRequirement"), ("loadListing", "deep_listing")] ) + ll.lc.filename = comment_filename proc["hints"].insert( 0, - CommentedMap( - [("class", "LoadListingRequirement"), ("loadListing", "deep_listing")] - ), + ll, ) if "cwlVersion" in proc: del proc["cwlVersion"] diff --git a/mypy-requirements.txt b/mypy-requirements.txt index 9118c91a0..6d7fbd533 100644 --- a/mypy-requirements.txt +++ b/mypy-requirements.txt @@ -1,7 +1,7 @@ mypy==0.971 ruamel.yaml>=0.16.0,<0.17.22 schema-salad>=8.2.20211104054942,<9 -cwl-utils>=0.15 +cwl-utils >=0.19 types-requests types-setuptools types-psutil diff --git a/requirements.txt b/requirements.txt index 61213db5a..f485c88f1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,4 @@ pydot>=1.4.1 argcomplete>=1.12.0 pyparsing != 3.0.2 # breaks --print-dot (pydot) https://github.com/pyparsing/pyparsing/issues/319 pyparsing < 3;python_version<='3.6' # breaks --print-dot -cwl-utils>=0.15 +cwl-utils>=0.19 diff --git a/setup.py b/setup.py index ddac1aafb..ee681a234 100644 --- a/setup.py +++ b/setup.py @@ -121,7 +121,7 @@ "pyparsing != 3.0.2", # breaks --print-dot (pydot) https://github.com/pyparsing/pyparsing/issues/319 "pyparsing < 3 ;python_version<='3.6'", # breaks --print-dot (pydot) "argcomplete", - "cwl-utils >= 0.15", + "cwl-utils >= 0.19", ], extras_require={ "deps": ["galaxy-tool-util >= 22.1.2, <23"],