Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prov data input output #1989

Draft
wants to merge 24 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
e3eabfe
Init for the no listing workflow
jjkoehorst Jun 5, 2023
ab0738f
improved test case with checking if the files are there and if they a…
jjkoehorst Jun 12, 2023
c559952
make clean formatting
jjkoehorst Jun 15, 2023
c62979a
Improved naming for the no_listing test and a creation step of 10.000…
jjkoehorst Jun 15, 2023
bc85a1f
Identifying what it is processing for provenance
jjkoehorst Jun 16, 2023
9a6f4ce
For the provenance a --no-input option to not copy input files into t…
jjkoehorst Jun 16, 2023
456b26c
A work in progress to ensure the test works for no-listing and no-input
jjkoehorst Jun 16, 2023
b278f58
Wokring on specifying input/output for provenance
jjkoehorst Jun 18, 2023
fd14f0d
working on no-input
jjkoehorst Jun 22, 2023
95cff2b
move back from data/input data/output to a data folder to see if RO-C…
jjkoehorst Jun 22, 2023
cdbf106
Likely missed one
jjkoehorst Jun 22, 2023
4d8ae62
Moved back to provenance_constants.DATA with comments for DATAX to kn…
jjkoehorst Jun 22, 2023
29eee5f
Merge branch 'main' into prov_data_input_output for updates
Mar 14, 2024
5fc090b
fix merge err
Mar 14, 2024
18a9bd6
fix loadListing of dir for prov
Mar 22, 2024
fc0e331
misc change of comments
Mar 22, 2024
627026c
fix text, remove vscode space files
Mar 26, 2024
96d1e20
remove redundant texts
Mar 26, 2024
c7963b3
Merge pull request #1986 from ElderMedic/prov_data_input_output
ElderMedic Mar 27, 2024
88b113b
minor changes of logging
Apr 11, 2024
8f52df9
Merge branch 'main' into prov_data_input_output
ElderMedic Apr 11, 2024
55f56b1
Merge branch 'prov_data_input_output' of https://github.com/common-wo…
Apr 11, 2024
c979e29
Result of make cleanup
jjkoehorst Apr 17, 2024
0f7df3c
Merge pull request #2004 from common-workflow-language/main
ElderMedic May 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
18 changes: 18 additions & 0 deletions cwltool/argparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,24 @@ def arg_parser() -> argparse.ArgumentParser:
type=str,
)

# TO DO: Not yet implemented
provgroup.add_argument(
"--no-data", # Maybe change to no-input and no-intermediate to ignore those kind of files?...
default=False,
action="store_true",
help="Disables the storage of input and output data files",
dest="no_data",
)

# TO DO: Not yet implemented
provgroup.add_argument(
"--no-input", # Maybe change to no-input and no-intermediate to ignore those kind of files?...
default=False,
action="store_true",
help="Disables the storage of input data files",
dest="no_input",
)

printgroup = parser.add_mutually_exclusive_group()
printgroup.add_argument(
"--print-rdf",
Expand Down
4 changes: 4 additions & 0 deletions cwltool/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,10 @@ def addsf(
datum = cast(CWLObjectType, datum)
ll = schema.get("loadListing") or self.loadListing
if ll and ll != "no_listing":
# Debug show
for k in datum:
_logger.debug("Datum: %s: %s" % (k, datum[k]))
_logger.debug("----------------------------------------")
get_listing(
self.fs_access,
datum,
Expand Down
39 changes: 37 additions & 2 deletions cwltool/cwlprov/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@
import re
import uuid
from getpass import getuser
from typing import IO, Any, Callable, Dict, List, Optional, Tuple, TypedDict, Union
from typing import IO, Any, Dict, List, Optional, Tuple, TypedDict, Union

from cwltool.cwlprov.provenance_constants import Hasher

from ..loghandler import _logger


def _whoami() -> Tuple[str, str]:
Expand Down Expand Up @@ -135,7 +139,7 @@
def checksum_copy(
src_file: IO[Any],
dst_file: Optional[IO[Any]] = None,
hasher: Optional[Callable[[], "hashlib._Hash"]] = None,
hasher=Hasher, # type: Callable[[], hashlib._Hash]
buffersize: int = 1024 * 1024,
) -> str:
"""Compute checksums while copying a file."""
Expand All @@ -158,6 +162,37 @@
pass
if os.path.exists(temp_location):
os.rename(temp_location, dst_file.name) # type: ignore

return content_processor(contents, src_file, dst_file, checksum, buffersize)


def checksum_only(
src_file: IO[Any],
dst_file: Optional[IO[Any]] = None,
hasher=Hasher, # type: Callable[[], hashlib._Hash]
buffersize: int = 1024 * 1024,
) -> str:
"""Calculate the checksum only, does not copy the data files."""
if dst_file is not None:
_logger.error(

Check warning on line 177 in cwltool/cwlprov/__init__.py

View check run for this annotation

Codecov / codecov/patch

cwltool/cwlprov/__init__.py#L177

Added line #L177 was not covered by tests
"[Debug Checksum Only] Destination file should be None but it is %s", dst_file
)
"""Compute checksums while copying a file."""

Check warning on line 180 in cwltool/cwlprov/__init__.py

View check run for this annotation

Codecov / codecov/patch

cwltool/cwlprov/__init__.py#L180

Added line #L180 was not covered by tests
# TODO: Use hashlib.new(Hasher_str) instead?
checksum = hasher()
contents = src_file.read(buffersize)

Check warning on line 183 in cwltool/cwlprov/__init__.py

View check run for this annotation

Codecov / codecov/patch

cwltool/cwlprov/__init__.py#L182-L183

Added lines #L182 - L183 were not covered by tests
# TODO Could be a function for both checksum_only and checksum_copy?
return content_processor(contents, src_file, dst_file, checksum, buffersize)

Check warning on line 185 in cwltool/cwlprov/__init__.py

View check run for this annotation

Codecov / codecov/patch

cwltool/cwlprov/__init__.py#L185

Added line #L185 was not covered by tests


def content_processor(
contents: Any,
src_file: IO[Any],
dst_file: Optional[IO[Any]],
checksum: "hashlib._Hash",
buffersize: int,
) -> str:
"""Calculate the checksum based on the content."""
while contents != b"":
if dst_file is not None:
dst_file.write(contents)
Expand Down
8 changes: 7 additions & 1 deletion cwltool/cwlprov/provenance_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,13 @@

# Research Object folders
METADATA = "metadata"
DATA = "data"
# sub-folders for data
DATA = "data/input"
# DATAX = "data"
INPUT_DATA = "data/input"
INTM_DATA = "data/intermediate"
OUTPUT_DATA = "data/output"

WORKFLOW = "workflow"
SNAPSHOT = "snapshot"
# sub-folders
Expand Down
22 changes: 17 additions & 5 deletions cwltool/cwlprov/provenance_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@
from ..stdfsaccess import StdFsAccess
from ..utils import CWLObjectType, JobsType, get_listing, posix_path, versionstring
from ..workflow_job import WorkflowJob
from .provenance_constants import (
from . import provenance_constants
from .provenance_constants import ( # DATAX,
ACCOUNT_UUID,
CWLPROV,
ENCODING,
Expand Down Expand Up @@ -287,6 +288,7 @@
process_run_id: str,
outputs: Union[CWLObjectType, MutableSequence[CWLObjectType], None],
when: datetime.datetime,
# load_listing: None,
) -> None:
self.generate_output_prov(outputs, process_run_id, process_name)
self.document.wasEndedBy(process_run_id, None, self.workflow_run_uri, when)
Expand All @@ -300,7 +302,11 @@
if "checksum" in value:
csum = cast(str, value["checksum"])
(method, checksum) = csum.split("$", 1)
if method == SHA1 and self.research_object.has_data_file(checksum):
# TODO Input, intermediate or output file?...
# if provenance_constants.DATA == 'data/input'
if method == SHA1 and self.research_object.has_data_file(
provenance_constants.DATA, checksum # DATAX
):
entity = self.document.entity("data:" + checksum)

if not entity and "location" in value:
Expand Down Expand Up @@ -408,8 +414,10 @@
# a later call to this method will sort that
is_empty = True

if "listing" not in value:
get_listing(self.fsaccess, value)
# get loadlisting, and load the listing if not no_listing, recursively if deep_listing
ll = value.get("loadListing")

Check warning on line 418 in cwltool/cwlprov/provenance_profile.py

View check run for this annotation

Codecov / codecov/patch

cwltool/cwlprov/provenance_profile.py#L418

Added line #L418 was not covered by tests
if ll and ll != "no_listing":
get_listing(self.fsaccess, value, (ll == "deep_listing"))

Check warning on line 420 in cwltool/cwlprov/provenance_profile.py

View check run for this annotation

Codecov / codecov/patch

cwltool/cwlprov/provenance_profile.py#L420

Added line #L420 was not covered by tests
for entry in cast(MutableSequence[CWLObjectType], value.get("listing", [])):
is_empty = False
# Declare child-artifacts
Expand Down Expand Up @@ -604,6 +612,7 @@
job_order: Union[CWLObjectType, List[CWLObjectType]],
process_run_id: str,
name: Optional[str] = None,
load_listing=None,
) -> None:
"""Add used() for each data artefact."""
if isinstance(job_order, list):
Expand Down Expand Up @@ -634,7 +643,10 @@
process_run_id: Optional[str],
name: Optional[str],
) -> None:
"""Call wasGeneratedBy() for each output,copy the files into the RO."""
"""Call wasGeneratedBy() for each output, copy the files into the RO."""
# TODO: Change INPUT_DATA to OUTPUT_DATA?
provenance_constants.DATA = provenance_constants.OUTPUT_DATA

if isinstance(final_output, MutableSequence):
for entry in final_output:
self.generate_output_prov(entry, process_run_id, name)
Expand Down