Skip to content

Commit

Permalink
type ro:Folder for directories
Browse files Browse the repository at this point in the history
also export ORE resource map as separate annotation
  • Loading branch information
stain committed Aug 16, 2018
1 parent 492d6ee commit 4382b80
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 17 deletions.
100 changes: 87 additions & 13 deletions cwltool/provenance.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ class PermissionError(OSError): # pylint: disable=redefined-builtin
PROVENANCE = os.path.join(METADATA, "provenance")
WFDESC = Namespace("wfdesc", 'http://purl.org/wf4ever/wfdesc#')
WFPROV = Namespace("wfprov", 'http://purl.org/wf4ever/wfprov#')
RO = Namespace("ro", 'http://purl.org/wf4ever/ro#')
ORE = Namespace("ore", 'http://www.openarchives.org/ore/terms/')
FOAF = Namespace("foaf", 'http://xmlns.com/foaf/0.1/')
SCHEMA = Namespace("schema", 'http://schema.org/')
CWLPROV = Namespace('cwlprov', 'https://w3id.org/cwl/prov#')
Expand Down Expand Up @@ -373,6 +375,9 @@ def host_provenance(document):

# info only, won't really be used by prov as sub-resources use /
self.document.add_namespace('researchobject', self.research_object.base_uri)
# annotations
self.metadata_ns = self.document.add_namespace('metadata',
self.research_object.base_uri + _posix_path(METADATA) + "/")
# Pre-register provenance directory so we can refer to its files
self.provenance_ns = self.document.add_namespace('provenance',
self.research_object.base_uri + _posix_path(PROVENANCE) + "/")
Expand Down Expand Up @@ -539,8 +544,10 @@ def declare_artefact(self, value):
# FIXME: Make consistent hash URIs for these
# that somehow include the type
# (so "1" != 1 != "1.0" != true)
return self.document.entity(uuid.uuid4().urn,
e = self.document.entity(uuid.uuid4().urn,
{ provM.PROV_VALUE: value })
self.research_object.add_uri(e.identifier.uri)
return e

elif isinstance(value, (Text, str)):
# Save as string in UTF-8
Expand Down Expand Up @@ -598,37 +605,92 @@ def declare_artefact(self, value):
# attempt to keep it inside the value dictionary
dir_id = value.setdefault("id",
uuid.uuid4().urn)

# New annotation file to keep the ORE Folder listing
ore_doc_fn = dir_id.replace("urn:uuid:", "directory-") + ".ttl"
dir_bundle = self.document.bundle(self.metadata_ns[ore_doc_fn])

coll = self.document.entity(dir_id,
[ (provM.PROV_TYPE, WFPROV["Artifact"]),
(provM.PROV_TYPE, PROV["Collection"]),
(provM.PROV_TYPE, PROV["Dictionary"]),
(provM.PROV_TYPE, CWLPROV["Directory"]),
(provM.PROV_TYPE, RO["Folder"]),
])
coll_attribs = [] # type ( tuple(Identifier, ProvEntity) )
# ORE description of ro:Folder, saved separately
coll_b = dir_bundle.entity(dir_id,
[
(provM.PROV_TYPE, RO["Folder"]),
(provM.PROV_TYPE, ORE["Aggregation"]),
])
self.document.mentionOf(dir_id + "#ore", dir_id, dir_bundle.identifier)

dir_manifest = dir_bundle.entity(dir_bundle.identifier,
{PROV["type"]: ORE["ResourceMap"],
ORE["describes"]: coll_b.identifier}
)

coll_attribs = [ # type ( tuple(Identifier, ProvEntity) )
(ORE["isDescribedBy"], dir_bundle.identifier )
]
coll_b_attribs = [] # type ( tuple(Identifier, ProvEntity) )

# FIXME: .listing might not be populated yet - hopefully
# a later call to this method will sort that
for f in value.get("listing", []):
# Declare child-artifacts
entity = self.declare_artefact(f)
# TODO: Add filename to PROV-dictionary
self.document.membership(coll, entity)
# Membership
m = self.document.entity(uuid.uuid4().urn)
# Note: only support PROV-O style dictionary
# Membership relation aka our ORE Proxy
m_id = uuid.uuid4().urn
m = self.document.entity(m_id)
m_b = dir_bundle.entity(m_id)

# PROV-O style Dictionary
# https://www.w3.org/TR/prov-dictionary/#dictionary-ontological-definition
# as prov.py do not easily allow PROV-N extensions
# ..as prov.py do not currently allow PROV-N extensions
# like hadDictionaryMember(..)
m.add_asserted_type(PROV["KeyEntityPair"])

m.add_attributes({
PROV["pairKey"]: f["basename"],
PROV["pairEntity"]: entity
PROV["pairEntity"]: entity,
})

# As well as a being a
# http://wf4ever.github.io/ro/2016-01-28/ro/#FolderEntry
m_b.add_asserted_type(RO["FolderEntry"])
m_b.add_asserted_type(ORE["Proxy"])
m_b.add_attributes({
RO["entryName"]: f["basename"],
ORE["proxyIn"]: coll,
ORE["proxyFor"]: entity,

})
coll_attribs.append(
(PROV["hadDictionaryMember"], m))
coll_b_attribs.append(
(ORE["aggregates"], m_b))

coll.add_attributes(coll_attribs)
coll_b.add_attributes(coll_b_attribs)

# Also Save ORE Folder as annotation metadata
ore_doc = ProvDocument()
ore_doc.add_namespace(ORE)
ore_doc.add_namespace(RO)
ore_doc.add_namespace(UUID)
ore_doc.add_bundle(dir_bundle)
ore_doc = ore_doc.flattened()
ore_doc_path = posixpath.join(_posix_path(METADATA), ore_doc_fn)
with self.research_object.write_bag_file(ore_doc_path) as provenance_file:
ore_doc.serialize(provenance_file, format="rdf", rdf_format="turtle")
self.research_object.add_annotation(dir_id, [ore_doc_fn], ORE["isDescribedBy"].uri)

if not coll_attribs:
# Empty directory
coll.add_asserted_type(PROV["EmptyCollection"])
coll.add_asserted_type(PROV["EmptyDictionary"])
self.research_object.add_uri(coll.identifier.uri)
return coll
else:
# some other kind of dictionary?
Expand Down Expand Up @@ -661,6 +723,7 @@ def declare_artefact(self, value):
coll_attribs.append(
(PROV["hadDictionaryMember"], m))
coll.add_attributes(coll_attribs)
self.research_object.add_uri(coll.identifier.uri)
return coll

# some other kind of Collection?
Expand All @@ -686,13 +749,16 @@ def declare_artefact(self, value):
# we would need to use PROV.Dictionary
# with numeric keys
self.document.membership(coll, e)
self.research_object.add_uri(coll.identifier.uri)
return coll
except TypeError:
_logger.warning("Unrecognized type %s of %r" %
(type(value), value))
# Let's just fall back to Python repr()
return self.document.entity(uuid.uuid4().urn,
e = self.document.entity(uuid.uuid4().urn,
{ provM.PROV_LABEL: repr(value) })
self.research_object.add_uri(e.identifier.uri)
return e

def used_artefacts(self,
job_order, # type: Dict
Expand Down Expand Up @@ -909,6 +975,7 @@ def __init__(self, temp_prefix_ro="tmp", orcid=None, full_name=None):
self.bagged_size = {} # type: Dict
self.tagfiles = set() # type: Set
self._file_provenance = {} # type: Dict
self._external_aggregates = [] # type: List[Dict]
self.annotations = [] # type: List[Dict]
self._content_types = {} # type: Dict[Text,str]

Expand Down Expand Up @@ -1093,7 +1160,7 @@ def guess_mediatype(rel_path):
local_aggregate["conformsTo"] = prov_conforms_to[extension]
return local_aggregate

aggregates = []
aggregates = [] # type: List[Dict]
for path in self.bagged_size.keys():
aggregate_dict = {} # type: Dict[str,Any]

Expand Down Expand Up @@ -1133,10 +1200,9 @@ def guess_mediatype(rel_path):
if path == posixpath.join(METADATA, "manifest.json"):
# Should not really be there yet! But anyway, we won't
# aggregate it.

continue

rel_aggregates = {}
rel_aggregates = {} # type: Dict[str,Any]
# These are local paths like metadata/provenance - but
# we need to relativize them for our current directory for
# as we are saved in metadata/manifest.json
Expand All @@ -1152,8 +1218,16 @@ def guess_mediatype(rel_path):
# make new timestamp?
rel_aggregates.update(self._self_made())
aggregates.append(rel_aggregates)
aggregates.extend(self._external_aggregates)
return aggregates

def add_uri(self, uri, when=None):
# type: (str, Optional[datetime.datetime]) -> Dict
aggr = self._self_made(when=when)
aggr["uri"] = uri
self._external_aggregates.append(aggr)
return aggr

def add_annotation(self, about, content, motivatedBy="oa:describing"):
# type: (str, List[str], str) -> str

Expand Down
7 changes: 4 additions & 3 deletions tests/test_provenance.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
# RDF namespaces we'll query for later
ORE = Namespace("http://www.openarchives.org/ore/terms/")
PROV = Namespace("http://www.w3.org/ns/prov#")
RO = Namespace("http://purl.org/wf4ever/ro#")
WFDESC = Namespace("http://purl.org/wf4ever/wfdesc#")
WFPROV = Namespace("http://purl.org/wf4ever/wfprov#")
SCHEMA = Namespace("http://schema.org/")
Expand Down Expand Up @@ -123,7 +124,7 @@ def test_directory_workflow(self):
self.assertTrue(os.path.isfile(p),
"Could not find %s as %s" % (l, p))

def check_provenance(self, nested=False, single_tool=False, directory=True):
def check_provenance(self, nested=False, single_tool=False, directory=False):
self.check_folders()
self.check_bagit()
self.check_ro(nested=nested)
Expand Down Expand Up @@ -357,8 +358,8 @@ def check_prov(self, nested=False, single_tool=False, directory=False):
# TODO: Check g2 statements that it's the same UUID activity inside
# as in the outer step
if directory:
# TODO: Test directory
pass
directories = set(g.subjects(RDF.type, RO.Folder))
self.assertTrue(directories)


class TestConvertPath(unittest.TestCase):
Expand Down
2 changes: 1 addition & 1 deletion typeshed/2and3/prov/model.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ class ProvDocument(ProvBundle):
def has_bundles(self): ...
@property
def bundles(self): ...
def flattened(self): ...
def flattened(self) -> ProvDocument: ...
def unified(self): ...
def update(self, other: Any) -> None: ...
def add_bundle(self, bundle: Any, identifier: Optional[Any] = ...) -> None: ...
Expand Down

0 comments on commit 4382b80

Please sign in to comment.