common-workflow-language · mr-c · Jul 3, 2018 · Feb 27, 2018 · Feb 27, 2018 · Feb 27, 2018
diff --git a/.gitignore b/.gitignore
@@ -30,3 +30,5 @@ typeshed/2and3/ruamel/yaml
 
 #mypy
 .mypy_cache/
+bin/
+lib/
diff --git a/cwltool/argparser.py b/cwltool/argparser.py
@@ -120,6 +120,10 @@ def arg_parser():  # type: () -> argparse.ArgumentParser
                         type=float,
                         default=20)
 
+    parser.add_argument("--provenance",
+                        help="Save provenance to specified folder as a Research Object that capture and aggregate workflow execution and data products.",
+                        type=Text)
+
     exgroup = parser.add_mutually_exclusive_group()
     exgroup.add_argument("--print-rdf", action="store_true",
                          help="Print corresponding RDF graph for workflow and exit")

diff --git a/cwltool/executors.py b/cwltool/executors.py
@@ -3,16 +3,22 @@
 import threading
 
 import os
+import copy
+import uuid
+import datetime
+import time
 from abc import ABCMeta, abstractmethod
-
+import prov.model as prov
 from typing import Dict, Text, Any, Tuple, Set, List
 
+
 from .builder import Builder
 from .errors import WorkflowException
 from .mutation import MutationManager
 from .job import JobBase
-from .process import relocateOutputs, cleanIntermediate, Process
+from .process import relocateOutputs, cleanIntermediate, Process, shortname, uniquename, get_overrides
 from . import loghandler
+from schema_salad.sourceline import SourceLine
 
 _logger = logging.getLogger("cwltool")
 
@@ -36,6 +42,7 @@ def output_callback(self, out, processStatus):
     def run_jobs(self,
                  t,  # type: Process
                  job_order_object,  # type: Dict[Text, Any]
+                 provDoc,
                  logger,
                  **kwargs  # type: Any
                  ):
@@ -44,6 +51,9 @@ def run_jobs(self,
     def execute(self, t,  # type: Process
                 job_order_object,  # type: Dict[Text, Any]
                 logger=_logger,
+                provDoc=None,
+                engineID=None,
+                WorkflowID=None,
                 **kwargs  # type: Any
                 ):
         # type: (...) -> Tuple[Dict[Text, Any], Text]
@@ -66,7 +76,7 @@ def execute(self, t,  # type: Process
             for req in jobReqs:
                 t.requirements.append(req)
 
-        self.run_jobs(t, job_order_object, logger, **kwargs)
+        self.run_jobs(t, job_order_object, provDoc, engineID, WorkflowID, logger, **kwargs)
 
         if self.final_output and self.final_output[0] and finaloutdir:
             self.final_output[0] = relocateOutputs(self.final_output[0], finaloutdir,
@@ -87,22 +97,83 @@ class SingleJobExecutor(JobExecutor):
     def run_jobs(self,
                  t,  # type: Process
                  job_order_object,  # type: Dict[Text, Any]
+                 document,
+                 engineUUID,
+                 WorkflowRunID,
                  logger,
                  **kwargs  # type: Any
                  ):
+        reference_locations={}
+        ProvActivity_dict={}
         jobiter = t.job(job_order_object,
                         self.output_callback,
                         **kwargs)
-
         try:
+            ro = kwargs.get("ro")
             for r in jobiter:
                 if r:
                     builder = kwargs.get("builder", None)  # type: Builder
+
                     if builder is not None:
                         r.builder = builder
                     if r.outdir:
                         self.output_dirs.add(r.outdir)
-                    r.run(**kwargs)
+                    if ro:
+                        #here we are recording provenance of each subprocess of the workflow
+                        if ".cwl" in getattr(r, "name"): #for prospective provenance
+                            steps=[]
+                            for s in r.steps:
+                                stepname="wf:main/"+str(s.name)[5:]
+                                steps.append(stepname)
+                                print("step name is: ", stepname)
+                                document.entity(stepname, {prov.PROV_TYPE: "wfdesc:Process", "prov:type": "prov:Plan"})
+                            #create prospective provenance recording for the workflow
+                            document.entity("wf:main", {prov.PROV_TYPE: "wfdesc:Process", "prov:type": "prov:Plan", "wfdesc:hasSubProcess=":str(steps),  "prov:label":"Prospective provenance"})
+                            customised_job={} #new job object for RO
+                            for e, i in enumerate(r.tool["inputs"]):
+                                with SourceLine(r.tool["inputs"], e, WorkflowException, _logger.isEnabledFor(logging.DEBUG)):
+                                    iid = shortname(i["id"])
+                                    if iid in job_order_object:
+                                        customised_job[iid]= copy.deepcopy(job_order_object[iid]) #add the input element in dictionary for provenance
+                                    elif "default" in i:
+                                        customised_job[iid]= copy.deepcopy(i["default"]) #add the defualt elements in the dictionary for provenance
+                                    else:
+                                        raise WorkflowException(
+                                            u"Input '%s' not in input object and does not have a default value." % (i["id"]))
+                            ##create master-job.json and returns a dictionary with workflow level identifiers as keys and locations or actual values of the attributes as values.
+                            relativised_input_object=ro.create_job(customised_job, kwargs) #call the method to generate a file with customised job
+                            for key, value in relativised_input_object.items():
+                                strvalue=str(value)
+                                if "data" in strvalue:
+                                    shahash="data:"+value.split("/")[-1]
+                                    rel_path=value[3:]
+                                    reference_locations[job_order_object[key]["location"]]=relativised_input_object[key][11:]
+                                    document.entity(shahash, {prov.PROV_TYPE:"wfprov:Artifact"})
+                                    #document.specializationOf(rel_path, shahash) NOTE:THIS NEEDS FIXING as it required both params as entities.
+                                else:
+                                    ArtefactValue="data:"+strvalue
+                                    document.entity(ArtefactValue, {prov.PROV_TYPE:"wfprov:Artifact"})
+                    if ".cwl" not in getattr(r, "name"):
+                        if ro:
+                            ProcessRunID="run:"+str(uuid.uuid4())
+                            #each subprocess is defined as an activity()
+                            provLabel="Run of workflow/packed.cwl#main/"+str(r.name)
+                            ProcessProvActivity = document.activity(ProcessRunID, None, None, {prov.PROV_TYPE: "wfprov:ProcessRun", "prov:label": provLabel})
+                            if hasattr(r, 'name') and ".cwl" not in getattr(r, "name"):
+                                document.wasAssociatedWith(ProcessRunID, engineUUID, str("wf:main/"+r.name))
+                            document.wasStartedBy(ProcessRunID, None, WorkflowRunID, datetime.datetime.now(), None, None)
+                            #this is where you run each step. so start and end time for the step
+                            r.run(document, WorkflowRunID, ProcessProvActivity, reference_locations, **kwargs)
+                        else:
+                            r.run(**kwargs)
+                        #capture workflow level outputs in the prov doc
+                    if ro:
+                        for eachOutput in self.final_output:
+                            for key, value in eachOutput.items():
+                                outputProvRole="wf:main"+"/"+str(key)
+                                output_checksum="data:"+str(value["checksum"][5:])
+                                document.entity(output_checksum, {prov.PROV_TYPE:"wfprov:Artifact"})
+                                document.wasGeneratedBy(output_checksum, WorkflowRunID, datetime.datetime.now(), None, {"prov:role":outputProvRole })
                 else:
                     logger.error("Workflow cannot make any more progress.")
                     break

diff --git a/cwltool/job.py b/cwltool/job.py
@@ -12,14 +12,20 @@
 import subprocess
 import sys
 import tempfile
+import prov.model as prov
 from abc import ABCMeta, abstractmethod
 from io import open
 from threading import Lock
 
 import shellescape
+
+import time
+import datetime
+from .utils import copytree_with_merge, docker_windows_path_adjust, onWindows
 from typing import (IO, Any, Callable, Dict, Iterable, List, MutableMapping, Text,
                     Union, cast)
 
+
 from .builder import Builder
 from .errors import WorkflowException
 from .pathmapper import PathMapper
@@ -170,11 +176,10 @@ def _setup(self, kwargs):  # type: (Dict) -> None
             _logger.debug(u"[job %s] initial work dir %s", self.name,
                           json.dumps({p: self.generatemapper.mapper(p) for p in self.generatemapper.files()}, indent=4))
 
-    def _execute(self, runtime, env, rm_tmpdir=True, move_outputs="move"):
+    def _execute(self, runtime, env, kwargs, document=None, WorkflowRunID=None, ProcessProvActivity=None,reference_locations=None, rm_tmpdir=True, move_outputs="move"):
         # type: (List[Text], MutableMapping[Text, Text], bool, Text) -> None
-
+        ro = kwargs.get("ro")
         scr, _ = get_feature(self, "ShellCommandRequirement")
-
         shouldquote = None  # type: Callable[[Any], Any]
         if scr:
             shouldquote = lambda x: False
@@ -189,7 +194,19 @@ def _execute(self, runtime, env, rm_tmpdir=True, move_outputs="move"):
                      u' < %s' % self.stdin if self.stdin else '',
                      u' > %s' % os.path.join(self.outdir, self.stdout) if self.stdout else '',
                      u' 2> %s' % os.path.join(self.outdir, self.stderr) if self.stderr else '')
-
+        if hasattr(self, "joborder"):
+            for key, value in getattr(self, "joborder").items():
+                if ro:
+                    provRole=self.name+"/"+str(key)
+                    ProcessRunID=str(ProcessProvActivity._identifier)
+                    if 'location' in str(value):
+                        location=str(value['location'])
+                        if location in reference_locations: #workflow level inputs referenced as hash in prov document
+                            document.used(ProcessRunID, "data:"+str(reference_locations[location]), datetime.datetime.now(), None, {"prov:role":provRole })
+                        else: #add checksum created by cwltool of the intermediate data products. NOTE: will only work if --compute-checksums is enabled.
+                            document.used(ProcessRunID, "data:"+str(value['checksum'][5:]), datetime.datetime.now(),None, {"prov:role":provRole })
+                    else: #add the actual data value in the prov document
+                        document.used(ProcessRunID, "data:"+str(value), datetime.datetime.now(),None, {"prov:role":provRole })
         outputs = {}  # type: Dict[Text,Text]
 
         try:
@@ -214,6 +231,7 @@ def _execute(self, runtime, env, rm_tmpdir=True, move_outputs="move"):
                 stdout_path = absout
 
             commands = [Text(x) for x in (runtime + self.command_line)]
+
             job_script_contents = None  # type: Text
             builder = getattr(self, "builder", None)  # type: Builder
             if builder is not None:
@@ -227,7 +245,6 @@ def _execute(self, runtime, env, rm_tmpdir=True, move_outputs="move"):
                 cwd=self.outdir,
                 job_script_contents=job_script_contents,
             )
-
             if self.successCodes and rcode in self.successCodes:
                 processStatus = "success"
             elif self.temporaryFailCodes and rcode in self.temporaryFailCodes:
@@ -244,6 +261,15 @@ def _execute(self, runtime, env, rm_tmpdir=True, move_outputs="move"):
 
             outputs = self.collect_outputs(self.outdir)
             outputs = bytes2str_in_dicts(outputs)  # type: ignore
+            #creating entities for the outputs produced by each step (in the provenance document) and associating them with
+            #the ProcessRunID
+            if ro:
+                for key, value in outputs.items():
+                    StepOutput_checksum="data:"+str(value["checksum"][5:])
+                    document.entity(StepOutput_checksum, {prov.PROV_TYPE:"wfprov:SubProcessArtifact"})
+                    stepProv="wf:main"+"/"+str(self.name)+"/"+str(key)
+                    ProcessRunID=str(ProcessProvActivity._identifier)
+                    document.wasGeneratedBy(StepOutput_checksum, ProcessRunID, datetime.datetime.now(), None, {"prov:role":stepProv})
 
         except OSError as e:
             if e.errno == 2:
@@ -263,8 +289,12 @@ def _execute(self, runtime, env, rm_tmpdir=True, move_outputs="move"):
 
         if processStatus != "success":
             _logger.warning(u"[job %s] completed %s", self.name, processStatus)
+            if ro:
+                document.wasEndedBy(str(ProcessProvActivity._identifier), None, WorkflowRunID, datetime.datetime.now())
         else:
             _logger.info(u"[job %s] completed %s", self.name, processStatus)
+            if ro:
+                document.wasEndedBy(str(ProcessProvActivity._identifier), None, WorkflowRunID, datetime.datetime.now())
 
         if _logger.isEnabledFor(logging.DEBUG):
             _logger.debug(u"[job %s] %s", self.name, json.dumps(outputs, indent=4))
@@ -283,8 +313,8 @@ def _execute(self, runtime, env, rm_tmpdir=True, move_outputs="move"):
 
 class CommandLineJob(JobBase):
 
-    def run(self, pull_image=True, rm_container=True,
-            rm_tmpdir=True, move_outputs="move", **kwargs):
+    def run(self, document=None, WorkflowRunID=None, ProcessProvActivity=None,reference_locations=None, pull_image=True, rm_container=True,
+            rm_tmpdir=True, move_outputs="move",  **kwargs):
         # type: (bool, bool, bool, Text, **Any) -> None
 
         self._setup(kwargs)
@@ -312,7 +342,7 @@ def run(self, pull_image=True, rm_container=True,
             stageFiles(self.generatemapper, ignoreWritable=self.inplace_update, symLink=True)
             relink_initialworkdir(self.generatemapper, self.outdir, self.builder.outdir, inplace_update=self.inplace_update)
 
-        self._execute([], env, rm_tmpdir=rm_tmpdir, move_outputs=move_outputs)
+        self._execute([], env, kwargs, document, WorkflowRunID, ProcessProvActivity,reference_locations, rm_tmpdir=rm_tmpdir, move_outputs=move_outputs)
 
 
 class ContainerCommandLineJob(JobBase):
@@ -323,17 +353,18 @@ def get_from_requirements(self, r, req, pull_image, dry_run=False):
         # type: (Dict[Text, Text], bool, bool, bool) -> Text
         pass
 
+
+        # type: (bool, bool, bool, Text, **Any) -> None
     @abstractmethod
     def create_runtime(self, env, rm_container, record_container_id, cidfile_dir,
                        cidfile_prefix, **kwargs):
         # type: (MutableMapping[Text, Text], bool, bool, Text, Text, **Any) -> List
         pass
 
-    def run(self, pull_image=True, rm_container=True,
+    def run(self, document=None, WorkflowRunID=None, ProcessProvActivity=None,
+            reference_locations=None, pull_image=True, rm_container=True,
             record_container_id=False, cidfile_dir="",
-            cidfile_prefix="",
-            rm_tmpdir=True, move_outputs="move", **kwargs):
-        # type: (bool, bool, bool, Text, Text, bool, Text, **Any) -> None
+            cidfile_prefix="", rm_tmpdir=True, move_outputs="move", **kwargs):
 
         (docker_req, docker_is_req) = get_feature(self, "DockerRequirement")
 
@@ -382,7 +413,7 @@ def run(self, pull_image=True, rm_container=True,
         runtime = self.create_runtime(env, rm_container, record_container_id, cidfile_dir, cidfile_prefix, **kwargs)
         runtime.append(img_id)
 
-        self._execute(runtime, env, rm_tmpdir=rm_tmpdir, move_outputs=move_outputs)
+        self._execute(runtime, env, kwargs, document, WorkflowRunID, ProcessProvActivity, reference_locations, rm_tmpdir=rm_tmpdir, move_outputs=move_outputs) #included kwargs to see if the workflow has been executed using the provenance flag.
 
 
 def _job_popen(
@@ -461,6 +492,7 @@ def _job_popen(
             stderr_path=stderr_path,
             stdin_path=stdin_path,
         )
+
         with open(os.path.join(job_dir, "job.json"), "wb") as f:
             json.dump(job_description, codecs.getwriter('utf-8')(f), ensure_ascii=False)  # type: ignore
         try:

diff --git a/cwltool/lib/python2.7/no-global-site-packages.txt b/cwltool/lib/python2.7/no-global-site-packages.txt