[WIP] Implement Expression tools in Galaxy.

- Tool definition languge and plumbing and datatype for expressing expressions as jobs. - Allow connecting expression tools to parameters in workflows, will delay evaluation of workflow so calculated value - Example test expression tools for testing and demonstration. - [WIP] Workflow expression module to allow users to specify arbitrary expressions.
common-workflow-lab · Jan 18, 2018 · a44455a · a44455a
1 parent 8cc6165
commit a44455a
Show file tree

Hide file tree

Showing 22 changed files with 551 additions and 6 deletions.
diff --git a/config/datatypes_conf.xml.sample b/config/datatypes_conf.xml.sample
@@ -385,6 +385,7 @@
     <!-- End RGenetics Datatypes -->
     <datatype extension="ipynb" type="galaxy.datatypes.text:Ipynb" display_in_upload="true" />
     <datatype extension="json" type="galaxy.datatypes.text:Json" display_in_upload="true" />
+    <datatype extension="expression.json" type="galaxy.datatypes.text:ExpressionJson" display_in_upload="true" />
     <!-- graph datatypes -->
     <datatype extension="xgmml" type="galaxy.datatypes.graph:Xgmml" display_in_upload="true"/>
     <datatype extension="sif" type="galaxy.datatypes.graph:Sif" display_in_upload="true"/>

diff --git a/lib/galaxy/datatypes/text.py b/lib/galaxy/datatypes/text.py
@@ -105,6 +105,30 @@ def display_peek(self, dataset):
             return "JSON file (%s)" % (nice_size(dataset.get_size()))
 
 
+class ExpressionJson(Json):
+    """ Represents the non-data input or output to a tool or workflow.
+    """
+    file_ext = "json"
+    MetadataElement(name="json_type", default=None, desc="JavaScript or JSON type of expression", readonly=True, visible=True, no_value=None)
+
+    def set_meta(self, dataset, **kwd):
+        """
+        """
+        json_type = "null"
+        with open(dataset.file_name) as f:
+            obj = json.load(f)
+            if isinstance(obj, int):
+                json_type = "int"
+            elif isinstance(obj, float):
+                json_type = "float"
+            elif isinstance(obj, list):
+                json_type = "list"
+            elif isinstance(obj, dict):
+                json_type = "object"
+
+        dataset.metadata.json_type = json_type
+
+
 class Ipynb(Json):
     file_ext = "ipynb"
 

diff --git a/lib/galaxy/tools/__init__.py b/lib/galaxy/tools/__init__.py
@@ -60,6 +60,7 @@
 from galaxy.tools.parameters.grouping import Conditional, ConditionalWhen, Repeat, Section, UploadDataset
 from galaxy.tools.parameters.input_translation import ToolInputTranslator
 from galaxy.tools.parameters.meta import expand_meta_parameters
+from galaxy.tools.parameters.wrapped_json import json_wrap
 from galaxy.tools.parser import (
     get_tool_source,
     get_tool_source_from_representation,
@@ -2104,6 +2105,71 @@ def exec_before_job(self, app, inp_data, out_data, param_dict=None):
         out.close()
 
 
+class ExpressionTool(Tool):
+    tool_type = 'expression'
+    EXPRESSION_INPUTS_NAME = "_expression_inputs_.json"
+
+    def parse_command(self, tool_source):
+        self.command = expressions.EXPRESSION_SCRIPT_CALL
+        self.interpreter = None
+        self._expression = tool_source.parse_expression().strip()
+
+    def parse_outputs(self, tool_source):
+        # Setup self.outputs and self.output_collections
+        super(ExpressionTool, self).parse_outputs(tool_source)
+
+        # Validate these outputs for expression tools.
+        if len(self.output_collections) != 0:
+            message = "Expression tools may not declare output collections at this time."
+            raise Exception(message)
+        for output in self.outputs.values():
+            if not hasattr(output, "from_expression"):
+                message = "Expression tools may not declare output datasets at this time."
+                raise Exception(message)
+
+    def exec_before_job(self, app, inp_data, out_data, param_dict=None):
+        super(ExpressionTool, self).exec_before_job(app, inp_data, out_data, param_dict=param_dict)
+        local_working_directory = param_dict["__local_working_directory__"]
+        expression_inputs_path = os.path.join(local_working_directory, 'working', ExpressionTool.EXPRESSION_INPUTS_NAME)
+
+        outputs = []
+        for i, (out_name, data) in enumerate(out_data.iteritems()):
+            output_def = self.outputs[out_name]
+            wrapped_data = param_dict.get(out_name)
+            file_name = str(wrapped_data)
+
+            outputs.append(dict(
+                name=out_name,
+                from_expression=output_def.from_expression,
+                path=file_name,
+            ))
+
+        if param_dict is None:
+            raise Exception("Internal error - param_dict is empty.")
+
+        job = {}
+        json_wrap(self.inputs, param_dict, job, handle_files='OBJECT')
+        expression_inputs = {
+            'job': job,
+            'script': self._expression,
+            'outputs': outputs,
+        }
+        expressions.write_evalute_script(os.path.join(local_working_directory, 'working'))
+        with open(expression_inputs_path, "w") as f:
+            json.dump(expression_inputs, f)
+
+    def parse_environment_variables(self, tool_source):
+        """ Setup environment variable for inputs file.
+        """
+        environmnt_variables_raw = super(ExpressionTool, self).parse_environment_variables(tool_source)
+        expression_script_inputs = dict(
+            name="GALAXY_EXPRESSION_INPUTS",
+            template=ExpressionTool.EXPRESSION_INPUTS_NAME,
+        )
+        environmnt_variables_raw.append(expression_script_inputs)
+        return environmnt_variables_raw
+
+
 class DataSourceTool(OutputParameterJSONTool):
     """
     Alternate implementation of Tool for data_source tools -- those that
@@ -2352,6 +2418,7 @@ class UsesExpressions:
     requires_js_runtime = True
 
     def _expression_environment(self, hda):
+        # TODO: use json_wrap HDA stuff for this for this...
         raw_as_dict = hda.to_dict()
         filtered_as_dict = {}
         # We are more conservative with the API provided to tools
@@ -2737,7 +2804,7 @@ def produce_outputs(self, trans, out_data, output_collections, incoming, history
 
 # Populate tool_type to ToolClass mappings
 tool_types = {}
-for tool_class in [Tool, SetMetadataTool, OutputParameterJSONTool,
+for tool_class in [Tool, SetMetadataTool, OutputParameterJSONTool, ExpressionTool,
                    DataManagerTool, DataSourceTool, AsyncDataSourceTool,
                    UnzipCollectionTool, ZipCollectionTool, MergeCollectionTool, RelabelFromFileTool, FilterFromFileTool,
                    DataDestinationTool]:

diff --git a/lib/galaxy/tools/expressions/__init__.py b/lib/galaxy/tools/expressions/__init__.py
@@ -1,12 +1,20 @@
 from .evaluation import evaluate
 from .sandbox import execjs, interpolate
 from .util import jshead, find_engine
+from .script import (
+    write_evalute_script,
+    EXPRESSION_SCRIPT_CALL,
+    EXPRESSION_SCRIPT_NAME,
+)
 
 
 __all__ = (
     'evaluate',
     'execjs',
+    'EXPRESSION_SCRIPT_CALL',
+    'EXPRESSION_SCRIPT_NAME',
     'find_engine',
     'interpolate',
     'jshead',
+    'write_evalute_script',
 )
diff --git a/lib/galaxy/tools/expressions/script.py b/lib/galaxy/tools/expressions/script.py
@@ -0,0 +1,15 @@
+import os
+
+EXPRESSION_SCRIPT_NAME = "_evaluate_expression_.py"
+EXPRESSION_SCRIPT_CALL = "python %s" % EXPRESSION_SCRIPT_NAME
+
+
+def write_evalute_script(in_directory):
+    """ Responsible for writing the script that evaluates expressions
+    in Galaxy jobs.
+    """
+    script = os.path.join(in_directory, EXPRESSION_SCRIPT_NAME)
+    with open(script, "w") as f:
+        f.write('from galaxy_ext.expressions.handle_job import run; run()')
+
+    return script
diff --git a/lib/galaxy/tools/parameters/wrapped_json.py b/lib/galaxy/tools/parameters/wrapped_json.py
@@ -46,10 +46,18 @@ def _json_wrap_input(input, value, handle_files="SKIP"):
     elif input_type == "data" and input.multiple:
         if handle_files == "SKIP":
             return SKIP_INPUT
+        # TODO: map over _hda_to_object
         raise NotImplementedError()
     elif input_type == "data":
         if handle_files == "SKIP":
             return SKIP_INPUT
+        elif handle_files == "OBJECT":
+            if value:
+                if isinstance(value, list):
+                    value = value[0]
+                return _hda_to_object(value)
+            else:
+                return None
         raise NotImplementedError()
     elif input_type == "data_collection":
         if handle_files == "SKIP":
@@ -72,6 +80,21 @@ def _json_wrap_input(input, value, handle_files="SKIP"):
     return json_value
 
 
+def _hda_to_object(hda):
+    hda_dict = hda.to_dict()
+    metadata_dict = {}
+
+    for key, value in hda_dict.items():
+        if key.startswith("metadata_"):
+            metadata_dict[key[len("metadata_"):]] = value
+
+    return {
+        'file_ext': hda_dict['file_ext'],
+        'name': hda_dict['name'],
+        'metadata': metadata_dict,
+    }
+
+
 def _cast_if_not_none(value, cast_to, empty_to_none=False):
     # log.debug("value [%s], type[%s]" % (value, type(value)))
     if value is None or (empty_to_none and str(value) == ''):

diff --git a/lib/galaxy/tools/parser/interface.py b/lib/galaxy/tools/parser/interface.py
@@ -86,6 +86,11 @@ def parse_command(self):
         """ Return string contianing command to run.
         """
 
+    def parse_expression(self):
+        """ Return string contianing command to run.
+        """
+        return None
+
     @abstractmethod
     def parse_environment_variables(self):
         """ Return environment variable templates to expose.

diff --git a/lib/galaxy/tools/parser/output_objects.py b/lib/galaxy/tools/parser/output_objects.py
@@ -21,12 +21,13 @@ class ToolOutput(ToolOutputBase):
       (format, metadata_source, parent)
     """
 
-    dict_collection_visible_keys = ['name', 'format', 'label', 'hidden']
+    dict_collection_visible_keys = ['name', 'format', 'label', 'hidden', 'output_type']
 
     def __init__(self, name, format=None, format_source=None, metadata_source=None,
                  parent=None, label=None, filters=None, actions=None, hidden=False,
                  implicit=False):
         super(ToolOutput, self).__init__(name, label=label, filters=filters, hidden=hidden)
+        self.output_type = "data"
         self.format = format
         self.format_source = format_source
         self.metadata_source = metadata_source
@@ -67,6 +68,27 @@ def to_dict(self, view='collection', value_mapper=None, app=None):
         return as_dict
 
 
+class ToolExpressionOutput(ToolOutputBase):
+    dict_collection_visible_keys = ('name', 'format', 'label', 'hidden', 'output_type')
+
+    def __init__(self, name, output_type, from_expression,
+                 label=None, filters=None, actions=None, hidden=False):
+        super(ToolExpressionOutput, self).__init__(name, label=label, filters=filters, hidden=hidden)
+        self.output_type = output_type  # JSON type...
+        self.from_expression = from_expression
+        self.format = "expression.json"  # galaxy.datatypes.text.ExpressionJson.file_ext
+
+        self.format_source = None
+        self.metadata_source = None
+        self.parent = None
+        self.actions = actions
+
+        # Initialize default values
+        self.change_format = []
+        self.implicit = False
+        self.from_work_dir = None
+
+
 class ToolOutputCollection(ToolOutputBase):
     """
     Represents a HistoryDatasetCollectionAssociation of output datasets produced
@@ -82,6 +104,7 @@ class ToolOutputCollection(ToolOutputBase):
       </collection>
     <outputs>
     """
+    dict_collection_visible_keys = ('name', 'format', 'label', 'hidden', 'output_type')
 
     def __init__(
         self,
@@ -97,6 +120,7 @@ def __init__(
         inherit_metadata=False
     ):
         super(ToolOutputCollection, self).__init__(name, label=label, filters=filters, hidden=hidden)
+        self.output_type = "collection"
         self.collection = True
         self.default_format = default_format
         self.structure = structure

diff --git a/lib/galaxy/tools/parser/xml.py b/lib/galaxy/tools/parser/xml.py
@@ -21,6 +21,7 @@
 from .output_actions import ToolOutputActionGroup
 from .output_collection_def import dataset_collector_descriptions_from_elem
 from .output_objects import (
+    ToolExpressionOutput,
     ToolOutput,
     ToolOutputCollection,
     ToolOutputCollectionStructure
@@ -110,6 +111,12 @@ def parse_command(self):
         command_el = self._command_el
         return ((command_el is not None) and command_el.text) or None
 
+    def parse_expression(self):
+        """ Return string contianing command to run.
+        """
+        expression_el = self.root.find("expression")
+        return ((expression_el is not None) and expression_el.text) or None
+
     def parse_environment_variables(self):
         environment_variables_el = self.root.find("environment_variables")
         if environment_variables_el is None:
@@ -231,9 +238,12 @@ def _parse(data_elem, **kwds):
             data_dict[output_def.name] = output_def
             return output_def
 
-        map(_parse, out_elem.findall("data"))
+        def _parse_expression(output_elem, **kwds):
+            output_def = self._parse_expression_output(output_elem, tool, **kwds)
+            data_dict[output_def.name] = output_def
+            return output_def
 
-        for collection_elem in out_elem.findall("collection"):
+        def _parse_collection(collection_elem):
             name = collection_elem.get("name")
             label = xml_text(collection_elem, "label")
             default_format = collection_elem.get("format", "data")
@@ -287,6 +297,22 @@ def _parse(data_elem, **kwds):
                 output_collection.outputs[output_name] = data
             output_collections[name] = output_collection
 
+        for out_child in out_elem.getchildren():
+            if out_child.tag == "data":
+                _parse(out_child)
+            elif out_child.tag == "collection":
+                _parse_collection(out_child)
+            elif out_child.tag == "output":
+                output_type = out_child.get("type")
+                if output_type == "data":
+                    _parse(out_child)
+                elif output_type == "collection":
+                    _parse_collection(out_child)
+                else:
+                    _parse_expression(out_child)
+            else:
+                log.warn("Unknown output tag encountered [%s]" % out_child.tag)
+
         for output_def in data_dict.values():
             outputs[output_def.name] = output_def
         return outputs, output_collections
@@ -298,6 +324,7 @@ def _parse_output(
         default_format="data",
         default_format_source=None,
         default_metadata_source="",
+        expression_type=None,
     ):
         output = ToolOutput(data_elem.get("name"))
         output_format = data_elem.get("format", default_format)
@@ -322,6 +349,22 @@ def _parse_output(
         output.dataset_collector_descriptions = dataset_collector_descriptions_from_elem(data_elem, legacy=self.legacy_defaults)
         return output
 
+    def _parse_expression_output(self, output_elem, tool, **kwds):
+        output_type = output_elem.get("type")
+        from_expression = output_elem.get("from")
+        output = ToolExpressionOutput(
+            output_elem.get("name"),
+            output_type,
+            from_expression,
+        )
+        output.path = output_elem.get("value")
+        output.label = xml_text(output_elem, "label")
+
+        output.hidden = string_as_bool(output_elem.get("hidden", ""))
+        output.actions = ToolOutputActionGroup(output, output_elem.find('actions'))
+        output.dataset_collector_descriptions = []
+        return output
+
     def parse_stdio(self):
         command_el = self._command_el
         detect_errors = None

diff --git a/lib/galaxy/tools/parser/yaml.py b/lib/galaxy/tools/parser/yaml.py
@@ -54,6 +54,9 @@ def parse_require_login(self, default):
     def parse_command(self):
         return self.root_dict.get("command")
 
+    def parse_expression(self):
+        return self.root_dict.get("expression")
+
     def parse_environment_variables(self):
         return []