DatasetMatcherClean - Implement DatasetMatcherFactory to reason for w…

…hole tool. In subsequent commit I'll use this central store of all the inputs for a tool to determine if summary data about collections can be used instead of processing individual datasets one at a time. Even this commit though uses the abstraction to optimize datatype checking and cache commons checks when possible - should lead to a lot fewer objects being created when processing a large history.
galaxyproject · Apr 27, 2018 · efe5d8b · efe5d8b
1 parent ca80232
commit efe5d8b
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 14 deletions.
diff --git a/lib/galaxy/tools/__init__.py b/lib/galaxy/tools/__init__.py
@@ -56,6 +56,10 @@
     ToolParameter,
     workflow_building_modes,
 )
+from galaxy.tools.parameters.dataset_matcher import (
+    set_dataset_matcher_factory,
+    unset_dataset_matcher_factory,
+)
 from galaxy.tools.parameters.grouping import Conditional, ConditionalWhen, Repeat, Section, UploadDataset
 from galaxy.tools.parameters.input_translation import ToolInputTranslator
 from galaxy.tools.parameters.meta import expand_meta_parameters
@@ -1826,7 +1830,9 @@ def to_json(self, trans, kwd={}, job=None, workflow_building_mode=False):
         # create tool model
         tool_model = self.to_dict(request_context)
         tool_model['inputs'] = []
+        set_dataset_matcher_factory(request_context, self, state_inputs)
         self.populate_model(request_context, self.inputs, state_inputs, tool_model['inputs'])
+        unset_dataset_matcher_factory(request_context)
 
         # create tool help
         tool_help = ''

diff --git a/lib/galaxy/tools/parameters/basic.py b/lib/galaxy/tools/parameters/basic.py
@@ -26,8 +26,7 @@
 from galaxy.web import url_for
 from . import validation
 from .dataset_matcher import (
-    DatasetCollectionMatcher,
-    DatasetMatcher
+    get_dataset_matcher_factory,
 )
 from .sanitize import ToolParameterSanitizer
 from ..parameters import (
@@ -1484,14 +1483,15 @@ def get_initial_value(self, trans, other_values):
             return None
         history = trans.history
         if history is not None:
-            dataset_matcher = DatasetMatcher(trans, self, other_values)
+            dataset_matcher_factory = get_dataset_matcher_factory(trans)
+            dataset_matcher = dataset_matcher_factory.dataset_matcher(self, other_values)
             if isinstance(self, DataToolParameter):
                 for hda in reversed(history.active_datasets_and_roles):
                     match = dataset_matcher.hda_match(hda)
                     if match:
                         return match.hda
             else:
-                dataset_collection_matcher = DatasetCollectionMatcher(dataset_matcher)
+                dataset_collection_matcher = dataset_matcher_factory.dataset_collection_matcher(dataset_matcher)
                 for hdca in reversed(history.active_dataset_collections):
                     if dataset_collection_matcher.hdca_match(hdca, reduction=self.multiple):
                         return hdca
@@ -1790,7 +1790,8 @@ def to_dict(self, trans, other_values={}):
             return d
 
         # prepare dataset/collection matching
-        dataset_matcher = DatasetMatcher(trans, self, other_values)
+        dataset_matcher_factory = get_dataset_matcher_factory(trans)
+        dataset_matcher = dataset_matcher_factory.dataset_matcher(self, other_values)
         multiple = self.multiple
 
         # build and append a new select option
@@ -1822,7 +1823,7 @@ def append(list, hda, name, src, keep=False):
                 append(d['options']['hda'], hda, '(%s) %s' % (hda_state, hda.name), 'hda', True)
 
         # add dataset collections
-        dataset_collection_matcher = DatasetCollectionMatcher(dataset_matcher)
+        dataset_collection_matcher = dataset_matcher_factory.dataset_collection_matcher(dataset_matcher)
         for hdca in history.active_dataset_collections:
             if dataset_collection_matcher.hdca_match(hdca, reduction=multiple):
                 append(d['options']['hdca'], hdca, hdca.name, 'hdca')
@@ -1859,18 +1860,15 @@ def _history_query(self, trans):
         dataset_collection_type_descriptions = trans.app.dataset_collections_service.collection_type_descriptions
         return history_query.HistoryQuery.from_parameter(self, dataset_collection_type_descriptions)
 
-    def match_collections(self, trans, history, dataset_matcher):
+    def match_collections(self, trans, history, dataset_collection_matcher):
         dataset_collections = trans.app.dataset_collections_service.history_dataset_collections(history, self._history_query(trans))
-        dataset_collection_matcher = DatasetCollectionMatcher(dataset_matcher)
 
         for dataset_collection_instance in dataset_collections:
             if not dataset_collection_matcher.hdca_match(dataset_collection_instance):
                 continue
             yield dataset_collection_instance
 
-    def match_multirun_collections(self, trans, history, dataset_matcher):
-        dataset_collection_matcher = DatasetCollectionMatcher(dataset_matcher)
-
+    def match_multirun_collections(self, trans, history, dataset_collection_matcher):
         for history_dataset_collection in history.active_dataset_collections:
             if not self._history_query(trans).can_map_over(history_dataset_collection):
                 continue
@@ -1947,10 +1945,12 @@ def to_dict(self, trans, other_values=None):
             return d
 
         # prepare dataset/collection matching
-        dataset_matcher = DatasetMatcher(trans, self, other_values)
+        dataset_matcher_factory = get_dataset_matcher_factory(trans)
+        dataset_matcher = dataset_matcher_factory.dataset_matcher(self, other_values)
+        dataset_collection_matcher = dataset_matcher_factory.dataset_collection_matcher(dataset_matcher)
 
         # append directly matched collections
-        for hdca in self.match_collections(trans, history, dataset_matcher):
+        for hdca in self.match_collections(trans, history, dataset_collection_matcher):
             d['options']['hdca'].append({
                 'id'   : trans.security.encode_id(hdca.id),
                 'hid'  : hdca.hid,
@@ -1960,7 +1960,7 @@ def to_dict(self, trans, other_values=None):
             })
 
         # append matching subcollections
-        for hdca in self.match_multirun_collections(trans, history, dataset_matcher):
+        for hdca in self.match_multirun_collections(trans, history, dataset_collection_matcher):
             subcollection_type = self._history_query(trans).can_map_over(hdca).collection_type
             d['options']['hdca'].append({
                 'id'   : trans.security.encode_id(hdca.id),

diff --git a/lib/galaxy/tools/parameters/dataset_matcher.py b/lib/galaxy/tools/parameters/dataset_matcher.py
@@ -5,6 +5,46 @@
 log = getLogger(__name__)
 
 
+def set_dataset_matcher_factory(trans, tool, param_values):
+    trans.dataset_matcher_factory = DatasetMatcherFactory(trans, tool, param_values)
+
+
+def unset_dataset_matcher_factory(trans):
+    trans.dataset_matcher_factory = None
+
+
+def get_dataset_matcher_factory(trans):
+    dataset_matcher_factory = getattr(trans, "dataset_matcher_factory", None)
+    return dataset_matcher_factory or DatasetMatcherFactory(trans)
+
+
+class DatasetMatcherFactory(object):
+    """"""
+
+    def __init__(self, trans, tool=None, param_values=None):
+        self._trans = trans
+        self._tool = tool
+        self._data_inputs = []
+        if tool is not None and param_values is not None:
+            self._collect_data_inputs(tool, param_values)
+
+    def _collect_data_inputs(self, tool, param_values):
+        def visitor(input, value, prefix, parent=None, **kwargs):
+            type_name = type(input).__name__
+            if "DataToolParameter" in type_name:
+                self._data_inputs.append(input)
+            elif "DatasetCollectionToolParameter" in type_name:
+                self._data_inputs.append(input)
+
+        tool.visit_inputs(param_values, visitor)
+
+    def dataset_matcher(self, param, other_values):
+        return DatasetMatcher(self._trans, param, other_values)
+
+    def dataset_collection_matcher(self, dataset_matcher):
+        return DatasetCollectionMatcher(dataset_matcher)
+
+
 class DatasetMatcher(object):
     """ Utility class to aid DataToolParameter and similar classes in reasoning
     about what HDAs could match or are selected for a parameter and value.