From 8b76a5a698fd3bc13d4acc38194221def9ed5163 Mon Sep 17 00:00:00 2001 From: Nuwan Goonasekera Date: Thu, 21 Dec 2017 22:59:50 +0530 Subject: [PATCH 1/3] Added support for recursively discovering output datasets --- lib/galaxy/tools/parameters/output_collect.py | 48 +++++++++----- .../tools/parser/output_collection_def.py | 2 + .../functional/tools/multi_output_recurse.xml | 65 +++++++++++++++++++ test/functional/tools/samples_tool_conf.xml | 1 + .../tools/test_collect_primary_datasets.py | 25 +++++++ 5 files changed, 123 insertions(+), 18 deletions(-) create mode 100644 test/functional/tools/multi_output_recurse.xml diff --git a/lib/galaxy/tools/parameters/output_collect.py b/lib/galaxy/tools/parameters/output_collect.py index a452d5805125..6a69cb6863ed 100644 --- a/lib/galaxy/tools/parameters/output_collect.py +++ b/lib/galaxy/tools/parameters/output_collect.py @@ -480,37 +480,48 @@ def discover_files(output_name, tool_provided_metadata, extra_file_collectors, j path = os.path.join(target_directory, filename) yield DiscoveredFile(path, extra_file_collector, JsonCollectedDatasetMatch(dataset, extra_file_collector, filename, path=path)) else: - for (match, collector) in walk_over_extra_files(extra_file_collectors, job_working_directory, matchable): + for (match, collector) in walk_over_file_collectors(extra_file_collectors, job_working_directory, matchable): yield DiscoveredFile(match.path, collector, match) -def discover_target_directory(extra_file_collector, job_working_directory): - directory = job_working_directory - if extra_file_collector.directory: - directory = os.path.join(directory, extra_file_collector.directory) +def discover_target_directory(dir_name, job_working_directory): + if dir_name: + directory = os.path.join(job_working_directory, dir_name) if not util.in_directory(directory, job_working_directory): raise Exception("Problem with tool configuration, attempting to pull in datasets from outside working directory.") - return directory - + return directory + else: + return job_working_directory -def walk_over_extra_files(extra_file_collectors, job_working_directory, matchable): +def walk_over_file_collectors(extra_file_collectors, job_working_directory, matchable): for extra_file_collector in extra_file_collectors: assert extra_file_collector.discover_via == "pattern" - matches = [] - directory = discover_target_directory(extra_file_collector, job_working_directory) - if not os.path.isdir(directory): - continue - for filename in os.listdir(directory): - path = os.path.join(directory, filename) - if not os.path.isfile(path): - continue + for match in walk_over_extra_files(extra_file_collector.directory, extra_file_collector, job_working_directory, matchable): + yield match, extra_file_collector + + +def walk_over_extra_files(target_dir, extra_file_collector, job_working_directory, matchable): + """ + Walks through all files in a given directory, and returns all files that + match the given collector's match criteria. If the collector has the + recurse flag enabled, will also recursively descend into child folders. + """ + matches = [] + directory = discover_target_directory(target_dir, job_working_directory) + for filename in os.listdir(directory): + path = os.path.join(directory, filename) + if os.path.isdir(path) and extra_file_collector.recurse: + # The current directory is already validated, so use that as the next job_working_directory when recursing + for match in walk_over_extra_files(filename, extra_file_collector, directory, matchable): + yield match + else: match = extra_file_collector.match(matchable, filename, path=path) if match: matches.append(match) - for match in extra_file_collector.sort(matches): - yield match, extra_file_collector + for match in extra_file_collector.sort(matches): + yield match def dataset_collector(dataset_collection_description): @@ -551,6 +562,7 @@ def __init__(self, dataset_collection_description): self.default_visible = dataset_collection_description.default_visible self.directory = dataset_collection_description.directory self.assign_primary_output = dataset_collection_description.assign_primary_output + self.recurse = dataset_collection_description.recurse def _pattern_for_dataset(self, dataset_instance=None): token_replacement = r'\d+' diff --git a/lib/galaxy/tools/parser/output_collection_def.py b/lib/galaxy/tools/parser/output_collection_def.py index d80f52a3042c..e95c140a7d4c 100644 --- a/lib/galaxy/tools/parser/output_collection_def.py +++ b/lib/galaxy/tools/parser/output_collection_def.py @@ -63,6 +63,7 @@ def __init__(self, **kwargs): self.default_visible = asbool(kwargs.get("visible", None)) self.assign_primary_output = asbool(kwargs.get('assign_primary_output', False)) self.directory = kwargs.get("directory", None) + self.recurse = False class ToolProvidedMetadataDatasetCollection(DatasetCollectionDescription): @@ -77,6 +78,7 @@ class FilePatternDatasetCollectionDescription(DatasetCollectionDescription): def __init__(self, **kwargs): super(FilePatternDatasetCollectionDescription, self).__init__(**kwargs) pattern = kwargs.get("pattern", "__default__") + self.recurse = asbool(kwargs.get("recurse", False)) if pattern in NAMED_PATTERNS: pattern = NAMED_PATTERNS.get(pattern) self.pattern = pattern diff --git a/test/functional/tools/multi_output_recurse.xml b/test/functional/tools/multi_output_recurse.xml new file mode 100644 index 000000000000..b397f1a3808f --- /dev/null +++ b/test/functional/tools/multi_output_recurse.xml @@ -0,0 +1,65 @@ + + + echo "Hello" > $report; + mkdir subdir1; + echo "This" > subdir1/this.txt; + echo "That" > subdir1/that.txt; + mkdir subdir2; + echo "1" > subdir2/CUSTOM_1.txt; + echo "2" > subdir2/CUSTOM_2.txt; + mkdir subdir3; + echo "Foo" > subdir3/Foo; + mkdir subdir3/nested1; + echo "Bar" > subdir3/nested1/bar.txt; + echo "Hello" > subdir3/nested1/hello; + echo "1" > sample1.report.txt; + echo "2" > sample2.report.txt; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/functional/tools/samples_tool_conf.xml b/test/functional/tools/samples_tool_conf.xml index 5bd62a7cbb8b..99f4c609e431 100644 --- a/test/functional/tools/samples_tool_conf.xml +++ b/test/functional/tools/samples_tool_conf.xml @@ -17,6 +17,7 @@ + diff --git a/test/unit/tools/test_collect_primary_datasets.py b/test/unit/tools/test_collect_primary_datasets.py index 4e10ad2132f4..9361c773705d 100644 --- a/test/unit/tools/test_collect_primary_datasets.py +++ b/test/unit/tools/test_collect_primary_datasets.py @@ -57,6 +57,31 @@ def test_collect_multiple(self): # didn't result in a dbkey being set. assert created_hda_1.dbkey == "?" + def test_collect_multiple_recurse(self): + self._replace_output_collectors(''' + + + ''') + path1 = self._setup_extra_file(filename="test1", subdir="subdir1") + path2 = self._setup_extra_file(filename="test2", subdir="subdir2/nested1/") + path3 = self._setup_extra_file(filename="test3", subdir="subdir2") + + datasets = self._collect() + assert DEFAULT_TOOL_OUTPUT in datasets + self.assertEquals(len(datasets[DEFAULT_TOOL_OUTPUT]), 3) + + # Test default order of collection. + assert list(datasets[DEFAULT_TOOL_OUTPUT].keys()) == ["test1", "test2", "test3"] + + created_hda_1 = datasets[DEFAULT_TOOL_OUTPUT]["test1"] + self.app.object_store.assert_created_with_path(created_hda_1.dataset, path1) + + created_hda_2 = datasets[DEFAULT_TOOL_OUTPUT]["test2"] + self.app.object_store.assert_created_with_path(created_hda_2.dataset, path2) + + created_hda_3 = datasets[DEFAULT_TOOL_OUTPUT]["test3"] + self.app.object_store.assert_created_with_path(created_hda_3.dataset, path3) + def test_collect_sorted_reverse(self): self._replace_output_collectors(''' From 3c7248a0884588826a4b5cc8fb699ad0e042d7ce Mon Sep 17 00:00:00 2001 From: Nuwan Goonasekera Date: Thu, 21 Dec 2017 23:40:55 +0530 Subject: [PATCH 2/3] Added schema entry for discover_datasets recurse attribute --- lib/galaxy/tools/xsd/galaxy.xsd | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/galaxy/tools/xsd/galaxy.xsd b/lib/galaxy/tools/xsd/galaxy.xsd index 506f0c0e8121..7e8136b801f1 100644 --- a/lib/galaxy/tools/xsd/galaxy.xsd +++ b/lib/galaxy/tools/xsd/galaxy.xsd @@ -3929,6 +3929,11 @@ More information can be found on Planemo's documentation for Directory (relative to working directory) to search for files. + + + Indicates that the specified directory should be searched recursively for matching files. + + Format (or datatype) of discovered datasets (an alias with ``ext``). From bae64800f072f84c7bbfda2c29f4330d6ca6552e Mon Sep 17 00:00:00 2001 From: Nuwan Goonasekera Date: Sat, 13 Jan 2018 18:33:43 +0530 Subject: [PATCH 3/3] Fix discover_target_directory and pass correct parameter Fix according to review comment: https://github.com/galaxyproject/galaxy/pull/5240/files#r159138549 --- lib/galaxy/tools/parameters/output_collect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/tools/parameters/output_collect.py b/lib/galaxy/tools/parameters/output_collect.py index 6a69cb6863ed..3e6ecc645dff 100644 --- a/lib/galaxy/tools/parameters/output_collect.py +++ b/lib/galaxy/tools/parameters/output_collect.py @@ -474,7 +474,7 @@ def discover_files(output_name, tool_provided_metadata, extra_file_collectors, j # just load entries from tool provided metadata... assert len(extra_file_collectors) == 1 extra_file_collector = extra_file_collectors[0] - target_directory = discover_target_directory(extra_file_collector, job_working_directory) + target_directory = discover_target_directory(extra_file_collector.directory, job_working_directory) for dataset in tool_provided_metadata.get_new_datasets(output_name): filename = dataset["filename"] path = os.path.join(target_directory, filename)