Navigation Menu

Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added support for recursively discovering output datasets #5240

Merged
merged 3 commits into from Jan 17, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
50 changes: 31 additions & 19 deletions lib/galaxy/tools/parameters/output_collect.py
Expand Up @@ -474,43 +474,54 @@ def discover_files(output_name, tool_provided_metadata, extra_file_collectors, j
# just load entries from tool provided metadata...
assert len(extra_file_collectors) == 1
extra_file_collector = extra_file_collectors[0]
target_directory = discover_target_directory(extra_file_collector, job_working_directory)
target_directory = discover_target_directory(extra_file_collector.directory, job_working_directory)
for dataset in tool_provided_metadata.get_new_datasets(output_name):
filename = dataset["filename"]
path = os.path.join(target_directory, filename)
yield DiscoveredFile(path, extra_file_collector, JsonCollectedDatasetMatch(dataset, extra_file_collector, filename, path=path))
else:
for (match, collector) in walk_over_extra_files(extra_file_collectors, job_working_directory, matchable):
for (match, collector) in walk_over_file_collectors(extra_file_collectors, job_working_directory, matchable):
yield DiscoveredFile(match.path, collector, match)


def discover_target_directory(extra_file_collector, job_working_directory):
directory = job_working_directory
if extra_file_collector.directory:
directory = os.path.join(directory, extra_file_collector.directory)
def discover_target_directory(dir_name, job_working_directory):
if dir_name:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't equivalent to the previous version, and I guess this is why collecting outputs from tool provided metadata fails (see https://jenkins.galaxyproject.org/job/docker-framework/9815/).
In line 477 this function is called. I suppose changing the call in 477 to pass in extra_file_collector.directory instead of extra_file_collector may fix the failing tests.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mvdbeek Thanks for catching that - I'd missed your comment and I've just pushed in the fix as suggested. I changed it around a few times and must have forgotten to update the parameter.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it seems to me this method may still have problems xref: #5560

directory = os.path.join(job_working_directory, dir_name)
if not util.in_directory(directory, job_working_directory):
raise Exception("Problem with tool configuration, attempting to pull in datasets from outside working directory.")
return directory

return directory
else:
return job_working_directory

def walk_over_extra_files(extra_file_collectors, job_working_directory, matchable):

def walk_over_file_collectors(extra_file_collectors, job_working_directory, matchable):
for extra_file_collector in extra_file_collectors:
assert extra_file_collector.discover_via == "pattern"
matches = []
directory = discover_target_directory(extra_file_collector, job_working_directory)
if not os.path.isdir(directory):
continue
for filename in os.listdir(directory):
path = os.path.join(directory, filename)
if not os.path.isfile(path):
continue
for match in walk_over_extra_files(extra_file_collector.directory, extra_file_collector, job_working_directory, matchable):
yield match, extra_file_collector


def walk_over_extra_files(target_dir, extra_file_collector, job_working_directory, matchable):
"""
Walks through all files in a given directory, and returns all files that
match the given collector's match criteria. If the collector has the
recurse flag enabled, will also recursively descend into child folders.
"""
matches = []
directory = discover_target_directory(target_dir, job_working_directory)
for filename in os.listdir(directory):
path = os.path.join(directory, filename)
if os.path.isdir(path) and extra_file_collector.recurse:
# The current directory is already validated, so use that as the next job_working_directory when recursing
for match in walk_over_extra_files(filename, extra_file_collector, directory, matchable):
yield match
else:
match = extra_file_collector.match(matchable, filename, path=path)
if match:
matches.append(match)

for match in extra_file_collector.sort(matches):
yield match, extra_file_collector
for match in extra_file_collector.sort(matches):
yield match


def dataset_collector(dataset_collection_description):
Expand Down Expand Up @@ -551,6 +562,7 @@ def __init__(self, dataset_collection_description):
self.default_visible = dataset_collection_description.default_visible
self.directory = dataset_collection_description.directory
self.assign_primary_output = dataset_collection_description.assign_primary_output
self.recurse = dataset_collection_description.recurse

def _pattern_for_dataset(self, dataset_instance=None):
token_replacement = r'\d+'
Expand Down
2 changes: 2 additions & 0 deletions lib/galaxy/tools/parser/output_collection_def.py
Expand Up @@ -63,6 +63,7 @@ def __init__(self, **kwargs):
self.default_visible = asbool(kwargs.get("visible", None))
self.assign_primary_output = asbool(kwargs.get('assign_primary_output', False))
self.directory = kwargs.get("directory", None)
self.recurse = False


class ToolProvidedMetadataDatasetCollection(DatasetCollectionDescription):
Expand All @@ -77,6 +78,7 @@ class FilePatternDatasetCollectionDescription(DatasetCollectionDescription):
def __init__(self, **kwargs):
super(FilePatternDatasetCollectionDescription, self).__init__(**kwargs)
pattern = kwargs.get("pattern", "__default__")
self.recurse = asbool(kwargs.get("recurse", False))
if pattern in NAMED_PATTERNS:
pattern = NAMED_PATTERNS.get(pattern)
self.pattern = pattern
Expand Down
5 changes: 5 additions & 0 deletions lib/galaxy/tools/xsd/galaxy.xsd
Expand Up @@ -3929,6 +3929,11 @@ More information can be found on Planemo's documentation for
<xs:documentation xml:lang="en">Directory (relative to working directory) to search for files.</xs:documentation>
</xs:annotation>
</xs:attribute>
<xs:attribute name="recurse" type="xs:boolean" use="optional">
<xs:annotation>
<xs:documentation xml:lang="en">Indicates that the specified directory should be searched recursively for matching files.</xs:documentation>
</xs:annotation>
</xs:attribute>
<xs:attribute name="format" type="xs:string" use="optional">
<xs:annotation>
<xs:documentation xml:lang="en">Format (or datatype) of discovered datasets (an alias with ``ext``).</xs:documentation>
Expand Down
65 changes: 65 additions & 0 deletions test/functional/tools/multi_output_recurse.xml
@@ -0,0 +1,65 @@
<tool id="multi_output_recurse" name="multi_output_recurse" version="0.1.0">
<command>
echo "Hello" > $report;
mkdir subdir1;
echo "This" > subdir1/this.txt;
echo "That" > subdir1/that.txt;
mkdir subdir2;
echo "1" > subdir2/CUSTOM_1.txt;
echo "2" > subdir2/CUSTOM_2.txt;
mkdir subdir3;
echo "Foo" > subdir3/Foo;
mkdir subdir3/nested1;
echo "Bar" > subdir3/nested1/bar.txt;
echo "Hello" > subdir3/nested1/hello;
echo "1" > sample1.report.txt;
echo "2" > sample2.report.txt;
</command>
<inputs>
<param name="num_param" type="integer" value="7" />
<param name="input" type="data" />
</inputs>
<outputs>
<data auto_format="true" name="report">
<discover_datasets pattern="__designation__" directory="." visible="true" recurse="true" />
</data>
</outputs>
<tests>
<test>
<param name="num_param" value="7" />
<param name="input" ftype="txt" value="simple_line.txt"/>
<output name="report">
<assert_contents>
<has_line line="Hello" />
</assert_contents>
<discovered_dataset designation="this.txt">
<assert_contents><has_line line="This" /></assert_contents>
</discovered_dataset>
<discovered_dataset designation="that.txt">
<assert_contents><has_line line="That" /></assert_contents>
</discovered_dataset>
<discovered_dataset designation="CUSTOM_1.txt">
<assert_contents><has_line line="1" /></assert_contents>
</discovered_dataset>
<discovered_dataset designation="CUSTOM_2.txt">
<assert_contents><has_line line="2" /></assert_contents>
</discovered_dataset>
<discovered_dataset designation="Foo">
<assert_contents><has_line line="Foo" /></assert_contents>
</discovered_dataset>
<discovered_dataset designation="bar.txt">
<assert_contents><has_line line="Bar" /></assert_contents>
</discovered_dataset>
<discovered_dataset designation="hello">
<assert_contents><has_line line="Hello" /></assert_contents>
</discovered_dataset>
<discovered_dataset designation="sample1.report.txt">
<assert_contents><has_line line="1" /></assert_contents>
</discovered_dataset>
<discovered_dataset designation="sample2.report.txt">
<assert_contents><has_line line="2" /></assert_contents>
</discovered_dataset>
</output>
</test>
</tests>
</tool>
1 change: 1 addition & 0 deletions test/functional/tools/samples_tool_conf.xml
Expand Up @@ -17,6 +17,7 @@
<tool file="multi_output.xml" />
<tool file="multi_output_configured.xml" />
<tool file="multi_output_assign_primary.xml" />
<tool file="multi_output_recurse.xml" />
<tool file="tool_provided_metadata_1.xml" />
<tool file="tool_provided_metadata_2.xml" />
<tool file="tool_provided_metadata_3.xml" />
Expand Down
25 changes: 25 additions & 0 deletions test/unit/tools/test_collect_primary_datasets.py
Expand Up @@ -57,6 +57,31 @@ def test_collect_multiple(self):
# didn't result in a dbkey being set.
assert created_hda_1.dbkey == "?"

def test_collect_multiple_recurse(self):
self._replace_output_collectors('''<output>
<discover_datasets pattern="__name__" directory="subdir1" recurse="true" ext="txt" />
<discover_datasets pattern="__name__" directory="subdir2" recurse="true" ext="txt" />
</output>''')
path1 = self._setup_extra_file(filename="test1", subdir="subdir1")
path2 = self._setup_extra_file(filename="test2", subdir="subdir2/nested1/")
path3 = self._setup_extra_file(filename="test3", subdir="subdir2")

datasets = self._collect()
assert DEFAULT_TOOL_OUTPUT in datasets
self.assertEquals(len(datasets[DEFAULT_TOOL_OUTPUT]), 3)

# Test default order of collection.
assert list(datasets[DEFAULT_TOOL_OUTPUT].keys()) == ["test1", "test2", "test3"]

created_hda_1 = datasets[DEFAULT_TOOL_OUTPUT]["test1"]
self.app.object_store.assert_created_with_path(created_hda_1.dataset, path1)

created_hda_2 = datasets[DEFAULT_TOOL_OUTPUT]["test2"]
self.app.object_store.assert_created_with_path(created_hda_2.dataset, path2)

created_hda_3 = datasets[DEFAULT_TOOL_OUTPUT]["test3"]
self.app.object_store.assert_created_with_path(created_hda_3.dataset, path3)

def test_collect_sorted_reverse(self):
self._replace_output_collectors('''<output>
<discover_datasets pattern="__name__" directory="subdir_for_name_discovery" sort_by="reverse_filename" ext="txt" />
Expand Down