Skip to content

Commit

Permalink
Merge pull request #3940 from jmchilton/collection_op_filter_from_file
Browse files Browse the repository at this point in the history
Collection Operation - Filtering from a File
  • Loading branch information
bgruening committed Apr 20, 2017
2 parents 9f4fd7c + 49abfce commit 01b3eb4
Show file tree
Hide file tree
Showing 10 changed files with 341 additions and 1 deletion.
2 changes: 2 additions & 0 deletions config/tool_conf.xml.sample
Expand Up @@ -36,6 +36,8 @@
<tool file="${model_tools_path}/filter_failed_collection.xml" />
<tool file="${model_tools_path}/flatten_collection.xml" />
<tool file="${model_tools_path}/merge_collection.xml" />
<tool file="${model_tools_path}/relabel_from_file.xml" />
<tool file="${model_tools_path}/filter_from_file.xml" />
</section>
<section id="liftOver" name="Lift-Over">
<tool file="extract/liftOver_wrapper.xml" />
Expand Down
89 changes: 88 additions & 1 deletion lib/galaxy/tools/__init__.py
Expand Up @@ -2506,11 +2506,98 @@ def add_elements(collection, prefix=""):
)


class RelabelFromFileTool(DatabaseOperationTool):
tool_type = 'relabel_from_file'

def produce_outputs(self, trans, out_data, output_collections, incoming, history):
hdca = incoming["input"]
how_type = incoming["how"]["how_select"]
new_labels_dataset_assoc = incoming["how"]["labels"]
strict = string_as_bool(incoming["how"]["strict"])
new_elements = odict()

def add_copied_value_to_new_elements(new_label, dce_object):
new_label = new_label.strip()
if new_label in new_elements:
raise Exception("New identifier [%s] appears twice in resulting collection, these values must be unique." % new_label)
copied_value = dce_object.copy()
if getattr(copied_value, "history_content_type", None) == "dataset":
history.add_dataset(copied_value, set_hid=False)
new_elements[new_label] = copied_value

new_labels_path = new_labels_dataset_assoc.file_name
new_labels = open(new_labels_path, "r").readlines(1024 * 1000000)
if strict and len(hdca.collection.elements) != len(new_labels):
raise Exception("Relabel mapping file contains incorrect number of identifiers")
if how_type == "tabular":
# We have a tabular file, where the first column is an existing element identifier,
# and the second column is the new element identifier.
source_new_label = (line.strip().split('\t') for line in new_labels)
new_labels_dict = {source: new_label for source, new_label in source_new_label}
for i, dce in enumerate(hdca.collection.elements):
dce_object = dce.element_object
element_identifier = dce.element_identifier
default = element_identifier if strict else None
new_label = new_labels_dict.get(element_identifier, default)
if not new_label:
raise Exception("Failed to find new label for identifier [%s]" % element_identifier)
add_copied_value_to_new_elements(new_label, dce_object)
else:
# If new_labels_dataset_assoc is not a two-column tabular dataset we label with the current line of the dataset
for i, dce in enumerate(hdca.collection.elements):
dce_object = dce.element_object
add_copied_value_to_new_elements(new_labels[i], dce_object)
for key in new_elements.keys():
if not re.match("^[\w\-_]+$", key):
raise Exception("Invalid new colleciton identifier [%s]" % key)
output_collections.create_collection(
next(iter(self.outputs.values())), "output", elements=new_elements
)


class FilterFromFileTool(DatabaseOperationTool):
tool_type = 'filter_from_file'

def produce_outputs(self, trans, out_data, output_collections, incoming, history):
hdca = incoming["input"]
how_filter = incoming["how"]["how_filter"]
filter_dataset_assoc = incoming["how"]["filter_source"]
filtered_elements = odict()
discarded_elements = odict()

filtered_path = filter_dataset_assoc.file_name
filtered_identifiers_raw = open(filtered_path, "r").readlines(1024 * 1000000)
filtered_identifiers = [i.strip() for i in filtered_identifiers_raw]

# If filtered_dataset_assoc is not a two-column tabular dataset we label with the current line of the dataset
for i, dce in enumerate(hdca.collection.elements):
dce_object = dce.element_object
element_identifier = dce.element_identifier
in_filter_file = element_identifier in filtered_identifiers
passes_filter = in_filter_file if how_filter == "remove_if_absent" else not in_filter_file

copied_value = dce_object.copy()
if getattr(copied_value, "history_content_type", None) == "dataset":
history.add_dataset(copied_value, set_hid=False)

if passes_filter:
filtered_elements[element_identifier] = copied_value
else:
discarded_elements[element_identifier] = copied_value

output_collections.create_collection(
self.outputs["output_filtered"], "output_filtered", elements=filtered_elements
)
output_collections.create_collection(
self.outputs["output_discarded"], "output_discarded", elements=discarded_elements
)


# Populate tool_type to ToolClass mappings
tool_types = {}
for tool_class in [ Tool, SetMetadataTool, OutputParameterJSONTool,
DataManagerTool, DataSourceTool, AsyncDataSourceTool,
UnzipCollectionTool, ZipCollectionTool, MergeCollectionTool,
UnzipCollectionTool, ZipCollectionTool, MergeCollectionTool, RelabelFromFileTool, FilterFromFileTool,
DataDestinationTool ]:
tool_types[ tool_class.tool_type ] = tool_class

Expand Down
91 changes: 91 additions & 0 deletions lib/galaxy/tools/filter_from_file.xml
@@ -0,0 +1,91 @@
<tool id="__FILTER_FROM_FILE__"
name="Filter List"
version="1.0.0"
tool_type="filter_from_file">
<description>from contents of a file</description>
<type class="FilterFromFileTool" module="galaxy.tools" />
<action module="galaxy.tools.actions.model_operations"
class="ModelOperationToolAction"/>
<inputs>
<param type="data_collection" name="input" label="Input Collection" help="A list whose elements will be filtered."/>
<conditional name="how">
<param type="select" name="how_filter" label="How should the elements to remove be determined?">
<option value="remove_if_absent" selected="true">Remove if identifiers are absent from supplied text file.</option>
<option value="remove_if_present">Remove if identifiers are present in supplied text file.</option>
</param>
<when value="remove_if_absent">
<param type="data" name="filter_source" format="txt" label="Filter out identifiers absent from" />
</when>
<when value="remove_if_present">
<param type="data" name="filter_source" format="txt" label="Filter out identifiers present in" />
</when>
</conditional>
</inputs>
<outputs>
<collection name="output_filtered" format_source="input" type_source="input" label="${on_string} (filtered)" >
</collection>
<collection name="output_discarded" format_source="input" type_source="input" label="${on_string} (discarded)" >
</collection>
</outputs>
<tests>
<test>
<param name="input">
<collection type="list">
<element name="i1" value="simple_line.txt" />
<element name="i2" value="simple_line_alternative.txt" />
</collection>
</param>
<param name="how_filter" value="remove_if_present" />
<param name="filter_source" value="filter_labels_1.txt" ftype="txt" />
<output_collection name="output_filtered" type="list">
<element name="i1">
<assert_contents>
<has_text_matching expression="^This is a line of text.\n$" />
</assert_contents>
</element>
</output_collection>
<output_collection name="output_discarded" type="list">
<element name="i2">
<assert_contents>
<has_text_matching expression="^This is a different line of text.\n$" />
</assert_contents>
</element>
</output_collection>
</test>
<test>
<param name="input">
<collection type="list">
<element name="i1" value="simple_line.txt" />
<element name="i2" value="simple_line_alternative.txt" />
</collection>
</param>
<param name="how_filter" value="remove_if_absent" />
<param name="filter_source" value="filter_labels_1.txt" ftype="txt" />
<output_collection name="output_filtered" type="list">
<element name="i2">
<assert_contents>
<has_text_matching expression="^This is a different line of text.\n$" />
</assert_contents>
</element>
</output_collection>
<output_collection name="output_discarded" type="list">
<element name="i1">
<assert_contents>
<has_text_matching expression="^This is a line of text.\n$" />
</assert_contents>
</element>
</output_collection>
</test>
</tests>
<help><![CDATA[
.. class:: infomark
This tool will take an input list and a text file with identifiers to
filter the list with. It will build two new lists - one "filtered" to
contain only the supplied identifiers and one of the discarded elements.
This tool will create new history datasets from your collection
but your quota usage will not increase.
]]></help>
</tool>
152 changes: 152 additions & 0 deletions lib/galaxy/tools/relabel_from_file.xml
@@ -0,0 +1,152 @@
<tool id="__RELABEL_FROM_FILE__"
name="Relabel List Identifiers"
version="1.0.0"
tool_type="relabel_from_file">
<description>from contents of a file</description>
<type class="RelabelFromFileTool" module="galaxy.tools" />
<action module="galaxy.tools.actions.model_operations"
class="ModelOperationToolAction"/>
<inputs>
<param type="data_collection" name="input" label="Input Collection" help="A list whose identifiers will be relabelled."/>
<conditional name="how">
<param type="select" name="how_select" label="How should the new labels be specified?">
<option value="txt">Using lines in a simple text file.</option>
<option value="tabular">Map original identifiers to new ones using a two column table.</option>
</param>
<when value="txt">
<param type="data" name="labels" format="txt" label="New Identifiers" />
<param name="strict" type="boolean" label="Ensure strict mapping" help="If selected, the target file must contain exactly the correct number of lines." truevalue="true" falsevalue="false" />
</when>
<when value="tabular">
<param type="data" name="labels" format="tabular" label="New Identifiers" />
<param name="strict" type="boolean" label="Ensure strict mapping" help="If selected, the target file must contain exactly the correct number of lines and each input identifier must match exactly one element of the input collection." truevalue="true" falsevalue="false" />
</when>
</conditional>
</inputs>
<outputs>
<collection name="output" format_source="input" type_source="input" label="${on_string} (relabelled)" >
</collection>
</outputs>
<tests>
<test>
<param name="input">
<collection type="list:paired">
<element name="i1">
<collection type="paired">
<element name="forward" value="simple_line.txt" />
<element name="reverse" value="simple_line_alternative.txt" />
</collection>
</element>
</collection>
</param>
<param name="how_select" value="txt" />
<param name="labels" value="new_labels_1.txt" ftype="txt" />
<output_collection name="output" type="list:paired">
<element name="new_i1">
<element name="forward">
<assert_contents>
<has_text_matching expression="^This is a line of text.\n$" />
</assert_contents>
</element>
<element name="reverse">
<assert_contents>
<has_text_matching expression="^This is a different line of text.\n$" />
</assert_contents>
</element>
</element>
</output_collection>
</test>
<test>
<param name="input">
<collection type="list:paired">
<element name="i1">
<collection type="paired">
<element name="forward" value="simple_line.txt" />
<element name="reverse" value="simple_line_alternative.txt" />
</collection>
</element>
</collection>
</param>
<param name="how_select" value="tabular" />
<param name="labels" value="new_labels_2.txt" ftype="tabular" />
<output_collection name="output" type="list:paired">
<element name="new_i1">
<element name="forward">
<assert_contents>
<has_text_matching expression="^This is a line of text.\n$" />
</assert_contents>
</element>
<element name="reverse">
<assert_contents>
<has_text_matching expression="^This is a different line of text.\n$" />
</assert_contents>
</element>
</element>
</output_collection>
</test>
<!-- test strict -->
<test expect_failure="true">
<param name="input">
<collection type="list:paired">
<element name="wrongi">
<collection type="paired">
<element name="forward" value="simple_line.txt" />
<element name="reverse" value="simple_line_alternative.txt" />
</collection>
</element>
</collection>
</param>
<param name="how_select" value="tabular" />
<param name="labels" value="new_labels_2.txt" ftype="tabular" />
</test>
<test expect_failure="true">
<param name="input">
<collection type="list">
<element name="i1" value="simple_line.txt" />
<element name="i3" value="simple_line_alternative.txt" />
</collection>
</param>
<param name="how_select" value="txt" />
<param name="labels" value="new_labels_1.txt" ftype="txt" />
</test>
<!-- test label bad characters -->
<test expect_failure="true">
<param name="input">
<collection type="list">
<element name="i1" value="simple_line.txt" />
</collection>
</param>
<param name="how_select" value="txt" />
<param name="labels" value="new_labels_bad_1.txt" ftype="txt" />
</test>
<!-- test label bad because of duplicates -->
<test expect_failure="true">
<param name="input">
<collection type="list">
<element name="i1" value="simple_line.txt" />
<element name="i2" value="simple_line.txt" />
</collection>
</param>
<param name="how_select" value="txt" />
<param name="labels" value="new_labels_bad_2.txt" ftype="txt" />
</test>
</tests>
<help><![CDATA[
.. class:: infomark
This tool will take an input list and a text file with new identifiers
and build a new list with the same datasets but these new identifiers.
The order and number of entries in the text file must match the order
of the items you want to rename in your dataset collection.
Alternatively a tabular file may be supplied, where the first column
if the current identifier that should be renamed, and the second column
contains the new label. This file may contain less entries than items
in the collection. In that case only matching list identifiers will be
relabeled.
This tool will create new history datasets from your collection
but your quota usage will not increase.
]]></help>
</tool>
1 change: 1 addition & 0 deletions test-data/filter_labels_1.txt
@@ -0,0 +1 @@
i2
1 change: 1 addition & 0 deletions test-data/new_labels_1.txt
@@ -0,0 +1 @@
new_i1
1 change: 1 addition & 0 deletions test-data/new_labels_2.txt
@@ -0,0 +1 @@
i1 new_i1
1 change: 1 addition & 0 deletions test-data/new_labels_bad_1.txt
@@ -0,0 +1 @@
new_i; rm -rf
2 changes: 2 additions & 0 deletions test-data/new_labels_bad_2.txt
@@ -0,0 +1,2 @@
newi1
newi1
2 changes: 2 additions & 0 deletions test/functional/tools/samples_tool_conf.xml
Expand Up @@ -146,5 +146,7 @@
<tool file="${model_tools_path}/filter_failed_collection.xml" />
<tool file="${model_tools_path}/flatten_collection.xml" />
<tool file="${model_tools_path}/merge_collection.xml" />
<tool file="${model_tools_path}/relabel_from_file.xml" />
<tool file="${model_tools_path}/filter_from_file.xml" />

</toolbox>

0 comments on commit 01b3eb4

Please sign in to comment.