Skip to content

Commit

Permalink
Merge pull request #5180 from bgruening/bam_unsorted
Browse files Browse the repository at this point in the history
Add BamNative datatype
  • Loading branch information
mvdbeek committed Jan 19, 2018
2 parents 33bd540 + 8ecc86c commit 8f65cd4
Show file tree
Hide file tree
Showing 13 changed files with 222 additions and 83 deletions.
5 changes: 5 additions & 0 deletions config/datatypes_conf.xml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
<display file="igb/bam.xml" />
<display file="iobio/bam.xml" />
</datatype>
<datatype extension="bam_native" type="galaxy.datatypes.binary:BamNative" mimetype="application/octet-stream" display_in_upload="true" description="A binary file compressed in the BGZF format with a '.bam' file extension." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#BAM">
<converter file="bam_to_bigwig_converter.xml" target_datatype="bigwig"/>
<converter file="bam_native_to_bam_converter.xml" target_datatype="bam"/>
</datatype>
<datatype extension="cram" type="galaxy.datatypes.binary:CRAM" mimetype="application/octet-stream" display_in_upload="true" description="CRAM is a file format for highly efficient and tunable reference-based compression of alignment data." description_url="http://www.ebi.ac.uk/ena/software/cram-usage">
<converter file="cram_to_bam_converter.xml" target_datatype="bam"/>
</datatype>
Expand Down Expand Up @@ -284,6 +288,7 @@
<datatype extension="qual454" type="galaxy.datatypes.qualityscore:QualityScore454" display_in_upload="true"/>
<datatype extension="roadmaps" type="galaxy.datatypes.assembly:Roadmaps" display_in_upload="false"/>
<datatype extension="sam" type="galaxy.datatypes.tabular:Sam" display_in_upload="true">
<converter file="sam_to_bam_native.xml" target_datatype="bam_native"/>
<converter file="sam_to_bam.xml" target_datatype="bam"/>
<converter file="sam_to_bigwig_converter.xml" target_datatype="bigwig"/>
</datatype>
Expand Down
174 changes: 99 additions & 75 deletions lib/galaxy/datatypes/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,14 +187,11 @@ class GenericAsn1Binary(Binary):
edam_data = "data_0849"


@dataproviders.decorators.has_dataproviders
class Bam(Binary):
"""Class describing a BAM binary file"""
class BamNative(Binary):
"""Class describing a BAM binary file that is not necessarily sorted"""
edam_format = "format_2572"
edam_data = "data_0863"
file_ext = "bam"
track_type = "ReadTrack"
data_sources = {"data": "bai", "index": "bigwig"}
file_ext = "bam_native"

MetadataElement(name="bam_index", desc="BAM Index File", param=metadata.FileParameter, file_ext="bai", readonly=True, no_value=None, visible=False, optional=True)
MetadataElement(name="bam_version", default=None, desc="BAM Version", param=MetadataParameter, readonly=True, visible=False, optional=True, no_value=None)
Expand All @@ -217,68 +214,23 @@ def merge(split_files, output_file):
"""
pysam.merge('-O', 'BAM', output_file, *split_files)

def dataset_content_needs_grooming(self, file_name):
"""
Check if file_name is a coordinate-sorted BAM file
"""
# The best way to ensure that BAM files are coordinate-sorted and indexable
# is to actually index them.
index_name = tempfile.NamedTemporaryFile(prefix="bam_index").name
try:
# If pysam fails to index a file it will write to stderr,
# and this causes the set_meta script to fail. So instead
# we start another process and discard stderr.
cmd = ['python', '-c', "import pysam; pysam.index('%s', '%s')" % (file_name, index_name)]
with open(os.devnull, 'w') as devnull:
subprocess.check_call(cmd, stderr=devnull, shell=False)
needs_sorting = False
except subprocess.CalledProcessError:
needs_sorting = True
try:
os.unlink(index_name)
except Exception:
pass
return needs_sorting
def init_meta(self, dataset, copy_from=None):
Binary.init_meta(self, dataset, copy_from=copy_from)

def groom_dataset_content(self, file_name):
"""
Ensures that the BAM file contents are sorted. This function is called
on an output dataset after the content is initially generated.
"""
# Use pysam to sort the BAM file
# This command may also creates temporary files <out.prefix>.%d.bam when the
# whole alignment cannot fit into memory.
# do this in a unique temp directory, because of possible <out.prefix>.%d.bam temp files
if not self.dataset_content_needs_grooming(file_name):
# Don't re-sort if already sorted
return
tmp_dir = tempfile.mkdtemp()
tmp_sorted_dataset_file_name_prefix = os.path.join(tmp_dir, 'sorted')
sorted_file_name = "%s.bam" % tmp_sorted_dataset_file_name_prefix
slots = os.environ.get('GALAXY_SLOTS', 1)
def sniff(self, filename):
# BAM is compressed in the BGZF format, and must not be uncompressed in Galaxy.
# The first 4 bytes of any bam file is 'BAM\1', and the file is binary.
try:
pysam.sort("-@%s" % slots, file_name, '-T', tmp_sorted_dataset_file_name_prefix, '-O', 'BAM', '-o', sorted_file_name)
header = gzip.open(filename).read(4)
if header == b'BAM\1':
return True
return False
except Exception:
shutil.rmtree(tmp_dir, ignore_errors=True)
raise
# Move samtools_created_sorted_file_name to our output dataset location
shutil.move(sorted_file_name, file_name)
# Remove temp file and empty temporary directory
os.rmdir(tmp_dir)

def init_meta(self, dataset, copy_from=None):
Binary.init_meta(self, dataset, copy_from=copy_from)
return False

def set_meta(self, dataset, overwrite=True, **kwd):
# These metadata values are not accessible by users, always overwrite
index_file = dataset.metadata.bam_index
if not index_file:
index_file = dataset.metadata.spec['bam_index'].param.new_file(dataset=dataset)
pysam.index(dataset.file_name, index_file.file_name)
dataset.metadata.bam_index = index_file
# Now use pysam with BAI index to determine additional metadata
try:
bam_file = pysam.AlignmentFile(dataset.file_name, mode='rb', index_filename=index_file.file_name)
bam_file = pysam.AlignmentFile(dataset.file_name, mode='rb')
# TODO: Reference names, lengths, read_groups and headers can become very large, truncate when necessary
dataset.metadata.reference_names = list(bam_file.references)
dataset.metadata.reference_lengths = list(bam_file.lengths)
Expand All @@ -291,17 +243,6 @@ def set_meta(self, dataset, overwrite=True, **kwd):
# fail metadata to end in the error state
pass

def sniff(self, filename):
# BAM is compressed in the BGZF format, and must not be uncompressed in Galaxy.
# The first 4 bytes of any bam file is 'BAM\1', and the file is binary.
try:
header = gzip.open(filename).read(4)
if header == b'BAM\1':
return True
return False
except Exception:
return False

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
dataset.peek = "Binary bam alignments file"
Expand All @@ -326,10 +267,9 @@ def to_archive(self, trans, dataset, name=""):
return zip(file_paths, rel_paths)

def get_chunk(self, trans, dataset, offset=0, ck_size=None):
index_file = dataset.metadata.bam_index
if not offset == -1:
try:
with pysam.AlignmentFile(dataset.file_name, "rb", index_filename=index_file.file_name) as bamfile:
with pysam.AlignmentFile(dataset.file_name, "rb") as bamfile:
ck_size = 300 # 300 lines
ck_data = ""
header_line_count = 0
Expand Down Expand Up @@ -382,6 +322,90 @@ def display_data(self, trans, dataset, preview=False, filename=None, to_ext=None
column_names=column_names,
column_types=column_types)


@dataproviders.decorators.has_dataproviders
class Bam(BamNative):
"""Class describing a BAM binary file"""
edam_format = "format_2572"
edam_data = "data_0863"
file_ext = "bam"
track_type = "ReadTrack"
data_sources = {"data": "bai", "index": "bigwig"}

def dataset_content_needs_grooming(self, file_name):
"""
Check if file_name is a coordinate-sorted BAM file
"""
# The best way to ensure that BAM files are coordinate-sorted and indexable
# is to actually index them.
index_name = tempfile.NamedTemporaryFile(prefix="bam_index").name
try:
# If pysam fails to index a file it will write to stderr,
# and this causes the set_meta script to fail. So instead
# we start another process and discard stderr.
cmd = ['python', '-c', "import pysam; pysam.index('%s', '%s')" % (file_name, index_name)]
with open(os.devnull, 'w') as devnull:
subprocess.check_call(cmd, stderr=devnull, shell=False)
needs_sorting = False
except subprocess.CalledProcessError:
needs_sorting = True
try:
os.unlink(index_name)
except Exception:
pass
return needs_sorting

def groom_dataset_content(self, file_name):
"""
Ensures that the BAM file contents are sorted. This function is called
on an output dataset after the content is initially generated.
"""
# Use pysam to sort the BAM file
# This command may also creates temporary files <out.prefix>.%d.bam when the
# whole alignment cannot fit into memory.
# do this in a unique temp directory, because of possible <out.prefix>.%d.bam temp files
if not self.dataset_content_needs_grooming(file_name):
# Don't re-sort if already sorted
return
tmp_dir = tempfile.mkdtemp()
tmp_sorted_dataset_file_name_prefix = os.path.join(tmp_dir, 'sorted')
sorted_file_name = "%s.bam" % tmp_sorted_dataset_file_name_prefix
slots = os.environ.get('GALAXY_SLOTS', 1)
try:
pysam.sort("-@%s" % slots, file_name, '-T', tmp_sorted_dataset_file_name_prefix, '-O', 'BAM', '-o', sorted_file_name)
except Exception:
shutil.rmtree(tmp_dir, ignore_errors=True)
raise
# Move samtools_created_sorted_file_name to our output dataset location
shutil.move(sorted_file_name, file_name)
# Remove temp file and empty temporary directory
os.rmdir(tmp_dir)

def set_meta(self, dataset, overwrite=True, **kwd):
# These metadata values are not accessible by users, always overwrite
index_file = dataset.metadata.bam_index
if not index_file:
index_file = dataset.metadata.spec['bam_index'].param.new_file(dataset=dataset)
pysam.index(dataset.file_name, index_file.file_name)
dataset.metadata.bam_index = index_file
# Now use pysam with BAI index to determine additional metadata
try:
bam_file = pysam.AlignmentFile(dataset.file_name, mode='rb', index_filename=index_file.file_name)
# TODO: Reference names, lengths, read_groups and headers can become very large, truncate when necessary
dataset.metadata.reference_names = list(bam_file.references)
dataset.metadata.reference_lengths = list(bam_file.lengths)
dataset.metadata.bam_header = bam_file.header
dataset.metadata.read_groups = [read_group['ID'] for read_group in dataset.metadata.bam_header.get('RG', []) if 'ID' in read_group]
dataset.metadata.sort_order = bam_file.header.get('HD', {}).get('SO', None)
dataset.metadata.bam_version = bam_file.header.get('HD', {}).get('VN', None)
except Exception:
# Per Dan, don't log here because doing so will cause datasets that
# fail metadata to end in the error state
pass

def sniff(self, file_name):
return super(Bam, self).sniff(file_name) and not self.dataset_content_needs_grooming(file_name)

# ------------- Dataproviders
# pipe through samtools view
# ALSO: (as Sam)
Expand Down
23 changes: 23 additions & 0 deletions lib/galaxy/datatypes/converters/bam_native_to_bam_converter.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<tool id="CONVERTER_bam_native_to_bam" name="Convert BAM native to BAM" version="1.0.0" hidden="true">
<!-- <description>__NOT_USED_CURRENTLY_FOR_CONVERTERS__</description> -->
<requirements>
<requirement type="package" version="1.6">samtools</requirement>
</requirements>
<command><![CDATA[
samtools sort
-@ \${GALAXY_SLOTS:-1}
-o '${output}'
-O bam
-T dataset
'${input}'
]]>
</command>
<inputs>
<param format="bam_native" name="input" type="data" label="Choose a BAM native file"/>
</inputs>
<outputs>
<data format="bam" name="output"/>
</outputs>
<help>
</help>
</tool>
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
> temp.bg && bedGraphToBigWig temp.bg '$chromInfo' '$output']]>
</command>
<inputs>
<param format="bam" name="input" type="data" label="Choose BAM file"/>
<param format="bam,bam_native" name="input" type="data" label="Choose BAM file"/>
</inputs>
<outputs>
<data format="bigwig" name="output"/>
Expand Down
23 changes: 23 additions & 0 deletions lib/galaxy/datatypes/converters/sam_to_bam_native.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<tool id="CONVERTER_sam_to_bam_native" name="Convert SAM to BAM native - without sorting" version="1.0.0">
<!-- <description>__NOT_USED_CURRENTLY_FOR_CONVERTERS__</description> -->
<requirements>
<requirement type="package" version="1.6">samtools</requirement>
</requirements>
<command><![CDATA[
samtools view
-b
-h
-@ \${GALAXY_SLOTS:-2}
-o '${output}'
'$input'
]]>
</command>
<inputs>
<param name="input" type="data" format="sam" label="SAM file"/>
</inputs>
<outputs>
<data name="output" format="bam_native"/>
</outputs>
<help>
</help>
</tool>
2 changes: 1 addition & 1 deletion lib/galaxy/datatypes/sniff.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ def guess_ext(fname, sniff_order):
'bam'
>>> fname = get_test_fname('3unsorted.bam')
>>> guess_ext(fname, sniff_order)
'bam'
'bam_native'
>>> fname = get_test_fname('test.idpDB')
>>> guess_ext(fname, sniff_order)
'idpdb'
Expand Down
2 changes: 2 additions & 0 deletions scripts/functional_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ class FrameworkToolsGalaxyTestDriver(DefaultGalaxyTestDriver):
"""Galaxy-style nose TestDriver for testing framework Galaxy tools."""

framework_tool_and_types = True
conda_auto_init = True
conda_auto_install = True


class DataManagersGalaxyTestDriver(driver_util.GalaxyTestDriver):
Expand Down
Binary file added test-data/bam_native_from_sam.bam
Binary file not shown.
8 changes: 4 additions & 4 deletions test-data/sam_with_header.sam
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
@SQ SN:ref LN:45
@SQ SN:ref2 LN:40
r003 16 ref 29 30 6H5M * 0 0 TAGGC *
r001 83 ref 37 30 9M = 7 -39 CAGCGCCAT *
x2 0 ref2 2 30 21M * 0 0 ggttttataaaacaaataatt ?????????????????????
r001 163 ref 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112
r002 0 ref 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA *
r003 0 ref 9 30 5H6M * 0 0 AGCTAA *
r004 0 ref 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC *
r003 16 ref 29 30 6H5M * 0 0 TAGGC *
r001 83 ref 37 30 9M = 7 -39 CAGCGCCAT *
x1 0 ref2 1 30 20M * 0 0 aggttttataaaacaaataa ????????????????????
x2 0 ref2 2 30 21M * 0 0 ggttttataaaacaaataatt ?????????????????????
x3 0 ref2 6 30 9M4I13M * 0 0 ttataaaacAAATaattaagtctaca ??????????????????????????
x4 0 ref2 10 30 25M * 0 0 CaaaTaattaagtctacagagcaac ?????????????????????????
x5 0 ref2 12 30 24M * 0 0 aaTaattaagtctacagagcaact ????????????????????????
x1 0 ref2 1 30 20M * 0 0 aggttttataaaacaaataa ????????????????????
x6 0 ref2 14 30 23M * 0 0 Taattaagtctacagagcaacta ???????????????????????
7 changes: 6 additions & 1 deletion test/base/driver_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,8 @@ def setup_galaxy_config(
update_integrated_tool_panel=False,
prefer_template_database=False,
log_format=None,
conda_auto_init=False,
conda_auto_install=False
):
"""Setup environment and build config for test Galaxy instance."""
if not os.path.exists(tmpdir):
Expand Down Expand Up @@ -188,7 +190,8 @@ def setup_galaxy_config(
api_allow_run_as='test@bx.psu.edu',
auto_configure_logging=logging_config_file is None,
check_migrate_tools=False,
conda_auto_init=False,
conda_auto_init=conda_auto_init,
conda_auto_install=conda_auto_install,
cleanup_job='onsuccess',
data_manager_config_file=data_manager_config_file,
enable_beta_tool_formats=True,
Expand Down Expand Up @@ -851,6 +854,8 @@ def setup(self, config_object=None):
datatypes_conf=datatypes_conf_override,
prefer_template_database=getattr(config_object, "prefer_template_database", False),
log_format=log_format,
conda_auto_init=getattr(config_object, "conda_auto_init", False),
conda_auto_install=getattr(config_object, "conda_auto_install", False),
)
galaxy_config = setup_galaxy_config(
galaxy_db_path,
Expand Down
Loading

0 comments on commit 8f65cd4

Please sign in to comment.