Skip to content

Commit

Permalink
add BamNative datatype
Browse files Browse the repository at this point in the history
  • Loading branch information
bgruening committed Dec 10, 2017
1 parent e198dd7 commit f5af953
Showing 1 changed file with 102 additions and 76 deletions.
178 changes: 102 additions & 76 deletions lib/galaxy/datatypes/binary.py
Expand Up @@ -187,14 +187,11 @@ class GenericAsn1Binary(Binary):
edam_data = "data_0849"


@dataproviders.decorators.has_dataproviders
class Bam(Binary):
"""Class describing a BAM binary file"""
class BamNative(Binary):
"""Class describing a BAM binary file that is not necessarily sorted"""
edam_format = "format_2572"
edam_data = "data_0863"
file_ext = "bam"
track_type = "ReadTrack"
data_sources = {"data": "bai", "index": "bigwig"}
file_ext = "bam_native"

MetadataElement(name="bam_index", desc="BAM Index File", param=metadata.FileParameter, file_ext="bai", readonly=True, no_value=None, visible=False, optional=True)
MetadataElement(name="bam_version", default=None, desc="BAM Version", param=MetadataParameter, readonly=True, visible=False, optional=True, no_value=None)
Expand All @@ -210,75 +207,19 @@ class Bam(Binary):
@staticmethod
def merge(split_files, output_file):
"""
Merges BAM files
Merges Bam files
:param split_files: List of bam file paths to merge
:param output_file: Write merged bam file to this location
"""
pysam.merge('-O', 'BAM', output_file, *split_files)

def dataset_content_needs_grooming(self, file_name):
"""
Check if file_name is a coordinate-sorted BAM file
"""
# The best way to ensure that BAM files are coordinate-sorted and indexable
# is to actually index them.
index_name = tempfile.NamedTemporaryFile(prefix="bam_index").name
try:
# If pysam fails to index a file it will write to stderr,
# and this causes the set_meta script to fail. So instead
# we start another process and discard stderr.
cmd = ['python', '-c', "import pysam; pysam.index('%s', '%s')" % (file_name, index_name)]
with open(os.devnull, 'w') as devnull:
subprocess.check_call(cmd, stderr=devnull, shell=False)
needs_sorting = False
except subprocess.CalledProcessError:
needs_sorting = True
try:
os.unlink(index_name)
except Exception:
pass
return needs_sorting

def groom_dataset_content(self, file_name):
"""
Ensures that the BAM file contents are sorted. This function is called
on an output dataset after the content is initially generated.
"""
# Use pysam to sort the BAM file
# This command may also creates temporary files <out.prefix>.%d.bam when the
# whole alignment cannot fit into memory.
# do this in a unique temp directory, because of possible <out.prefix>.%d.bam temp files
if not self.dataset_content_needs_grooming(file_name):
# Don't re-sort if already sorted
return
tmp_dir = tempfile.mkdtemp()
tmp_sorted_dataset_file_name_prefix = os.path.join(tmp_dir, 'sorted')
sorted_file_name = "%s.bam" % tmp_sorted_dataset_file_name_prefix
slots = os.environ.get('GALAXY_SLOTS', 1)
try:
pysam.sort("-@%s" % slots, file_name, '-T', tmp_sorted_dataset_file_name_prefix, '-O', 'BAM', '-o', sorted_file_name)
except Exception:
shutil.rmtree(tmp_dir, ignore_errors=True)
raise
# Move samtools_created_sorted_file_name to our output dataset location
shutil.move(sorted_file_name, file_name)
# Remove temp file and empty temporary directory
os.rmdir(tmp_dir)

def init_meta(self, dataset, copy_from=None):
Binary.init_meta(self, dataset, copy_from=copy_from)

def set_meta(self, dataset, overwrite=True, **kwd):
# These metadata values are not accessible by users, always overwrite
index_file = dataset.metadata.bam_index
if not index_file:
index_file = dataset.metadata.spec['bam_index'].param.new_file(dataset=dataset)
pysam.index(dataset.file_name, index_file.file_name)
dataset.metadata.bam_index = index_file
# Now use pysam with BAI index to determine additional metadata
try:
bam_file = pysam.AlignmentFile(dataset.file_name, mode='rb', index_filename=index_file.file_name)
bam_file = pysam.AlignmentFile(dataset.file_name, mode='rb')
# TODO: Reference names, lengths, read_groups and headers can become very large, truncate when necessary
dataset.metadata.reference_names = list(bam_file.references)
dataset.metadata.reference_lengths = list(bam_file.lengths)
Expand All @@ -291,17 +232,6 @@ def set_meta(self, dataset, overwrite=True, **kwd):
# fail metadata to end in the error state
pass

def sniff(self, filename):
# BAM is compressed in the BGZF format, and must not be uncompressed in Galaxy.
# The first 4 bytes of any bam file is 'BAM\1', and the file is binary.
try:
header = gzip.open(filename).read(4)
if header == b'BAM\1':
return True
return False
except Exception:
return False

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
dataset.peek = "Binary bam alignments file"
Expand All @@ -327,7 +257,11 @@ def to_archive(self, trans, dataset, name=""):

def get_chunk(self, trans, dataset, offset=0, ck_size=None):
index_file = dataset.metadata.bam_index
with pysam.AlignmentFile(dataset.file_name, "rb", index_filename=index_file.file_name) as bamfile:
if index_file:
index_filename = index_file.file_name
else:
index_filename = None
with pysam.AlignmentFile(dataset.file_name, "rb", index_filename=index_filename) as bamfile:
ck_size = 300 # 300 lines
ck_data = ""
header_line_count = 0
Expand Down Expand Up @@ -374,6 +308,98 @@ def display_data(self, trans, dataset, preview=False, filename=None, to_ext=None
column_names=column_names,
column_types=column_types)


@dataproviders.decorators.has_dataproviders
class Bam(BamNative):
"""Class describing a BAM binary file"""
edam_format = "format_2572"
edam_data = "data_0863"
file_ext = "bam"
track_type = "ReadTrack"
data_sources = {"data": "bai", "index": "bigwig"}

def dataset_content_needs_grooming(self, file_name):
"""
Check if file_name is a coordinate-sorted BAM file
"""
# The best way to ensure that BAM files are coordinate-sorted and indexable
# is to actually index them.
index_name = tempfile.NamedTemporaryFile(prefix="bam_index").name
try:
# If pysam fails to index a file it will write to stderr,
# and this causes the set_meta script to fail. So instead
# we start another process and discard stderr.
cmd = ['python', '-c', "import pysam; pysam.index('%s', '%s')" % (file_name, index_name)]
with open(os.devnull, 'w') as devnull:
subprocess.check_call(cmd, stderr=devnull, shell=False)
needs_sorting = False
except subprocess.CalledProcessError:
needs_sorting = True
try:
os.unlink(index_name)
except Exception:
pass
return needs_sorting

def groom_dataset_content(self, file_name):
"""
Ensures that the BAM file contents are sorted. This function is called
on an output dataset after the content is initially generated.
"""
# Use pysam to sort the BAM file
# This command may also creates temporary files <out.prefix>.%d.bam when the
# whole alignment cannot fit into memory.
# do this in a unique temp directory, because of possible <out.prefix>.%d.bam temp files
if not self.dataset_content_needs_grooming(file_name):
# Don't re-sort if already sorted
return
tmp_dir = tempfile.mkdtemp()
tmp_sorted_dataset_file_name_prefix = os.path.join(tmp_dir, 'sorted')
sorted_file_name = "%s.bam" % tmp_sorted_dataset_file_name_prefix
slots = os.environ.get('GALAXY_SLOTS', 1)
try:
pysam.sort("-@%s" % slots, file_name, '-T', tmp_sorted_dataset_file_name_prefix, '-O', 'BAM', '-o', sorted_file_name)
except Exception:
shutil.rmtree(tmp_dir, ignore_errors=True)
raise
# Move samtools_created_sorted_file_name to our output dataset location
shutil.move(sorted_file_name, file_name)
# Remove temp file and empty temporary directory
os.rmdir(tmp_dir)

def set_meta(self, dataset, overwrite=True, **kwd):
# These metadata values are not accessible by users, always overwrite
index_file = dataset.metadata.bam_index
if not index_file:
index_file = dataset.metadata.spec['bam_index'].param.new_file(dataset=dataset)
pysam.index(dataset.file_name, index_file.file_name)
dataset.metadata.bam_index = index_file
# Now use pysam with BAI index to determine additional metadata
try:
bam_file = pysam.AlignmentFile(dataset.file_name, mode='rb', index_filename=index_file.file_name)
# TODO: Reference names, lengths, read_groups and headers can become very large, truncate when necessary
dataset.metadata.reference_names = list(bam_file.references)
dataset.metadata.reference_lengths = list(bam_file.lengths)
dataset.metadata.bam_header = bam_file.header
dataset.metadata.read_groups = [read_group['ID'] for read_group in dataset.metadata.bam_header.get('RG', []) if 'ID' in read_group]
dataset.metadata.sort_order = bam_file.header.get('HD', {}).get('SO', None)
dataset.metadata.bam_version = bam_file.header.get('HD', {}).get('VN', None)
except Exception:
# Per Dan, don't log here because doing so will cause datasets that
# fail metadata to end in the error state
pass

def sniff(self, filename):
# BAM is compressed in the BGZF format, and must not be uncompressed in Galaxy.
# The first 4 bytes of any bam file is 'BAM\1', and the file is binary.
try:
header = gzip.open(filename).read(4)
if header == b'BAM\1':
return True
return False
except Exception:
return False

# ------------- Dataproviders
# pipe through samtools view
# ALSO: (as Sam)
Expand Down

0 comments on commit f5af953

Please sign in to comment.