Skip to content

Commit

Permalink
Use subprocess to check if pysam.index succeeds
Browse files Browse the repository at this point in the history
Checking if pysam.index succeeds tests whether a file is coordinate sorted.
If pysam.index fails to index htslib writes to stderr and this fails set meta
tool. To prevent this we run this in a subprocess and discard stderr.
  • Loading branch information
mvdbeek committed Dec 8, 2017
1 parent 369485a commit ae72e56
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 66 deletions.
69 changes: 18 additions & 51 deletions lib/galaxy/datatypes/binary.py
Expand Up @@ -216,49 +216,33 @@ def merge(split_files, output_file):
"""
pysam.merge('-O', 'BAM', output_file, *split_files)

@staticmethod
def _is_coordinate_sorted(file_name):
def dataset_content_needs_grooming(self, file_name):
"""
Check if the input BAM file is sorted from the header information.
>>> from galaxy.datatypes.sniff import get_test_fname
>>> bamfile = get_test_fname('1.bam')
>>> Bam._is_coordinate_sorted(bamfile)
True
>>> bamfile = get_test_fname('1.unsorted.bam')
>>> Bam._is_coordinate_sorted(bamfile)
False
Check if file_name is a coordinate-sorted BAM file
"""
f = pysam.AlignmentFile(file_name)
coordinate_sorted = False
# The best way to ensure that BAM files are coordinate-sorted and indexable
# is to actually index them.
index_name = tempfile.NamedTemporaryFile(prefix="bam_index").name
try:
coordinate_sorted = f.header['HD']['SO'] == 'coordinate'
# If pysam fails to index a file it will write to stderr,
# and this causes the set_meta script to fail. So instead
# we start another process and discard stderr.
cmd = ['python', '-c', "import pysam; pysam.index('%s', '%s')" % (file_name, index_name)]
with open(os.devnull, 'w') as devnull:
subprocess.check_call(cmd, stderr=devnull, shell=False)
needs_sorting = False
except Exception:
needs_sorting = True
try:
os.unlink(index_name)
except Exception:
pass
return coordinate_sorted

def dataset_content_needs_grooming(self, file_name):
"""
Check if file_name is a coordinate-sorted BAM file
"""
# We check if the input BAM file is coordinate-sorted from the header information.
return not self._is_coordinate_sorted(file_name)
return needs_sorting

def groom_dataset_content(self, file_name):
"""
Ensures that the Bam file contents are sorted. This function is called
on an output dataset after the content is initially generated.
>>> from galaxy.datatypes.sniff import get_test_fname
>>> bamfile = get_test_fname('1.unsorted.bam')
>>> # Grooming happens in-place, so we copy the test dataset to a new location
>>> _, temp_path = tempfile.mkstemp()
>>> shutil.copy(bamfile, temp_path)
>>> b = Bam()
>>> b.groom_dataset_content(temp_path)
>>> b.dataset_content_needs_grooming(temp_path)
False
>>> os.remove(temp_path)
"""
# Use pysam to sort the Bam file
# This command may also creates temporary files <out.prefix>.%d.bam when the
Expand Down Expand Up @@ -500,24 +484,7 @@ def get_cram_version(self, filename):

def set_index_file(self, dataset, index_file):
try:
# @todo when pysam 1.2.1 or pysam 1.3.0 gets released and becomes
# a dependency of galaxy, use pysam.index(alignment, target_idx)
# This currently gives coredump in the current release but is
# fixed in the dev branch:
# xref: https://github.com/samtools/samtools/issues/199

dataset_symlink = os.path.join(os.path.dirname(index_file.file_name), '__dataset_%d_%s' % (dataset.id, os.path.basename(index_file.file_name)))
os.symlink(dataset.file_name, dataset_symlink)
pysam.index(dataset_symlink)

tmp_index = dataset_symlink + ".crai"
if os.path.isfile(tmp_index):
shutil.move(tmp_index, index_file.file_name)
return index_file.file_name
else:
os.unlink(dataset_symlink)
log.warning('%s, expected crai index not created for: %s', self, dataset.file_name)
return False
pysam.index(dataset.file_name, index_file.file_name)
except Exception as exc:
log.warning('%s, set_index_file Exception: %s', self, exc)
return False
Expand Down
33 changes: 18 additions & 15 deletions test/unit/datatypes/test_bam.py
Expand Up @@ -19,37 +19,40 @@ def test_merge_bam():

def test_dataset_content_needs_grooming():
b = Bam()
with get_input_files('1.bam') as input_files:
assert b.dataset_content_needs_grooming(input_files[0]) is False
with get_tmp_path() as qname_sorted:
pysam.sort('-n', input_files[0], '-o', qname_sorted)
assert b.dataset_content_needs_grooming(qname_sorted) is True
with get_input_files('1.unsorted.bam') as input_files:
assert b.dataset_content_needs_grooming(input_files[0]) is True
with get_input_files('1.bam', '2.shuffled.bam') as input_files:
sorted_bam, shuffled_bam = input_files
assert b.dataset_content_needs_grooming(sorted_bam) is False
assert b.dataset_content_needs_grooming(shuffled_bam) is True


def test_groom_dataset_content():
b = Bam()
with get_input_files('1.unsorted.bam') as input_files:
with get_input_files('2.shuffled.bam') as input_files:
b.groom_dataset_content(input_files[0])
assert b.dataset_content_needs_grooming(input_files[0]) is False


def test_set_meta():
def test_set_meta_presorted():
b = Bam()
dataset = Bunch()
with get_input_files('1.bam') as input_files, get_tmp_path() as index_path:
dataset.file_name = input_files[0]
dataset.metadata = Bunch()
dataset.metadata.bam_index = Bunch()
dataset.metadata.bam_index.file_name = index_path
with get_dataset('1.bam') as dataset:
b.set_meta(dataset=dataset)
assert dataset.metadata.sort_order == 'coordinate'
bam_file = pysam.AlignmentFile(dataset.file_name, mode='rb',
index_filename=dataset.metadata.bam_index.file_name)
assert bam_file.has_index() is True


@contextmanager
def get_dataset(file):
dataset = Bunch()
dataset.metadata = Bunch()
dataset.metadata.bam_index = Bunch()
with get_input_files(file) as input_files, get_tmp_path() as index_path:
dataset.file_name = input_files[0]
dataset.metadata.bam_index.file_name = index_path
yield dataset


@contextmanager
def get_tmp_path():
_, path = tempfile.mkstemp()
Expand Down

0 comments on commit ae72e56

Please sign in to comment.