Skip to content

Commit

Permalink
Remove the IndexHandlingMode enum; defer errors about unindexed files…
Browse files Browse the repository at this point in the history
… to Query-invocation time. Eliminate use_index idiom.

This should improve ease-of-use and uniformity of reader interfaces.

PiperOrigin-RevId: 202998434
  • Loading branch information
Genomics team in Google Brain authored and Copybara-Service committed Jul 2, 2018
1 parent 92a3566 commit d10b746
Show file tree
Hide file tree
Showing 23 changed files with 111 additions and 252 deletions.
2 changes: 1 addition & 1 deletion deepvariant/labeler/labeled_examples_to_vcf_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def test_sample_name_flag(self):

labeled_examples_to_vcf.main(0)

with vcf.VcfReader(FLAGS.output_vcf, use_index=False) as vcf_reader:
with vcf.VcfReader(FLAGS.output_vcf) as vcf_reader:
self.assertEqual(
list(vcf_reader.header.sample_names), [FLAGS.sample_name])

Expand Down
2 changes: 1 addition & 1 deletion deepvariant/postprocess_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -856,7 +856,7 @@ def main(argv=()):
FLAGS.nonvariant_site_tfrecord_path,
key=_get_contig_based_variant_sort_keyfn(contigs),
proto=variants_pb2.Variant)
with vcf.VcfReader(FLAGS.outfile, use_index=False) as variant_reader:
with vcf.VcfReader(FLAGS.outfile) as variant_reader:
lessthanfn = _get_contig_based_lessthan(contigs)
gvcf_variants = (
_transform_to_gvcf_record(variant)
Expand Down
5 changes: 0 additions & 5 deletions third_party/nucleus/io/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ py_library(
":genomics_writer",
"//third_party/nucleus/io/python:vcf_reader",
"//third_party/nucleus/io/python:vcf_writer",
"//third_party/nucleus/protos:index_py_pb2",
"//third_party/nucleus/protos:variants_py_pb2",
"//third_party/nucleus/util:ranges",
"//third_party/nucleus/util:variant_utils",
Expand Down Expand Up @@ -91,7 +90,6 @@ py_library(
":genomics_reader",
":genomics_writer",
"//third_party/nucleus/io/python:sam_reader",
"//third_party/nucleus/protos:index_py_pb2",
"//third_party/nucleus/protos:reads_py_pb2",
"//third_party/nucleus/util:py_utils",
"//third_party/nucleus/util:ranges",
Expand Down Expand Up @@ -383,7 +381,6 @@ cc_library(
":reader_base",
"//third_party/nucleus/platform:types",
"//third_party/nucleus/protos:cigar_cc_pb2",
"//third_party/nucleus/protos:index_cc_pb2",
"//third_party/nucleus/protos:position_cc_pb2",
"//third_party/nucleus/protos:range_cc_pb2",
"//third_party/nucleus/protos:reads_cc_pb2",
Expand All @@ -405,7 +402,6 @@ cc_test(
data = ["//third_party/nucleus/testdata"],
deps = [
":sam_reader",
"//third_party/nucleus/protos:index_cc_pb2",
"//third_party/nucleus/testing:cpp_test_utils",
"//third_party/nucleus/testing:gunit_extras",
"//third_party/nucleus/util:cpp_utils",
Expand All @@ -424,7 +420,6 @@ cc_library(
":reader_base",
":vcf_conversion",
"//third_party/nucleus/platform:types",
"//third_party/nucleus/protos:index_cc_pb2",
"//third_party/nucleus/protos:range_cc_pb2",
"//third_party/nucleus/protos:reference_cc_pb2",
"//third_party/nucleus/protos:variants_cc_pb2",
Expand Down
2 changes: 1 addition & 1 deletion third_party/nucleus/io/genomics_io_noplugin_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class PluginAbsenceTest(absltest.TestCase):
def test_tfbam_plugin_does_not_load(self):
with self.assertRaisesRegexp(
ImportError, 'tfbam_lib module not found, cannot read .tfbam files.'):
_ = sam.SamReader('mouse@25.tfbam', use_index=True)
_ = sam.SamReader('mouse@25.tfbam')


if __name__ == '__main__':
Expand Down
2 changes: 1 addition & 1 deletion third_party/nucleus/io/genomics_io_plugins_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class PluginTest(absltest.TestCase):
"""Test that the plugin loads correctly."""

def test_tfbam_plugin_loads(self):
reader = sam.SamReader('mouse@25.tfbam', use_index=True)
reader = sam.SamReader('mouse@25.tfbam')
self.assertIsNotNone(reader)


Expand Down
2 changes: 1 addition & 1 deletion third_party/nucleus/io/python/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,12 @@ py_test(
deps = [
":sam_reader",
"//third_party/nucleus/io:clif_postproc",
"//third_party/nucleus/protos:index_py_pb2",
"//third_party/nucleus/protos:reads_py_pb2",
"//third_party/nucleus/protos:reference_py_pb2",
"//third_party/nucleus/testing:py_test_utils",
"//third_party/nucleus/util:ranges",
"@io_abseil_py//absl/testing:absltest",
"@io_abseil_py//absl/testing:parameterized",
],
)

Expand Down
30 changes: 12 additions & 18 deletions third_party/nucleus/io/python/sam_reader_wrap_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,23 +33,21 @@
from __future__ import print_function

from absl.testing import absltest
from absl.testing import parameterized

from third_party.nucleus.io import clif_postproc
from third_party.nucleus.io.python import sam_reader
from third_party.nucleus.protos import index_pb2
from third_party.nucleus.protos import reads_pb2
from third_party.nucleus.protos import reference_pb2
from third_party.nucleus.testing import test_utils
from third_party.nucleus.util import ranges


class SamReaderTest(absltest.TestCase):
class SamReaderTest(parameterized.TestCase):

def setUp(self):
self.bam = test_utils.genomics_core_testdata('test.bam')
self.options = reads_pb2.SamReaderOptions()
self.indexed_options = reads_pb2.SamReaderOptions(
index_mode=index_pb2.INDEX_BASED_ON_FILENAME)

def test_bam_iterate(self):
reader = sam_reader.SamReader.from_file(self.bam, self.options)
Expand All @@ -59,7 +57,7 @@ def test_bam_iterate(self):
self.assertEqual(test_utils.iterable_len(iterable), 106)

def test_bam_query(self):
reader = sam_reader.SamReader.from_file(self.bam, self.indexed_options)
reader = sam_reader.SamReader.from_file(self.bam, self.options)
expected = [(ranges.parse_literal('chr20:10,000,000-10,000,100'), 106),
(ranges.parse_literal('chr20:10,000,000-10,000,000'), 45)]
with reader:
Expand Down Expand Up @@ -131,7 +129,7 @@ def test_sam_contigs(self):

def test_context_manager(self):
"""Test that we can use context manager to do two queries in sequence."""
reader = sam_reader.SamReader.from_file(self.bam, self.indexed_options)
reader = sam_reader.SamReader.from_file(self.bam, self.options)
region = ranges.parse_literal('chr20:10,000,000-10,000,100')
with reader:
with reader.query(region) as query_iterable1:
Expand All @@ -146,14 +144,8 @@ def test_from_file_raises_with_missing_bam(self):
'Not found: Could not open missing.bam'):
sam_reader.SamReader.from_file('missing.bam', self.options)

def test_from_file_raises_with_missing_index(self):
with self.assertRaisesRegexp(ValueError, 'Not found: No index found for'):
sam_reader.SamReader.from_file(
test_utils.genomics_core_testdata('unindexed.bam'),
self.indexed_options)

def test_ops_on_closed_reader_raise(self):
reader = sam_reader.SamReader.from_file(self.bam, self.indexed_options)
reader = sam_reader.SamReader.from_file(self.bam, self.options)
with reader:
pass
# At this point the reader is closed.
Expand All @@ -162,14 +154,16 @@ def test_ops_on_closed_reader_raise(self):
with self.assertRaisesRegexp(ValueError, 'Cannot Query a closed'):
reader.query(ranges.parse_literal('chr20:10,000,000-10,000,100'))

def test_query_on_unindexed_reader_raises(self):
with sam_reader.SamReader.from_file(self.bam, self.options) as reader:
@parameterized.parameters('test.sam', 'unindexed.bam')
def test_query_without_index_raises(self, unindexed_file_name):
path = test_utils.genomics_core_testdata(unindexed_file_name)
window = ranges.parse_literal('chr20:10,000,000-10,000,100')
with sam_reader.SamReader.from_file(path, self.options) as reader:
with self.assertRaisesRegexp(ValueError, 'Cannot query without an index'):
reader.query(ranges.parse_literal('chr20:10,000,000-10,000,100'))
reader.query(window)

def test_query_raises_with_bad_range(self):
with sam_reader.SamReader.from_file(self.bam,
self.indexed_options) as reader:
with sam_reader.SamReader.from_file(self.bam, self.options) as reader:
with self.assertRaisesRegexp(ValueError, 'Unknown reference_name'):
reader.query(ranges.parse_literal('XXX:1-10'))
with self.assertRaisesRegexp(ValueError, 'unknown reference interval'):
Expand Down
27 changes: 9 additions & 18 deletions third_party/nucleus/io/python/vcf_reader_wrap_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
from absl.testing import absltest

from third_party.nucleus.io.python import vcf_reader
from third_party.nucleus.protos import index_pb2
from third_party.nucleus.protos import reference_pb2
from third_party.nucleus.protos import variants_pb2
from third_party.nucleus.testing import test_utils
Expand Down Expand Up @@ -151,15 +150,13 @@
class WrapVcfReaderTests(absltest.TestCase):

def setUp(self):
self.unindexed_options = variants_pb2.VcfReaderOptions()
self.indexed_options = variants_pb2.VcfReaderOptions(
index_mode=index_pb2.INDEX_BASED_ON_FILENAME)
self.sites_vcf = test_utils.genomics_core_testdata('test_sites.vcf')
self.samples_vcf = test_utils.genomics_core_testdata('test_samples.vcf.gz')
self.options = variants_pb2.VcfReaderOptions()
self.sites_reader = vcf_reader.VcfReader.from_file(self.sites_vcf,
self.unindexed_options)
self.options)
self.samples_reader = vcf_reader.VcfReader.from_file(
self.samples_vcf, self.indexed_options)
self.samples_vcf, self.options)

def test_vcf_iterate(self):
iterable = self.sites_reader.iterate()
Expand Down Expand Up @@ -207,13 +204,7 @@ def test_vcf_query(self):
def test_from_file_raises_with_missing_source(self):
with self.assertRaisesRegexp(ValueError,
'Not found: Could not open missing.vcf'):
vcf_reader.VcfReader.from_file('missing.vcf', self.unindexed_options)

def test_from_file_raises_with_missing_index(self):
with self.assertRaisesRegexp(ValueError, 'Not found: No index found for'):
vcf_reader.VcfReader.from_file(
test_utils.genomics_core_testdata('test_sites.vcf'),
self.indexed_options)
vcf_reader.VcfReader.from_file('missing.vcf', self.options)

def test_ops_on_closed_reader_raise(self):
with self.samples_reader:
Expand All @@ -226,10 +217,11 @@ def test_ops_on_closed_reader_raise(self):
ranges.parse_literal('chr1:10,000,000-10,000,100'))

def test_query_on_unindexed_reader_raises(self):
with vcf_reader.VcfReader.from_file(self.samples_vcf,
self.unindexed_options) as reader:
window = ranges.parse_literal('chr1:10,000,000-10,000,100')
unindexed_file = test_utils.genomics_core_testdata('test_samples.vcf')
with vcf_reader.VcfReader.from_file(unindexed_file, self.options) as reader:
with self.assertRaisesRegexp(ValueError, 'Cannot query without an index'):
reader.query(ranges.parse_literal('chr1:10,000,000-10,000,100'))
reader.query(window)

def test_query_raises_with_bad_range(self):
with self.assertRaisesRegexp(ValueError, 'Unknown reference_name'):
Expand All @@ -242,8 +234,7 @@ def test_query_raises_with_bad_range(self):
self.samples_reader.query(ranges.parse_literal('chr1:10-5'))

def test_context_manager(self):
with vcf_reader.VcfReader.from_file(self.sites_vcf,
self.unindexed_options) as f:
with vcf_reader.VcfReader.from_file(self.sites_vcf, self.options) as f:
self.assertEqual(expected_sites_contigs, list(f.header.contigs))

# Commented out because we in fact don't detect the malformed VCF yet. It is
Expand Down
4 changes: 2 additions & 2 deletions third_party/nucleus/io/python/vcf_writer_wrap_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def test_round_trip_vcf(self, test_datum_name):
in_file = test_utils.genomics_core_testdata(test_datum_name)
out_file = test_utils.test_tmpfile('output_' + test_datum_name)

v1_reader = vcf.VcfReader(in_file, use_index=False)
v1_reader = vcf.VcfReader(in_file)
v1_records = list(v1_reader.iterate())
self.assertTrue(v1_records, 'Reader failed to find records')

Expand All @@ -304,7 +304,7 @@ def test_round_trip_vcf(self, test_datum_name):
for record in v1_records:
writer.write(record)

v2_reader = vcf.VcfReader(out_file, use_index=False)
v2_reader = vcf.VcfReader(out_file)
v2_records = list(v2_reader.iterate())

self.assertEqual(v1_records, v2_records,
Expand Down
12 changes: 0 additions & 12 deletions third_party/nucleus/io/sam.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@
from third_party.nucleus.io import genomics_reader
from third_party.nucleus.io import genomics_writer
from third_party.nucleus.io.python import sam_reader
from third_party.nucleus.protos import index_pb2
from third_party.nucleus.protos import reads_pb2
from third_party.nucleus.util import ranges
from third_party.nucleus.util import utils
Expand All @@ -86,7 +85,6 @@ class NativeSamReader(genomics_reader.GenomicsReader):
"""

def __init__(self, input_path,
use_index=True,
read_requirements=None,
parse_aux_fields=False,
hts_block_size=None,
Expand All @@ -97,10 +95,6 @@ def __init__(self, input_path,
Args:
input_path: str. A path to a resource containing SAM/BAM records.
Currently supports SAM text format and BAM binary format.
use_index: optional bool, defaulting to True. If True, we will attempt to
load an index file for input_path to enable the query() API call. If
True an index file must exist. If False, we will not attempt to load an
index for input_path, disabling the query() call.
read_requirements: optional ReadRequirement proto. If not None, this proto
is used to control which reads are filtered out by the reader before
they are passed to the client.
Expand Down Expand Up @@ -133,18 +127,13 @@ def __init__(self, input_path,
self._reader = tfbam_reader.make_sam_reader(
input_path,
read_requirements=read_requirements,
use_index=use_index,
unused_block_size=hts_block_size,
downsample_fraction=downsample_fraction,
random_seed=random_seed)
except ImportError:
raise ImportError(
'tfbam_lib module not found, cannot read .tfbam files.')
else:
index_mode = index_pb2.INDEX_BASED_ON_FILENAME
if not use_index:
index_mode = index_pb2.DONT_USE_INDEX

aux_field_handling = reads_pb2.SamReaderOptions.SKIP_AUX_FIELDS
if parse_aux_fields:
aux_field_handling = reads_pb2.SamReaderOptions.PARSE_ALL_AUX_FIELDS
Expand All @@ -165,7 +154,6 @@ def __init__(self, input_path,
input_path.encode('utf8'),
reads_pb2.SamReaderOptions(
read_requirements=read_requirements,
index_mode=index_mode,
aux_field_handling=aux_field_handling,
hts_block_size=(hts_block_size or 0),
downsample_fraction=downsample_fraction,
Expand Down
12 changes: 6 additions & 6 deletions third_party/nucleus/io/sam_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
#include "htslib/sam.h"
#include "third_party/nucleus/io/hts_path.h"
#include "third_party/nucleus/protos/cigar.pb.h"
#include "third_party/nucleus/protos/index.pb.h"
#include "third_party/nucleus/protos/position.pb.h"
#include "third_party/nucleus/protos/range.pb.h"
#include "third_party/nucleus/protos/reads.pb.h"
Expand Down Expand Up @@ -76,6 +75,10 @@ constexpr char kSamReadGroupTag[] = "@RG";
constexpr char kSamProgramTag[] = "@PG";
constexpr char kSamCommentTag[] = "@CO";

bool FileTypeIsIndexable(htsFormat format) {
return format.format == bam;
}

void AddHeaderLineToHeader(const string& line, SamHeader& header) {
static constexpr char kVersionTag[] = "VN:";
static constexpr char kSortingOrderTag[] = "SO:";
Expand Down Expand Up @@ -574,13 +577,10 @@ StatusOr<std::unique_ptr<SamReader>> SamReader::FromFile(
return tf::errors::Unknown("Couldn't parse header for ", fp->fn);

hts_idx_t* idx = nullptr;
if (options.index_mode() ==
nucleus::genomics::v1::IndexHandlingMode::INDEX_BASED_ON_FILENAME) {
if (FileTypeIsIndexable(fp->format)) {
// redacted
// This call may return null, which we will look for at Query time.
idx = sam_index_load(fp, fp->fn);
if (idx == nullptr) {
return tf::errors::NotFound("No index found for ", fp->fn);
}
}

return std::unique_ptr<SamReader>(
Expand Down
9 changes: 5 additions & 4 deletions third_party/nucleus/io/sam_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,9 @@ class SamReader : public Reader {
// reads_path must point to an existing SAM/BAM formatted file (text SAM or
// compressed or uncompressed BAM file).
//
// If options.index_mode indicates we should load an index, this constructor
// will attempt to load a BAI index from file reads_path + '.bai'.
// If the filetype is BAM, this constructor will attempt to load a BAI index
// from file reads_path + '.bai'; if the index is not found, attempts to Query
// will fail.
//
// Returns a StatusOr that is OK if the SamReader could be successfully
// created or an error code indicating the error that occurred.
Expand Down Expand Up @@ -114,8 +115,8 @@ class SamReader : public Reader {
// The specific parsing, filtering, etc behavior is determined by the options
// provided during construction.
//
// This function is only available if an index was loaded. If no index was
// loaded a non-OK status value will be returned.
// If no index was loaded by the constructor a non-OK status value will be
// returned.
//
// If range isn't a valid interval in this BAM file a non-OK status value will
// be returned.
Expand Down
3 changes: 0 additions & 3 deletions third_party/nucleus/io/sam_reader_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@

#include <string>

#include "third_party/nucleus/protos/index.pb.h"
#include "third_party/nucleus/testing/protocol-buffer-matchers.h"
#include "third_party/nucleus/testing/test_utils.h"
#include "third_party/nucleus/util/utils.h"
Expand Down Expand Up @@ -150,8 +149,6 @@ TEST(SamReaderTest, TestHeaderlessSamIsNotOkay) {
class SamReaderQueryTest : public ::testing::Test {
protected:
void SetUp() override {
options_.set_index_mode(
nucleus::genomics::v1::IndexHandlingMode::INDEX_BASED_ON_FILENAME);
indexed_bam_ = GetTestData(kBamTestFilename);
RecreateReader();
}
Expand Down

0 comments on commit d10b746

Please sign in to comment.