Remove the IndexHandlingMode enum; defer errors about unindexed files…

… to Query-invocation time. Eliminate use_index idiom. This should improve ease-of-use and uniformity of reader interfaces. PiperOrigin-RevId: 202998434
google · Jul 2, 2018 · d10b746 · d10b746
1 parent 92a3566
commit d10b746
Show file tree

Hide file tree

Showing 23 changed files with 111 additions and 252 deletions.
diff --git a/deepvariant/labeler/labeled_examples_to_vcf_test.py b/deepvariant/labeler/labeled_examples_to_vcf_test.py
@@ -74,7 +74,7 @@ def test_sample_name_flag(self):
 
     labeled_examples_to_vcf.main(0)
 
-    with vcf.VcfReader(FLAGS.output_vcf, use_index=False) as vcf_reader:
+    with vcf.VcfReader(FLAGS.output_vcf) as vcf_reader:
       self.assertEqual(
           list(vcf_reader.header.sample_names), [FLAGS.sample_name])
 

diff --git a/deepvariant/postprocess_variants.py b/deepvariant/postprocess_variants.py
@@ -856,7 +856,7 @@ def main(argv=()):
           FLAGS.nonvariant_site_tfrecord_path,
           key=_get_contig_based_variant_sort_keyfn(contigs),
           proto=variants_pb2.Variant)
-      with vcf.VcfReader(FLAGS.outfile, use_index=False) as variant_reader:
+      with vcf.VcfReader(FLAGS.outfile) as variant_reader:
         lessthanfn = _get_contig_based_lessthan(contigs)
         gvcf_variants = (
             _transform_to_gvcf_record(variant)

diff --git a/third_party/nucleus/io/BUILD b/third_party/nucleus/io/BUILD
@@ -59,7 +59,6 @@ py_library(
         ":genomics_writer",
         "//third_party/nucleus/io/python:vcf_reader",
         "//third_party/nucleus/io/python:vcf_writer",
-        "//third_party/nucleus/protos:index_py_pb2",
         "//third_party/nucleus/protos:variants_py_pb2",
         "//third_party/nucleus/util:ranges",
         "//third_party/nucleus/util:variant_utils",
@@ -91,7 +90,6 @@ py_library(
         ":genomics_reader",
         ":genomics_writer",
         "//third_party/nucleus/io/python:sam_reader",
-        "//third_party/nucleus/protos:index_py_pb2",
         "//third_party/nucleus/protos:reads_py_pb2",
         "//third_party/nucleus/util:py_utils",
         "//third_party/nucleus/util:ranges",
@@ -383,7 +381,6 @@ cc_library(
         ":reader_base",
         "//third_party/nucleus/platform:types",
         "//third_party/nucleus/protos:cigar_cc_pb2",
-        "//third_party/nucleus/protos:index_cc_pb2",
         "//third_party/nucleus/protos:position_cc_pb2",
         "//third_party/nucleus/protos:range_cc_pb2",
         "//third_party/nucleus/protos:reads_cc_pb2",
@@ -405,7 +402,6 @@ cc_test(
     data = ["//third_party/nucleus/testdata"],
     deps = [
         ":sam_reader",
-        "//third_party/nucleus/protos:index_cc_pb2",
         "//third_party/nucleus/testing:cpp_test_utils",
         "//third_party/nucleus/testing:gunit_extras",
         "//third_party/nucleus/util:cpp_utils",
@@ -424,7 +420,6 @@ cc_library(
         ":reader_base",
         ":vcf_conversion",
         "//third_party/nucleus/platform:types",
-        "//third_party/nucleus/protos:index_cc_pb2",
         "//third_party/nucleus/protos:range_cc_pb2",
         "//third_party/nucleus/protos:reference_cc_pb2",
         "//third_party/nucleus/protos:variants_cc_pb2",

diff --git a/third_party/nucleus/io/genomics_io_noplugin_test.py b/third_party/nucleus/io/genomics_io_noplugin_test.py
@@ -44,7 +44,7 @@ class PluginAbsenceTest(absltest.TestCase):
   def test_tfbam_plugin_does_not_load(self):
     with self.assertRaisesRegexp(
         ImportError, 'tfbam_lib module not found, cannot read .tfbam files.'):
-      _ = sam.SamReader('mouse@25.tfbam', use_index=True)
+      _ = sam.SamReader('mouse@25.tfbam')
 
 
 if __name__ == '__main__':

diff --git a/third_party/nucleus/io/genomics_io_plugins_test.py b/third_party/nucleus/io/genomics_io_plugins_test.py
@@ -42,7 +42,7 @@ class PluginTest(absltest.TestCase):
   """Test that the plugin loads correctly."""
 
   def test_tfbam_plugin_loads(self):
-    reader = sam.SamReader('mouse@25.tfbam', use_index=True)
+    reader = sam.SamReader('mouse@25.tfbam')
     self.assertIsNotNone(reader)
 
 

diff --git a/third_party/nucleus/io/python/BUILD b/third_party/nucleus/io/python/BUILD
@@ -100,12 +100,12 @@ py_test(
     deps = [
         ":sam_reader",
         "//third_party/nucleus/io:clif_postproc",
-        "//third_party/nucleus/protos:index_py_pb2",
         "//third_party/nucleus/protos:reads_py_pb2",
         "//third_party/nucleus/protos:reference_py_pb2",
         "//third_party/nucleus/testing:py_test_utils",
         "//third_party/nucleus/util:ranges",
         "@io_abseil_py//absl/testing:absltest",
+        "@io_abseil_py//absl/testing:parameterized",
     ],
 )
 

diff --git a/third_party/nucleus/io/python/sam_reader_wrap_test.py b/third_party/nucleus/io/python/sam_reader_wrap_test.py
@@ -33,23 +33,21 @@
 from __future__ import print_function
 
 from absl.testing import absltest
+from absl.testing import parameterized
 
 from third_party.nucleus.io import clif_postproc
 from third_party.nucleus.io.python import sam_reader
-from third_party.nucleus.protos import index_pb2
 from third_party.nucleus.protos import reads_pb2
 from third_party.nucleus.protos import reference_pb2
 from third_party.nucleus.testing import test_utils
 from third_party.nucleus.util import ranges
 
 
-class SamReaderTest(absltest.TestCase):
+class SamReaderTest(parameterized.TestCase):
 
   def setUp(self):
     self.bam = test_utils.genomics_core_testdata('test.bam')
     self.options = reads_pb2.SamReaderOptions()
-    self.indexed_options = reads_pb2.SamReaderOptions(
-        index_mode=index_pb2.INDEX_BASED_ON_FILENAME)
 
   def test_bam_iterate(self):
     reader = sam_reader.SamReader.from_file(self.bam, self.options)
@@ -59,7 +57,7 @@ def test_bam_iterate(self):
       self.assertEqual(test_utils.iterable_len(iterable), 106)
 
   def test_bam_query(self):
-    reader = sam_reader.SamReader.from_file(self.bam, self.indexed_options)
+    reader = sam_reader.SamReader.from_file(self.bam, self.options)
     expected = [(ranges.parse_literal('chr20:10,000,000-10,000,100'), 106),
                 (ranges.parse_literal('chr20:10,000,000-10,000,000'), 45)]
     with reader:
@@ -131,7 +129,7 @@ def test_sam_contigs(self):
 
   def test_context_manager(self):
     """Test that we can use context manager to do two queries in sequence."""
-    reader = sam_reader.SamReader.from_file(self.bam, self.indexed_options)
+    reader = sam_reader.SamReader.from_file(self.bam, self.options)
     region = ranges.parse_literal('chr20:10,000,000-10,000,100')
     with reader:
       with reader.query(region) as query_iterable1:
@@ -146,14 +144,8 @@ def test_from_file_raises_with_missing_bam(self):
                                  'Not found: Could not open missing.bam'):
       sam_reader.SamReader.from_file('missing.bam', self.options)
 
-  def test_from_file_raises_with_missing_index(self):
-    with self.assertRaisesRegexp(ValueError, 'Not found: No index found for'):
-      sam_reader.SamReader.from_file(
-          test_utils.genomics_core_testdata('unindexed.bam'),
-          self.indexed_options)
-
   def test_ops_on_closed_reader_raise(self):
-    reader = sam_reader.SamReader.from_file(self.bam, self.indexed_options)
+    reader = sam_reader.SamReader.from_file(self.bam, self.options)
     with reader:
       pass
     # At this point the reader is closed.
@@ -162,14 +154,16 @@ def test_ops_on_closed_reader_raise(self):
     with self.assertRaisesRegexp(ValueError, 'Cannot Query a closed'):
       reader.query(ranges.parse_literal('chr20:10,000,000-10,000,100'))
 
-  def test_query_on_unindexed_reader_raises(self):
-    with sam_reader.SamReader.from_file(self.bam, self.options) as reader:
+  @parameterized.parameters('test.sam', 'unindexed.bam')
+  def test_query_without_index_raises(self, unindexed_file_name):
+    path = test_utils.genomics_core_testdata(unindexed_file_name)
+    window = ranges.parse_literal('chr20:10,000,000-10,000,100')
+    with sam_reader.SamReader.from_file(path, self.options) as reader:
       with self.assertRaisesRegexp(ValueError, 'Cannot query without an index'):
-        reader.query(ranges.parse_literal('chr20:10,000,000-10,000,100'))
+        reader.query(window)
 
   def test_query_raises_with_bad_range(self):
-    with sam_reader.SamReader.from_file(self.bam,
-                                        self.indexed_options) as reader:
+    with sam_reader.SamReader.from_file(self.bam, self.options) as reader:
       with self.assertRaisesRegexp(ValueError, 'Unknown reference_name'):
         reader.query(ranges.parse_literal('XXX:1-10'))
       with self.assertRaisesRegexp(ValueError, 'unknown reference interval'):

diff --git a/third_party/nucleus/io/python/vcf_reader_wrap_test.py b/third_party/nucleus/io/python/vcf_reader_wrap_test.py
@@ -35,7 +35,6 @@
 from absl.testing import absltest
 
 from third_party.nucleus.io.python import vcf_reader
-from third_party.nucleus.protos import index_pb2
 from third_party.nucleus.protos import reference_pb2
 from third_party.nucleus.protos import variants_pb2
 from third_party.nucleus.testing import test_utils
@@ -151,15 +150,13 @@
 class WrapVcfReaderTests(absltest.TestCase):
 
   def setUp(self):
-    self.unindexed_options = variants_pb2.VcfReaderOptions()
-    self.indexed_options = variants_pb2.VcfReaderOptions(
-        index_mode=index_pb2.INDEX_BASED_ON_FILENAME)
     self.sites_vcf = test_utils.genomics_core_testdata('test_sites.vcf')
     self.samples_vcf = test_utils.genomics_core_testdata('test_samples.vcf.gz')
+    self.options = variants_pb2.VcfReaderOptions()
     self.sites_reader = vcf_reader.VcfReader.from_file(self.sites_vcf,
-                                                       self.unindexed_options)
+                                                       self.options)
     self.samples_reader = vcf_reader.VcfReader.from_file(
-        self.samples_vcf, self.indexed_options)
+        self.samples_vcf, self.options)
 
   def test_vcf_iterate(self):
     iterable = self.sites_reader.iterate()
@@ -207,13 +204,7 @@ def test_vcf_query(self):
   def test_from_file_raises_with_missing_source(self):
     with self.assertRaisesRegexp(ValueError,
                                  'Not found: Could not open missing.vcf'):
-      vcf_reader.VcfReader.from_file('missing.vcf', self.unindexed_options)
-
-  def test_from_file_raises_with_missing_index(self):
-    with self.assertRaisesRegexp(ValueError, 'Not found: No index found for'):
-      vcf_reader.VcfReader.from_file(
-          test_utils.genomics_core_testdata('test_sites.vcf'),
-          self.indexed_options)
+      vcf_reader.VcfReader.from_file('missing.vcf', self.options)
 
   def test_ops_on_closed_reader_raise(self):
     with self.samples_reader:
@@ -226,10 +217,11 @@ def test_ops_on_closed_reader_raise(self):
           ranges.parse_literal('chr1:10,000,000-10,000,100'))
 
   def test_query_on_unindexed_reader_raises(self):
-    with vcf_reader.VcfReader.from_file(self.samples_vcf,
-                                        self.unindexed_options) as reader:
+    window = ranges.parse_literal('chr1:10,000,000-10,000,100')
+    unindexed_file = test_utils.genomics_core_testdata('test_samples.vcf')
+    with vcf_reader.VcfReader.from_file(unindexed_file, self.options) as reader:
       with self.assertRaisesRegexp(ValueError, 'Cannot query without an index'):
-        reader.query(ranges.parse_literal('chr1:10,000,000-10,000,100'))
+        reader.query(window)
 
   def test_query_raises_with_bad_range(self):
     with self.assertRaisesRegexp(ValueError, 'Unknown reference_name'):
@@ -242,8 +234,7 @@ def test_query_raises_with_bad_range(self):
       self.samples_reader.query(ranges.parse_literal('chr1:10-5'))
 
   def test_context_manager(self):
-    with vcf_reader.VcfReader.from_file(self.sites_vcf,
-                                        self.unindexed_options) as f:
+    with vcf_reader.VcfReader.from_file(self.sites_vcf, self.options) as f:
       self.assertEqual(expected_sites_contigs, list(f.header.contigs))
 
   # Commented out because we in fact don't detect the malformed VCF yet. It is

diff --git a/third_party/nucleus/io/python/vcf_writer_wrap_test.py b/third_party/nucleus/io/python/vcf_writer_wrap_test.py
@@ -292,7 +292,7 @@ def test_round_trip_vcf(self, test_datum_name):
     in_file = test_utils.genomics_core_testdata(test_datum_name)
     out_file = test_utils.test_tmpfile('output_' + test_datum_name)
 
-    v1_reader = vcf.VcfReader(in_file, use_index=False)
+    v1_reader = vcf.VcfReader(in_file)
     v1_records = list(v1_reader.iterate())
     self.assertTrue(v1_records, 'Reader failed to find records')
 
@@ -304,7 +304,7 @@ def test_round_trip_vcf(self, test_datum_name):
       for record in v1_records:
         writer.write(record)
 
-    v2_reader = vcf.VcfReader(out_file, use_index=False)
+    v2_reader = vcf.VcfReader(out_file)
     v2_records = list(v2_reader.iterate())
 
     self.assertEqual(v1_records, v2_records,

diff --git a/third_party/nucleus/io/sam.py b/third_party/nucleus/io/sam.py
@@ -71,7 +71,6 @@
 from third_party.nucleus.io import genomics_reader
 from third_party.nucleus.io import genomics_writer
 from third_party.nucleus.io.python import sam_reader
-from third_party.nucleus.protos import index_pb2
 from third_party.nucleus.protos import reads_pb2
 from third_party.nucleus.util import ranges
 from third_party.nucleus.util import utils
@@ -86,7 +85,6 @@ class NativeSamReader(genomics_reader.GenomicsReader):
   """
 
   def __init__(self, input_path,
-               use_index=True,
                read_requirements=None,
                parse_aux_fields=False,
                hts_block_size=None,
@@ -97,10 +95,6 @@ def __init__(self, input_path,
     Args:
       input_path: str. A path to a resource containing SAM/BAM records.
         Currently supports SAM text format and BAM binary format.
-      use_index: optional bool, defaulting to True. If True, we will attempt to
-        load an index file for input_path to enable the query() API call. If
-        True an index file must exist. If False, we will not attempt to load an
-        index for input_path, disabling the query() call.
       read_requirements: optional ReadRequirement proto. If not None, this proto
         is used to control which reads are filtered out by the reader before
         they are passed to the client.
@@ -133,18 +127,13 @@ def __init__(self, input_path,
         self._reader = tfbam_reader.make_sam_reader(
             input_path,
             read_requirements=read_requirements,
-            use_index=use_index,
             unused_block_size=hts_block_size,
             downsample_fraction=downsample_fraction,
             random_seed=random_seed)
       except ImportError:
         raise ImportError(
             'tfbam_lib module not found, cannot read .tfbam files.')
     else:
-      index_mode = index_pb2.INDEX_BASED_ON_FILENAME
-      if not use_index:
-        index_mode = index_pb2.DONT_USE_INDEX
-
       aux_field_handling = reads_pb2.SamReaderOptions.SKIP_AUX_FIELDS
       if parse_aux_fields:
         aux_field_handling = reads_pb2.SamReaderOptions.PARSE_ALL_AUX_FIELDS
@@ -165,7 +154,6 @@ def __init__(self, input_path,
           input_path.encode('utf8'),
           reads_pb2.SamReaderOptions(
               read_requirements=read_requirements,
-              index_mode=index_mode,
               aux_field_handling=aux_field_handling,
               hts_block_size=(hts_block_size or 0),
               downsample_fraction=downsample_fraction,

diff --git a/third_party/nucleus/io/sam_reader.cc b/third_party/nucleus/io/sam_reader.cc
@@ -41,7 +41,6 @@
 #include "htslib/sam.h"
 #include "third_party/nucleus/io/hts_path.h"
 #include "third_party/nucleus/protos/cigar.pb.h"
-#include "third_party/nucleus/protos/index.pb.h"
 #include "third_party/nucleus/protos/position.pb.h"
 #include "third_party/nucleus/protos/range.pb.h"
 #include "third_party/nucleus/protos/reads.pb.h"
@@ -76,6 +75,10 @@ constexpr char kSamReadGroupTag[] = "@RG";
 constexpr char kSamProgramTag[] = "@PG";
 constexpr char kSamCommentTag[] = "@CO";
 
+bool FileTypeIsIndexable(htsFormat format) {
+  return format.format == bam;
+}
+
 void AddHeaderLineToHeader(const string& line, SamHeader& header) {
   static constexpr char kVersionTag[] = "VN:";
   static constexpr char kSortingOrderTag[] = "SO:";
@@ -574,13 +577,10 @@ StatusOr<std::unique_ptr<SamReader>> SamReader::FromFile(
     return tf::errors::Unknown("Couldn't parse header for ", fp->fn);
 
   hts_idx_t* idx = nullptr;
-  if (options.index_mode() ==
-      nucleus::genomics::v1::IndexHandlingMode::INDEX_BASED_ON_FILENAME) {
+  if (FileTypeIsIndexable(fp->format)) {
     // redacted
+    // This call may return null, which we will look for at Query time.
     idx = sam_index_load(fp, fp->fn);
-    if (idx == nullptr) {
-      return tf::errors::NotFound("No index found for ", fp->fn);
-    }
   }
 
   return std::unique_ptr<SamReader>(

diff --git a/third_party/nucleus/io/sam_reader.h b/third_party/nucleus/io/sam_reader.h
@@ -79,8 +79,9 @@ class SamReader : public Reader {
   // reads_path must point to an existing SAM/BAM formatted file (text SAM or
   // compressed or uncompressed BAM file).
   //
-  // If options.index_mode indicates we should load an index, this constructor
-  // will attempt to load a BAI index from file reads_path + '.bai'.
+  // If the filetype is BAM, this constructor will attempt to load a BAI index
+  // from file reads_path + '.bai'; if the index is not found, attempts to Query
+  // will fail.
   //
   // Returns a StatusOr that is OK if the SamReader could be successfully
   // created or an error code indicating the error that occurred.
@@ -114,8 +115,8 @@ class SamReader : public Reader {
   // The specific parsing, filtering, etc behavior is determined by the options
   // provided during construction.
   //
-  // This function is only available if an index was loaded. If no index was
-  // loaded a non-OK status value will be returned.
+  // If no index was loaded by the constructor a non-OK status value will be
+  // returned.
   //
   // If range isn't a valid interval in this BAM file a non-OK status value will
   // be returned.

diff --git a/third_party/nucleus/io/sam_reader_test.cc b/third_party/nucleus/io/sam_reader_test.cc
@@ -34,7 +34,6 @@
 
 #include <string>
 
-#include "third_party/nucleus/protos/index.pb.h"
 #include "third_party/nucleus/testing/protocol-buffer-matchers.h"
 #include "third_party/nucleus/testing/test_utils.h"
 #include "third_party/nucleus/util/utils.h"
@@ -150,8 +149,6 @@ TEST(SamReaderTest, TestHeaderlessSamIsNotOkay) {
 class SamReaderQueryTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    options_.set_index_mode(
-        nucleus::genomics::v1::IndexHandlingMode::INDEX_BASED_ON_FILENAME);
     indexed_bam_ = GetTestData(kBamTestFilename);
     RecreateReader();
   }