googlegenomics · bashir2 · Mar 20, 2018 · Mar 13, 2018 · Mar 16, 2018 · Mar 16, 2018
diff --git a/gcp_variant_transforms/libs/processed_variant.py b/gcp_variant_transforms/libs/processed_variant.py
@@ -51,6 +51,10 @@
 # matching was ambiguous or not.
 _ANNOTATION_ALT_AMBIGUOUS = 'ambiguous_allele'
 
+# The annotation field that VEP uses to record the index of the alternate
+# allele (i.e., ALT) that an annotation list is for.
+_ALLELE_NUM_ANNOTATION = 'ALLELE_NUM'
+
 
 # Counter names
 class _CounterEnum(enum.Enum):
@@ -59,6 +63,8 @@ class _CounterEnum(enum.Enum):
   ANNOTATION_ALT_MINIMAL_MATCH = 'annotation_alt_minimal_match_counter'
   ANNOTATION_ALT_MINIMAL_AMBIGUOUS = 'annotation_alt_minimal_ambiguous_counter'
   ANNOTATION_ALT_MISMATCH = 'annotation_alt_mismatch_counter'
+  ALLELE_NUM_MISSING = 'allele_num_missing'
+  ALLELE_NUM_INCORRECT = 'allele_num_incorrect'
 
 
 class ProcessedVariant(object):
@@ -190,6 +196,7 @@ def __init__(
       header_fields,  # type: vcf_header_parser.HeaderFields
       split_alternate_allele_info_fields=True,  # type: bool
       annotation_fields=None,  # type: List[str]
+      use_allele_num=False,  # type: bool
       minimal_match=False,  # type: bool
       counter_factory=None  # type: metrics_util.CounterFactoryInterface
   ):
@@ -203,6 +210,8 @@ def __init__(
       annotation_fields: If provided, this is the list of INFO field names that
         store variant annotations. The format of how annotations are stored and
         their names are extracted from header_fields.
+      use_allele_num: If set, then "ALLELE_NUM" annotation is used to determine
+        the index of the ALT that corresponds to an annotation set.
       minimal_match: If set, then the --minimal mode of VEP is simulated for
         annotation ALT matching.
     """
@@ -214,7 +223,8 @@ def __init__(
     self._variant_counter = cfactory.create_counter(
         _CounterEnum.VARIANT.value)
     self._annotation_processor = _AnnotationProcessor(
-        annotation_fields, self._header_fields, cfactory, minimal_match)
+        annotation_fields, self._header_fields, cfactory, use_allele_num,
+        minimal_match)
     self._minimal_match = minimal_match
 
   def create_processed_variant(self, variant):
@@ -337,6 +347,7 @@ def __init__(self,
                annotation_fields,  # type: List[str]
                header_fields,  # type: vcf_header_parser.HeaderFields
                counter_factory,  # type: metrics_util.CounterFactoryInterface
+               use_allele_num,  # type: bool
                minimal_match,  # type: bool
               ):
     """Creates an instance for adding annotations to `ProcessedVariant` objects.
@@ -366,6 +377,11 @@ def __init__(self,
         _CounterEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value)
     self._alt_mismatch_counter = counter_factory.create_counter(
         _CounterEnum.ANNOTATION_ALT_MISMATCH.value)
+    self._allele_num_missing_counter = counter_factory.create_counter(
+        _CounterEnum.ALLELE_NUM_MISSING.value)
+    self._allele_num_incorrect_counter = counter_factory.create_counter(
+        _CounterEnum.ALLELE_NUM_INCORRECT.value)
+    self._use_allele_num = use_allele_num
     self._minimal_match = minimal_match
 
   def add_annotation_data(self, proc_var, annotation_field_name, data):
@@ -398,12 +414,21 @@ def add_annotation_data(self, proc_var, annotation_field_name, data):
     alt_annotation_map = self._convert_annotation_strs_to_alt_map(
         annotation_field_name, data)
     for alt_bases, annotations_list in alt_annotation_map.iteritems():
-      alt, ambiguous = self._find_matching_alt(
-          proc_var, common_prefix, alt_bases, annotation_field_name)
-      if alt:
-        if self._minimal_match:
-          self._add_ambiguous_fields(annotations_list, ambiguous)
-        alt._info[annotation_field_name] = annotations_list
+      if self._use_allele_num:
+        # TODO(bashir2): This class needs a major refactoring which should be
+        # done as part of creating a class for holding annotation data. The
+        # choice of mapping annotation lists to their annotation ALT string
+        # needs to be revisited in case of using ALLELE_NUM.
+        for annotation_dict in annotations_list:
+          self._add_annotations_by_allele_num(
+              proc_var, annotation_dict, annotation_field_name)
+      else:
+        alt, ambiguous = self._find_matching_alt(
+            proc_var, common_prefix, alt_bases, annotation_field_name)
+        if alt:
+          if self._minimal_match:
+            self._add_ambiguous_fields(annotations_list, ambiguous)
+          alt._info[annotation_field_name] = annotations_list
 
   def _find_common_alt_ref_prefix(self, proc_var):
     # type: (ProcessedVariant) -> str
@@ -599,6 +624,27 @@ def _alt_matches_annotation_alt_minimal_mode(
       return True
     return False
 
+  def _add_annotations_by_allele_num(
+      self, proc_var, annotation_dict, annotation_field_name):
+    # type: (ProcessedVariant, Dict[str, str], str) -> None
+    if _ALLELE_NUM_ANNOTATION not in annotation_dict:
+      self._allele_num_missing_counter.inc()
+      return
+    index_str = annotation_dict[_ALLELE_NUM_ANNOTATION]
+    try:
+      alt_index = int(index_str) - 1
+      alt_list = proc_var._alternate_datas
+      if alt_index >= len(alt_list) or alt_index < 0:
+        raise ValueError
+      alt = alt_list[alt_index]
+      self._alt_match_counter.inc()
+      if annotation_field_name not in alt._info:
+        alt._info[annotation_field_name] = [annotation_dict]
+      else:
+        alt._info[annotation_field_name].append(annotation_dict)
+    except ValueError:
+      self._allele_num_incorrect_counter.inc()
+
 
 def _extract_annotation_list_with_alt(annotation_str):
   # type: (str) -> List[str]

diff --git a/gcp_variant_transforms/libs/processed_variant_test.py b/gcp_variant_transforms/libs/processed_variant_test.py
@@ -397,4 +397,66 @@ def test_create_processed_variant_annotation_alt_minimal(self):
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value].get_value(), 1)
 
+  def test_create_processed_variant_annotation_alt_allele_num(self):
+    csq_info = parser._Info(
+        id=None, num='.', type=None,
+        desc='some desc Allele|Consequence|IMPACT|ALLELE_NUM',
+        source=None, version=None)
+    header_fields = vcf_header_parser.HeaderFields(
+        infos={'CSQ': csq_info}, formats={})
+    variant = vcfio.Variant(
+        reference_name='19', start=11, end=12, reference_bases='C',
+        # The following represent a SNV and an insertion, resp.
+        alternate_bases=['T', 'CT'],
+        names=['rs1'], quality=2,
+        filters=['PASS'],
+        # Note that in the minimal mode of VEP, 'T' is an ambiguous annotation
+        # ALT because it can map to either the 'T' SNV or the 'CT' insertion.
+        # But because there is ALLELE_NUM there should be no ambiguity.
+        # The last four annotations have incorrect ALLELE_NUMs.
+        info={'CSQ': vcfio.VariantInfo(
+            data=[
+                'T|C1|I1|1', 'T|C2|I2|2', 'T|C3|I3|0', 'T|C4|I4|3',
+                'T|C5|I5|TEST', 'T|C6|I6|'], field_count='.')})
+    counter_factory = _CounterSpyFactory()
+    factory = processed_variant.ProcessedVariantFactory(
+        header_fields,
+        split_alternate_allele_info_fields=True,
+        annotation_fields=['CSQ'],
+        use_allele_num=True,
+        minimal_match=True,  # This should be ignored by the factory method.
+        counter_factory=counter_factory)
+    proc_var = factory.create_processed_variant(variant)
+    alt1 = processed_variant.AlternateBaseData('T')
+    alt1._info = {
+        'CSQ': [
+            {processed_variant._ANNOTATION_ALT: 'T',
+             'Consequence': 'C1', 'IMPACT': 'I1', 'ALLELE_NUM': '1'}]
+    }
+    alt2 = processed_variant.AlternateBaseData('CT')
+    alt2._info = {
+        'CSQ': [
+            {processed_variant._ANNOTATION_ALT: 'T',
+             'Consequence': 'C2', 'IMPACT': 'I2', 'ALLELE_NUM': '2'}]
+    }
+    self.assertEqual(proc_var.alternate_data_list, [alt1, alt2])
+    self.assertFalse(proc_var.non_alt_info.has_key('CSQ'))
+    self.assertEqual(counter_factory.counter_map[
+        CEnum.VARIANT.value].get_value(), 1)
+    self.assertEqual(counter_factory.counter_map[
+        CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 2)
+    self.assertEqual(
+        counter_factory.counter_map[
+            CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0)
+    self.assertEqual(
+        counter_factory.counter_map[
+            CEnum.ANNOTATION_ALT_MINIMAL_MATCH.value].get_value(), 0)
+    self.assertEqual(
+        counter_factory.counter_map[
+            CEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value].get_value(), 0)
+    self.assertEqual(
+        counter_factory.counter_map[
+            CEnum.ALLELE_NUM_INCORRECT.value].get_value(), 4)
+
+
 # TODO(bashir2): Add tests for create_alt_record_for_schema.
diff --git a/gcp_variant_transforms/options/variant_transform_options.py b/gcp_variant_transforms/options/variant_transform_options.py
@@ -151,6 +151,16 @@ def add_arguments(self, parser):
               'INFO fields will be broken into multiple columns in the output '
               'BigQuery table and stored as repeated fields with '
               'corresponding alternate alleles. [EXPERIMENTAL]'))
+    parser.add_argument(
+        '--use_allele_num',
+        type='bool', default=False, nargs='?', const=True,
+        help=('If true, uses the "ALLELE_NUM" annotation to determine the ALT'
+              'that matches each annotation set. Note this is the preferred way'
+              'of ALT matching and should be used if available. In particular, '
+              'if the annotation tool was VEP and it was used with --minimal, '
+              'this option is preferred over --minimal_vep_alt_matching '
+              'because it avoids ambiguity. When --use_allele_num is set '
+              '--minimal_vep_alt_matching is ignored.'))
     parser.add_argument(
         '--minimal_vep_alt_matching',
         type='bool', default=False, nargs='?', const=True,

diff --git a/gcp_variant_transforms/vcf_to_bq.py b/gcp_variant_transforms/vcf_to_bq.py
@@ -237,6 +237,7 @@ def run(argv=None):
       header_fields,
       known_args.split_alternate_allele_info_fields,
       known_args.annotation_fields,
+      known_args.use_allele_num,
       known_args.minimal_vep_alt_matching,
       counter_factory)