googlegenomics · arostamianfar · Nov 1, 2018 · Oct 30, 2018 · Nov 1, 2018
diff --git a/gcp_variant_transforms/libs/processed_variant.py b/gcp_variant_transforms/libs/processed_variant.py
@@ -54,6 +54,7 @@
 # Counter names
 class _CounterEnum(enum.Enum):
   VARIANT = 'variant_counter'
+  ALTERNATE_ALLELE_INFO_MISMATCH = 'alternate_allele_info_mismatch_counter'
   ANNOTATION_ALT_MATCH = 'annotation_alt_match_counter'
   ANNOTATION_ALT_MINIMAL_AMBIGUOUS = 'annotation_alt_minimal_ambiguous_counter'
   ANNOTATION_ALT_MISMATCH = 'annotation_alt_mismatch_counter'
@@ -196,6 +197,7 @@ def __init__(
       self,
       header_fields,  # type: vcf_header_io.VcfHeader
       split_alternate_allele_info_fields=True,  # type: bool
+      allow_alternate_allele_info_mismatch=False,  # type: bool
       annotation_fields=None,  # type: List[str]
       use_allele_num=False,  # type: bool
       minimal_match=False,  # type: bool
@@ -209,24 +211,36 @@ def __init__(
       header_fields: Header information used for parsing and splitting INFO
         fields of the variant.
       split_alternate_allele_info_fields: If True, splits fields with
-        field_count='A' (i.e., one value for each alternate) among alternates.
+        `field_count='A'` (i.e., one value for each alternate) among alternates.
+      allow_alternate_allele_info_mismatch: By default (when False), an error
+        will be raised for INFO fields with `field_count='A'` (i.e. one value
+        for each alternate base) that do not have the same cardinality as
+        alternate bases. If True, an error will not be raised and excess values
+        will be dropped or insufficient values will be set to null. Only
+        applicable if `split_alternate_allele_info_fields` is True.
       annotation_fields: If provided, this is the list of INFO field names that
         store variant annotations. The format of how annotations are stored and
         their names are extracted from header_fields.
-      use_allele_num: If set, then "ALLELE_NUM" annotation is used to determine
+      use_allele_num: If True, then "ALLELE_NUM" annotation is used to determine
         the index of the ALT that corresponds to an annotation set.
-      minimal_match: If set, then the --minimal mode of VEP is simulated for
+      minimal_match: If True, then the --minimal mode of VEP is simulated for
         annotation ALT matching.
-      infer_annotation_types: If set, then warnings will be provided if header
+      infer_annotation_types: If True, then warnings will be provided if header
         fields fail to contain Info type lines for annotation fields
+      counter_factory: If provided, it will be used to record counters (e.g. the
+        number of variants processed).
     """
     self._header_fields = header_fields
     self._split_alternate_allele_info_fields = (
         split_alternate_allele_info_fields)
+    self._allow_alternate_allele_info_mismatch = (
+        allow_alternate_allele_info_mismatch)
     self._annotation_field_set = set(annotation_fields or [])
     cfactory = counter_factory or metrics_util.NoOpCounterFactory()
     self._variant_counter = cfactory.create_counter(
         _CounterEnum.VARIANT.value)
+    self._alternate_allele_info_mismatche_counter = cfactory.create_counter(
+        _CounterEnum.ALTERNATE_ALLELE_INFO_MISMATCH.value)
     self._annotation_processor = _AnnotationProcessor(
         annotation_fields, self._header_fields, cfactory, use_allele_num,
         minimal_match, infer_annotation_types)
@@ -254,14 +268,21 @@ def create_processed_variant(self, variant):
 
   def _add_per_alt_info(self, proc_var, field_name, variant_info_data):
     # type: (ProcessedVariant, str, vcfio.VariantInfo) -> None
-    if len(variant_info_data) != len(proc_var._alternate_datas):
-      raise ValueError(
+    num_variant_infos = len(variant_info_data)
+    num_alternate_bases = len(proc_var._alternate_datas)
+    if num_variant_infos != num_alternate_bases:
+      error_message = (
           'Per alternate INFO field "{}" does not have same cardinality as '
-          ' number of alternates: {} vs {} in variant: "{}"'.format(
-              field_name, len(variant_info_data),
-              len(proc_var._alternate_datas), proc_var))
-    for alt_index, info in enumerate(variant_info_data):
-      proc_var._alternate_datas[alt_index]._info[field_name] = info
+          'number of alternates: {} vs {} in variant: "{}"'.format(
+              field_name, num_variant_infos, num_alternate_bases, proc_var))
+      self._alternate_allele_info_mismatche_counter.inc()
+      if self._allow_alternate_allele_info_mismatch:
+        logging.warning(error_message)
+      else:
+        raise ValueError(error_message)
+    for alt_index in range(min(num_variant_infos, num_alternate_bases)):
+      proc_var._alternate_datas[alt_index]._info[field_name] = (
+          variant_info_data[alt_index])
 
   def create_alt_bases_field_schema(self):
     # type: () -> bigquery.TableFieldSchema

diff --git a/gcp_variant_transforms/libs/processed_variant_test.py b/gcp_variant_transforms/libs/processed_variant_test.py
@@ -110,6 +110,57 @@ def test_create_processed_variant_move_alt_info(self):
     self.assertEqual(proc_var.alternate_data_list, [alt1, alt2])
     self.assertFalse(proc_var.non_alt_info.has_key('A2'))
 
+  def test_create_processed_variant_move_alt_info_extra_values(self):
+    header_fields = vcf_header_util.make_header({'A1': '1', 'A2': 'A'})
+    variant = self._get_sample_variant()
+    # Add a value to `A2` (it only has two alternate bases, so this is invalid).
+    variant.info['A2'] = ['data1', 'data2', 'data3']
+
+    # Ensure error is raised by default.
+    factory = processed_variant.ProcessedVariantFactory(
+        header_fields,
+        split_alternate_allele_info_fields=True)
+    with self.assertRaises(ValueError):
+      _ = factory.create_processed_variant(variant)
+
+    # Try again with allow_alternate_allele_info_mismatch=True.
+    factory = processed_variant.ProcessedVariantFactory(
+        header_fields,
+        split_alternate_allele_info_fields=True,
+        allow_alternate_allele_info_mismatch=True)
+    proc_var = factory.create_processed_variant(variant)
+    alt1 = processed_variant.AlternateBaseData('A')
+    alt1._info = {'A2': 'data1'}
+    alt2 = processed_variant.AlternateBaseData('TT')
+    alt2._info = {'A2': 'data2'}
+    self.assertEqual(proc_var.alternate_data_list, [alt1, alt2])
+    self.assertFalse(proc_var.non_alt_info.has_key('A2'))
+
+  def test_create_processed_variant_move_alt_info_insufficient_values(self):
+    header_fields = vcf_header_util.make_header({'A1': '1', 'A2': 'A'})
+    variant = self._get_sample_variant()
+    # Remove a value from `A2` (it has two alternate bases, so this is invalid).
+    variant.info['A2'] = ['data1']
+
+    # Ensure error is raised by default.
+    factory = processed_variant.ProcessedVariantFactory(
+        header_fields,
+        split_alternate_allele_info_fields=True)
+    with self.assertRaises(ValueError):
+      _ = factory.create_processed_variant(variant)
+
+    # Try again with allow_alternate_allele_info_mismatch=True.
+    factory = processed_variant.ProcessedVariantFactory(
+        header_fields,
+        split_alternate_allele_info_fields=True,
+        allow_alternate_allele_info_mismatch=True)
+    proc_var = factory.create_processed_variant(variant)
+    alt1 = processed_variant.AlternateBaseData('A')
+    alt1._info = {'A2': 'data1'}
+    alt2 = processed_variant.AlternateBaseData('TT')
+    self.assertEqual(proc_var.alternate_data_list, [alt1, alt2])
+    self.assertFalse(proc_var.non_alt_info.has_key('A2'))
+
   def _get_sample_variant_and_header_with_csq(self, additional_infos=None):
     """Provides a simple `Variant` and `VcfHeader` with info fields
 

diff --git a/gcp_variant_transforms/testing/data/vcf/README.md b/gcp_variant_transforms/testing/data/vcf/README.md
@@ -22,6 +22,12 @@ POS value for the first entry. It is used to test when `allow_malformed_records`
 is enabled, failed VCF record reads will not raise errors and the BigQuery table
 can still be generated.
 
+`invalid-4.2-AF-mismatch.vcf` is created on `valid-4.2.vcf` by changing the
+value of `AF` field in `20:17330` to have too many values (two instead of one)
+and the value in `20:1234567` to have too few values (one instead of two).
+By default, the job will fail as the cardinalities do not match, but it
+should be successful when `allow_malformed_records` is enabled.
+
 The folder `merge` is created to test the merge options. Three .vcf files are
 created. `merge1.vcf` contains two samples, while `merge2.vcf` and `merge3.vcf`
 contain one other sample, respectively. When MERGE_TO_CALLS is selected, the

diff --git a/gcp_variant_transforms/testing/data/vcf/invalid-4.2-AF-mismatch.vcf b/gcp_variant_transforms/testing/data/vcf/invalid-4.2-AF-mismatch.vcf
@@ -0,0 +1,42 @@
+##fileformat=VCFv4.2
+##fileDate=20090805
+##source=myImputationProgramV3.1
+##phasing=partial
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
+##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
+##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
+##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant (with unïcodé)">
+##INFO=<ID=END,Number=1,Type=Integer,Description="End position of variant">
+##FILTER=<ID=q10,Description="Quality below 10">
+##FILTER=<ID=s50,Description="Less than 50% of samples have data (with \\ backslash)">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
+##FORMAT=<ID=GL,Number=G,Type=Integer,Description="Genotype Likelihood">
+##reference=file:/lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta
+##contig=<ID=19,length=59128983,md5=1aacd71f30db8e561810913e0b72636d,species="Homo Sapiens">
+##contig=<ID=20,length=63025520,md5=0dec9660ec1efaaf33281c0d5ea2560f,species="Homo Sapiens">
+##contig=<ID=Y,length=63025520,md5=0dec9660ec1efaaf33281c0d5ea2560f,species="Homo Sapiens">
+##SAMPLE=<ID=Blood,Genomes=Germline,Mixture=1.,Description="Patient germline genome">
+##SAMPLE=<ID=TissueSample,Genomes=Germline;Tumor,Mixture=.3;.7,Description="Patient germline genome;Patient tumor genome">
+##PEDIGREE=<Derived=ID2,Original=ID1>
+##PEDIGREE=<Child=CHILD-GENOME-ID,Mother=MOTHER-GENOME-ID,Father=FATHER-GENOME-ID>
+##pedigreeDB=url
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001	NA00002	NA00003
+19	14370	rs6054257	G	A	29	PASS	NS=3;DP=14;AF=Infinity;DB;H2	GT:GQ:DP:HQ	0|0:48:1:51,51	1|0:48:8:51,51	1/1:43:5:.,.
+20	17330	.	T	A	3	q10	NS=3;DP=11;AF=0.017,0.3	GT:GQ:DP:HQ	0|0:49:3:58,50	0|1:3:5:65,3	0/0:41:3
+20	1110696	rs6040355	A	G,T	67	PASS	NS=2;DP=10;AF=-inf,0.667;AA=T;DB	GT:GQ:DP:HQ	1|2:21:6:23,27	2|1:2:0:18,2	2/2:35:4
+20	1230237	.	T	.	47	PASS	NS=3;DP=13;AA=T	GT:GQ:DP:HQ	0|0:54:7:56,60	0|0:48:4:51,51	0/0:61:2
+20	1234567	microsat1	GTC	G,GTCTC	50	PASS	NS=3;DP=9;AA=G;AF=0.1	GT:GQ:DP	0/1:35:4	0/2:17:2	1/1:40:3
+20	2234567	.	C	[13:123457[ACGC	50	PASS	SVTYPE=BÑD;NS=3;DP=9;AA=G	GT:GQ:DP	0/1:35:4	0/1:17:2	1/1:40:3
+20	2234568	.	C	.TC	50	PASS	SVTYPE=BND;NS=3;DP=9;AA=G	GT:GQ:DP	0/1:35:4	0/1:17:2	1/1:40:3
+20	2234569	.	C	CT.	50	PASS	SVTYPE=BND;NS=3;DP=9;AA=G	GT:GQ:DP	0/1:35:4	0/1:17:2	1/1:40:3
+20	3234569	.	C	<SYMBOLIC>	50	PASS	END=3235677;NS=3;DP=9;AA=G	GT:GQ:DP	0/1:35:4	0/1:17:2	1/1:40:3
+20	4234569	.	N	.[13:123457[	50	PASS	SVTYPE=BND;NS=3;DP=9;AA=G	GT:GQ:DP	0/1:35:4	0/1:17:2	./.:40:3
+20	5234569	.	N	[13:123457[.	50	PASS	SVTYPE=BND;NS=3;DP=9;AA=G	GT:GQ:DP	0/1:35:4	0/1:17:2	1/1:40:3
+Y	17330	.	T	A	3	q10	NS=3;DP=11;AF=nan	GT:GL	0:0,49	0:0,3	1:41,0
+HLA-A*01:01:01:01	1	.	N	T	50	PASS	END=1;NS=3;DP=9;AA=G	GT:GQ:DP:HQ	0|0:48:1:51,51	1|0:48:8:51,51	1/1:43:5:.,.
diff --git a/..._tests/presubmit_tests/small_tests/option_allow_malformed_records_alternate_mismatch.json b/..._tests/presubmit_tests/small_tests/option_allow_malformed_records_alternate_mismatch.json
@@ -0,0 +1,48 @@
+[
+  {
+    "test_name": "option-allow-malformed-records-alternate-mismatch",
+    "table_name": "option_allow_malformed_records_alternate_mismatch",
+    "input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/invalid-4.2-AF-mismatch.vcf",
+    "allow_malformed_records": true,
+    "runner": "DirectRunner",
+    "zones": ["us-west1-b"],
+    "assertion_configs": [
+      {
+        "query": ["NUM_ROWS_QUERY"],
+        "expected_result": {"num_rows": 13}
+      },
+      {
+        "query": ["SUM_START_QUERY"],
+        "expected_result": {"sum_start": 23031929}
+      },
+      {
+        "query": ["SUM_END_QUERY"],
+        "expected_result": {"sum_end": 23033052}
+      },
+      {
+        "query": [
+          "SELECT alternate_bases.AF AS AF ",
+          "FROM {TABLE_NAME} t, t.alternate_bases as alternate_bases ",
+          "WHERE reference_name = '20' and start_position = 17329"
+        ],
+        "expected_result": {"AF": 0.017}
+      },
+      {
+        "query": [
+          "SELECT SUM(alternate_bases.AF) AS AF_sum ",
+          "FROM {TABLE_NAME} t, t.alternate_bases as alternate_bases ",
+          "WHERE reference_name = '20' and start_position = 1234566"
+        ],
+        "expected_result": {"AF_sum": 0.1}
+      },
+      {
+        "query": [
+          "SELECT COUNT(IFNULL(alternate_bases.AF, 0)) AS AF_count ",
+          "FROM {TABLE_NAME} t, t.alternate_bases as alternate_bases ",
+          "WHERE reference_name = '20' and start_position = 1234566"
+        ],
+        "expected_result": {"AF_count": 2}
+      }
+    ]
+  }
+]
diff --git a/gcp_variant_transforms/vcf_to_bq.py b/gcp_variant_transforms/vcf_to_bq.py
@@ -220,6 +220,7 @@ def run(argv=None):
   processed_variant_factory = processed_variant.ProcessedVariantFactory(
       header_fields,
       known_args.split_alternate_allele_info_fields,
+      known_args.allow_malformed_records,
       known_args.annotation_fields,
       known_args.use_allele_num,
       known_args.minimal_vep_alt_matching,