diff --git a/gcp_variant_transforms/libs/processed_variant.py b/gcp_variant_transforms/libs/processed_variant.py index b1701eaba..e9412a493 100644 --- a/gcp_variant_transforms/libs/processed_variant.py +++ b/gcp_variant_transforms/libs/processed_variant.py @@ -54,6 +54,7 @@ # Counter names class _CounterEnum(enum.Enum): VARIANT = 'variant_counter' + ALTERNATE_ALLELE_INFO_MISMATCH = 'alternate_allele_info_mismatch_counter' ANNOTATION_ALT_MATCH = 'annotation_alt_match_counter' ANNOTATION_ALT_MINIMAL_AMBIGUOUS = 'annotation_alt_minimal_ambiguous_counter' ANNOTATION_ALT_MISMATCH = 'annotation_alt_mismatch_counter' @@ -196,6 +197,7 @@ def __init__( self, header_fields, # type: vcf_header_io.VcfHeader split_alternate_allele_info_fields=True, # type: bool + allow_alternate_allele_info_mismatch=False, # type: bool annotation_fields=None, # type: List[str] use_allele_num=False, # type: bool minimal_match=False, # type: bool @@ -209,24 +211,36 @@ def __init__( header_fields: Header information used for parsing and splitting INFO fields of the variant. split_alternate_allele_info_fields: If True, splits fields with - field_count='A' (i.e., one value for each alternate) among alternates. + `field_count='A'` (i.e., one value for each alternate) among alternates. + allow_alternate_allele_info_mismatch: By default (when False), an error + will be raised for INFO fields with `field_count='A'` (i.e. one value + for each alternate base) that do not have the same cardinality as + alternate bases. If True, an error will not be raised and excess values + will be dropped or insufficient values will be set to null. Only + applicable if `split_alternate_allele_info_fields` is True. annotation_fields: If provided, this is the list of INFO field names that store variant annotations. The format of how annotations are stored and their names are extracted from header_fields. - use_allele_num: If set, then "ALLELE_NUM" annotation is used to determine + use_allele_num: If True, then "ALLELE_NUM" annotation is used to determine the index of the ALT that corresponds to an annotation set. - minimal_match: If set, then the --minimal mode of VEP is simulated for + minimal_match: If True, then the --minimal mode of VEP is simulated for annotation ALT matching. - infer_annotation_types: If set, then warnings will be provided if header + infer_annotation_types: If True, then warnings will be provided if header fields fail to contain Info type lines for annotation fields + counter_factory: If provided, it will be used to record counters (e.g. the + number of variants processed). """ self._header_fields = header_fields self._split_alternate_allele_info_fields = ( split_alternate_allele_info_fields) + self._allow_alternate_allele_info_mismatch = ( + allow_alternate_allele_info_mismatch) self._annotation_field_set = set(annotation_fields or []) cfactory = counter_factory or metrics_util.NoOpCounterFactory() self._variant_counter = cfactory.create_counter( _CounterEnum.VARIANT.value) + self._alternate_allele_info_mismatche_counter = cfactory.create_counter( + _CounterEnum.ALTERNATE_ALLELE_INFO_MISMATCH.value) self._annotation_processor = _AnnotationProcessor( annotation_fields, self._header_fields, cfactory, use_allele_num, minimal_match, infer_annotation_types) @@ -254,14 +268,21 @@ def create_processed_variant(self, variant): def _add_per_alt_info(self, proc_var, field_name, variant_info_data): # type: (ProcessedVariant, str, vcfio.VariantInfo) -> None - if len(variant_info_data) != len(proc_var._alternate_datas): - raise ValueError( + num_variant_infos = len(variant_info_data) + num_alternate_bases = len(proc_var._alternate_datas) + if num_variant_infos != num_alternate_bases: + error_message = ( 'Per alternate INFO field "{}" does not have same cardinality as ' - ' number of alternates: {} vs {} in variant: "{}"'.format( - field_name, len(variant_info_data), - len(proc_var._alternate_datas), proc_var)) - for alt_index, info in enumerate(variant_info_data): - proc_var._alternate_datas[alt_index]._info[field_name] = info + 'number of alternates: {} vs {} in variant: "{}"'.format( + field_name, num_variant_infos, num_alternate_bases, proc_var)) + self._alternate_allele_info_mismatche_counter.inc() + if self._allow_alternate_allele_info_mismatch: + logging.warning(error_message) + else: + raise ValueError(error_message) + for alt_index in range(min(num_variant_infos, num_alternate_bases)): + proc_var._alternate_datas[alt_index]._info[field_name] = ( + variant_info_data[alt_index]) def create_alt_bases_field_schema(self): # type: () -> bigquery.TableFieldSchema diff --git a/gcp_variant_transforms/libs/processed_variant_test.py b/gcp_variant_transforms/libs/processed_variant_test.py index 1893e0fd3..438aa07ea 100644 --- a/gcp_variant_transforms/libs/processed_variant_test.py +++ b/gcp_variant_transforms/libs/processed_variant_test.py @@ -110,6 +110,57 @@ def test_create_processed_variant_move_alt_info(self): self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) self.assertFalse(proc_var.non_alt_info.has_key('A2')) + def test_create_processed_variant_move_alt_info_extra_values(self): + header_fields = vcf_header_util.make_header({'A1': '1', 'A2': 'A'}) + variant = self._get_sample_variant() + # Add a value to `A2` (it only has two alternate bases, so this is invalid). + variant.info['A2'] = ['data1', 'data2', 'data3'] + + # Ensure error is raised by default. + factory = processed_variant.ProcessedVariantFactory( + header_fields, + split_alternate_allele_info_fields=True) + with self.assertRaises(ValueError): + _ = factory.create_processed_variant(variant) + + # Try again with allow_alternate_allele_info_mismatch=True. + factory = processed_variant.ProcessedVariantFactory( + header_fields, + split_alternate_allele_info_fields=True, + allow_alternate_allele_info_mismatch=True) + proc_var = factory.create_processed_variant(variant) + alt1 = processed_variant.AlternateBaseData('A') + alt1._info = {'A2': 'data1'} + alt2 = processed_variant.AlternateBaseData('TT') + alt2._info = {'A2': 'data2'} + self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) + self.assertFalse(proc_var.non_alt_info.has_key('A2')) + + def test_create_processed_variant_move_alt_info_insufficient_values(self): + header_fields = vcf_header_util.make_header({'A1': '1', 'A2': 'A'}) + variant = self._get_sample_variant() + # Remove a value from `A2` (it has two alternate bases, so this is invalid). + variant.info['A2'] = ['data1'] + + # Ensure error is raised by default. + factory = processed_variant.ProcessedVariantFactory( + header_fields, + split_alternate_allele_info_fields=True) + with self.assertRaises(ValueError): + _ = factory.create_processed_variant(variant) + + # Try again with allow_alternate_allele_info_mismatch=True. + factory = processed_variant.ProcessedVariantFactory( + header_fields, + split_alternate_allele_info_fields=True, + allow_alternate_allele_info_mismatch=True) + proc_var = factory.create_processed_variant(variant) + alt1 = processed_variant.AlternateBaseData('A') + alt1._info = {'A2': 'data1'} + alt2 = processed_variant.AlternateBaseData('TT') + self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) + self.assertFalse(proc_var.non_alt_info.has_key('A2')) + def _get_sample_variant_and_header_with_csq(self, additional_infos=None): """Provides a simple `Variant` and `VcfHeader` with info fields diff --git a/gcp_variant_transforms/testing/data/vcf/README.md b/gcp_variant_transforms/testing/data/vcf/README.md index 84f991953..090b731cf 100644 --- a/gcp_variant_transforms/testing/data/vcf/README.md +++ b/gcp_variant_transforms/testing/data/vcf/README.md @@ -22,6 +22,12 @@ POS value for the first entry. It is used to test when `allow_malformed_records` is enabled, failed VCF record reads will not raise errors and the BigQuery table can still be generated. +`invalid-4.2-AF-mismatch.vcf` is created on `valid-4.2.vcf` by changing the +value of `AF` field in `20:17330` to have too many values (two instead of one) +and the value in `20:1234567` to have too few values (one instead of two). +By default, the job will fail as the cardinalities do not match, but it +should be successful when `allow_malformed_records` is enabled. + The folder `merge` is created to test the merge options. Three .vcf files are created. `merge1.vcf` contains two samples, while `merge2.vcf` and `merge3.vcf` contain one other sample, respectively. When MERGE_TO_CALLS is selected, the diff --git a/gcp_variant_transforms/testing/data/vcf/invalid-4.2-AF-mismatch.vcf b/gcp_variant_transforms/testing/data/vcf/invalid-4.2-AF-mismatch.vcf new file mode 100644 index 000000000..503e99f59 --- /dev/null +++ b/gcp_variant_transforms/testing/data/vcf/invalid-4.2-AF-mismatch.vcf @@ -0,0 +1,42 @@ +##fileformat=VCFv4.2 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##reference=file:/lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta +##contig= +##contig= +##contig= +##SAMPLE= +##SAMPLE= +##PEDIGREE= +##PEDIGREE= +##pedigreeDB=url +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +19 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=Infinity;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. +20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017,0.3 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 +20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=-inf,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 +20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 +20 1234567 microsat1 GTC G,GTCTC 50 PASS NS=3;DP=9;AA=G;AF=0.1 GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 +20 2234567 . C [13:123457[ACGC 50 PASS SVTYPE=BÑD;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 1/1:40:3 +20 2234568 . C .TC 50 PASS SVTYPE=BND;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 1/1:40:3 +20 2234569 . C CT. 50 PASS SVTYPE=BND;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 1/1:40:3 +20 3234569 . C 50 PASS END=3235677;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 1/1:40:3 +20 4234569 . N .[13:123457[ 50 PASS SVTYPE=BND;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 ./.:40:3 +20 5234569 . N [13:123457[. 50 PASS SVTYPE=BND;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 1/1:40:3 +Y 17330 . T A 3 q10 NS=3;DP=11;AF=nan GT:GL 0:0,49 0:0,3 1:41,0 +HLA-A*01:01:01:01 1 . N T 50 PASS END=1;NS=3;DP=9;AA=G GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. diff --git a/gcp_variant_transforms/testing/integration/vcf_to_bq_tests/presubmit_tests/small_tests/option_allow_malformed_records_alternate_mismatch.json b/gcp_variant_transforms/testing/integration/vcf_to_bq_tests/presubmit_tests/small_tests/option_allow_malformed_records_alternate_mismatch.json new file mode 100644 index 000000000..0a1c3d604 --- /dev/null +++ b/gcp_variant_transforms/testing/integration/vcf_to_bq_tests/presubmit_tests/small_tests/option_allow_malformed_records_alternate_mismatch.json @@ -0,0 +1,48 @@ +[ + { + "test_name": "option-allow-malformed-records-alternate-mismatch", + "table_name": "option_allow_malformed_records_alternate_mismatch", + "input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/invalid-4.2-AF-mismatch.vcf", + "allow_malformed_records": true, + "runner": "DirectRunner", + "zones": ["us-west1-b"], + "assertion_configs": [ + { + "query": ["NUM_ROWS_QUERY"], + "expected_result": {"num_rows": 13} + }, + { + "query": ["SUM_START_QUERY"], + "expected_result": {"sum_start": 23031929} + }, + { + "query": ["SUM_END_QUERY"], + "expected_result": {"sum_end": 23033052} + }, + { + "query": [ + "SELECT alternate_bases.AF AS AF ", + "FROM {TABLE_NAME} t, t.alternate_bases as alternate_bases ", + "WHERE reference_name = '20' and start_position = 17329" + ], + "expected_result": {"AF": 0.017} + }, + { + "query": [ + "SELECT SUM(alternate_bases.AF) AS AF_sum ", + "FROM {TABLE_NAME} t, t.alternate_bases as alternate_bases ", + "WHERE reference_name = '20' and start_position = 1234566" + ], + "expected_result": {"AF_sum": 0.1} + }, + { + "query": [ + "SELECT COUNT(IFNULL(alternate_bases.AF, 0)) AS AF_count ", + "FROM {TABLE_NAME} t, t.alternate_bases as alternate_bases ", + "WHERE reference_name = '20' and start_position = 1234566" + ], + "expected_result": {"AF_count": 2} + } + ] + } +] diff --git a/gcp_variant_transforms/vcf_to_bq.py b/gcp_variant_transforms/vcf_to_bq.py index 6dbbf0eb1..3bb11bc64 100644 --- a/gcp_variant_transforms/vcf_to_bq.py +++ b/gcp_variant_transforms/vcf_to_bq.py @@ -220,6 +220,7 @@ def run(argv=None): processed_variant_factory = processed_variant.ProcessedVariantFactory( header_fields, known_args.split_alternate_allele_info_fields, + known_args.allow_malformed_records, known_args.annotation_fields, known_args.use_allele_num, known_args.minimal_vep_alt_matching,