diff --git a/gcp_variant_transforms/libs/processed_variant.py b/gcp_variant_transforms/libs/processed_variant.py index 7d7f7ce66..f552e1b56 100644 --- a/gcp_variant_transforms/libs/processed_variant.py +++ b/gcp_variant_transforms/libs/processed_variant.py @@ -28,7 +28,7 @@ import re from collections import defaultdict -from typing import Dict, List, Any # pylint: disable=unused-import +from typing import Dict, List, Any, Tuple, Optional # pylint: disable=unused-import import vcf @@ -41,10 +41,23 @@ _FIELD_COUNT_ALTERNATE_ALLELE = 'A' +# The representation of a deletion variant in VEP. +_COMPLETELY_DELETED_ALT = '-' + +# The field name in the BigQuery table that holds annotation ALT. +_ANNOTATION_ALT = 'allele' + +# The field name in the BigQuery table that indicates whether the annotation ALT +# matching was ambiguous or not. +_ANNOTATION_ALT_AMBIGUOUS = 'ambiguous_allele' + + # Counter names class _CounterEnum(enum.Enum): VARIANT = 'variant_counter' - ANNOTATION = 'annotation_counter' + ANNOTATION_ALT_MATCH = 'annotation_alt_match_counter' + ANNOTATION_ALT_MINIMAL_MATCH = 'annotation_alt_minimal_match_counter' + ANNOTATION_ALT_MINIMAL_AMBIGUOUS = 'annotation_alt_minimal_ambiguous_counter' ANNOTATION_ALT_MISMATCH = 'annotation_alt_mismatch_counter' @@ -177,6 +190,7 @@ def __init__( header_fields, # type: vcf_header_parser.HeaderFields split_alternate_allele_info_fields=True, # type: bool annotation_fields=None, # type: List[str] + minimal_match=False, # type: bool counter_factory=None # type: metrics_util.CounterFactoryInterface ): """Sets the internal state of the factory class. @@ -189,6 +203,8 @@ def __init__( annotation_fields: If provided, this is the list of INFO field names that store variant annotations. The format of how annotations are stored and their names are extracted from header_fields. + minimal_match: If set, then the --minimal mode of VEP is simulated for + annotation ALT matching. """ self._header_fields = header_fields self._split_alternate_allele_info_fields = ( @@ -197,13 +213,9 @@ def __init__( cfactory = counter_factory or metrics_util.NoOpCounterFactory() self._variant_counter = cfactory.create_counter( _CounterEnum.VARIANT.value) - self._annotation_counter = cfactory.create_counter( - _CounterEnum.ANNOTATION.value) - self._annotation_alt_mismatch_counter = cfactory.create_counter( - _CounterEnum.ANNOTATION_ALT_MISMATCH.value) self._annotation_processor = _AnnotationProcessor( - annotation_fields, self._header_fields, self._annotation_counter, - self._annotation_alt_mismatch_counter) + annotation_fields, self._header_fields, cfactory, minimal_match) + self._minimal_match = minimal_match def create_processed_variant(self, variant): # type: (vcfio.Variant) -> ProcessedVariant @@ -277,6 +289,17 @@ def create_alt_bases_field_schema(self): mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='List of {} annotations for this alternate.'.format( annot_field)) + annotation_record.fields.append(bigquery.TableFieldSchema( + name=_ANNOTATION_ALT, + type=bigquery_util.TableFieldConstants.TYPE_STRING, + mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, + description='The ALT part of the annotation field.')) + if self._minimal_match: + annotation_record.fields.append(bigquery.TableFieldSchema( + name=_ANNOTATION_ALT_AMBIGUOUS, + type=bigquery_util.TableFieldConstants.TYPE_BOOLEAN, + mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, + description='Whether the annotation ALT matching was ambiguous.')) for annotation_name in annotation_names: annotation_record.fields.append(bigquery.TableFieldSchema( name=bigquery_util.get_bigquery_sanitized_field(annotation_name), @@ -313,8 +336,8 @@ class _AnnotationProcessor(object): def __init__(self, annotation_fields, # type: List[str] header_fields, # type: vcf_header_parser.HeaderFields - annotation_alt_match_counter, # type: metrics_util.BaseCounter - annotation_alt_mismatch_counter # type: metrics_util.BaseCounter + counter_factory, # type: metrics_util.CounterFactoryInterface + minimal_match, # type: bool ): """Creates an instance for adding annotations to `ProcessedVariant` objects. @@ -335,8 +358,15 @@ def __init__(self, header_desc = header_fields.infos[field].desc self._annotation_names_map[field] = _extract_annotation_names( header_desc) - self._annotation_alt_match_counter = annotation_alt_match_counter - self._annotation_alt_mismatch_counter = annotation_alt_mismatch_counter + self._alt_match_counter = counter_factory.create_counter( + _CounterEnum.ANNOTATION_ALT_MATCH.value) + self._alt_minimal_match_counter = counter_factory.create_counter( + _CounterEnum.ANNOTATION_ALT_MINIMAL_MATCH.value) + self._alt_minimal_ambiguous_counter = counter_factory.create_counter( + _CounterEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value) + self._alt_mismatch_counter = counter_factory.create_counter( + _CounterEnum.ANNOTATION_ALT_MISMATCH.value) + self._minimal_match = minimal_match def add_annotation_data(self, proc_var, annotation_field_name, data): # type: (ProcessedVariant, str, List[str]) -> None @@ -368,9 +398,12 @@ def add_annotation_data(self, proc_var, annotation_field_name, data): alt_annotation_map = self._convert_annotation_strs_to_alt_map( annotation_field_name, data) for alt_bases, annotations_list in alt_annotation_map.iteritems(): - self._add_annotation_list( - proc_var, common_prefix, alt_bases, annotations_list, - annotation_field_name) + alt, ambiguous = self._find_matching_alt( + proc_var, common_prefix, alt_bases, annotation_field_name) + if alt: + if self._minimal_match: + self._add_ambiguous_fields(annotations_list, ambiguous) + alt._info[annotation_field_name] = annotations_list def _find_common_alt_ref_prefix(self, proc_var): # type: (ProcessedVariant) -> str @@ -420,15 +453,24 @@ def _create_map(self, annotations, annotation_names): raise ValueError('Expected {} annotations, got {}'.format( len(annotation_names), len(annotations) - 1)) annotation_dict = {} + annotation_dict[_ANNOTATION_ALT] = annotations[0] for index, name in enumerate(annotation_names): annotation_dict[name] = annotations[index + 1] return annotation_dict - def _add_annotation_list( - self, proc_var, common_prefix, alt_bases, annotations_list, - annotation_field_name): - # type: (ProcessedVariant, str, str, List[Dict[str, str]], str) -> None - """Adds all annotations to the given `proc_var`. + def _add_ambiguous_fields(self, annotations_list, ambiguous): + # type: (List[Dict[str, str]], bool) -> None + for annotation_map in annotations_list: + annotation_map[_ANNOTATION_ALT_AMBIGUOUS] = ambiguous + + def _find_matching_alt(self, + proc_var, # type: ProcessedVariant + common_prefix, # type: str + alt_bases, # type: str + annotation_field_name # type: str + ): + # type: (...) -> Tuple[Optional[AlternateBaseData], bool] + """Searches among ALTs of `proc_var` to find one that matches `alt_bases`. Args: proc_var: The object to which the annotations are being added. @@ -440,21 +482,47 @@ def _add_annotation_list( this list is a map of annotation names to values, see the example in `_convert_annotation_strs_to_alt_map` which creates these maps. annotation_field_name: The name of the annotation field, e.g., ANN, CSQ. + + Returns: + The `AlternateBaseData` object from proc_var that matches or None. It also + returns whether the matching was ambiguous or not. """ + found_alt = None + is_ambiguous = False # This assumes that number of alternate bases and annotation segments # are not too big. If this assumption is not true, we should replace the # following loop with a hash table search and avoid the quadratic time. for alt in proc_var._alternate_datas: if self._alt_matches_annotation_alt( common_prefix, alt.alternate_bases, alt_bases): - alt._info[annotation_field_name] = annotations_list - self._annotation_alt_match_counter.inc() + self._alt_match_counter.inc() + found_alt = alt break - else: - self._annotation_alt_mismatch_counter.inc() + if not found_alt and self._minimal_match: + for alt in proc_var._alternate_datas: + if self._alt_matches_annotation_alt_minimal_mode( + proc_var.reference_bases or '', alt.alternate_bases, alt_bases): + if found_alt: + is_ambiguous = True + self._alt_minimal_ambiguous_counter.inc() + logging.warning( + 'Annotation ALT %s of field %s matches both ALTs %s and %s ' + 'with reference bases %s at reference %s start %s', alt_bases, + annotation_field_name, found_alt.alternate_bases, + alt.alternate_bases, proc_var.reference_bases, + proc_var.reference_name, proc_var.start) + else: + self._alt_minimal_match_counter.inc() + found_alt = alt + # Note we do not `break` in this case because we want to know if this + # match was an ambiguous match or an exact one. + if not found_alt: + self._alt_mismatch_counter.inc() logging.warning( 'Could not find matching alternate bases for %s in ' - 'annotation filed %s', alt_bases, annotation_field_name) + 'annotation filed %s for variant at reference %s start %s', alt_bases, + annotation_field_name, proc_var.reference_name, proc_var.start) + return found_alt, is_ambiguous def _alt_matches_annotation_alt( self, common_prefix, alt_bases, annotation_alt): @@ -470,15 +538,17 @@ def _alt_matches_annotation_alt( A ID A .[13:123[ .[13 """ - # Check equality without the common prefix. Note according to VCF spec - # the length of this common prefix should be at most one but we have - # not checked/enforced that here. - if alt_bases[len(common_prefix):] == annotation_alt: - return True - # Handling deletion. - if (len(common_prefix) == len(alt_bases) - and annotation_alt == '-'): - return True + if not self._minimal_match: + # Check equality without the common prefix. + # Note according to VCF spec the length of this common prefix should be + # at most one but we have not checked/enforced that here. + # Also, this string matching should be skipped if in minimal_match mode. + if alt_bases[len(common_prefix):] == annotation_alt: + return True + # Handling deletion. + if (len(common_prefix) == len(alt_bases) + and annotation_alt == _COMPLETELY_DELETED_ALT): + return True # Handling symbolic ALTs. id_match = self._SYMBOLIC_ALT_RE.match(alt_bases) if id_match and id_match.group('ID') == annotation_alt: @@ -492,6 +562,43 @@ def _alt_matches_annotation_alt( return True return False + def _alt_matches_annotation_alt_minimal_mode( + self, referece_bases, alt_bases, annotation_alt): + # type: (str, str, str) -> bool + """Returns true if ALTs match in the --minimal mode of VEP. + + Note in the minimal mode, the matching can be non-deterministic, so this + should only be done if _alt_matches_annotation_alt which is deterministic + has not succeeded. For details of ALT matching in the --minimal mode of VEP, + see the "Complex VCF entries" sections of + https://useast.ensembl.org/info/docs/tools/vep/vep_formats.html + Basically, each ALT is independently checked with REF and the common prefix + and suffix is removed from ALT. The remaining part is the annotation ALT: + REF ALT annotation-ALT + A T T + AT TT,A T,- + C CT,T T -> Note this is ambiguous. + """ + if not alt_bases or not annotation_alt: + return False + # Finding common leading and trailing sub-strings of ALT and REF. + leading = 0 + trailing = 0 + min_len = min(len(alt_bases), len(referece_bases)) + while (leading < min_len and + alt_bases[leading] == referece_bases[leading]): + leading += 1 + while (trailing + leading < min_len and # TODO check this condition + alt_bases[len(alt_bases) - trailing - 1] == + referece_bases[len(referece_bases) - trailing - 1]): + trailing += 1 + if alt_bases[leading:len(alt_bases) - trailing] == annotation_alt: + return True + if (leading + trailing == len(alt_bases) and + annotation_alt == _COMPLETELY_DELETED_ALT): + return True + return False + def _extract_annotation_list_with_alt(annotation_str): # type: (str) -> List[str] diff --git a/gcp_variant_transforms/libs/processed_variant_test.py b/gcp_variant_transforms/libs/processed_variant_test.py index b8495a45e..9596012ce 100644 --- a/gcp_variant_transforms/libs/processed_variant_test.py +++ b/gcp_variant_transforms/libs/processed_variant_test.py @@ -22,6 +22,9 @@ from gcp_variant_transforms.beam_io import vcfio from gcp_variant_transforms.libs import metrics_util from gcp_variant_transforms.libs import processed_variant +# This is intentionally breaking the style guide because without this the lines +# referencing the counter names are too long and hard to read. +from gcp_variant_transforms.libs.processed_variant import _CounterEnum as CEnum from gcp_variant_transforms.libs import vcf_header_parser class _CounterSpy(metrics_util.CounterInterface): @@ -82,13 +85,12 @@ def test_create_processed_variant_no_change(self): processed_variant.AlternateBaseData(a) for a in ['A', 'TT']] self.assertEqual([proc_var_synthetic], [proc_var]) self.assertEqual(counter_factory.counter_map[ - processed_variant._CounterEnum.VARIANT.value].get_value(), 1) + CEnum.VARIANT.value].get_value(), 1) self.assertEqual(counter_factory.counter_map[ - processed_variant._CounterEnum.ANNOTATION.value].get_value(), 0) + CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 0) self.assertEqual( counter_factory.counter_map[ - processed_variant._CounterEnum.ANNOTATION_ALT_MISMATCH.value - ].get_value(), 0) + CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0) def test_create_processed_variant_move_alt_info(self): variant = self._get_sample_variant() @@ -134,26 +136,28 @@ def test_create_processed_variant_move_alt_info_and_annotation(self): alt1._info = { 'A2': 'data1', 'CSQ': [ - {'Consequence': 'C1', 'IMPACT': 'I1', 'SYMBOL': 'S1', 'Gene': 'G1'}, - {'Consequence': 'C3', 'IMPACT': 'I3', 'SYMBOL': 'S3', 'Gene': 'G3'}] + {processed_variant._ANNOTATION_ALT: 'A', 'Consequence': 'C1', + 'IMPACT': 'I1', 'SYMBOL': 'S1', 'Gene': 'G1'}, + {processed_variant._ANNOTATION_ALT: 'A', 'Consequence': 'C3', + 'IMPACT': 'I3', 'SYMBOL': 'S3', 'Gene': 'G3'}] } alt2 = processed_variant.AlternateBaseData('TT') alt2._info = { 'A2': 'data2', 'CSQ': [ - {'Consequence': 'C2', 'IMPACT': 'I2', 'SYMBOL': 'S2', 'Gene': 'G2'}] + {processed_variant._ANNOTATION_ALT: 'TT', 'Consequence': 'C2', + 'IMPACT': 'I2', 'SYMBOL': 'S2', 'Gene': 'G2'}] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) self.assertFalse(proc_var.non_alt_info.has_key('A2')) self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) self.assertEqual(counter_factory.counter_map[ - processed_variant._CounterEnum.VARIANT.value].get_value(), 1) + CEnum.VARIANT.value].get_value(), 1) self.assertEqual(counter_factory.counter_map[ - processed_variant._CounterEnum.ANNOTATION.value].get_value(), 2) + CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 2) self.assertEqual( counter_factory.counter_map[ - processed_variant._CounterEnum.ANNOTATION_ALT_MISMATCH.value - ].get_value(), 0) + CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0) def test_create_processed_variant_mismatched_annotation_alt(self): # This is like `test_create_processed_variant_move_alt_info_and_annotation` @@ -175,26 +179,28 @@ def test_create_processed_variant_mismatched_annotation_alt(self): alt1._info = { 'A2': 'data1', 'CSQ': [ - {'Consequence': 'C1', 'IMPACT': 'I1', 'SYMBOL': 'S1', 'Gene': 'G1'}, - {'Consequence': 'C3', 'IMPACT': 'I3', 'SYMBOL': 'S3', 'Gene': 'G3'}] + {processed_variant._ANNOTATION_ALT: 'A', 'Consequence': 'C1', + 'IMPACT': 'I1', 'SYMBOL': 'S1', 'Gene': 'G1'}, + {processed_variant._ANNOTATION_ALT: 'A', 'Consequence': 'C3', + 'IMPACT': 'I3', 'SYMBOL': 'S3', 'Gene': 'G3'}] } alt2 = processed_variant.AlternateBaseData('TT') alt2._info = { 'A2': 'data2', 'CSQ': [ - {'Consequence': 'C2', 'IMPACT': 'I2', 'SYMBOL': 'S2', 'Gene': 'G2'}] + {processed_variant._ANNOTATION_ALT: 'TT', 'Consequence': 'C2', + 'IMPACT': 'I2', 'SYMBOL': 'S2', 'Gene': 'G2'}] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) self.assertFalse(proc_var.non_alt_info.has_key('A2')) self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) self.assertEqual(counter_factory.counter_map[ - processed_variant._CounterEnum.VARIANT.value].get_value(), 1) + CEnum.VARIANT.value].get_value(), 1) self.assertEqual(counter_factory.counter_map[ - processed_variant._CounterEnum.ANNOTATION.value].get_value(), 2) + CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 2) self.assertEqual( counter_factory.counter_map[ - processed_variant._CounterEnum.ANNOTATION_ALT_MISMATCH.value - ].get_value(), 1) + CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 1) def test_create_processed_variant_symbolic_and_breakend_annotation_alt(self): # The returned variant is ignored as we create a custom one next. @@ -219,28 +225,30 @@ def test_create_processed_variant_symbolic_and_breakend_annotation_alt(self): alt1 = processed_variant.AlternateBaseData('') alt1._info = { 'CSQ': [ - {'Consequence': 'C1', 'IMPACT': 'I1', 'SYMBOL': 'S1', 'Gene': 'G1'}] + {processed_variant._ANNOTATION_ALT: 'SYMBOLIC', 'Consequence': 'C1', + 'IMPACT': 'I1', 'SYMBOL': 'S1', 'Gene': 'G1'}] } alt2 = processed_variant.AlternateBaseData('[13:123457[.') alt2._info = { 'CSQ': [ - {'Consequence': 'C2', 'IMPACT': 'I2', 'SYMBOL': 'S2', 'Gene': 'G2'}] + {processed_variant._ANNOTATION_ALT: '[13', 'Consequence': 'C2', + 'IMPACT': 'I2', 'SYMBOL': 'S2', 'Gene': 'G2'}] } alt3 = processed_variant.AlternateBaseData('C[10:10357[.') alt3._info = { 'CSQ': [ - {'Consequence': 'C3', 'IMPACT': 'I3', 'SYMBOL': 'S3', 'Gene': 'G3'}] + {processed_variant._ANNOTATION_ALT: 'C[10', 'Consequence': 'C3', + 'IMPACT': 'I3', 'SYMBOL': 'S3', 'Gene': 'G3'}] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2, alt3]) self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) self.assertEqual(counter_factory.counter_map[ - processed_variant._CounterEnum.VARIANT.value].get_value(), 1) + CEnum.VARIANT.value].get_value(), 1) self.assertEqual(counter_factory.counter_map[ - processed_variant._CounterEnum.ANNOTATION.value].get_value(), 3) + CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 3) self.assertEqual( counter_factory.counter_map[ - processed_variant._CounterEnum.ANNOTATION_ALT_MISMATCH.value].\ - get_value(), 1) + CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 1) def test_create_processed_variant_annotation_alt_prefix(self): # The returned variant is ignored as we create a custom one next. @@ -264,28 +272,30 @@ def test_create_processed_variant_annotation_alt_prefix(self): alt1 = processed_variant.AlternateBaseData('CT') alt1._info = { 'CSQ': [ - {'Consequence': 'C1', 'IMPACT': 'I1', 'SYMBOL': 'S1', 'Gene': 'G1'}] + {processed_variant._ANNOTATION_ALT: 'T', 'Consequence': 'C1', + 'IMPACT': 'I1', 'SYMBOL': 'S1', 'Gene': 'G1'}] } alt2 = processed_variant.AlternateBaseData('CC') alt2._info = { 'CSQ': [ - {'Consequence': 'C2', 'IMPACT': 'I2', 'SYMBOL': 'S2', 'Gene': 'G2'}] + {processed_variant._ANNOTATION_ALT: 'C', 'Consequence': 'C2', + 'IMPACT': 'I2', 'SYMBOL': 'S2', 'Gene': 'G2'}] } alt3 = processed_variant.AlternateBaseData('CCC') alt3._info = { 'CSQ': [ - {'Consequence': 'C3', 'IMPACT': 'I3', 'SYMBOL': 'S3', 'Gene': 'G3'}] + {processed_variant._ANNOTATION_ALT: 'CC', 'Consequence': 'C3', + 'IMPACT': 'I3', 'SYMBOL': 'S3', 'Gene': 'G3'}] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2, alt3]) self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) self.assertEqual(counter_factory.counter_map[ - processed_variant._CounterEnum.VARIANT.value].get_value(), 1) + CEnum.VARIANT.value].get_value(), 1) self.assertEqual(counter_factory.counter_map[ - processed_variant._CounterEnum.ANNOTATION.value].get_value(), 3) + CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 3) self.assertEqual( counter_factory.counter_map[ - processed_variant._CounterEnum.ANNOTATION_ALT_MISMATCH.value].\ - get_value(), 0) + CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0) def test_create_processed_variant_annotation_alt_prefix_but_ref(self): # The returned variant is ignored as we create a custom one next. @@ -309,22 +319,82 @@ def test_create_processed_variant_annotation_alt_prefix_but_ref(self): alt1 = processed_variant.AlternateBaseData('AA') alt1._info = { 'CSQ': [ - {'Consequence': 'C1', 'IMPACT': 'I1', 'SYMBOL': 'S1', 'Gene': 'G1'}] + {processed_variant._ANNOTATION_ALT: 'AA', 'Consequence': 'C1', + 'IMPACT': 'I1', 'SYMBOL': 'S1', 'Gene': 'G1'}] } alt2 = processed_variant.AlternateBaseData('AAA') alt2._info = { 'CSQ': [ - {'Consequence': 'C2', 'IMPACT': 'I2', 'SYMBOL': 'S2', 'Gene': 'G2'}] + {processed_variant._ANNOTATION_ALT: 'AAA', 'Consequence': 'C2', + 'IMPACT': 'I2', 'SYMBOL': 'S2', 'Gene': 'G2'}] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) self.assertEqual(counter_factory.counter_map[ - processed_variant._CounterEnum.VARIANT.value].get_value(), 1) + CEnum.VARIANT.value].get_value(), 1) self.assertEqual(counter_factory.counter_map[ - processed_variant._CounterEnum.ANNOTATION.value].get_value(), 2) + CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 2) self.assertEqual( counter_factory.counter_map[ - processed_variant._CounterEnum.ANNOTATION_ALT_MISMATCH.value].\ - get_value(), 0) + CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0) + self.assertEqual( + counter_factory.counter_map[ + CEnum.ANNOTATION_ALT_MINIMAL_MATCH.value].get_value(), 0) + + def test_create_processed_variant_annotation_alt_minimal(self): + # The returned variant is ignored as we create a custom one next. + _, header_fields = self._get_sample_variant_and_header_with_csq() + variant = vcfio.Variant( + reference_name='19', start=11, end=12, reference_bases='CC', + # The following represent a SNV, an insertion, and a deletion, resp. + alternate_bases=['CT', 'CCT', 'C'], + names=['rs1'], quality=2, + filters=['PASS'], + # Note that in the minimal mode, 'T' is an ambiguous annotation ALT + # because it can map to either the 'CT' SNV or the 'CCT' insertion. + # It is not ambiguous in the non-minimal mode (it only maps to `CT`). + info={'CSQ': vcfio.VariantInfo( + data=[ + 'T|C1|I1|S1|G1', '-|C2|I2|S2|G2'], + field_count='.')}) + counter_factory = _CounterSpyFactory() + factory = processed_variant.ProcessedVariantFactory( + header_fields, + split_alternate_allele_info_fields=True, + annotation_fields=['CSQ'], + minimal_match=True, + counter_factory=counter_factory) + proc_var = factory.create_processed_variant(variant) + alt1 = processed_variant.AlternateBaseData('CT') + alt1._info = {} + alt2 = processed_variant.AlternateBaseData('CCT') + alt2._info = { + 'CSQ': [ + {processed_variant._ANNOTATION_ALT: 'T', + processed_variant._ANNOTATION_ALT_AMBIGUOUS: True, + 'Consequence': 'C1', 'IMPACT': 'I1', 'SYMBOL': 'S1', 'Gene': 'G1'}] + } + alt3 = processed_variant.AlternateBaseData('C') + alt3._info = { + 'CSQ': [ + {processed_variant._ANNOTATION_ALT: '-', + processed_variant._ANNOTATION_ALT_AMBIGUOUS: False, + 'Consequence': 'C2', 'IMPACT': 'I2', 'SYMBOL': 'S2', 'Gene': 'G2'}] + } + self.assertEqual(proc_var.alternate_data_list, [alt1, alt2, alt3]) + self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) + self.assertEqual(counter_factory.counter_map[ + CEnum.VARIANT.value].get_value(), 1) + self.assertEqual(counter_factory.counter_map[ + CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 0) + self.assertEqual( + counter_factory.counter_map[ + CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0) + self.assertEqual( + counter_factory.counter_map[ + CEnum.ANNOTATION_ALT_MINIMAL_MATCH.value].get_value(), 2) + self.assertEqual( + counter_factory.counter_map[ + CEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value].get_value(), 1) # TODO(bashir2): Add tests for create_alt_record_for_schema. diff --git a/gcp_variant_transforms/options/variant_transform_options.py b/gcp_variant_transforms/options/variant_transform_options.py index 9114bba4b..194f36c82 100644 --- a/gcp_variant_transforms/options/variant_transform_options.py +++ b/gcp_variant_transforms/options/variant_transform_options.py @@ -85,6 +85,7 @@ def add_arguments(self, parser): 'variants. This is useful when there are header fields in ' 'variants not defined in the header sections.')) + class BigQueryWriteOptions(VariantTransformsOptions): """Options for writing Variant records to BigQuery.""" @@ -111,15 +112,6 @@ def add_arguments(self, parser): '--omit_empty_sample_calls', type='bool', default=False, nargs='?', const=True, help=("If true, samples that don't have a given call will be omitted.")) - parser.add_argument( - '--annotation_fields', - default=None, nargs='+', - help=('If set, it is a list of field names (separated by space) that ' - 'should be treated as annotation fields. The content of these ' - 'INFO fields will be broken into multiple columns in the output ' - 'BigQuery table and stored as repeated fields with ' - 'corresponding alternate alleles. [EXPERIMENTAL]')) - def validate(self, parsed_args, client=None): output_table_re_match = re.match( @@ -147,6 +139,31 @@ def validate(self, parsed_args, client=None): raise +class AnnotationOptions(VariantTransformsOptions): + """Options for how to treat annotation fields.""" + + def add_arguments(self, parser): + parser.add_argument( + '--annotation_fields', + default=None, nargs='+', + help=('If set, it is a list of field names (separated by space) that ' + 'should be treated as annotation fields. The content of these ' + 'INFO fields will be broken into multiple columns in the output ' + 'BigQuery table and stored as repeated fields with ' + 'corresponding alternate alleles. [EXPERIMENTAL]')) + parser.add_argument( + '--minimal_vep_alt_matching', + type='bool', default=False, nargs='?', const=True, + help=('If true, for ALT matching of annotation fields, the --minimal ' + 'mode of VEP is simulated. Note that this can lead to ambiguous ' + 'matches so by default this is False but if the VCF files are ' + 'generated with VEP in --minimal mode, then this option should ' + 'be turned on. The ambiguous cases are logged and counted.' + 'See the "Complext VCF Entries" of this doc for details:' + 'http://www.ensembl.org/info/docs/tools/vep/online/' + 'VEP_web_documentation.pdf')) + + class FilterOptions(VariantTransformsOptions): """Options for filtering Variant records.""" diff --git a/gcp_variant_transforms/vcf_to_bq.py b/gcp_variant_transforms/vcf_to_bq.py index 1476996ee..d84b8bbcf 100644 --- a/gcp_variant_transforms/vcf_to_bq.py +++ b/gcp_variant_transforms/vcf_to_bq.py @@ -63,6 +63,7 @@ _COMMAND_LINE_OPTIONS = [ variant_transform_options.VcfReadOptions, variant_transform_options.BigQueryWriteOptions, + variant_transform_options.AnnotationOptions, variant_transform_options.FilterOptions, variant_transform_options.MergeOptions, ] @@ -236,6 +237,7 @@ def run(argv=None): header_fields, known_args.split_alternate_allele_info_fields, known_args.annotation_fields, + known_args.minimal_vep_alt_matching, counter_factory) pipeline_options = PipelineOptions(pipeline_args)