Skip to content
60 changes: 53 additions & 7 deletions gcp_variant_transforms/libs/processed_variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@
# matching was ambiguous or not.
_ANNOTATION_ALT_AMBIGUOUS = 'ambiguous_allele'

# The annotation field that VEP uses to record the index of the alternate
# allele (i.e., ALT) that an annotation list is for.
_ALLELE_NUM_ANNOTATION = 'ALLELE_NUM'


# Counter names
class _CounterEnum(enum.Enum):
Expand All @@ -59,6 +63,8 @@ class _CounterEnum(enum.Enum):
ANNOTATION_ALT_MINIMAL_MATCH = 'annotation_alt_minimal_match_counter'
ANNOTATION_ALT_MINIMAL_AMBIGUOUS = 'annotation_alt_minimal_ambiguous_counter'
ANNOTATION_ALT_MISMATCH = 'annotation_alt_mismatch_counter'
ALLELE_NUM_MISSING = 'allele_num_missing'
ALLELE_NUM_INCORRECT = 'allele_num_incorrect'


class ProcessedVariant(object):
Expand Down Expand Up @@ -190,6 +196,7 @@ def __init__(
header_fields, # type: vcf_header_parser.HeaderFields
split_alternate_allele_info_fields=True, # type: bool
annotation_fields=None, # type: List[str]
use_allele_num=False, # type: bool
minimal_match=False, # type: bool
counter_factory=None # type: metrics_util.CounterFactoryInterface
):
Expand All @@ -203,6 +210,8 @@ def __init__(
annotation_fields: If provided, this is the list of INFO field names that
store variant annotations. The format of how annotations are stored and
their names are extracted from header_fields.
use_allele_num: If set, then "ALLELE_NUM" annotation is used to determine
the index of the ALT that corresponds to an annotation set.
minimal_match: If set, then the --minimal mode of VEP is simulated for
annotation ALT matching.
"""
Expand All @@ -214,7 +223,8 @@ def __init__(
self._variant_counter = cfactory.create_counter(
_CounterEnum.VARIANT.value)
self._annotation_processor = _AnnotationProcessor(
annotation_fields, self._header_fields, cfactory, minimal_match)
annotation_fields, self._header_fields, cfactory, use_allele_num,
minimal_match)
self._minimal_match = minimal_match

def create_processed_variant(self, variant):
Expand Down Expand Up @@ -337,6 +347,7 @@ def __init__(self,
annotation_fields, # type: List[str]
header_fields, # type: vcf_header_parser.HeaderFields
counter_factory, # type: metrics_util.CounterFactoryInterface
use_allele_num, # type: bool
minimal_match, # type: bool
):
"""Creates an instance for adding annotations to `ProcessedVariant` objects.
Expand Down Expand Up @@ -366,6 +377,11 @@ def __init__(self,
_CounterEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value)
self._alt_mismatch_counter = counter_factory.create_counter(
_CounterEnum.ANNOTATION_ALT_MISMATCH.value)
self._allele_num_missing_counter = counter_factory.create_counter(
_CounterEnum.ALLELE_NUM_MISSING.value)
self._allele_num_incorrect_counter = counter_factory.create_counter(
_CounterEnum.ALLELE_NUM_INCORRECT.value)
self._use_allele_num = use_allele_num
self._minimal_match = minimal_match

def add_annotation_data(self, proc_var, annotation_field_name, data):
Expand Down Expand Up @@ -398,12 +414,21 @@ def add_annotation_data(self, proc_var, annotation_field_name, data):
alt_annotation_map = self._convert_annotation_strs_to_alt_map(
annotation_field_name, data)
for alt_bases, annotations_list in alt_annotation_map.iteritems():
alt, ambiguous = self._find_matching_alt(
proc_var, common_prefix, alt_bases, annotation_field_name)
if alt:
if self._minimal_match:
self._add_ambiguous_fields(annotations_list, ambiguous)
alt._info[annotation_field_name] = annotations_list
if self._use_allele_num:
# TODO(bashir2): This class needs a major refactoring which should be
# done as part of creating a class for holding annotation data. The
# choice of mapping annotation lists to their annotation ALT string
# needs to be revisited in case of using ALLELE_NUM.
for annotation_dict in annotations_list:
self._add_annotations_by_allele_num(
proc_var, annotation_dict, annotation_field_name)
else:
alt, ambiguous = self._find_matching_alt(
proc_var, common_prefix, alt_bases, annotation_field_name)
if alt:
if self._minimal_match:
self._add_ambiguous_fields(annotations_list, ambiguous)
alt._info[annotation_field_name] = annotations_list

def _find_common_alt_ref_prefix(self, proc_var):
# type: (ProcessedVariant) -> str
Expand Down Expand Up @@ -599,6 +624,27 @@ def _alt_matches_annotation_alt_minimal_mode(
return True
return False

def _add_annotations_by_allele_num(
self, proc_var, annotation_dict, annotation_field_name):
# type: (ProcessedVariant, Dict[str, str], str) -> None
if _ALLELE_NUM_ANNOTATION not in annotation_dict:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we always use this check so that we no longer need the use_allele_num flag? (i.e. the logic would be to first look for allele_num, if it doesn't exist, then use minimal or exact match depending on the --minimal flag). Or are you concerned about cases where allele_num is incorrect?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, when I was adding this feature I also preferred not to add an extra flag, but then I thought it is better to be explicit such that if there are problems with ALLELE_NUM we don't force them on the user. Of course we can try to use ALLELE_NUM and if it fails for whatever reason (e.g., bad indices) then fall back on matching (whether exact or minimal).

One problem with the latter approach I described is that in the ALLELE_NUM mode, annotation lists are processed one by one regardless of their alt_bases but in the non ALLELE_NUM mode, we map all annotations lists that have the same alt_bases into a single ALT (check the new unit-test I have added and note both lists have 'T' as their alt_bases but are mapped to two different ALTs). Of course, we can fix this problem too if we strongly prefer not to have the extra flag.

So with all these, I decided that being explicit and either use ALLELE_NUM always or ignore it always is a more clear approach. What do you think?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. Sounds good. Let's keep the flag then.

self._allele_num_missing_counter.inc()
return
index_str = annotation_dict[_ALLELE_NUM_ANNOTATION]
try:
alt_index = int(index_str) - 1
alt_list = proc_var._alternate_datas
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

consider using the public getter?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I explicitly have chosen not to use the public attributes when the intention is to mutate (i.e., almost everywhere in this module, you can find other examples too). If you remember, we even wanted to make the attributes to be read-only such that they can be read but not altered but then we decided that it is probably over-designing. So I prefer to treat the attributes as read-only getters.

if alt_index >= len(alt_list) or alt_index < 0:
raise ValueError
alt = alt_list[alt_index]
self._alt_match_counter.inc()
if annotation_field_name not in alt._info:
alt._info[annotation_field_name] = [annotation_dict]
else:
alt._info[annotation_field_name].append(annotation_dict)
except ValueError:
self._allele_num_incorrect_counter.inc()


def _extract_annotation_list_with_alt(annotation_str):
# type: (str) -> List[str]
Expand Down
62 changes: 62 additions & 0 deletions gcp_variant_transforms/libs/processed_variant_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,4 +397,66 @@ def test_create_processed_variant_annotation_alt_minimal(self):
counter_factory.counter_map[
CEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value].get_value(), 1)

def test_create_processed_variant_annotation_alt_allele_num(self):
csq_info = parser._Info(
id=None, num='.', type=None,
desc='some desc Allele|Consequence|IMPACT|ALLELE_NUM',
source=None, version=None)
header_fields = vcf_header_parser.HeaderFields(
infos={'CSQ': csq_info}, formats={})
variant = vcfio.Variant(
reference_name='19', start=11, end=12, reference_bases='C',
# The following represent a SNV and an insertion, resp.
alternate_bases=['T', 'CT'],
names=['rs1'], quality=2,
filters=['PASS'],
# Note that in the minimal mode of VEP, 'T' is an ambiguous annotation
# ALT because it can map to either the 'T' SNV or the 'CT' insertion.
# But because there is ALLELE_NUM there should be no ambiguity.
# The last four annotations have incorrect ALLELE_NUMs.
info={'CSQ': vcfio.VariantInfo(
data=[
'T|C1|I1|1', 'T|C2|I2|2', 'T|C3|I3|0', 'T|C4|I4|3',
'T|C5|I5|TEST', 'T|C6|I6|'], field_count='.')})
counter_factory = _CounterSpyFactory()
factory = processed_variant.ProcessedVariantFactory(
header_fields,
split_alternate_allele_info_fields=True,
annotation_fields=['CSQ'],
use_allele_num=True,
minimal_match=True, # This should be ignored by the factory method.
counter_factory=counter_factory)
proc_var = factory.create_processed_variant(variant)
alt1 = processed_variant.AlternateBaseData('T')
alt1._info = {
'CSQ': [
{processed_variant._ANNOTATION_ALT: 'T',
'Consequence': 'C1', 'IMPACT': 'I1', 'ALLELE_NUM': '1'}]
}
alt2 = processed_variant.AlternateBaseData('CT')
alt2._info = {
'CSQ': [
{processed_variant._ANNOTATION_ALT: 'T',
'Consequence': 'C2', 'IMPACT': 'I2', 'ALLELE_NUM': '2'}]
}
self.assertEqual(proc_var.alternate_data_list, [alt1, alt2])
self.assertFalse(proc_var.non_alt_info.has_key('CSQ'))
self.assertEqual(counter_factory.counter_map[
CEnum.VARIANT.value].get_value(), 1)
self.assertEqual(counter_factory.counter_map[
CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 2)
self.assertEqual(
counter_factory.counter_map[
CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0)
self.assertEqual(
counter_factory.counter_map[
CEnum.ANNOTATION_ALT_MINIMAL_MATCH.value].get_value(), 0)
self.assertEqual(
counter_factory.counter_map[
CEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value].get_value(), 0)
self.assertEqual(
counter_factory.counter_map[
CEnum.ALLELE_NUM_INCORRECT.value].get_value(), 4)


# TODO(bashir2): Add tests for create_alt_record_for_schema.
10 changes: 10 additions & 0 deletions gcp_variant_transforms/options/variant_transform_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,16 @@ def add_arguments(self, parser):
'INFO fields will be broken into multiple columns in the output '
'BigQuery table and stored as repeated fields with '
'corresponding alternate alleles. [EXPERIMENTAL]'))
parser.add_argument(
'--use_allele_num',
type='bool', default=False, nargs='?', const=True,
help=('If true, uses the "ALLELE_NUM" annotation to determine the ALT'
'that matches each annotation set. Note this is the preferred way'
'of ALT matching and should be used if available. In particular, '
'if the annotation tool was VEP and it was used with --minimal, '
'this option is preferred over --minimal_vep_alt_matching '
'because it avoids ambiguity. When --use_allele_num is set '
'--minimal_vep_alt_matching is ignored.'))
parser.add_argument(
'--minimal_vep_alt_matching',
type='bool', default=False, nargs='?', const=True,
Expand Down
1 change: 1 addition & 0 deletions gcp_variant_transforms/vcf_to_bq.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ def run(argv=None):
header_fields,
known_args.split_alternate_allele_info_fields,
known_args.annotation_fields,
known_args.use_allele_num,
known_args.minimal_vep_alt_matching,
counter_factory)

Expand Down