-
Notifications
You must be signed in to change notification settings - Fork 59
Adds the option for processing ALLELE_NUM. #141
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
dcbc206
a0a66c8
ee34cbc
b357283
e2f7b68
edb8202
b5ad461
1f5a03e
763e036
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -51,6 +51,10 @@ | |
| # matching was ambiguous or not. | ||
| _ANNOTATION_ALT_AMBIGUOUS = 'ambiguous_allele' | ||
|
|
||
| # The annotation field that VEP uses to record the index of the alternate | ||
| # allele (i.e., ALT) that an annotation list is for. | ||
| _ALLELE_NUM_ANNOTATION = 'ALLELE_NUM' | ||
|
|
||
|
|
||
| # Counter names | ||
| class _CounterEnum(enum.Enum): | ||
|
|
@@ -59,6 +63,8 @@ class _CounterEnum(enum.Enum): | |
| ANNOTATION_ALT_MINIMAL_MATCH = 'annotation_alt_minimal_match_counter' | ||
| ANNOTATION_ALT_MINIMAL_AMBIGUOUS = 'annotation_alt_minimal_ambiguous_counter' | ||
| ANNOTATION_ALT_MISMATCH = 'annotation_alt_mismatch_counter' | ||
| ALLELE_NUM_MISSING = 'allele_num_missing' | ||
| ALLELE_NUM_INCORRECT = 'allele_num_incorrect' | ||
|
|
||
|
|
||
| class ProcessedVariant(object): | ||
|
|
@@ -190,6 +196,7 @@ def __init__( | |
| header_fields, # type: vcf_header_parser.HeaderFields | ||
| split_alternate_allele_info_fields=True, # type: bool | ||
| annotation_fields=None, # type: List[str] | ||
| use_allele_num=False, # type: bool | ||
| minimal_match=False, # type: bool | ||
| counter_factory=None # type: metrics_util.CounterFactoryInterface | ||
| ): | ||
|
|
@@ -203,6 +210,8 @@ def __init__( | |
| annotation_fields: If provided, this is the list of INFO field names that | ||
| store variant annotations. The format of how annotations are stored and | ||
| their names are extracted from header_fields. | ||
| use_allele_num: If set, then "ALLELE_NUM" annotation is used to determine | ||
| the index of the ALT that corresponds to an annotation set. | ||
| minimal_match: If set, then the --minimal mode of VEP is simulated for | ||
| annotation ALT matching. | ||
| """ | ||
|
|
@@ -214,7 +223,8 @@ def __init__( | |
| self._variant_counter = cfactory.create_counter( | ||
| _CounterEnum.VARIANT.value) | ||
| self._annotation_processor = _AnnotationProcessor( | ||
| annotation_fields, self._header_fields, cfactory, minimal_match) | ||
| annotation_fields, self._header_fields, cfactory, use_allele_num, | ||
| minimal_match) | ||
| self._minimal_match = minimal_match | ||
|
|
||
| def create_processed_variant(self, variant): | ||
|
|
@@ -337,6 +347,7 @@ def __init__(self, | |
| annotation_fields, # type: List[str] | ||
| header_fields, # type: vcf_header_parser.HeaderFields | ||
| counter_factory, # type: metrics_util.CounterFactoryInterface | ||
| use_allele_num, # type: bool | ||
| minimal_match, # type: bool | ||
| ): | ||
| """Creates an instance for adding annotations to `ProcessedVariant` objects. | ||
|
|
@@ -366,6 +377,11 @@ def __init__(self, | |
| _CounterEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value) | ||
| self._alt_mismatch_counter = counter_factory.create_counter( | ||
| _CounterEnum.ANNOTATION_ALT_MISMATCH.value) | ||
| self._allele_num_missing_counter = counter_factory.create_counter( | ||
| _CounterEnum.ALLELE_NUM_MISSING.value) | ||
| self._allele_num_incorrect_counter = counter_factory.create_counter( | ||
| _CounterEnum.ALLELE_NUM_INCORRECT.value) | ||
| self._use_allele_num = use_allele_num | ||
| self._minimal_match = minimal_match | ||
|
|
||
| def add_annotation_data(self, proc_var, annotation_field_name, data): | ||
|
|
@@ -398,12 +414,21 @@ def add_annotation_data(self, proc_var, annotation_field_name, data): | |
| alt_annotation_map = self._convert_annotation_strs_to_alt_map( | ||
| annotation_field_name, data) | ||
| for alt_bases, annotations_list in alt_annotation_map.iteritems(): | ||
| alt, ambiguous = self._find_matching_alt( | ||
| proc_var, common_prefix, alt_bases, annotation_field_name) | ||
| if alt: | ||
| if self._minimal_match: | ||
| self._add_ambiguous_fields(annotations_list, ambiguous) | ||
| alt._info[annotation_field_name] = annotations_list | ||
| if self._use_allele_num: | ||
| # TODO(bashir2): This class needs a major refactoring which should be | ||
| # done as part of creating a class for holding annotation data. The | ||
| # choice of mapping annotation lists to their annotation ALT string | ||
| # needs to be revisited in case of using ALLELE_NUM. | ||
| for annotation_dict in annotations_list: | ||
| self._add_annotations_by_allele_num( | ||
| proc_var, annotation_dict, annotation_field_name) | ||
| else: | ||
| alt, ambiguous = self._find_matching_alt( | ||
| proc_var, common_prefix, alt_bases, annotation_field_name) | ||
| if alt: | ||
| if self._minimal_match: | ||
| self._add_ambiguous_fields(annotations_list, ambiguous) | ||
| alt._info[annotation_field_name] = annotations_list | ||
|
|
||
| def _find_common_alt_ref_prefix(self, proc_var): | ||
| # type: (ProcessedVariant) -> str | ||
|
|
@@ -599,6 +624,27 @@ def _alt_matches_annotation_alt_minimal_mode( | |
| return True | ||
| return False | ||
|
|
||
| def _add_annotations_by_allele_num( | ||
| self, proc_var, annotation_dict, annotation_field_name): | ||
| # type: (ProcessedVariant, Dict[str, str], str) -> None | ||
| if _ALLELE_NUM_ANNOTATION not in annotation_dict: | ||
| self._allele_num_missing_counter.inc() | ||
| return | ||
| index_str = annotation_dict[_ALLELE_NUM_ANNOTATION] | ||
| try: | ||
| alt_index = int(index_str) - 1 | ||
| alt_list = proc_var._alternate_datas | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. consider using the public getter?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I explicitly have chosen not to use the public attributes when the intention is to mutate (i.e., almost everywhere in this module, you can find other examples too). If you remember, we even wanted to make the attributes to be read-only such that they can be read but not altered but then we decided that it is probably over-designing. So I prefer to treat the attributes as read-only getters. |
||
| if alt_index >= len(alt_list) or alt_index < 0: | ||
| raise ValueError | ||
| alt = alt_list[alt_index] | ||
| self._alt_match_counter.inc() | ||
| if annotation_field_name not in alt._info: | ||
| alt._info[annotation_field_name] = [annotation_dict] | ||
| else: | ||
| alt._info[annotation_field_name].append(annotation_dict) | ||
| except ValueError: | ||
| self._allele_num_incorrect_counter.inc() | ||
|
|
||
|
|
||
| def _extract_annotation_list_with_alt(annotation_str): | ||
| # type: (str) -> List[str] | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we always use this check so that we no longer need the use_allele_num flag? (i.e. the logic would be to first look for allele_num, if it doesn't exist, then use minimal or exact match depending on the --minimal flag). Or are you concerned about cases where allele_num is incorrect?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, when I was adding this feature I also preferred not to add an extra flag, but then I thought it is better to be explicit such that if there are problems with ALLELE_NUM we don't force them on the user. Of course we can try to use ALLELE_NUM and if it fails for whatever reason (e.g., bad indices) then fall back on matching (whether exact or minimal).
One problem with the latter approach I described is that in the ALLELE_NUM mode, annotation lists are processed one by one regardless of their alt_bases but in the non ALLELE_NUM mode, we map all annotations lists that have the same alt_bases into a single ALT (check the new unit-test I have added and note both lists have 'T' as their alt_bases but are mapped to two different ALTs). Of course, we can fix this problem too if we strongly prefer not to have the extra flag.
So with all these, I decided that being explicit and either use ALLELE_NUM always or ignore it always is a more clear approach. What do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see. Sounds good. Let's keep the flag then.