Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 32 additions & 11 deletions gcp_variant_transforms/libs/processed_variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
# Counter names
class _CounterEnum(enum.Enum):
VARIANT = 'variant_counter'
ALTERNATE_ALLELE_INFO_MISMATCH = 'alternate_allele_info_mismatch_counter'
ANNOTATION_ALT_MATCH = 'annotation_alt_match_counter'
ANNOTATION_ALT_MINIMAL_AMBIGUOUS = 'annotation_alt_minimal_ambiguous_counter'
ANNOTATION_ALT_MISMATCH = 'annotation_alt_mismatch_counter'
Expand Down Expand Up @@ -196,6 +197,7 @@ def __init__(
self,
header_fields, # type: vcf_header_io.VcfHeader
split_alternate_allele_info_fields=True, # type: bool
allow_alternate_allele_info_mismatch=False, # type: bool
annotation_fields=None, # type: List[str]
use_allele_num=False, # type: bool
minimal_match=False, # type: bool
Expand All @@ -209,24 +211,36 @@ def __init__(
header_fields: Header information used for parsing and splitting INFO
fields of the variant.
split_alternate_allele_info_fields: If True, splits fields with
field_count='A' (i.e., one value for each alternate) among alternates.
`field_count='A'` (i.e., one value for each alternate) among alternates.
allow_alternate_allele_info_mismatch: By default (when False), an error
will be raised for INFO fields with `field_count='A'` (i.e. one value
for each alternate base) that do not have the same cardinality as
alternate bases. If True, an error will not be raised and excess values
will be dropped or insufficient values will be set to null. Only
applicable if `split_alternate_allele_info_fields` is True.
annotation_fields: If provided, this is the list of INFO field names that
store variant annotations. The format of how annotations are stored and
their names are extracted from header_fields.
use_allele_num: If set, then "ALLELE_NUM" annotation is used to determine
use_allele_num: If True, then "ALLELE_NUM" annotation is used to determine
the index of the ALT that corresponds to an annotation set.
minimal_match: If set, then the --minimal mode of VEP is simulated for
minimal_match: If True, then the --minimal mode of VEP is simulated for
annotation ALT matching.
infer_annotation_types: If set, then warnings will be provided if header
infer_annotation_types: If True, then warnings will be provided if header
fields fail to contain Info type lines for annotation fields
counter_factory: If provided, it will be used to record counters (e.g. the
number of variants processed).
"""
self._header_fields = header_fields
self._split_alternate_allele_info_fields = (
split_alternate_allele_info_fields)
self._allow_alternate_allele_info_mismatch = (
allow_alternate_allele_info_mismatch)
self._annotation_field_set = set(annotation_fields or [])
cfactory = counter_factory or metrics_util.NoOpCounterFactory()
self._variant_counter = cfactory.create_counter(
_CounterEnum.VARIANT.value)
self._alternate_allele_info_mismatche_counter = cfactory.create_counter(
_CounterEnum.ALTERNATE_ALLELE_INFO_MISMATCH.value)
self._annotation_processor = _AnnotationProcessor(
annotation_fields, self._header_fields, cfactory, use_allele_num,
minimal_match, infer_annotation_types)
Expand Down Expand Up @@ -254,14 +268,21 @@ def create_processed_variant(self, variant):

def _add_per_alt_info(self, proc_var, field_name, variant_info_data):
# type: (ProcessedVariant, str, vcfio.VariantInfo) -> None
if len(variant_info_data) != len(proc_var._alternate_datas):
raise ValueError(
num_variant_infos = len(variant_info_data)
num_alternate_bases = len(proc_var._alternate_datas)
if num_variant_infos != num_alternate_bases:
error_message = (
'Per alternate INFO field "{}" does not have same cardinality as '
' number of alternates: {} vs {} in variant: "{}"'.format(
field_name, len(variant_info_data),
len(proc_var._alternate_datas), proc_var))
for alt_index, info in enumerate(variant_info_data):
proc_var._alternate_datas[alt_index]._info[field_name] = info
'number of alternates: {} vs {} in variant: "{}"'.format(
field_name, num_variant_infos, num_alternate_bases, proc_var))
self._alternate_allele_info_mismatche_counter.inc()
if self._allow_alternate_allele_info_mismatch:
logging.warning(error_message)
else:
raise ValueError(error_message)
for alt_index in range(min(num_variant_infos, num_alternate_bases)):
proc_var._alternate_datas[alt_index]._info[field_name] = (
variant_info_data[alt_index])

def create_alt_bases_field_schema(self):
# type: () -> bigquery.TableFieldSchema
Expand Down
51 changes: 51 additions & 0 deletions gcp_variant_transforms/libs/processed_variant_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,57 @@ def test_create_processed_variant_move_alt_info(self):
self.assertEqual(proc_var.alternate_data_list, [alt1, alt2])
self.assertFalse(proc_var.non_alt_info.has_key('A2'))

def test_create_processed_variant_move_alt_info_extra_values(self):
header_fields = vcf_header_util.make_header({'A1': '1', 'A2': 'A'})
variant = self._get_sample_variant()
# Add a value to `A2` (it only has two alternate bases, so this is invalid).
variant.info['A2'] = ['data1', 'data2', 'data3']

# Ensure error is raised by default.
factory = processed_variant.ProcessedVariantFactory(
header_fields,
split_alternate_allele_info_fields=True)
with self.assertRaises(ValueError):
_ = factory.create_processed_variant(variant)

# Try again with allow_alternate_allele_info_mismatch=True.
factory = processed_variant.ProcessedVariantFactory(
header_fields,
split_alternate_allele_info_fields=True,
allow_alternate_allele_info_mismatch=True)
proc_var = factory.create_processed_variant(variant)
alt1 = processed_variant.AlternateBaseData('A')
alt1._info = {'A2': 'data1'}
alt2 = processed_variant.AlternateBaseData('TT')
alt2._info = {'A2': 'data2'}
self.assertEqual(proc_var.alternate_data_list, [alt1, alt2])
self.assertFalse(proc_var.non_alt_info.has_key('A2'))

def test_create_processed_variant_move_alt_info_insufficient_values(self):
header_fields = vcf_header_util.make_header({'A1': '1', 'A2': 'A'})
variant = self._get_sample_variant()
# Remove a value from `A2` (it has two alternate bases, so this is invalid).
variant.info['A2'] = ['data1']

# Ensure error is raised by default.
factory = processed_variant.ProcessedVariantFactory(
header_fields,
split_alternate_allele_info_fields=True)
with self.assertRaises(ValueError):
_ = factory.create_processed_variant(variant)

# Try again with allow_alternate_allele_info_mismatch=True.
factory = processed_variant.ProcessedVariantFactory(
header_fields,
split_alternate_allele_info_fields=True,
allow_alternate_allele_info_mismatch=True)
proc_var = factory.create_processed_variant(variant)
alt1 = processed_variant.AlternateBaseData('A')
alt1._info = {'A2': 'data1'}
alt2 = processed_variant.AlternateBaseData('TT')
self.assertEqual(proc_var.alternate_data_list, [alt1, alt2])
self.assertFalse(proc_var.non_alt_info.has_key('A2'))

def _get_sample_variant_and_header_with_csq(self, additional_infos=None):
"""Provides a simple `Variant` and `VcfHeader` with info fields

Expand Down
6 changes: 6 additions & 0 deletions gcp_variant_transforms/testing/data/vcf/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ POS value for the first entry. It is used to test when `allow_malformed_records`
is enabled, failed VCF record reads will not raise errors and the BigQuery table
can still be generated.

`invalid-4.2-AF-mismatch.vcf` is created on `valid-4.2.vcf` by changing the
value of `AF` field in `20:17330` to have too many values (two instead of one)
and the value in `20:1234567` to have too few values (one instead of two).
By default, the job will fail as the cardinalities do not match, but it
should be successful when `allow_malformed_records` is enabled.

The folder `merge` is created to test the merge options. Three .vcf files are
created. `merge1.vcf` contains two samples, while `merge2.vcf` and `merge3.vcf`
contain one other sample, respectively. When MERGE_TO_CALLS is selected, the
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
##fileformat=VCFv4.2
##fileDate=20090805
##source=myImputationProgramV3.1
##phasing=partial
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant (with unïcodé)">
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of variant">
##FILTER=<ID=q10,Description="Quality below 10">
##FILTER=<ID=s50,Description="Less than 50% of samples have data (with \\ backslash)">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
##FORMAT=<ID=GL,Number=G,Type=Integer,Description="Genotype Likelihood">
##reference=file:/lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta
##contig=<ID=19,length=59128983,md5=1aacd71f30db8e561810913e0b72636d,species="Homo Sapiens">
##contig=<ID=20,length=63025520,md5=0dec9660ec1efaaf33281c0d5ea2560f,species="Homo Sapiens">
##contig=<ID=Y,length=63025520,md5=0dec9660ec1efaaf33281c0d5ea2560f,species="Homo Sapiens">
##SAMPLE=<ID=Blood,Genomes=Germline,Mixture=1.,Description="Patient germline genome">
##SAMPLE=<ID=TissueSample,Genomes=Germline;Tumor,Mixture=.3;.7,Description="Patient germline genome;Patient tumor genome">
##PEDIGREE=<Derived=ID2,Original=ID1>
##PEDIGREE=<Child=CHILD-GENOME-ID,Mother=MOTHER-GENOME-ID,Father=FATHER-GENOME-ID>
##pedigreeDB=url
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
19 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=Infinity;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017,0.3 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3
20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=-inf,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4
20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2
20 1234567 microsat1 GTC G,GTCTC 50 PASS NS=3;DP=9;AA=G;AF=0.1 GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3
20 2234567 . C [13:123457[ACGC 50 PASS SVTYPE=BÑD;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 1/1:40:3
20 2234568 . C .TC 50 PASS SVTYPE=BND;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 1/1:40:3
20 2234569 . C CT. 50 PASS SVTYPE=BND;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 1/1:40:3
20 3234569 . C <SYMBOLIC> 50 PASS END=3235677;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 1/1:40:3
20 4234569 . N .[13:123457[ 50 PASS SVTYPE=BND;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 ./.:40:3
20 5234569 . N [13:123457[. 50 PASS SVTYPE=BND;NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 1/1:40:3
Y 17330 . T A 3 q10 NS=3;DP=11;AF=nan GT:GL 0:0,49 0:0,3 1:41,0
HLA-A*01:01:01:01 1 . N T 50 PASS END=1;NS=3;DP=9;AA=G GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
[
{
"test_name": "option-allow-malformed-records-alternate-mismatch",
"table_name": "option_allow_malformed_records_alternate_mismatch",
"input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/invalid-4.2-AF-mismatch.vcf",
"allow_malformed_records": true,
"runner": "DirectRunner",
"zones": ["us-west1-b"],
"assertion_configs": [
{
"query": ["NUM_ROWS_QUERY"],
"expected_result": {"num_rows": 13}
},
{
"query": ["SUM_START_QUERY"],
"expected_result": {"sum_start": 23031929}
},
{
"query": ["SUM_END_QUERY"],
"expected_result": {"sum_end": 23033052}
},
{
"query": [
"SELECT alternate_bases.AF AS AF ",
"FROM {TABLE_NAME} t, t.alternate_bases as alternate_bases ",
"WHERE reference_name = '20' and start_position = 17329"
],
"expected_result": {"AF": 0.017}
},
{
"query": [
"SELECT SUM(alternate_bases.AF) AS AF_sum ",
"FROM {TABLE_NAME} t, t.alternate_bases as alternate_bases ",
"WHERE reference_name = '20' and start_position = 1234566"
],
"expected_result": {"AF_sum": 0.1}
},
{
"query": [
"SELECT COUNT(IFNULL(alternate_bases.AF, 0)) AS AF_count ",
"FROM {TABLE_NAME} t, t.alternate_bases as alternate_bases ",
"WHERE reference_name = '20' and start_position = 1234566"
],
"expected_result": {"AF_count": 2}
}
]
}
]
1 change: 1 addition & 0 deletions gcp_variant_transforms/vcf_to_bq.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ def run(argv=None):
processed_variant_factory = processed_variant.ProcessedVariantFactory(
header_fields,
known_args.split_alternate_allele_info_fields,
known_args.allow_malformed_records,
known_args.annotation_fields,
known_args.use_allele_num,
known_args.minimal_vep_alt_matching,
Expand Down