From 38622d1d13b5c1a13274fe815edc7ff83074e0f2 Mon Sep 17 00:00:00 2001 From: yifangchen Date: Fri, 8 Mar 2019 09:36:18 -0500 Subject: [PATCH] Fix issue in BQ to VCF: - When parsing the genomic region filter, it should be case sensitive. For instance, chrY:1-1000, should have ref=chrY, rather than ref=chry, Or the SELECT query will return the wrong results. This method is reused by partitioning, in which the matching of the reference should be case insensitive. --- gcp_variant_transforms/libs/genomic_region_parser.py | 4 ++-- gcp_variant_transforms/libs/genomic_region_parser_test.py | 4 ++-- gcp_variant_transforms/libs/variant_partition.py | 1 + 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/gcp_variant_transforms/libs/genomic_region_parser.py b/gcp_variant_transforms/libs/genomic_region_parser.py index b9740cfbd..a08acf14a 100644 --- a/gcp_variant_transforms/libs/genomic_region_parser.py +++ b/gcp_variant_transforms/libs/genomic_region_parser.py @@ -43,7 +43,7 @@ def _parse_position(pos_str): matched = _REGION_LITERAL_REGEXP.match(genomic_region) if matched: ref_name, start, end = matched.groups() - ref_name = ref_name.strip().lower() + ref_name = ref_name.strip() start = _parse_position(start) end = _parse_position(end) if start < 0: @@ -54,7 +54,7 @@ def _parse_position(pos_str): 'vs {}'.format(end, start)) else: # This region includes a full chromosome - ref_name = genomic_region.strip().lower() + ref_name = genomic_region.strip() start = 0 end = _DEFAULT_END_POSITION return ref_name, start, end diff --git a/gcp_variant_transforms/libs/genomic_region_parser_test.py b/gcp_variant_transforms/libs/genomic_region_parser_test.py index 0bea9a7d1..128c97409 100644 --- a/gcp_variant_transforms/libs/genomic_region_parser_test.py +++ b/gcp_variant_transforms/libs/genomic_region_parser_test.py @@ -29,8 +29,8 @@ def test_parse_genomic_regions(self): ('chr1', 1000000, 2000000) ) self.assertEqual( - genomic_region_parser.parse_genomic_region('chr1:1000000-2000000'), - ('chr1', 1000000, 2000000) + genomic_region_parser.parse_genomic_region('chrY:1000000-2000000'), + ('chrY', 1000000, 2000000) ) self.assertEqual( genomic_region_parser.parse_genomic_region('chr'), diff --git a/gcp_variant_transforms/libs/variant_partition.py b/gcp_variant_transforms/libs/variant_partition.py index de354f87a..489aeface 100644 --- a/gcp_variant_transforms/libs/variant_partition.py +++ b/gcp_variant_transforms/libs/variant_partition.py @@ -189,6 +189,7 @@ def _is_residual_partition(regions): for r in regions: ref_name, start, end = genomic_region_parser.parse_genomic_region(r) + ref_name = ref_name.lower() self._ref_name_to_partitions_map[ref_name].add_region( start, end, partition_index)