Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions gcp_variant_transforms/libs/annotation/annotation_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,3 +410,13 @@ def extract_annotation_names(description):
'Expected at least one | in annotation description {}'.format(
description))
return annotation_names[1:]


def reconstruct_annotation_description(annotation_names):
# type: (List[str]) -> str
"""Reconstructs annotation description.

For example, given ['Allele', 'Consequence', 'IMPACT', 'SYMBOL', 'Gene'], it
returns 'Format: Allele|Consequence|IMPACT|SYMBOL|Gene'.
"""
return ' '.join(['Format:', '|'.join(annotation_names)])
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,11 @@ def test_reconstruct_annotation_str_missing_annotation_names(self):
annotation_maps = [{u'Consequence': u'upstream_gene_variant'}]
with self.assertRaises(ValueError):
list(str_builder.reconstruct_annotation_str('CSQ', annotation_maps))

def test_reconstruct_annotation_description(self):
expected_description = 'Format: Allele|Consequence|IMPACT|SYMBOL|Gene'
self.assertEqual(
expected_description,
annotation_parser.reconstruct_annotation_description(
['Allele', 'Consequence', 'IMPACT', 'SYMBOL', 'Gene'])
)
20 changes: 17 additions & 3 deletions gcp_variant_transforms/libs/bigquery_vcf_schema_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from gcp_variant_transforms.libs import bigquery_sanitizer
from gcp_variant_transforms.libs import vcf_field_conflict_resolver # pylint: disable=unused-import
from gcp_variant_transforms.libs import vcf_reserved_fields
from gcp_variant_transforms.libs.annotation import annotation_parser
from gcp_variant_transforms.libs.variant_merge import variant_merge_strategy # pylint: disable=unused-import

_Format = parser._Format
Expand Down Expand Up @@ -278,12 +279,19 @@ def _add_info_fields_from_alternate_bases(schema,
from the mode (REPEATED) in reserved field definition.

Any `Record` field within alternate bases is considered as an annotation
field, and the annotation fields are skipped.
field.
"""
for field in schema.fields:
if (field.name in _CONSTANT_ALTERNATE_BASES_FIELDS or
field.type == bigquery_util.TableFieldConstants.TYPE_RECORD):
if field.name in _CONSTANT_ALTERNATE_BASES_FIELDS:
continue
elif field.type == bigquery_util.TableFieldConstants.TYPE_RECORD:
infos.update({field.name: _Info(
id=field.name,
num=parser.field_counts[vcfio.MISSING_FIELD_VALUE],
type=bigquery_util._VcfHeaderTypeConstants.STRING,
desc=_remove_special_characters(_get_annotation_description(field)),
source=None,
version=None)})
elif (field.name in vcf_reserved_fields.INFO_FIELDS.keys() and
not allow_incompatible_schema):
reserved_definition = vcf_reserved_fields.INFO_FIELDS.get(field.name)
Expand Down Expand Up @@ -338,5 +346,11 @@ def _validate_reserved_field_mode(field_schema, reserved_definition):
.format(field_schema.name, schema_mode, reserved_mode))


def _get_annotation_description(field):
return ' '.join([field.description,
annotation_parser.reconstruct_annotation_description(
[sub_field.name for sub_field in field.fields])])


def _remove_special_characters(description):
return description.replace('\n', ' ') if description else ''
20 changes: 20 additions & 0 deletions gcp_variant_transforms/libs/bigquery_vcf_schema_converter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,26 @@ def test_generate_header_fields_from_schema(self):
expected_header = vcf_header_io.VcfHeader(infos=infos, formats=formats)
self.assertEqual(header, expected_header)

def test_generate_header_fields_from_schema_with_annotation(self):
sample_schema = bigquery_schema_util.get_sample_table_schema(
with_annotation_fields=True)
header = bigquery_vcf_schema_converter.generate_header_fields_from_schema(
sample_schema)

infos = OrderedDict([
('AF', Info('AF', field_counts['A'], 'Float', 'desc', None, None)),
('CSQ', Info('CSQ', field_counts['.'], 'String',
'desc Format: Consequence|IMPACT', None, None)),
('AA', Info('AA', 1, 'String', 'desc', None, None)),
('IFR', Info('IFR', field_counts['.'], 'Float', 'desc', None, None)),
('IS', Info('IS', 1, 'String', 'desc', None, None))])
formats = OrderedDict([
('FB', parser._Format('FB', 0, 'Flag', 'desc')),
('GQ', parser._Format('GQ', 1, 'Integer',
'desc'))])
expected_header = vcf_header_io.VcfHeader(infos=infos, formats=formats)
self.assertEqual(header, expected_header)

def test_generate_header_fields_from_schema_date_type(self):
schema = bigquery.TableSchema()
schema.fields.append(bigquery.TableFieldSchema(
Expand Down