From b9e0f1bf6b649b661fd1cf2e6e7bc9b995d44774 Mon Sep 17 00:00:00 2001 From: yifangchen Date: Wed, 17 Oct 2018 15:38:29 -0400 Subject: [PATCH] add annotation header This reverts commit 943993955c5f77dc67715d716210c48f92cd504d. --- .../libs/annotation/annotation_parser.py | 10 ++++++++++ .../libs/annotation/annotation_parser_test.py | 8 ++++++++ .../libs/bigquery_vcf_schema_converter.py | 20 ++++++++++++++++--- .../bigquery_vcf_schema_converter_test.py | 20 +++++++++++++++++++ 4 files changed, 55 insertions(+), 3 deletions(-) diff --git a/gcp_variant_transforms/libs/annotation/annotation_parser.py b/gcp_variant_transforms/libs/annotation/annotation_parser.py index 3637c229f..f526539cb 100644 --- a/gcp_variant_transforms/libs/annotation/annotation_parser.py +++ b/gcp_variant_transforms/libs/annotation/annotation_parser.py @@ -410,3 +410,13 @@ def extract_annotation_names(description): 'Expected at least one | in annotation description {}'.format( description)) return annotation_names[1:] + + +def reconstruct_annotation_description(annotation_names): + # type: (List[str]) -> str + """Reconstructs annotation description. + + For example, given ['Allele', 'Consequence', 'IMPACT', 'SYMBOL', 'Gene'], it + returns 'Format: Allele|Consequence|IMPACT|SYMBOL|Gene'. + """ + return ' '.join(['Format:', '|'.join(annotation_names)]) diff --git a/gcp_variant_transforms/libs/annotation/annotation_parser_test.py b/gcp_variant_transforms/libs/annotation/annotation_parser_test.py index 9e16f8542..2913feaf0 100644 --- a/gcp_variant_transforms/libs/annotation/annotation_parser_test.py +++ b/gcp_variant_transforms/libs/annotation/annotation_parser_test.py @@ -78,3 +78,11 @@ def test_reconstruct_annotation_str_missing_annotation_names(self): annotation_maps = [{u'Consequence': u'upstream_gene_variant'}] with self.assertRaises(ValueError): list(str_builder.reconstruct_annotation_str('CSQ', annotation_maps)) + + def test_reconstruct_annotation_description(self): + expected_description = 'Format: Allele|Consequence|IMPACT|SYMBOL|Gene' + self.assertEqual( + expected_description, + annotation_parser.reconstruct_annotation_description( + ['Allele', 'Consequence', 'IMPACT', 'SYMBOL', 'Gene']) + ) diff --git a/gcp_variant_transforms/libs/bigquery_vcf_schema_converter.py b/gcp_variant_transforms/libs/bigquery_vcf_schema_converter.py index 38a97fd90..3ea714063 100644 --- a/gcp_variant_transforms/libs/bigquery_vcf_schema_converter.py +++ b/gcp_variant_transforms/libs/bigquery_vcf_schema_converter.py @@ -31,6 +31,7 @@ from gcp_variant_transforms.libs import bigquery_sanitizer from gcp_variant_transforms.libs import vcf_field_conflict_resolver # pylint: disable=unused-import from gcp_variant_transforms.libs import vcf_reserved_fields +from gcp_variant_transforms.libs.annotation import annotation_parser from gcp_variant_transforms.libs.variant_merge import variant_merge_strategy # pylint: disable=unused-import _Format = parser._Format @@ -278,12 +279,19 @@ def _add_info_fields_from_alternate_bases(schema, from the mode (REPEATED) in reserved field definition. Any `Record` field within alternate bases is considered as an annotation - field, and the annotation fields are skipped. + field. """ for field in schema.fields: - if (field.name in _CONSTANT_ALTERNATE_BASES_FIELDS or - field.type == bigquery_util.TableFieldConstants.TYPE_RECORD): + if field.name in _CONSTANT_ALTERNATE_BASES_FIELDS: continue + elif field.type == bigquery_util.TableFieldConstants.TYPE_RECORD: + infos.update({field.name: _Info( + id=field.name, + num=parser.field_counts[vcfio.MISSING_FIELD_VALUE], + type=bigquery_util._VcfHeaderTypeConstants.STRING, + desc=_remove_special_characters(_get_annotation_description(field)), + source=None, + version=None)}) elif (field.name in vcf_reserved_fields.INFO_FIELDS.keys() and not allow_incompatible_schema): reserved_definition = vcf_reserved_fields.INFO_FIELDS.get(field.name) @@ -338,5 +346,11 @@ def _validate_reserved_field_mode(field_schema, reserved_definition): .format(field_schema.name, schema_mode, reserved_mode)) +def _get_annotation_description(field): + return ' '.join([field.description, + annotation_parser.reconstruct_annotation_description( + [sub_field.name for sub_field in field.fields])]) + + def _remove_special_characters(description): return description.replace('\n', ' ') diff --git a/gcp_variant_transforms/libs/bigquery_vcf_schema_converter_test.py b/gcp_variant_transforms/libs/bigquery_vcf_schema_converter_test.py index 8b6459ead..353502a36 100644 --- a/gcp_variant_transforms/libs/bigquery_vcf_schema_converter_test.py +++ b/gcp_variant_transforms/libs/bigquery_vcf_schema_converter_test.py @@ -447,6 +447,26 @@ def test_generate_header_fields_from_schema(self): expected_header = vcf_header_io.VcfHeader(infos=infos, formats=formats) self.assertEqual(header, expected_header) + def test_generate_header_fields_from_schema_with_annotation(self): + sample_schema = bigquery_schema_util.get_sample_table_schema( + with_annotation_fields=True) + header = bigquery_vcf_schema_converter.generate_header_fields_from_schema( + sample_schema) + + infos = OrderedDict([ + ('AF', Info('AF', field_counts['A'], 'Float', 'desc', None, None)), + ('CSQ', Info('CSQ', field_counts['.'], 'String', + 'desc Format: Consequence|IMPACT', None, None)), + ('AA', Info('AA', 1, 'String', 'desc', None, None)), + ('IFR', Info('IFR', field_counts['.'], 'Float', 'desc', None, None)), + ('IS', Info('IS', 1, 'String', 'desc', None, None))]) + formats = OrderedDict([ + ('FB', parser._Format('FB', 0, 'Flag', 'desc')), + ('GQ', parser._Format('GQ', 1, 'Integer', + 'desc'))]) + expected_header = vcf_header_io.VcfHeader(infos=infos, formats=formats) + self.assertEqual(header, expected_header) + def test_generate_header_fields_from_schema_date_type(self): schema = bigquery.TableSchema() schema.fields.append(bigquery.TableFieldSchema(