diff --git a/gcp_variant_transforms/testing/data/vcf/README.md b/gcp_variant_transforms/testing/data/vcf/README.md new file mode 100644 index 000000000..2cb6664e6 --- /dev/null +++ b/gcp_variant_transforms/testing/data/vcf/README.md @@ -0,0 +1,30 @@ +This file summarizes the contents and the purpose for each files/folder within +current folder. + +`valid-4.0.vcf`, `valid-4.0.vcf.gz`, `valid-4.0.vcf.bz2` are used to test +Variant Call Format version 4.0 files in the form of uncompressed, gzip and +bzip, respectively. For more details on the VCF format version specifications, +please refer to [VCF Specification](https://samtools.github.io/hts-specs/). + +`valid-4.1-large.vcf`, `valid-4.1-large.vcf.gz` are used to test version 4.1 +uncompressed, gzip VCF file, respectively. + +`valid-4.2.vcf`, `valid-4.2.vcf.gz` are used to test version 4.2 uncompressed, +gzip VCF file, respectively. + +`invalid-4.0-AF-field-removed.vcf` is created by removing `AF` field definition +from the meta-information based on `valid-4.0.vcf`. It is used to test `AF` +field can be parsed correctly given a representative_header_file containing +`AF`. + +`invalid-4.0-POS-empty.vcf` is created based on `valid-4.0.vcf` by removing the +POS value for the first entry. It is used to test when `allow_malformed_records` +is enabled, failed VCF record reads will not raise errors and the BigQuery table +can still be generated. + +The folder `merge` is created to test the merge options. Three .vcf files are +created. `merge1.vcf` contains two samples, while `merge2.vcf` and `merge3.vcf` +contain one other sample, respectively. When MERGE_TO_CALLS is selected, the +variant call with `POS = 14370` is meant to merge across three files, while the +call with `POS = 1234567` is designed to be merged for `merge1.vcf` and +`merge2.vcf`. diff --git a/gcp_variant_transforms/testing/data/vcf/invalid-4.0-AF-field-removed.vcf b/gcp_variant_transforms/testing/data/vcf/invalid-4.0-AF-field-removed.vcf new file mode 100644 index 000000000..0f4ac50a5 --- /dev/null +++ b/gcp_variant_transforms/testing/data/vcf/invalid-4.0-AF-field-removed.vcf @@ -0,0 +1,22 @@ +##fileformat=VCFv4.0 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=1000GenomesPilot-NCBI36 +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. +20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 +20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 +20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 +19 1234567 microsat1 GTCT G,GTACT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 diff --git a/gcp_variant_transforms/testing/data/vcf/invalid-4.0-POS-empty.vcf b/gcp_variant_transforms/testing/data/vcf/invalid-4.0-POS-empty.vcf new file mode 100644 index 000000000..db8b66d19 --- /dev/null +++ b/gcp_variant_transforms/testing/data/vcf/invalid-4.0-POS-empty.vcf @@ -0,0 +1,23 @@ +##fileformat=VCFv4.0 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=1000GenomesPilot-NCBI36 +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +20 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. +20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 +20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 +20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 +19 1234567 microsat1 GTCT G,GTACT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 diff --git a/gcp_variant_transforms/testing/data/vcf/merge/merge1.vcf b/gcp_variant_transforms/testing/data/vcf/merge/merge1.vcf new file mode 100644 index 000000000..84563af8b --- /dev/null +++ b/gcp_variant_transforms/testing/data/vcf/merge/merge1.vcf @@ -0,0 +1,20 @@ +##fileformat=VCFv4.0 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=1000GenomesPilot-NCBI36 +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 +20 14370 rs6054257 G A 10 q10 NS=2;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 +20 17290 . T A 3 q10 NS=2;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 +19 1234567 microsat1 GTCT G,GTACT 50 PASS NS=2;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 diff --git a/gcp_variant_transforms/testing/data/vcf/merge/merge2.vcf b/gcp_variant_transforms/testing/data/vcf/merge/merge2.vcf new file mode 100644 index 000000000..e6da680ff --- /dev/null +++ b/gcp_variant_transforms/testing/data/vcf/merge/merge2.vcf @@ -0,0 +1,21 @@ +##fileformat=VCFv4.0 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=1000GenomesPilot-NCBI36 +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00003 +20 14370 rs6054257 G A 29 PASS NS=1;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 1/1:43:5:.,. +20 17330 . T A 3 q10 NS=1;DP=11;AF=0.017 GT:GQ:DP:HQ 0/0:41:3 +19 1234567 microsat2 GTCT G,GTACT 50 PASS NS=1;DP=9;AA=G GT:GQ:DP 1/1:40:3 diff --git a/gcp_variant_transforms/testing/data/vcf/merge/merge3.vcf b/gcp_variant_transforms/testing/data/vcf/merge/merge3.vcf new file mode 100644 index 000000000..225bbdf4f --- /dev/null +++ b/gcp_variant_transforms/testing/data/vcf/merge/merge3.vcf @@ -0,0 +1,19 @@ +##fileformat=VCFv4.0 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=1000GenomesPilot-NCBI36 +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00004 +20 14370 rs6054257 G A 30 PASS NS=1;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 + diff --git a/gcp_variant_transforms/testing/integration/small_tests/merge_option_copy_filter_to_calls.json b/gcp_variant_transforms/testing/integration/small_tests/merge_option_copy_filter_to_calls.json new file mode 100644 index 000000000..73c1b9133 --- /dev/null +++ b/gcp_variant_transforms/testing/integration/small_tests/merge_option_copy_filter_to_calls.json @@ -0,0 +1,58 @@ +{ + "test_name": "merge-option-copy-filter-to-calls", + "table_name": "merge_option_copy_filter_to_calls", + "input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/merge/*.vcf", + "variant_merge_strategy": "MOVE_TO_CALLS", + "copy_filter_to_calls": true, + "runner": "DataflowRunner", + "assertion_configs": [ + { + "query": ["NUM_ROWS_QUERY"], + "expected_result": {"num_rows": 4} + }, + { + "query": ["SUM_START_QUERY"], + "expected_result": {"sum_start": 1283553} + }, + { + "query": ["SUM_END_QUERY"], + "expected_result": {"sum_end": 1283560} + }, + { + "query": [ + "SELECT COUNT(0) AS num_rows ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00001' ", + "AND 'q10' IN UNNEST (call.filter)" + ], + "expected_result": {"num_rows": 1} + }, + { + "query": [ + "SELECT COUNT(0) AS num_rows ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00002' ", + "AND 'q10' IN UNNEST (call.filter)" + ], + "expected_result": {"num_rows": 1} + }, + { + "query": [ + "SELECT COUNT(0) AS num_rows ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00003' ", + "AND 'PASS' IN UNNEST (call.filter)" + ], + "expected_result": {"num_rows": 1} + }, + { + "query": [ + "SELECT COUNT(0) AS num_rows ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00004' ", + "AND 'PASS' IN UNNEST (call.filter)" + ], + "expected_result": {"num_rows": 1} + } + ] +} diff --git a/gcp_variant_transforms/testing/integration/small_tests/merge_option_copy_quality_to_calls.json b/gcp_variant_transforms/testing/integration/small_tests/merge_option_copy_quality_to_calls.json new file mode 100644 index 000000000..53af0528a --- /dev/null +++ b/gcp_variant_transforms/testing/integration/small_tests/merge_option_copy_quality_to_calls.json @@ -0,0 +1,54 @@ +{ + "test_name": "merge-option-copy-quality-to-calls", + "table_name": "merge_option_copy_quality_to_calls", + "input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/merge/*.vcf", + "variant_merge_strategy": "MOVE_TO_CALLS", + "copy_quality_to_calls": true, + "runner": "DataflowRunner", + "assertion_configs": [ + { + "query": ["NUM_ROWS_QUERY"], + "expected_result": {"num_rows": 4} + }, + { + "query": ["SUM_START_QUERY"], + "expected_result": {"sum_start": 1283553} + }, + { + "query": ["SUM_END_QUERY"], + "expected_result": {"sum_end": 1283560} + }, + { + "query": [ + "SELECT call.quality AS quality ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00001'" + ], + "expected_result": {"quality": 10.0} + }, + { + "query": [ + "SELECT call.quality AS quality ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00002'" + ], + "expected_result": {"quality": 10.0} + }, + { + "query": [ + "SELECT call.quality AS quality ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00003'" + ], + "expected_result": {"quality": 29.0} + }, + { + "query": [ + "SELECT call.quality AS quality ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00004'" + ], + "expected_result": {"quality": 30.0} + } + ] +} diff --git a/gcp_variant_transforms/testing/integration/small_tests/merge_option_info_keys_to_move_to_calls_regex.json b/gcp_variant_transforms/testing/integration/small_tests/merge_option_info_keys_to_move_to_calls_regex.json new file mode 100644 index 000000000..422335268 --- /dev/null +++ b/gcp_variant_transforms/testing/integration/small_tests/merge_option_info_keys_to_move_to_calls_regex.json @@ -0,0 +1,54 @@ +{ + "test_name": "merge-option-info-keys-to-move-to-calls-regex", + "table_name": "merge_option_info_keys_to_move_to_calls_regex", + "input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/merge/*.vcf", + "variant_merge_strategy": "MOVE_TO_CALLS", + "info_keys_to_move_to_calls_regex": "^NS$", + "runner": "DataflowRunner", + "assertion_configs": [ + { + "query": ["NUM_ROWS_QUERY"], + "expected_result": {"num_rows": 4} + }, + { + "query": ["SUM_START_QUERY"], + "expected_result": {"sum_start": 1283553} + }, + { + "query": ["SUM_END_QUERY"], + "expected_result": {"sum_end": 1283560} + }, + { + "query": [ + "SELECT call.NS AS NS ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00001'" + ], + "expected_result": {"NS": 2} + }, + { + "query": [ + "SELECT call.NS AS NS ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00002'" + ], + "expected_result": {"NS": 2} + }, + { + "query": [ + "SELECT call.NS AS NS ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00003'" + ], + "expected_result": {"NS": 1} + }, + { + "query": [ + "SELECT call.NS AS NS ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00004'" + ], + "expected_result": {"NS": 1} + } + ] +} diff --git a/gcp_variant_transforms/testing/integration/small_tests/merge_option_move_to_calls.json b/gcp_variant_transforms/testing/integration/small_tests/merge_option_move_to_calls.json new file mode 100644 index 000000000..feb9f1f15 --- /dev/null +++ b/gcp_variant_transforms/testing/integration/small_tests/merge_option_move_to_calls.json @@ -0,0 +1,28 @@ +{ + "test_name": "merge-option-move-to-calls", + "table_name": "merge_option_move_to_calls", + "input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/merge/*.vcf", + "runner": "DataflowRunner", + "variant_merge_strategy": "MOVE_TO_CALLS", + "assertion_configs": [ + { + "query": ["NUM_ROWS_QUERY"], + "expected_result": {"num_rows": 4} + }, + { + "query": ["SUM_START_QUERY"], + "expected_result": {"sum_start": 1283553} + }, + { + "query": ["SUM_END_QUERY"], + "expected_result": {"sum_end": 1283560} + }, + { + "query": [ + "SELECT COUNT(0) AS num_rows FROM {TABLE_NAME} ", + "WHERE start_position = 14369" + ], + "expected_result": {"num_rows": 1} + } + ] +} diff --git a/gcp_variant_transforms/testing/integration/small_tests/merge_option_none.json b/gcp_variant_transforms/testing/integration/small_tests/merge_option_none.json new file mode 100644 index 000000000..fdd5f705b --- /dev/null +++ b/gcp_variant_transforms/testing/integration/small_tests/merge_option_none.json @@ -0,0 +1,27 @@ +{ + "test_name": "merge-option-none", + "table_name": "merge_option_none", + "input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/merge/*.vcf", + "runner": "DataflowRunner", + "assertion_configs": [ + { + "query": ["NUM_ROWS_QUERY"], + "expected_result": {"num_rows": 7} + }, + { + "query": ["SUM_START_QUERY"], + "expected_result": {"sum_start": 2546857} + }, + { + "query": ["SUM_END_QUERY"], + "expected_result": {"sum_end": 2546870} + }, + { + "query": [ + "SELECT COUNT(0) AS num_rows FROM {TABLE_NAME} ", + "WHERE start_position = 14369" + ], + "expected_result": {"num_rows": 3} + } + ] +} diff --git a/gcp_variant_transforms/testing/integration/small_tests/option_allow_malformed_records.json b/gcp_variant_transforms/testing/integration/small_tests/option_allow_malformed_records.json new file mode 100644 index 000000000..70bd187a7 --- /dev/null +++ b/gcp_variant_transforms/testing/integration/small_tests/option_allow_malformed_records.json @@ -0,0 +1,21 @@ +{ + "test_name": "option-allow-malformed-records", + "table_name": "option_allow_malformed_records", + "input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/invalid-4.0-POS-empty.vcf", + "allow_malformed_records": true, + "runner": "DataflowRunner", + "assertion_configs": [ + { + "query": ["NUM_ROWS_QUERY"], + "expected_result": {"num_rows": 4} + }, + { + "query": ["SUM_START_QUERY"], + "expected_result": {"sum_start": 3592826} + }, + { + "query": ["SUM_END_QUERY"], + "expected_result": {"sum_end": 3592833} + } + ] +} diff --git a/gcp_variant_transforms/testing/integration/small_tests/option_representative_header_file.json b/gcp_variant_transforms/testing/integration/small_tests/option_representative_header_file.json new file mode 100644 index 000000000..d8ee4a5ce --- /dev/null +++ b/gcp_variant_transforms/testing/integration/small_tests/option_representative_header_file.json @@ -0,0 +1,21 @@ +{ + "test_name": "option-representative-header-file", + "table_name": "option_representative_header_file", + "input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/invalid-4.0-AF-field-removed.vcf", + "representative_header_file": "gs://gcp-variant-transforms-testfiles/small_tests/valid-4.0.vcf", + "runner": "DataflowRunner", + "assertion_configs": [ + { + "query": ["NUM_ROWS_QUERY"], + "expected_result": {"num_rows": 5} + }, + { + "query": ["SUM_START_QUERY"], + "expected_result": {"sum_start": 3607195} + }, + { + "query": ["SUM_END_QUERY"], + "expected_result": {"sum_end": 3607203} + } + ] +} diff --git a/gcp_variant_transforms/testing/integration/small_tests/option_split_alternate_allele_info_fields.json b/gcp_variant_transforms/testing/integration/small_tests/option_split_alternate_allele_info_fields.json new file mode 100644 index 000000000..6fcc183ac --- /dev/null +++ b/gcp_variant_transforms/testing/integration/small_tests/option_split_alternate_allele_info_fields.json @@ -0,0 +1,25 @@ +{ + "test_name": "valid-4-2-split-alternate-allele-info-fields", + "table_name": "valid_4_2_split_alternate_allele_info_fields", + "input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/valid-4.2.vcf", + "split_alternate_allele_info_fields": false, + "runner": "DataflowRunner", + "assertion_configs": [ + { + "query": ["NUM_ROWS_QUERY"], + "expected_result": {"num_rows": 13} + }, + { + "query": ["SUM_START_QUERY"], + "expected_result": {"sum_start": 23031929} + }, + { + "query": ["SUM_END_QUERY"], + "expected_result": {"sum_end": 23033052} + }, + { + "query": ["SELECT COUNT(AF) AS num_rows FROM {TABLE_NAME}"], + "expected_result": {"num_rows": 13} + } + ] +} diff --git a/gcp_variant_transforms/testing/integration/small_tests/valid_4_2_VEP.json b/gcp_variant_transforms/testing/integration/small_tests/valid_4_2_VEP.json index 6c8867d13..be58d93e4 100644 --- a/gcp_variant_transforms/testing/integration/small_tests/valid_4_2_VEP.json +++ b/gcp_variant_transforms/testing/integration/small_tests/valid_4_2_VEP.json @@ -2,7 +2,7 @@ "test_name": "valid-4-2-vep", "table_name": "valid_4_2_VEP", "input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/valid-4.2_VEP.vcf", - "annotation_field": "CSQ", + "annotation_fields": "CSQ", "runner": "DataflowRunner", "assertion_configs": [ { diff --git a/gcp_variant_transforms/testing/integration/small_tests/valid_4_2_reference_names.json b/gcp_variant_transforms/testing/integration/small_tests/valid_4_2_reference_names.json new file mode 100644 index 000000000..579604b6b --- /dev/null +++ b/gcp_variant_transforms/testing/integration/small_tests/valid_4_2_reference_names.json @@ -0,0 +1,21 @@ +{ + "test_name": "valid-4-2-reference-names", + "table_name": "valid_4_2_reference_names", + "input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/valid-4.2.vcf", + "reference_names": "20 Y", + "runner": "DataflowRunner", + "assertion_configs": [ + { + "query": ["NUM_ROWS_QUERY"], + "expected_result": {"num_rows": 11} + }, + { + "query": ["SUM_START_QUERY"], + "expected_result": {"sum_start": 23017560} + }, + { + "query": ["SUM_END_QUERY"], + "expected_result": {"sum_end": 23018681} + } + ] +}