From 5f54d7d296eecb65c24619e17051bb991237e6b4 Mon Sep 17 00:00:00 2001 From: yifangchen Date: Mon, 5 Mar 2018 16:54:23 -0500 Subject: [PATCH 1/4] Add integration test for the following options: 1. reference_names: tested on valid-4.2.vcf. Used "20 Y" as the reference_names. 2. split_alternate_allele_info_fields: set this field to false and validate there is a column AF in BQ. 3. allow_malformed_records: created a new file "invalid-4.0-POS-empty" based on "valid-4.0.vcf" by removing the POS value for the first entry. 4. representative_header_file: created a new file "invalid-4.0-AF-field-removed.vcf" by removing "AF" field from "valid-4.0.vcf". 5. merge_option_none: created a new folder containing three new .vcf files for merge test. By default, there is no merge. Verified that it successfully read three files in the folder and form one BQ Table. 6. merge_option_move_to_calls: set variant_merge_stragegy to MOVE_TO_CALLS. Verified that the three files are merged on the keys. 7. info_keys_to_move_to_calls_regex: set this field to be "NS" and verified thatthe call.NS value is copied from each VCF file. 8. copy_quality_to_calls: set to true. Verified that call.quality is copied from each VCF file. 9. copy_filter_to_calls: set to true. Verified that call.filter is copied from each VCF file. All the created .vcf files are uploaded to the gcp-variant-transforms-test. Integration tests are passed. --- .../data/vcf/invalid-4.0-AF-field-removed.vcf | 22 +++++++ .../data/vcf/invalid-4.0-POS-empty.vcf | 23 ++++++++ .../testing/data/vcf/merge/merge1.vcf | 20 +++++++ .../testing/data/vcf/merge/merge2.vcf | 21 +++++++ .../testing/data/vcf/merge/merge3.vcf | 19 ++++++ .../merge_option_copy_filter_to_calls.json | 58 +++++++++++++++++++ .../merge_option_copy_quality_to_calls.json | 54 +++++++++++++++++ ...tion_info_keys_to_move_to_calls_regex.json | 54 +++++++++++++++++ .../merge_option_move_to_calls.json | 21 +++++++ .../small_tests/merge_option_none.json | 20 +++++++ .../option_allow_malformed_records.json | 21 +++++++ .../option_representative_header_file.json | 21 +++++++ ...on_split_alternate_allele_info_fields.json | 25 ++++++++ .../valid_4_2_reference_names.json | 21 +++++++ 14 files changed, 400 insertions(+) create mode 100644 gcp_variant_transforms/testing/data/vcf/invalid-4.0-AF-field-removed.vcf create mode 100644 gcp_variant_transforms/testing/data/vcf/invalid-4.0-POS-empty.vcf create mode 100644 gcp_variant_transforms/testing/data/vcf/merge/merge1.vcf create mode 100644 gcp_variant_transforms/testing/data/vcf/merge/merge2.vcf create mode 100644 gcp_variant_transforms/testing/data/vcf/merge/merge3.vcf create mode 100644 gcp_variant_transforms/testing/integration/small_tests/merge_option_copy_filter_to_calls.json create mode 100644 gcp_variant_transforms/testing/integration/small_tests/merge_option_copy_quality_to_calls.json create mode 100644 gcp_variant_transforms/testing/integration/small_tests/merge_option_info_keys_to_move_to_calls_regex.json create mode 100644 gcp_variant_transforms/testing/integration/small_tests/merge_option_move_to_calls.json create mode 100644 gcp_variant_transforms/testing/integration/small_tests/merge_option_none.json create mode 100644 gcp_variant_transforms/testing/integration/small_tests/option_allow_malformed_records.json create mode 100644 gcp_variant_transforms/testing/integration/small_tests/option_representative_header_file.json create mode 100644 gcp_variant_transforms/testing/integration/small_tests/option_split_alternate_allele_info_fields.json create mode 100644 gcp_variant_transforms/testing/integration/small_tests/valid_4_2_reference_names.json diff --git a/gcp_variant_transforms/testing/data/vcf/invalid-4.0-AF-field-removed.vcf b/gcp_variant_transforms/testing/data/vcf/invalid-4.0-AF-field-removed.vcf new file mode 100644 index 000000000..0f4ac50a5 --- /dev/null +++ b/gcp_variant_transforms/testing/data/vcf/invalid-4.0-AF-field-removed.vcf @@ -0,0 +1,22 @@ +##fileformat=VCFv4.0 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=1000GenomesPilot-NCBI36 +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. +20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 +20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 +20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 +19 1234567 microsat1 GTCT G,GTACT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 diff --git a/gcp_variant_transforms/testing/data/vcf/invalid-4.0-POS-empty.vcf b/gcp_variant_transforms/testing/data/vcf/invalid-4.0-POS-empty.vcf new file mode 100644 index 000000000..db8b66d19 --- /dev/null +++ b/gcp_variant_transforms/testing/data/vcf/invalid-4.0-POS-empty.vcf @@ -0,0 +1,23 @@ +##fileformat=VCFv4.0 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=1000GenomesPilot-NCBI36 +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +20 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. +20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 +20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 +20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 +19 1234567 microsat1 GTCT G,GTACT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 diff --git a/gcp_variant_transforms/testing/data/vcf/merge/merge1.vcf b/gcp_variant_transforms/testing/data/vcf/merge/merge1.vcf new file mode 100644 index 000000000..84563af8b --- /dev/null +++ b/gcp_variant_transforms/testing/data/vcf/merge/merge1.vcf @@ -0,0 +1,20 @@ +##fileformat=VCFv4.0 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=1000GenomesPilot-NCBI36 +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 +20 14370 rs6054257 G A 10 q10 NS=2;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 +20 17290 . T A 3 q10 NS=2;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 +19 1234567 microsat1 GTCT G,GTACT 50 PASS NS=2;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 diff --git a/gcp_variant_transforms/testing/data/vcf/merge/merge2.vcf b/gcp_variant_transforms/testing/data/vcf/merge/merge2.vcf new file mode 100644 index 000000000..e6da680ff --- /dev/null +++ b/gcp_variant_transforms/testing/data/vcf/merge/merge2.vcf @@ -0,0 +1,21 @@ +##fileformat=VCFv4.0 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=1000GenomesPilot-NCBI36 +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00003 +20 14370 rs6054257 G A 29 PASS NS=1;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 1/1:43:5:.,. +20 17330 . T A 3 q10 NS=1;DP=11;AF=0.017 GT:GQ:DP:HQ 0/0:41:3 +19 1234567 microsat2 GTCT G,GTACT 50 PASS NS=1;DP=9;AA=G GT:GQ:DP 1/1:40:3 diff --git a/gcp_variant_transforms/testing/data/vcf/merge/merge3.vcf b/gcp_variant_transforms/testing/data/vcf/merge/merge3.vcf new file mode 100644 index 000000000..225bbdf4f --- /dev/null +++ b/gcp_variant_transforms/testing/data/vcf/merge/merge3.vcf @@ -0,0 +1,19 @@ +##fileformat=VCFv4.0 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=1000GenomesPilot-NCBI36 +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00004 +20 14370 rs6054257 G A 30 PASS NS=1;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 + diff --git a/gcp_variant_transforms/testing/integration/small_tests/merge_option_copy_filter_to_calls.json b/gcp_variant_transforms/testing/integration/small_tests/merge_option_copy_filter_to_calls.json new file mode 100644 index 000000000..73c1b9133 --- /dev/null +++ b/gcp_variant_transforms/testing/integration/small_tests/merge_option_copy_filter_to_calls.json @@ -0,0 +1,58 @@ +{ + "test_name": "merge-option-copy-filter-to-calls", + "table_name": "merge_option_copy_filter_to_calls", + "input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/merge/*.vcf", + "variant_merge_strategy": "MOVE_TO_CALLS", + "copy_filter_to_calls": true, + "runner": "DataflowRunner", + "assertion_configs": [ + { + "query": ["NUM_ROWS_QUERY"], + "expected_result": {"num_rows": 4} + }, + { + "query": ["SUM_START_QUERY"], + "expected_result": {"sum_start": 1283553} + }, + { + "query": ["SUM_END_QUERY"], + "expected_result": {"sum_end": 1283560} + }, + { + "query": [ + "SELECT COUNT(0) AS num_rows ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00001' ", + "AND 'q10' IN UNNEST (call.filter)" + ], + "expected_result": {"num_rows": 1} + }, + { + "query": [ + "SELECT COUNT(0) AS num_rows ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00002' ", + "AND 'q10' IN UNNEST (call.filter)" + ], + "expected_result": {"num_rows": 1} + }, + { + "query": [ + "SELECT COUNT(0) AS num_rows ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00003' ", + "AND 'PASS' IN UNNEST (call.filter)" + ], + "expected_result": {"num_rows": 1} + }, + { + "query": [ + "SELECT COUNT(0) AS num_rows ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00004' ", + "AND 'PASS' IN UNNEST (call.filter)" + ], + "expected_result": {"num_rows": 1} + } + ] +} diff --git a/gcp_variant_transforms/testing/integration/small_tests/merge_option_copy_quality_to_calls.json b/gcp_variant_transforms/testing/integration/small_tests/merge_option_copy_quality_to_calls.json new file mode 100644 index 000000000..53af0528a --- /dev/null +++ b/gcp_variant_transforms/testing/integration/small_tests/merge_option_copy_quality_to_calls.json @@ -0,0 +1,54 @@ +{ + "test_name": "merge-option-copy-quality-to-calls", + "table_name": "merge_option_copy_quality_to_calls", + "input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/merge/*.vcf", + "variant_merge_strategy": "MOVE_TO_CALLS", + "copy_quality_to_calls": true, + "runner": "DataflowRunner", + "assertion_configs": [ + { + "query": ["NUM_ROWS_QUERY"], + "expected_result": {"num_rows": 4} + }, + { + "query": ["SUM_START_QUERY"], + "expected_result": {"sum_start": 1283553} + }, + { + "query": ["SUM_END_QUERY"], + "expected_result": {"sum_end": 1283560} + }, + { + "query": [ + "SELECT call.quality AS quality ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00001'" + ], + "expected_result": {"quality": 10.0} + }, + { + "query": [ + "SELECT call.quality AS quality ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00002'" + ], + "expected_result": {"quality": 10.0} + }, + { + "query": [ + "SELECT call.quality AS quality ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00003'" + ], + "expected_result": {"quality": 29.0} + }, + { + "query": [ + "SELECT call.quality AS quality ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00004'" + ], + "expected_result": {"quality": 30.0} + } + ] +} diff --git a/gcp_variant_transforms/testing/integration/small_tests/merge_option_info_keys_to_move_to_calls_regex.json b/gcp_variant_transforms/testing/integration/small_tests/merge_option_info_keys_to_move_to_calls_regex.json new file mode 100644 index 000000000..56feb6787 --- /dev/null +++ b/gcp_variant_transforms/testing/integration/small_tests/merge_option_info_keys_to_move_to_calls_regex.json @@ -0,0 +1,54 @@ +{ + "test_name": "merge-option-info-keys-to-move-to-calls-regex", + "table_name": "merge_option_info_keys_to_move_to_calls_regex", + "input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/merge/*.vcf", + "variant_merge_strategy": "MOVE_TO_CALLS", + "info_keys_to_move_to_calls_regex": "NS", + "runner": "DataflowRunner", + "assertion_configs": [ + { + "query": ["NUM_ROWS_QUERY"], + "expected_result": {"num_rows": 4} + }, + { + "query": ["SUM_START_QUERY"], + "expected_result": {"sum_start": 1283553} + }, + { + "query": ["SUM_END_QUERY"], + "expected_result": {"sum_end": 1283560} + }, + { + "query": [ + "SELECT call.NS AS num_samples ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00001'" + ], + "expected_result": {"num_samples": 2} + }, + { + "query": [ + "SELECT call.NS AS num_samples ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00002'" + ], + "expected_result": {"num_samples": 2} + }, + { + "query": [ + "SELECT call.NS AS num_samples ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00003'" + ], + "expected_result": {"num_samples": 1} + }, + { + "query": [ + "SELECT call.NS AS num_samples ", + "FROM {TABLE_NAME} AS t, t.call as call ", + "WHERE start_position = 14369 AND call.name ='NA00004'" + ], + "expected_result": {"num_samples": 1} + } + ] +} diff --git a/gcp_variant_transforms/testing/integration/small_tests/merge_option_move_to_calls.json b/gcp_variant_transforms/testing/integration/small_tests/merge_option_move_to_calls.json new file mode 100644 index 000000000..26c7e6106 --- /dev/null +++ b/gcp_variant_transforms/testing/integration/small_tests/merge_option_move_to_calls.json @@ -0,0 +1,21 @@ +{ + "test_name": "merge-option-move-to-calls", + "table_name": "merge_option_move_to_calls", + "input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/merge/*.vcf", + "runner": "DataflowRunner", + "variant_merge_strategy": "MOVE_TO_CALLS", + "assertion_configs": [ + { + "query": ["NUM_ROWS_QUERY"], + "expected_result": {"num_rows": 4} + }, + { + "query": ["SUM_START_QUERY"], + "expected_result": {"sum_start": 1283553} + }, + { + "query": ["SUM_END_QUERY"], + "expected_result": {"sum_end": 1283560} + } + ] +} diff --git a/gcp_variant_transforms/testing/integration/small_tests/merge_option_none.json b/gcp_variant_transforms/testing/integration/small_tests/merge_option_none.json new file mode 100644 index 000000000..468b6e142 --- /dev/null +++ b/gcp_variant_transforms/testing/integration/small_tests/merge_option_none.json @@ -0,0 +1,20 @@ +{ + "test_name": "merge-option-none", + "table_name": "merge_option_none", + "input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/merge/*.vcf", + "runner": "DataflowRunner", + "assertion_configs": [ + { + "query": ["NUM_ROWS_QUERY"], + "expected_result": {"num_rows": 7} + }, + { + "query": ["SUM_START_QUERY"], + "expected_result": {"sum_start": 2546857} + }, + { + "query": ["SUM_END_QUERY"], + "expected_result": {"sum_end": 2546870} + } + ] +} diff --git a/gcp_variant_transforms/testing/integration/small_tests/option_allow_malformed_records.json b/gcp_variant_transforms/testing/integration/small_tests/option_allow_malformed_records.json new file mode 100644 index 000000000..70bd187a7 --- /dev/null +++ b/gcp_variant_transforms/testing/integration/small_tests/option_allow_malformed_records.json @@ -0,0 +1,21 @@ +{ + "test_name": "option-allow-malformed-records", + "table_name": "option_allow_malformed_records", + "input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/invalid-4.0-POS-empty.vcf", + "allow_malformed_records": true, + "runner": "DataflowRunner", + "assertion_configs": [ + { + "query": ["NUM_ROWS_QUERY"], + "expected_result": {"num_rows": 4} + }, + { + "query": ["SUM_START_QUERY"], + "expected_result": {"sum_start": 3592826} + }, + { + "query": ["SUM_END_QUERY"], + "expected_result": {"sum_end": 3592833} + } + ] +} diff --git a/gcp_variant_transforms/testing/integration/small_tests/option_representative_header_file.json b/gcp_variant_transforms/testing/integration/small_tests/option_representative_header_file.json new file mode 100644 index 000000000..d8ee4a5ce --- /dev/null +++ b/gcp_variant_transforms/testing/integration/small_tests/option_representative_header_file.json @@ -0,0 +1,21 @@ +{ + "test_name": "option-representative-header-file", + "table_name": "option_representative_header_file", + "input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/invalid-4.0-AF-field-removed.vcf", + "representative_header_file": "gs://gcp-variant-transforms-testfiles/small_tests/valid-4.0.vcf", + "runner": "DataflowRunner", + "assertion_configs": [ + { + "query": ["NUM_ROWS_QUERY"], + "expected_result": {"num_rows": 5} + }, + { + "query": ["SUM_START_QUERY"], + "expected_result": {"sum_start": 3607195} + }, + { + "query": ["SUM_END_QUERY"], + "expected_result": {"sum_end": 3607203} + } + ] +} diff --git a/gcp_variant_transforms/testing/integration/small_tests/option_split_alternate_allele_info_fields.json b/gcp_variant_transforms/testing/integration/small_tests/option_split_alternate_allele_info_fields.json new file mode 100644 index 000000000..6fcc183ac --- /dev/null +++ b/gcp_variant_transforms/testing/integration/small_tests/option_split_alternate_allele_info_fields.json @@ -0,0 +1,25 @@ +{ + "test_name": "valid-4-2-split-alternate-allele-info-fields", + "table_name": "valid_4_2_split_alternate_allele_info_fields", + "input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/valid-4.2.vcf", + "split_alternate_allele_info_fields": false, + "runner": "DataflowRunner", + "assertion_configs": [ + { + "query": ["NUM_ROWS_QUERY"], + "expected_result": {"num_rows": 13} + }, + { + "query": ["SUM_START_QUERY"], + "expected_result": {"sum_start": 23031929} + }, + { + "query": ["SUM_END_QUERY"], + "expected_result": {"sum_end": 23033052} + }, + { + "query": ["SELECT COUNT(AF) AS num_rows FROM {TABLE_NAME}"], + "expected_result": {"num_rows": 13} + } + ] +} diff --git a/gcp_variant_transforms/testing/integration/small_tests/valid_4_2_reference_names.json b/gcp_variant_transforms/testing/integration/small_tests/valid_4_2_reference_names.json new file mode 100644 index 000000000..579604b6b --- /dev/null +++ b/gcp_variant_transforms/testing/integration/small_tests/valid_4_2_reference_names.json @@ -0,0 +1,21 @@ +{ + "test_name": "valid-4-2-reference-names", + "table_name": "valid_4_2_reference_names", + "input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/valid-4.2.vcf", + "reference_names": "20 Y", + "runner": "DataflowRunner", + "assertion_configs": [ + { + "query": ["NUM_ROWS_QUERY"], + "expected_result": {"num_rows": 11} + }, + { + "query": ["SUM_START_QUERY"], + "expected_result": {"sum_start": 23017560} + }, + { + "query": ["SUM_END_QUERY"], + "expected_result": {"sum_end": 23018681} + } + ] +} From 704a0a0544e192afcd3830d19ed5b348bfcd74c3 Mon Sep 17 00:00:00 2001 From: yifangchen Date: Thu, 8 Mar 2018 14:02:55 -0500 Subject: [PATCH 2/4] Add readme and correct some typos --- .../testing/data/vcf/README.md | 27 +++++++++++++++++++ ...tion_info_keys_to_move_to_calls_regex.json | 18 ++++++------- .../small_tests/valid_4_2_VEP.json | 2 +- 3 files changed, 37 insertions(+), 10 deletions(-) create mode 100644 gcp_variant_transforms/testing/data/vcf/README.md diff --git a/gcp_variant_transforms/testing/data/vcf/README.md b/gcp_variant_transforms/testing/data/vcf/README.md new file mode 100644 index 000000000..61d566b07 --- /dev/null +++ b/gcp_variant_transforms/testing/data/vcf/README.md @@ -0,0 +1,27 @@ +This file summarizes the contents and the purpose for each files/folder within +current folder. + +`valid-4.0.vcf`, `valid-4.0.vcf.gz`, `valid-4.0.vcf.bz2` are used to test +version 4.0 uncompressed, gzip, bzip VCF file, respectively. + +`valid-4.1-large.vcf`, `valid-4.1-large.vcf.gz` are used to test version 4.1 +uncompressed, gzip VCF file, respectively. + +`valid-4.2.vcf`, `valid-4.2.vcf.gz` are used to test version 4.2 uncompressed, +gzip VCF file, respectively. + +`invalid-4.0-AF-field-removed.vcf` is created by removing `AF` field definition +from the meta-information based on `valid-4.0.vcf`. It is used to test `AF` +field can be parsed correctly given a representative_header_file contains `AF`. + +`invalid-4.0-POS-empty.vcf` is created based on `valid-4.0.vcf` by removing the +POS value for the first entry. It is used to test when `allow_malformed_records` +is enabled, failed VCF record reads will not raise errors and the BQ can still +generate correctly. + +The folder `merge` is created to test the merge options. Three .vcf files are +created. `merge1.vcf` contains two samples, while `merge2.vcf` and `merge3.vcf` +contain one other sample, respectively. When MERGE_TO_CALLS is selected, the +variant call where `POS = 14370` is meant to merge across three files, while the +call where `POS = 1234567` is designed to be merged for `merge1.vcf` and +`merge2.vcf`. diff --git a/gcp_variant_transforms/testing/integration/small_tests/merge_option_info_keys_to_move_to_calls_regex.json b/gcp_variant_transforms/testing/integration/small_tests/merge_option_info_keys_to_move_to_calls_regex.json index 56feb6787..422335268 100644 --- a/gcp_variant_transforms/testing/integration/small_tests/merge_option_info_keys_to_move_to_calls_regex.json +++ b/gcp_variant_transforms/testing/integration/small_tests/merge_option_info_keys_to_move_to_calls_regex.json @@ -3,7 +3,7 @@ "table_name": "merge_option_info_keys_to_move_to_calls_regex", "input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/merge/*.vcf", "variant_merge_strategy": "MOVE_TO_CALLS", - "info_keys_to_move_to_calls_regex": "NS", + "info_keys_to_move_to_calls_regex": "^NS$", "runner": "DataflowRunner", "assertion_configs": [ { @@ -20,35 +20,35 @@ }, { "query": [ - "SELECT call.NS AS num_samples ", + "SELECT call.NS AS NS ", "FROM {TABLE_NAME} AS t, t.call as call ", "WHERE start_position = 14369 AND call.name ='NA00001'" ], - "expected_result": {"num_samples": 2} + "expected_result": {"NS": 2} }, { "query": [ - "SELECT call.NS AS num_samples ", + "SELECT call.NS AS NS ", "FROM {TABLE_NAME} AS t, t.call as call ", "WHERE start_position = 14369 AND call.name ='NA00002'" ], - "expected_result": {"num_samples": 2} + "expected_result": {"NS": 2} }, { "query": [ - "SELECT call.NS AS num_samples ", + "SELECT call.NS AS NS ", "FROM {TABLE_NAME} AS t, t.call as call ", "WHERE start_position = 14369 AND call.name ='NA00003'" ], - "expected_result": {"num_samples": 1} + "expected_result": {"NS": 1} }, { "query": [ - "SELECT call.NS AS num_samples ", + "SELECT call.NS AS NS ", "FROM {TABLE_NAME} AS t, t.call as call ", "WHERE start_position = 14369 AND call.name ='NA00004'" ], - "expected_result": {"num_samples": 1} + "expected_result": {"NS": 1} } ] } diff --git a/gcp_variant_transforms/testing/integration/small_tests/valid_4_2_VEP.json b/gcp_variant_transforms/testing/integration/small_tests/valid_4_2_VEP.json index 6c8867d13..be58d93e4 100644 --- a/gcp_variant_transforms/testing/integration/small_tests/valid_4_2_VEP.json +++ b/gcp_variant_transforms/testing/integration/small_tests/valid_4_2_VEP.json @@ -2,7 +2,7 @@ "test_name": "valid-4-2-vep", "table_name": "valid_4_2_VEP", "input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/valid-4.2_VEP.vcf", - "annotation_field": "CSQ", + "annotation_fields": "CSQ", "runner": "DataflowRunner", "assertion_configs": [ { From ad7e1e928ff2f6a6267670eb41c3cee53eb67aa1 Mon Sep 17 00:00:00 2001 From: yifangchen Date: Mon, 12 Mar 2018 11:08:11 -0400 Subject: [PATCH 3/4] Update README --- gcp_variant_transforms/testing/data/vcf/README.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/gcp_variant_transforms/testing/data/vcf/README.md b/gcp_variant_transforms/testing/data/vcf/README.md index 61d566b07..2cb6664e6 100644 --- a/gcp_variant_transforms/testing/data/vcf/README.md +++ b/gcp_variant_transforms/testing/data/vcf/README.md @@ -2,7 +2,9 @@ This file summarizes the contents and the purpose for each files/folder within current folder. `valid-4.0.vcf`, `valid-4.0.vcf.gz`, `valid-4.0.vcf.bz2` are used to test -version 4.0 uncompressed, gzip, bzip VCF file, respectively. +Variant Call Format version 4.0 files in the form of uncompressed, gzip and +bzip, respectively. For more details on the VCF format version specifications, +please refer to [VCF Specification](https://samtools.github.io/hts-specs/). `valid-4.1-large.vcf`, `valid-4.1-large.vcf.gz` are used to test version 4.1 uncompressed, gzip VCF file, respectively. @@ -12,16 +14,17 @@ gzip VCF file, respectively. `invalid-4.0-AF-field-removed.vcf` is created by removing `AF` field definition from the meta-information based on `valid-4.0.vcf`. It is used to test `AF` -field can be parsed correctly given a representative_header_file contains `AF`. +field can be parsed correctly given a representative_header_file containing +`AF`. `invalid-4.0-POS-empty.vcf` is created based on `valid-4.0.vcf` by removing the POS value for the first entry. It is used to test when `allow_malformed_records` -is enabled, failed VCF record reads will not raise errors and the BQ can still -generate correctly. +is enabled, failed VCF record reads will not raise errors and the BigQuery table +can still be generated. The folder `merge` is created to test the merge options. Three .vcf files are created. `merge1.vcf` contains two samples, while `merge2.vcf` and `merge3.vcf` contain one other sample, respectively. When MERGE_TO_CALLS is selected, the -variant call where `POS = 14370` is meant to merge across three files, while the -call where `POS = 1234567` is designed to be merged for `merge1.vcf` and +variant call with `POS = 14370` is meant to merge across three files, while the +call with `POS = 1234567` is designed to be merged for `merge1.vcf` and `merge2.vcf`. From 078a5f8a38747e84fb527c23786452fa39860ffa Mon Sep 17 00:00:00 2001 From: yifangchen Date: Tue, 13 Mar 2018 10:11:48 -0400 Subject: [PATCH 4/4] Add test case for merge options --- .../small_tests/merge_option_move_to_calls.json | 7 +++++++ .../testing/integration/small_tests/merge_option_none.json | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/gcp_variant_transforms/testing/integration/small_tests/merge_option_move_to_calls.json b/gcp_variant_transforms/testing/integration/small_tests/merge_option_move_to_calls.json index 26c7e6106..feb9f1f15 100644 --- a/gcp_variant_transforms/testing/integration/small_tests/merge_option_move_to_calls.json +++ b/gcp_variant_transforms/testing/integration/small_tests/merge_option_move_to_calls.json @@ -16,6 +16,13 @@ { "query": ["SUM_END_QUERY"], "expected_result": {"sum_end": 1283560} + }, + { + "query": [ + "SELECT COUNT(0) AS num_rows FROM {TABLE_NAME} ", + "WHERE start_position = 14369" + ], + "expected_result": {"num_rows": 1} } ] } diff --git a/gcp_variant_transforms/testing/integration/small_tests/merge_option_none.json b/gcp_variant_transforms/testing/integration/small_tests/merge_option_none.json index 468b6e142..fdd5f705b 100644 --- a/gcp_variant_transforms/testing/integration/small_tests/merge_option_none.json +++ b/gcp_variant_transforms/testing/integration/small_tests/merge_option_none.json @@ -15,6 +15,13 @@ { "query": ["SUM_END_QUERY"], "expected_result": {"sum_end": 2546870} + }, + { + "query": [ + "SELECT COUNT(0) AS num_rows FROM {TABLE_NAME} ", + "WHERE start_position = 14369" + ], + "expected_result": {"num_rows": 3} } ] }