Skip to content

Commit

Permalink
Merge 843c67e into 61721cf
Browse files Browse the repository at this point in the history
  • Loading branch information
mlin committed Dec 5, 2019
2 parents 61721cf + 843c67e commit 36ed691
Show file tree
Hide file tree
Showing 7 changed files with 211 additions and 10 deletions.
3 changes: 3 additions & 0 deletions src/cli_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -771,6 +771,7 @@ static const char* config_presets_yml = R"eof(
genotyper_config:
required_dp: 0
revise_genotypes: true
allow_partial_data: true
more_PL: true
trim_uncalled_alleles: true
liftover_fields:
Expand Down Expand Up @@ -817,6 +818,7 @@ static const char* config_presets_yml = R"eof(
genotyper_config:
required_dp: 0
revise_genotypes: true
allow_partial_data: true
more_PL: true
trim_uncalled_alleles: true
liftover_fields:
Expand Down Expand Up @@ -863,6 +865,7 @@ static const char* config_presets_yml = R"eof(
genotyper_config:
required_dp: 0
revise_genotypes: true
allow_partial_data: true
more_PL: true
trim_uncalled_alleles: true
liftover_fields:
Expand Down
22 changes: 13 additions & 9 deletions src/genotyper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -108,12 +108,12 @@ Status revise_genotypes(const genotyper_config& cfg, const unified_site& us,
for (const auto& sample : sample_mapping) {
assert(sample.first < vr.p->n_sample);
auto al1 = vr.gt.v[sample.first*2];
if (!bcf_gt_is_missing(al1) && vr.allele_mapping.at(bcf_gt_allele(al1)) == -1) {
if (!bcf_gt_is_missing(al1) && vr.allele_mapping.at(bcf_gt_allele(al1)) == -1 && !us.monoallelic) {
needs_revision = true;
break;
}
auto al2 = vr.gt.v[sample.first*2+1];
if (!bcf_gt_is_missing(al2) && vr.allele_mapping.at(bcf_gt_allele(al2)) == -1) {
if (!bcf_gt_is_missing(al2) && vr.allele_mapping.at(bcf_gt_allele(al2)) == -1 && !us.monoallelic) {
needs_revision = true;
break;
}
Expand Down Expand Up @@ -155,7 +155,9 @@ Status revise_genotypes(const genotyper_config& cfg, const unified_site& us,
for (unsigned gt = 0; gt < gt_log_prior.size(); gt++) {
auto als = diploid::gt_alleles(gt);
if (vr.allele_mapping.at(als.first) == -1 || vr.allele_mapping.at(als.second) == -1) {
gt_log_prior[gt] = lost_log_prior;
if (!us.monoallelic) {
gt_log_prior[gt] = lost_log_prior;
}
} else if (als.first > 0 && als.first == als.second) {
gt_log_prior[gt] = log(std::max(us.alleles[vr.allele_mapping[als.first]].frequency,
cfg.min_assumed_allele_frequency));
Expand Down Expand Up @@ -310,12 +312,14 @@ Status prepare_dataset_records(const genotyper_config& cfg, const unified_site&
// We exclude 'haploid' records from this treatment for now as observed
// examples (e.g. in Strelka2 gVCFs) don't seem to require it, but this
// may need to be configurable in the future.
for (const auto& rp : all_records) {
if (rp->is_ref && !rp->was_haploid) {
for (unsigned i = 0; i < 2*rp->p->n_sample; i++) {
if (bcf_gt_is_missing(rp->gt[i]) || bcf_gt_allele(rp->gt[i]) != 0) {
rnc = NoCallReason::InputNonCalled;
return Status::OK();
if (variant_records.empty() || !cfg.allow_partial_data) {
for (const auto& rp : all_records) {
if (rp->is_ref && !rp->was_haploid) {
for (unsigned i = 0; i < 2*rp->p->n_sample; i++) {
if (bcf_gt_is_missing(rp->gt[i]) || bcf_gt_allele(rp->gt[i]) != 0) {
rnc = NoCallReason::InputNonCalled;
return Status::OK();
}
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/types.cc
Original file line number Diff line number Diff line change
Expand Up @@ -801,7 +801,7 @@ Status retained_format_field::of_yaml(const YAML::Node& yaml, unique_ptr<retaine
V(n_ignore_non_variants.IsScalar(), "invalid ignore_non_variants value");
try {
ignore_non_variants = n_ignore_non_variants.as<bool>();
} catch (YAML::Exception e) {
} catch (YAML::Exception& e) {
V(false, "invalid ignore_non_variants value");
}
}
Expand Down
1 change: 1 addition & 0 deletions test/data/gvcf_test_cases/deepvariant2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -866,6 +866,7 @@ config_preset: DeepVariant
genotyper_config:
required_dp: 0
revise_genotypes: false
allow_partial_data: true
more_PL: true
liftover_fields:
- orig_names: [MIN_DP, DP]
Expand Down
71 changes: 71 additions & 0 deletions test/data/gvcf_test_cases/dv_1000G_chr17_2517459.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
readme: |
1000G dv_1000G_chr17_2517459
A site exercising code path for a non-called reference band near a variant record (with allow_partial_data=true)
input:
header : |-
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##FILTER=<ID=RefCall,Description="Genotyping model thinks this site is reference.">
##FILTER=<ID=LowQual,Description="Confidence in this variant being real is below calling threshold."
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth of all passing filters reads.">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block.">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth for each allele">
##FORMAT=<ID=VAF,Number=A,Type=Float,Description="Variant allele fractions.">
##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype likelihoods, log10 encoded">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Genotype likelihoods, Phred encoded">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##contig=<ID=chr17,length=83257441>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
body:
- NA12878.gvcf: |
NA12878
chr17 2517445 . A <*> 0 . END=2517455 GT:GQ:MIN_DP:PL 0/0:50:30:0,66,899
chr17 2517456 . G <*> 0 . END=2517456 GT:GQ:MIN_DP:PL ./.:0:29:62,0,632
chr17 2517457 . A <*> 0 . END=2517458 GT:GQ:MIN_DP:PL 0/0:50:31:0,93,929
chr17 2517459 . A G,<*> 38.9 PASS . GT:GQ:DP:AD:VAF:PL 0/1:38:28:14,12,0:0.428571,0:38,0,47,990,990,990
chr17 2517460 . A <*> 0 . END=2517461 GT:GQ:MIN_DP:PL 0/0:50:28:0,84,839
- NA19771.gvcf: |
NA19771
chr17 2517445 . A <*> 0 . END=2517455 GT:GQ:MIN_DP:PL 0/0:50:32:0,66,899
chr17 2517456 . GAAA G,<*> 47 PASS . GT:GQ:DP:AD:VAF:PL 0/1:47:32:18,12,0:0.375,0:46,0,65,990,990,990
chr17 2517460 . A <*> 0 . END=2517461 GT:GQ:MIN_DP:PL 0/0:50:20:0,60,599
config_preset: DeepVariant

truth_unified_sites:
- range: {ref: chr17, beg: 2517456, end: 2517459}
in_target: {ref: chr17, beg: 1, end: 1000000000}
alleles:
- dna: GAAA
- dna: G
quality: 46
frequency: 0.25
- dna: GAAG
normalized:
range: {beg: 2517459, end: 2517459}
dna: G
quality: 38
frequency: 0.25
quality: 46


truth_output_vcf:
- truth.vcf: |
##fileformat=VCFv4.2
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency estimate for each alternate allele">
##INFO=<ID=AQ,Number=A,Type=Integer,Description="Allele Quality score reflecting evidence from all samples (Phred scale)">
##FILTER=<ID=PASS,Description="All filters passed">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Genotype likelihoods, Phred encoded">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=RNC,Number=G,Type=Character,Description="Reason for No Call in GT: . = n/a, M = Missing data, P = Partial data, D = insufficient Depth of coverage, - = unrepresentable overlapping deletion, L = Lost/unrepresentable allele (other than deletion), U = multiple Unphased variants present, O = multiple Overlapping variants present">
##contig=<ID=chr17,length=83257441>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA19771
chr17 2517456 chr17_2517456_GAAA_G;chr17_2517459_A_G GAAA G,GAAG 46 . AF=0.25,0.25;AQ=46,38 GT:DP:AD:GQ:PL:RNC 0/2:28:14,0,12:38:38,990,990,0,990,47:.. 0/1:32:18,12,0:47:46,0,65,990,990,990:..
110 changes: 110 additions & 0 deletions test/data/gvcf_test_cases/dv_1000G_chr17_8375536.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
readme: |
1000G dv_1000G_chr17_8375536
A site exercising code path for a non-called reference band near a variant record (with allow_partial_data=true)
input:
header : |-
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##FILTER=<ID=RefCall,Description="Genotyping model thinks this site is reference.">
##FILTER=<ID=LowQual,Description="Confidence in this variant being real is below calling threshold."
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth of all passing filters reads.">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block.">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth for each allele">
##FORMAT=<ID=VAF,Number=A,Type=Float,Description="Variant allele fractions.">
##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype likelihoods, log10 encoded">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Genotype likelihoods, Phred encoded">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##contig=<ID=chr17,length=83257441>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
body:
- NA12878.gvcf: |
NA12878
chr17 8375505 . A <*> 0 . END=8375535 GT:GQ:MIN_DP:PL 0/0:50:34:0,117,1169
chr17 8375536 . ATT A,AT,<*> 64.6 PASS . GT:GQ:DP:AD:VAF:PL 1/2:26:36:2,11,23,0:0.305556,0.638889,0:64,44,55,44,0,26,990,990,990,990
chr17 8375539 . T <*> 0 . END=8375691 GT:GQ:MIN_DP:PL 0/0:48:25:0,75,749
- NA19771.gvcf: |
NA19771
chr17 8375505 . A <*> 0 . END=8375535 GT:GQ:MIN_DP:PL 0/0:50:26:0,69,929
chr17 8375536 . A G,<*> 56.7 PASS . GT:GQ:DP:AD:VAF:PL 1/1:43:26:1,25,0:0.961538,0:56,42,0,990,990,990
chr17 8375537 . T <*> 0 . END=8375592 GT:GQ:MIN_DP:PL 0/0:50:25:0,78,779
- NA19772.gvcf: |
NA19772
chr17 8375505 . A <*> 0 . END=8375536 GT:GQ:MIN_DP:PL 0/0:50:26:0,69,929
chr17 8375537 . T G,<*> 56.7 PASS . GT:GQ:DP:AD:VAF:PL 1/1:43:26:1,25,0:0.961538,0:56,42,0,990,990,990
chr17 8375538 . T C,<*> 56.7 PASS . GT:GQ:DP:AD:VAF:PL 1/1:43:26:1,25,0:0.961538,0:56,42,0,990,990,990
chr17 8375539 . T <*> 0 . END=8375592 GT:GQ:MIN_DP:PL 0/0:50:25:0,78,779
config_preset: DeepVariant

truth_unified_sites:
- range: {ref: chr17, beg: 8375536, end: 8375536}
alleles:
- dna: A
- dna: G
quality: 56
frequency: 0.333332986
lost_allele_frequency: 0.333333999
quality: 56
- range: {ref: chr17, beg: 8375536, end: 8375537}
alleles:
- dna: AT
- dna: A
quality: 44
frequency: 0.166666999
quality: 44
monoallelic: true
unification:
- range: {beg: 8375536, end: 8375538}
dna: AT
to: 1
- range: {ref: chr17, beg: 8375536, end: 8375538}
alleles:
- dna: ATT
- dna: A
quality: 26
frequency: 0.166666999
quality: 26
monoallelic: true
- range: {ref: chr17, beg: 8375537, end: 8375537}
alleles:
- dna: T
- dna: G
quality: 56
frequency: 0.333332986
lost_allele_frequency: 0.333333999
quality: 56
- range: {ref: chr17, beg: 8375538, end: 8375538}
alleles:
- dna: T
- dna: C
quality: 56
frequency: 0.333332986
lost_allele_frequency: 0.166666999
quality: 56


truth_output_vcf:
- truth.vcf: |
##fileformat=VCFv4.2
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency estimate for each alternate allele">
##INFO=<ID=AQ,Number=A,Type=Integer,Description="Allele Quality score reflecting evidence from all samples (Phred scale)">
##FILTER=<ID=PASS,Description="All filters passed">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Genotype likelihoods, Phred encoded">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=RNC,Number=G,Type=Character,Description="Reason for No Call in GT: . = n/a, M = Missing data, P = Partial data, D = insufficient Depth of coverage, - = unrepresentable overlapping deletion, L = Lost/unrepresentable allele (other than deletion), U = multiple Unphased variants present, O = multiple Overlapping variants present">
##contig=<ID=chr17,length=83257441>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA19771 NA19772
chr17 8375536 chr17_8375536_A_G A G 56 . AF=0.333333;AQ=56 GT:DP:AD:GQ:PL:RNC ./.:36:2,0:26:0,0,0:-- 1/1:26:1,25:37:56,42,0:.. 0/0:26:26,0:50:0,69,929:..
chr17 8375536 chr17_8375536_AT_A AT A 44 MONOALLELIC AF=0.166667;AQ=44 GT:DP:AD:GQ:PL:RNC ./1:36:.,23:26:0,0,0:1. ./.:25:.,0:42:0,0,0:11 ./.:26:.,0:42:0,0,0:11
chr17 8375536 chr17_8375536_ATT_A ATT A 26 MONOALLELIC AF=0.166667;AQ=26 GT:DP:AD:GQ:PL:RNC ./1:36:.,11:26:0,0,0:1. ./.:25:.,0:42:0,0,0:11 ./.:26:.,0:42:0,0,0:11
chr17 8375537 chr17_8375537_T_G T G 56 . AF=0.333333;AQ=56 GT:DP:AD:GQ:PL:RNC ./.:36:2,0:26:0,0,0:-- 0/0:25:25,0:50:0,78,779:.. 1/1:26:1,25:37:56,42,0:..
chr17 8375538 chr17_8375538_T_C T C 56 . AF=0.333333;AQ=56 GT:DP:AD:GQ:PL:RNC ./.:36:2,0:26:0,0,0:-- 0/0:25:25,0:50:0,78,779:.. 1/1:26:1,25:37:56,42,0:..
12 changes: 12 additions & 0 deletions test/gvcf_test_cases.cc
Original file line number Diff line number Diff line change
Expand Up @@ -853,6 +853,18 @@ TEST_CASE("dv_1000G_chr21_5583275") {
GVCFTestCase("dv_1000G_chr21_5583275", v_formats, v_infos, true).perform_gvcf_test();
}

TEST_CASE("dv_1000G_chr17_2517459") {
vector<string> v_formats = {"DP", "GT", "GQ", "PL", "AD", "RNC"};
vector<string> v_infos = {"ANR","AF","AQ"};
GVCFTestCase("dv_1000G_chr17_2517459", v_formats, v_infos, false).perform_gvcf_test();
}

TEST_CASE("dv_1000G_chr17_8375536") {
vector<string> v_formats = {"DP", "GT", "GQ", "PL", "AD", "RNC"};
vector<string> v_infos = {"ANR","AF","AQ"};
GVCFTestCase("dv_1000G_chr17_8375536", v_formats, v_infos, false).perform_gvcf_test();
}

TEST_CASE("strelka2") {
vector<string> v_formats = {"DP", "GT", "GQ", "AD", "FT"};
vector<string> v_infos = {"AF","AQ"};
Expand Down

0 comments on commit 36ed691

Please sign in to comment.