Skip to content

Commit

Permalink
Add fixes to improve vcf_caller in calling mode
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 266165858
  • Loading branch information
sgoe1 authored and Copybara-Service committed Aug 29, 2019
1 parent 741296f commit c027549
Show file tree
Hide file tree
Showing 11 changed files with 94 additions and 16 deletions.
16 changes: 13 additions & 3 deletions deepvariant/postprocess_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,10 @@
flags.DEFINE_string(
'gvcf_outfile', None,
'Optional. Destination path where we will write the Genomic VCF output.')
flags.DEFINE_boolean(
'group_variants', True, 'If using vcf_caller and multi-allelic sites are '
'split across multiple lines in VCF, set to False so that variants are not '
'grouped when transforming CallVariantsOutput to Variants.')
flags.DEFINE_boolean(
'vcf_stats_report', False, 'Optional. Output a visual report (HTML) of '
'statistics about the output VCF, along with statistics JSON files, '
Expand Down Expand Up @@ -669,7 +673,7 @@ def _sort_grouped_variants(group):
def _transform_call_variants_output_to_variants(input_sorted_tfrecord_path,
qual_filter,
multi_allelic_qual_filter,
sample_name):
sample_name, group_variants):
"""Yields Variant protos in sorted order from CallVariantsOutput protos.
Variants present in the input TFRecord are converted to Variant protos, with
Expand All @@ -685,14 +689,19 @@ def _transform_call_variants_output_to_variants(input_sorted_tfrecord_path,
multi_allelic_qual_filter: double. The qual value below which to filter
multi-allelic variants.
sample_name: str. Sample name to write to VCF file.
group_variants: bool. If true, group variants that have same start and end
position.
Yields:
Variant protos in sorted order representing the CallVariantsOutput calls.
"""
group_fn = None
if group_variants:
group_fn = lambda x: variant_utils.variant_range(x.variant)
for _, group in itertools.groupby(
tfrecord.read_tfrecords(
input_sorted_tfrecord_path, proto=deepvariant_pb2.CallVariantsOutput),
lambda x: variant_utils.variant_range(x.variant)):
group_fn):
outputs = _sort_grouped_variants(group)
canonical_variant, predictions = merge_predictions(
outputs, multi_allelic_qual_filter)
Expand Down Expand Up @@ -969,7 +978,8 @@ def main(argv=()):
input_sorted_tfrecord_path=temp.name,
qual_filter=FLAGS.qual_filter,
multi_allelic_qual_filter=FLAGS.multi_allelic_qual_filter,
sample_name=sample_name)
sample_name=sample_name,
group_variants=FLAGS.group_variants)
variant_generator = haplotypes.maybe_resolve_conflicting_variants(
independent_variants)

Expand Down
32 changes: 25 additions & 7 deletions deepvariant/postprocess_variants_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,14 @@ def _simple_gv(ref_name, start, ref_base):
gls=[-2.99, 0, -0.99] + [postprocess_variants._GVCF_ALT_ALLELE_GL] * 3)


def _read_contents(path, decompress=False):
with tf.gfile.GFile(path, 'rb') as fin:
contents = fin.read()
if decompress:
contents = gzip.GzipFile(path, fileobj=io.BytesIO(contents)).read()
return contents


def _create_nonvariant(ref_name, start, end, ref_base):
"""Creates a non-variant Variant record for testing.
Expand Down Expand Up @@ -297,13 +305,6 @@ def test_call_end2end(self, compressed_inputs_and_outputs):

postprocess_variants.main(['postprocess_variants.py'])

def _read_contents(path, decompress=False):
with tf.gfile.GFile(path, 'rb') as fin:
contents = fin.read()
if decompress:
contents = gzip.GzipFile(path, fileobj=io.BytesIO(contents)).read()
return contents

self.assertEqual(
_read_contents(FLAGS.outfile, compressed_inputs_and_outputs),
_read_contents(testdata.GOLDEN_POSTPROCESS_OUTPUT))
Expand All @@ -315,6 +316,23 @@ def _read_contents(path, decompress=False):
self.assertTrue(tf.gfile.Exists(FLAGS.outfile + '.tbi'))
self.assertTrue(tf.gfile.Exists(FLAGS.gvcf_outfile + '.tbi'))

@flagsaver.FlagSaver
def test_group_variants(self):
FLAGS.infile = testdata.GOLDEN_VCF_CALLER_POSTPROCESS_INPUT
FLAGS.ref = testdata.CHR20_FASTA
FLAGS.outfile = create_outfile('calls.vcf')

FLAGS.group_variants = True
with self.assertRaisesRegexp(
ValueError, '`call_variants_outputs` did not pass sanity check.'):
postprocess_variants.main(['postprocess_variants.py'])

FLAGS.group_variants = False
postprocess_variants.main(['postprocess_variants.py'])
self.assertEqual(
_read_contents(FLAGS.outfile),
_read_contents(testdata.GOLDEN_VCF_CALLER_POSTPROCESS_OUTPUT))

@parameterized.parameters(False, True)
def test_build_index(self, use_csi):
vcf_file_gz = os.path.join(absltest.get_default_test_tmpdir(),
Expand Down
8 changes: 8 additions & 0 deletions deepvariant/testdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ def deepvariant_testdata(filename):
WS_ALLELE_COUNT_LINEAR_MODEL = None
WS_ALLELE_COUNT_LINEAR_MODEL_PCKL = None
WS_VARIANT_READS_THRESHOLD_MODEL = None
GOLDEN_VCF_CALLER_POSTPROCESS_INPUT = None
GOLDEN_VCF_CALLER_POSTPROCESS_OUTPUT = None

N_GOLDEN_TRAINING_EXAMPLES = 49
N_GOLDEN_CALLING_EXAMPLES = 82
Expand Down Expand Up @@ -110,6 +112,8 @@ def init():
global WS_ALLELE_COUNT_LINEAR_MODEL
global WS_ALLELE_COUNT_LINEAR_MODEL_PCKL
global WS_VARIANT_READS_THRESHOLD_MODEL
global GOLDEN_VCF_CALLER_POSTPROCESS_INPUT
global GOLDEN_VCF_CALLER_POSTPROCESS_OUTPUT

CHR20_FASTA = deepvariant_testdata('ucsc.hg19.chr20.unittest.fasta.gz')
CHR20_BAM = deepvariant_testdata('NA12878_S1.chr20.10_10p1mb.bam')
Expand Down Expand Up @@ -143,6 +147,10 @@ def init():
'window_selector_allele_count_linear.pckl')
WS_VARIANT_READS_THRESHOLD_MODEL = deepvariant_testdata(
'window_selector_variant_read_threshold.pbtxt')
GOLDEN_VCF_CALLER_POSTPROCESS_INPUT = deepvariant_testdata(
'golden.vcf_caller_postprocess_single_site_input.tfrecord.gz')
GOLDEN_VCF_CALLER_POSTPROCESS_OUTPUT = deepvariant_testdata(
'golden.vcf_caller_postprocess_single_site_output.vcf')

# For CustomizedClassesVariantLabeler.
global CUSTOMIZED_CLASSES_GOLDEN_TRAINING_EXAMPLES
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##FILTER=<ID=RefCall,Description="Genotyping model thinks this site is reference.">
##FILTER=<ID=LowQual,Description="Confidence in this variant being real is below calling threshold.">
##INFO=<ID=END,Number=1,Type=Integer,Description="End position (for use with symbolic alleles)">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Conditional genotype quality">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block.">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth for each allele">
##FORMAT=<ID=VAF,Number=A,Type=Float,Description="Variant allele fractions.">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Phred-scaled genotype likelihoods rounded to the closest integer">
##contig=<ID=chr20,length=63025520>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG002
chr20 59777010 pbsv.INS.45658 C CCTCACTCAGGACCCTCCATGGTGTCACCTCCATCCTCACTCAGGACCCTCCATGGTGTCACCTCCATCCTCACCAGGATCCCCTGTAAGTGTCACCTCCATCCTCACCAGGACCCTCCATGAGTGTCACCTCCATCCTCACTCAGGACCCTCCATGGTGTCACCTCCATCCTCA 1 RefCall . GT:GQ:DP:AD:PL ./.:7:249:192,57:0,11,7
chr20 59777010 pbsv.INS.45657 C CCTCACTCAGGACCCTCCATGAGTGCCACCTCCATCCTCACCAGGATCCCCTGTAAGTGTCACCTCCATCCTCACCAGGACCCTCCATGAGTGTCACCTCCATCCTCAGGACCCTCCATGAGTGTCACCTCCATCCTCACTCAGGACCCTCCATGGTGTCACCTCCATCCTCACTCAGGACCCTCCATGGTGTCACCTCCATCCTCACTCAGGACCCTCCATGGTGTCACCTCCATCCTCACTCAGGACCCTCCATGGTGTCACCTCCATCCTCACCAGGACCCTCCATGAGTGTCACCTCCATCCTCACTCAGGACCCTCCATGGTGTCACCTCCATCCTCACTCAGGACCCTCCATGGGTGTCACCTCCATCCTCACTCAGGACCCTCCATGGTGTCACCGCCATCCTCA 1 RefCall . GT:GQ:DP:AD:PL ./.:7:240:194,46:0,11,7
chr20 59777010 pbsv.INS.45656 C CCTCACTCAGGACCCTCCATGGTGTCACCTCCATCCTCACTCAGGACCCTCCATGGTGTCACCTCCATCCTCACTCAGGACCCTCCATGGGTGTCACCTCCATCCTCA 1 RefCall . GT:GQ:DP:AD:PL ./.:7:240:207,33:0,11,7
chr20 59777553 pbsv.INS.45659 G GTGTCACCTCCATCCTCACTCAGGACCCTCCATGGGTGTCACCTCCATCCTCACTCAGGACCCTCCATGAGTGTCACCTCCATCCTCAGGACCCTCCATGAGTGTCACCTCCATCCTCACTCAGGACCCTCCATGGTGTCACCTCCATCCTCACTCAGGACCCTCCATGGTGTCACCTCCATCCTCACTCAGGACCCTCCATGGTGTCACCTCCATCCTCACTCAGGACACTCCATGGTGTCACCTCCATCCTTACTCAGGACCCTCCATGGTGTCACCGCCATCCTCACTCAGGACCCTCCATGAGTGCCACCTCCATCCTCACCAGGATCCCCTGTAAGTGTCACCTCCATCCTCACTCAGGACCCTCCATGAGTGTCACCTCCATCCTCAGGACCCTCCATGGTGTCACCTCCATCCTCAGGACCCTCCATGAGTGTCACCTCCATCCTCACTCAGGACCCTCCATGGTGTCACCTCCATCCTCACTCAGGACCCTCCATGGGTGTCACCTCCATCCTCACTCAGGACCCTCCATGGGTGTCACCTCCATCCTCACTCAGGACCCTCCATGAGTGTCACCTCCATCCTCACTCAGGACCCTCCATGGTGTCACCTCCATCCTCACTCAGGACCCTCCATGGTGTCACCTCCATCCTCACTCAGGACACTCCATGGTGTCACCTCCATCCTTACTCAGGACCCTCCATGGTGTCACCGCCATCCTCACTCAGGACCCTCCATGAGTGCCACCTCCATCCTCACCAGGATCCCCTGTAA 0.7 RefCall . GT:GQ:DP:AD:PL ./.:8:258:232,26:0,12,8
chr20 59804673 pbsv.DEL.45636 GATATATATATATATATATATATATATATATATATATATATATATAT G 4.5 PASS . GT:GQ:DP:AD:PL 1/1:2:184:99,85:0,1,0
chr20 59848584 pbsv.INS.45660 T TTGGGGAGTACCGTGTGCAGTTTGGGGGAGTATTGGGGGAGTACCATGTGCAGTTTGGGGGAGGACCATGTGCAGTTTAGGGGAGTACCGTGTGCAGTTTGGGGGAGTATTGGGGAAGTACCATGTGCAGTTTGGGGGAATACCATGTGCAACTTGGGGGAGTACCATGCACAGCTT 0.7 RefCall . GT:GQ:DP:AD:PL ./.:9:202:121,81:0,13,9
chr20 59858360 pbsv.INS.45661 G GGTGTGTGATGTACGTGCATTTGCACGCGTGTGCTGTGGC 0.6 RefCall . GT:GQ:DP:AD:PL ./.:9:242:136,106:0,13,9
chr20 59858389 pbsv.INS.45662 C CGTGTGTGATGTGTGTGCGTTTGCACGCGCGTGCTGTGGCGTGTGTGATGTGTGTGCGTTTGCACGCGCGTGCTGTGGCGTGTGTGATGTGTGTGCGTTTGCACGCGCGTGCTGTGGCGTGTGTGATGTGTGTGCGTTTGCACGCGCGTGCTGTGGCGTGTGTGAT 0.9 RefCall . GT:GQ:DP:AD:PL ./.:7:239:155,84:0,11,7
chr20 59865298 pbsv.DEL.45637 GTTTATCCCGATATCCAATGAGACTTGCTGGA G 1 RefCall . GT:GQ:DP:AD:PL ./.:7:209:169,40:0,11,7
chr20 59884412 pbsv.DEL.45638 CGAGTTTCCAGAACCATGCAGGTGTGCATAGAGGTGTTCAGCTGTCTTCCTT C 0.6 RefCall . GT:GQ:DP:AD:PL ./.:9:222:120,102:0,13,9
chr20 59904402 pbsv.INS.45691 T TCCCTCGTGGGTCTGGGACGCGGGTCGAGGTGGCTTTAAGCCCAGTGTGCTGCGGTGGATTGCAGCGTGCGTGGAGGGCACCCAGGTCCTCATGGTCCCCTCATGGGTCTGGGACGCGGGTCGAGGTGGCTTTAAGCCCAGTGTGCTGCGGTGGATTGCAGCATGCGTGGAGGGCACCCAGGTCCTCATGGTCCCCTCATGGGTCTGGGACGTGGGTCGAGGTGGCTTTAAGCCCAGTGTGCTGCAGTGGATTGCAGCATGCGTGGAGGGCACCCAGGTCCTCATGGTCCCCTCGTGGGTCTGGGACGCGGGTCGAGGTGGCTTTAAGCCCAGTGTGCTGCGGTGGATTGCAGCGTGCGTGGAGGGCACCCAGGTCCTCATGGTCCCCTCGTGGGTCTGGGACGCGGGTCGAGGTGGCTTTAAGCCTAGTGTGCTGCGGTGGATTGCAGCATGCGTGGAGGGCACCCAGGTCCTCATGGTCCCCTCATGGGTCTGGGACGTGGGTCGAGGTGGCTTTAAGCCCAGTGTGCTGCGGTGGATTGCAGCATGCGTGGAGGGCACCCAGGTCCTCATGGTTGCCTCGTGGGTCTGGGACGCGGGTCGAGGTGGCTTTAAGCCCACTGTGCTGCAGTGGATTGCAGCATGCGTGGAGGGCACCCAGGTCCTCATGGTTCCCCCGTGGGTCTGGGACGCGGGTCGAGGTGGCTTTAAGCCCACTGTGCTGCGGTGGATTGCAGCATGCGTGGAGGGCACCCAGGTCCTCATGGTCCCCCCGTGGGTCTGGGACGCGGGTCGAGGTGGCTTTAAGCCCAGTGTGCTGCGGTGGATTGCAGCATGCGTGGAGGGCACCCAGGTCCTCATGGTCCCCCCGTGGGTCTGGGACGCGGGTCGAGGTGGCTTTAAGCCCAGTGTGCTGCAGTGGATTGCAGCATGCGTGGAGGGCACCCAGGTCCTCATGGTTCCCTCGTGGGTCTGGGACGCGGGTCGAGGTGGCTTTAAGCCCAGTGTGCTGCGGTGGATTGCAGCGTGCGTGGAGGGCACCCAGATCCTCATGGTTCCCCCGTGGGTCTGGGACGCGGGTCGAGGTGGCTTTAAGCCCAGTGTGCTGCAGTGGATTGCAGCATGCGTGGAGGGCACCCAGATCCTCATGTTTCCCCCACGGGTCTGGGACGCGGGTCGAGGTGGCTTTAAGCCCAGTGTGCTGCGGTGGATTGCAGCGTGCGTGGAGGGCACCCAGGTCCTCATGGTTCCCTCGTGGGTCTGGGACGTGGGTCGAGGTGGCTTTAAGCCCAGTGTGCTGCAGTGGATTGCAGCGTGCGTGGAGGGCACCCAGGTCCTCATGGTC 0.7 RefCall . GT:GQ:DP:AD:PL ./.:8:241:216,25:0,12,8
chr20 59904405 pbsv.INS.45692 C CCCGTGGGTCTGGGACGCGGGTCGAGGTGGCTTTAAGCCCAGTGTGCTGCGGTGGATTGCAGCATGCGTGGAGGGCACCCAGATCCTCATGGTTCCCCCGTGGGTCTGGGACGCGGGTCGAGGTGGCTTTAAGCCCACTGTGCTGCGGTGGATTGCAGCATGCGTGGAGGGCACCCAGGTCCTCATGGTTCCCCCACGGGTCTGGGACGCGGGTCGAGGTGGCTTTAAGCCCAGTGTGCTGCGGTGGATTGCAGCGTGCGTGGAGGGCACCCAGGTCCTCATGGTCCCCCCGTGGGTCTGGGACGCGGGTCGAGGTGGCTTTAAGCCCAGTGTGCTGCGGTGGATTGCAGCATGCGTGGAGGGCACCCAGATCCTCATGGTCCCCCCGTGGGTCTGGGACGCGGGTCGAGGTGGCTTTAAGCCCAGTGTGCTGCGGTGGATTGCAGCATGCGTGGAGGGCACCCAGGTCCTCATGGTCCCC 0.9 RefCall . GT:GQ:DP:AD:PL ./.:7:241:196,45:0,11,7
chr20 59906638 pbsv.INS.45693 C CCCCAGGCCCACCTCCTGCCCCGGCCACCTGCCTCAGATCTACCTCCTGCCCCGGCCACCTGCCCCAGACCCACCTCCTGCCCCGGCCACCTGCCCCAGACCCACCTCCTGACCCGACCACCTCCCCCAGGCTCACCTCCTGCCCCGGCCACCTGCCCTAGAGCCACCTCCTGCCCCGGCCACCTGCCTCAGGCCCACCTCCTGCCCCGGCCACCTGCCCCAGATCTACCTCCTGACCCGACCACCTCCCCCAGGCTCACCTCCTGCCCTGGCCACCTGCCCCAGACCCACCTCCTGCCCCGGCCACCTGCCCCAGAGCCACCTCCTGACCCGACCACCTCCCCCAGGCTCACCTCCTGCCCCGACCACCTGCCCCAGGCCCACCTCCTGCCCCAGACCCACCTCCTGCCTCAGCCACCTGCCCCAGACTCACCTCCTGCCCCGGCCACCTGCCCCAGAGCCACCTCCTGCCCCGGCCACCTACCCCAGACCCACCTCCTGCCCCGGCCACCTGCCCCAGGCCCACCTCCTGCCCCGGCCACCTGCCCCAGGCCCACCTCCTGCCCCAGACCCACCTCCTGCCTCAGCCACCTGCCCCAGACTCACCTCCTGCCCCGGCCACCTGCCCCAGAGCCACCTCCTGCCCCGGCCACCTACCCCAGACCCACCTCCTGCCCCGGCCACCTGCCCCAGGCCCACCTCCTGCCCCGGCCACCTGCCCCAGGCCCACCTCCTGCCCCGGCCACCTGCCCCAGGCCCACCTCCTGCCCCGGCCACCTGCCCCAGGCCCACCTCCTGCCCCGGCCACCTGCCCCAGGCCCACCTCCTGCCCCCGCCAACTGCCCCAGGTCTACCTCCTGCCCCGGCCACCTGCCCCAGGCCCACCTCCTGCCCCGGTCACCTGTCCCAGCCCACCTCCTGCCCCGGCCACCTGCCCCAGGCCCACCTCCTGCCCCGGCCACCTGT 1.1 RefCall . GT:GQ:DP:AD:PL ./.:7:239:214,25:0,11,7
chr20 59912354 pbsv.INS.45694 A ATGATGATAATGATGGTGGTATGATGATGTGGATGGTAATGGTGATGATAACAGTGGATAATGATGGCAATGCAAGTGATGATATTGATGATGATGGTGATGACAGTGCTGATAATGATCATAATGGTGATGATGATGATGGTGATTATTGTAATAGTGATGACAGTGATGATGATAATGATGGTGATAAGAGTGGATAATGATAATAATGATGATGGTGATTTTAATAGTGATGATGATGTTAAGATGATTATGGTGATGATGATAGTGATGATGATGGTGATGATAACAGTGGATAC 0.6 RefCall . GT:GQ:DP:AD:PL ./.:9:222:124,98:0,12,9
chr20 59928659 pbsv.INS.45695 A ACTGCCTTCCTCCCCCTCCTCCCCATCTTTCTCCCCTTCCCCCTCCCCCCTCCTCTGTCTTCCCCCTACTCCTCCCCTTCTCCCTCCCCCTTCTCCCCCTCTTCCTCCCCTTCCCCCCTCCTCTTCCCCCCTCCTCCTCCTCCCCTTCCCCCTCCCATTACCCCTTCTCCTCCTCCTCTTCCTCCCCCTCCTCCTCTTCTTCCTCCTCTTCCCTCTCCTCCTCCTCCCCCTCCTCCCCCTCCCCCTGTTCCCCTTTTCCTCTGGAAGGCGATGAGAATACAGTATGATGATTCACCCTTCCCCTCCCCCTCCCCCTCCCTCCTCCTCCCCCTACTCTTCCTCCCCCTCCTCCTCCCTCTCCCCTCCCTCTCCTTCCCCTCCCTCCTCCTCCTCCCCCTCCCCTCCTCCTCCTCTTCCCCCTCCTCCTCTTCTTCCTCCCTTCCCCCTCCTCCTCCTCCCTCTCCTCCTCCTCCCCCTGTTCCCCTTTTCCTCTGGAAGGTGATGAGAGTACAGTATGATGATTCACCCTCCCCCCCCCTCCTCCCTCCCCCTCCCCTCCTCCTCCTCTTCCCCCTCCACCTCTT 0.6 RefCall . GT:GQ:DP:AD:PL ./.:9:96:61,35:0,13,9
chr20 59951039 pbsv.INS.45696 T TTGATGGTAGTGTGATGGTCTTGGTGCTGGTGGTGATGATAGTGGGGTTTATTGATGGTAGTGTGATGGTCTTGGTGGTGCTGATAATGGTGTGGTTTGTTGATGATAGTGTGATGGTCTTGGTGGTGGTGGCGTGGTTTGTTGATGGTAGTGTGATGGTCTTGGTGCTGGTGGTATGGTTTGTTGATGGTAGTGTGATGGTCTTGGTGCTGGTGGTGGTGGTGATGATGTAGTTTGTTGATGGTAGCGTGATGTTCTTGGTGCTGGTGGTGGTGATGATGTGGTTTGTTGATGGTAGTGTGATGGTCTTGGTGGTGGTGGTGGTGGTGATAGTGTGGTTTGTTGATGGTAGTGTGATGGTCTTGATGCTGGTGATGGTATTGGTGATGGTGTGGTTTGTTGATGGTAGTGTGATGGTCTTCGTGGTGGTGGTGTGGTTTGTCGATGGTAGTGTGGTGGTCTTGGTGCTGGTGGTTGTCGTGTGGTTTGTTGATGGTAGTGTGATGGTCTTGGTGCTGGTGGTGGTGATGTGGTTTGTTGATGATAGTGTGATGGTCTTGGTGGTGGTAGTGATGGTGTGGTTTGC 0.5 RefCall . GT:GQ:DP:AD:PL ./.:10:243:159,84:0,14,11
chr20 59958678 pbsv.INS.45697 G GATACATATCTATATATGATATATATGAATATATGAATATATGATACGTATGAATATATATGATATATATGGATATATGATACGTATGAATATATATCATATATGTATATATGATATGATATATATATGAATATATGATATATATGAACATATATATGGATATTATGAATATATATGATATATATGAATATATATGGATATATATGAATATATATGATATATGAATATATATGGATATATATGAATATATATGATTGATATATATCATATATGTATGCATATATGTGATATATATCATATATGA 0.3 RefCall . GT:GQ:DP:AD:PL ./.:12:212:129,83:0,16,13
chr20 59958834 pbsv.DEL.45664 TATATATGAAGATATATATGC T 0.2 RefCall . GT:GQ:DP:AD:PL ./.:14:208:118,90:0,18,16
chr20 59965316 pbsv.INS.45698 C CGGTCATAGGGTGCCCATAGTGCCGTGTCTGGGAAACCCCGATTGAGATCGTGCAGTCATAGGGTGCCCATAGCACCATGTCTAGGAAACCCCGATTGGGTTCGTGCAGTCGTAGGGTGCCCATAGTGCCGTGTCTGGGAAACCCCAATTGAGATCGTAT 1 RefCall . GT:GQ:DP:AD:PL ./.:7:201:116,85:0,11,7
chr20 59965633 pbsv.INS.45699 G GCGGTCATAGGGTGCCCATAGTCCCGTGTCTGGGAAGCCCCAATTGAGATCGTA 0.7 RefCall . GT:GQ:DP:AD:PL ./.:8:210:150,60:0,12,8
chr20 59965721 pbsv.INS.45700 A AACCCCAATTGAGATCGTACGGTCATAGGGTGCCCATAGTGCCGTGTCTGGGAAACCCCGATTGAGATCGTGCGGTCATAGGGTGCCCATAGCACCGTGTCTGGGAG 1.1 RefCall . GT:GQ:DP:AD:PL ./.:7:206:174,32:0,11,6
chr20 59974166 pbsv.DEL.45665 AAAAACCCACTTTTTTTTTTTTTTTTTTTTTTTTGAGACGGAGTCTCACTCTGTCGCCCAGGCCGGACTGCGGACTGCAGTGGCGCAATCTCGGCTCACTGCAAGCTCCGCTTCCCGGGTTCACGCCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCGCCCGCCACCATGCCCGGCTAATTTTTTGTATTTTTAGTAGAGACGGGGTTTCACCTTGTTAGCCAGGATGGTCTCGATCTCCTGACCTCATGATCCACCCGCCTCAGCCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACCGCGCCCGGCC A 1.8 RefCall . GT:GQ:DP:AD:PL ./.:5:262:209,53:0,8,4
Binary file modified deepvariant/testdata/ucsc.hg19.chr20.unittest.fasta.gz
Binary file not shown.
Binary file modified deepvariant/testdata/ucsc.hg19.chr20.unittest.fasta.gz.gzi
Binary file not shown.
Binary file not shown.
Binary file not shown.
18 changes: 12 additions & 6 deletions deepvariant/variant_calling.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ namespace learning {
namespace genomics {
namespace deepvariant {

using nucleus::genomics::v1::Range;
using nucleus::genomics::v1::Variant;
using nucleus::genomics::v1::VariantCall;
using tensorflow::string;
Expand Down Expand Up @@ -326,21 +327,26 @@ std::vector<DeepVariantCall> VariantCaller::CallsFromVcf(
const AlleleCounter& allele_counter,
nucleus::VcfReader* vcf_reader_ptr) const {
std::vector<AlleleCount> allele_counts = allele_counter.Counts();
nucleus::genomics::v1::Range range = allele_counter.Interval();
std::vector<nucleus::genomics::v1::Variant> variants_in_region;
Range range = allele_counter.Interval();
std::vector<Variant> variants_in_region;
// An error here is fatal and whole program will fail if unable to query vcf.
std::shared_ptr<nucleus::VariantIterable> variants =
vcf_reader_ptr->Query(range).ValueOrDie();
for (const auto& v : variants) {
variants_in_region.push_back(*v.ValueOrDie());
const Variant* variant = v.ValueOrDie();
// This ensures we only keep variants that start in this region. By default,
// vcf_reader->Query() returns all variants that overlap a region, which
// can incorrectly cause the same variant to be processed multiple times.
if (variant->start() >= range.start()) {
variants_in_region.push_back(*variant);
}
}
return CallsFromVariantsInRegion(allele_counts, variants_in_region);
}

std::vector<DeepVariantCall> VariantCaller::CallsFromVariantsInRegion(
const std::vector<AlleleCount>& allele_counts,
const std::vector<nucleus::genomics::v1::Variant>& variants_in_region)
const {
const std::vector<Variant>& variants_in_region) const {
std::vector<DeepVariantCall> calls;
// For each variant in the region, loop through AlleleCounts to find a match
// to the variant position. At each match, add the supporting reads.
Expand All @@ -354,7 +360,7 @@ std::vector<DeepVariantCall> VariantCaller::CallsFromVariantsInRegion(
}

optional<DeepVariantCall> VariantCaller::ComputeVariant(
const nucleus::genomics::v1::Variant& variant,
const Variant& variant,
const std::vector<AlleleCount>& allele_counts) const {
DeepVariantCall call;
*call.mutable_variant() = variant;
Expand Down

0 comments on commit c027549

Please sign in to comment.