Skip to content

Commit

Permalink
Merge pull request #233 from dib-lab/microtrios
Browse files Browse the repository at this point in the history
New microtrios for reduced (?) test/CI build times
  • Loading branch information
standage committed Mar 29, 2018
2 parents 3a001f1 + a52b7f2 commit f98a15e
Show file tree
Hide file tree
Showing 23 changed files with 102 additions and 48 deletions.
7 changes: 6 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
SHELL=bash


## #≠≠≠≠≠ build targets ≠≠≠≠≠#

## help: print this help message and exit
Expand All @@ -8,7 +9,7 @@ help: Makefile

## devenv: install software development pre-requisites
devenv:
pip install --upgrade pip setuptools pytest pytest-cov pycodestyle cython sphinx sphinx-argparse
pip install --upgrade pip setuptools pytest pytest-cov pytest-xdist pycodestyle cython sphinx sphinx-argparse

## style: check Python code style against PEP8
style:
Expand All @@ -22,6 +23,10 @@ ext: kevlar/alignment.c src/align.c inc/align.h
test: ext
py.test --cov=kevlar kevlar/tests/*.py -m 'not long and not toolong'

## test: execute the automated test suite with 4 parallel threads
test4: ext
py.test -n=4 --cov=kevlar kevlar/tests/*.py -m 'not long and not toolong'

## testmore: execute the automated test suite, including longer-running tests
testmore: ext
py.test -v --cov=kevlar kevlar/tests/*.py -m 'not toolong'
Expand Down
1 change: 1 addition & 0 deletions kevlar/tests/data/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pico-trio-refr.fa.gz.*
fiveparts-refr.fa.gz.*
inf-mate-dist/*.genome.fa.gz.*
calls-*.vcf
4 changes: 4 additions & 0 deletions kevlar/tests/data/microtrios/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
*.fa
*.fasta
*.fastq
refr-*.fa.gz.*
27 changes: 27 additions & 0 deletions kevlar/tests/data/microtrios/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/usr/bin/env bash

nuclmm simulate --order 6 --numseqs 1 --seqlen 20000 --seed 2468 --out kevlar/tests/data/microtrios/refr-li.fa notebook/human-sim-pico/human.order6.mm
nuclmm simulate --order 6 --numseqs 1 --seqlen 20000 --seed 1357 --out kevlar/tests/data/microtrios/refr-na.fa notebook/human-sim-pico/human.order6.mm
nuclmm simulate --order 6 --numseqs 1 --seqlen 20000 --seed 1235 --out kevlar/tests/data/microtrios/refr-k.fa notebook/human-sim-pico/human.order6.mm

kevlar gentrio --inherited 2 --de-novo 1 --vcf kevlar/tests/data/microtrios/variants-li.vcf --prefix kevlar/tests/data/microtrios/trio-li --weights snv=1.0 --seed 102938 kevlar/tests/data/microtrios/refr-li.fa
kevlar gentrio --inherited 2 --de-novo 1 --vcf kevlar/tests/data/microtrios/variants-na.vcf --prefix kevlar/tests/data/microtrios/trio-na --weights snv=1.0 --seed 475656 kevlar/tests/data/microtrios/refr-na.fa
kevlar gentrio --inherited 2 --de-novo 1 --vcf kevlar/tests/data/microtrios/variants-k.vcf --prefix kevlar/tests/data/microtrios/trio-k --weights snv=1.0 --seed 928374 kevlar/tests/data/microtrios/refr-k.fa

wgsim -e 0.005 -r 0.0 -d 450 -s 50 -N 3000 -1 100 -2 100 -S 111111 kevlar/tests/data/microtrios/trio-li-father.fasta kevlar/tests/data/microtrios/trio-li-father-{1,2}.fastq
wgsim -e 0.005 -r 0.0 -d 450 -s 50 -N 3000 -1 100 -2 100 -S 222222 kevlar/tests/data/microtrios/trio-li-mother.fasta kevlar/tests/data/microtrios/trio-li-mother-{1,2}.fastq
wgsim -e 0.005 -r 0.0 -d 450 -s 50 -N 3000 -1 100 -2 100 -S 333333 kevlar/tests/data/microtrios/trio-li-proband.fasta kevlar/tests/data/microtrios/trio-li-proband-{1,2}.fastq
wgsim -e 0.005 -r 0.0 -d 450 -s 50 -N 3000 -1 100 -2 100 -S 444444 kevlar/tests/data/microtrios/trio-na-father.fasta kevlar/tests/data/microtrios/trio-na-father-{1,2}.fastq
wgsim -e 0.005 -r 0.0 -d 450 -s 50 -N 3000 -1 100 -2 100 -S 555555 kevlar/tests/data/microtrios/trio-na-mother.fasta kevlar/tests/data/microtrios/trio-na-mother-{1,2}.fastq
wgsim -e 0.005 -r 0.0 -d 450 -s 50 -N 3000 -1 100 -2 100 -S 666666 kevlar/tests/data/microtrios/trio-na-proband.fasta kevlar/tests/data/microtrios/trio-na-proband-{1,2}.fastq
wgsim -e 0.005 -r 0.0 -d 450 -s 50 -N 3000 -1 100 -2 100 -S 777777 kevlar/tests/data/microtrios/trio-k-father.fasta kevlar/tests/data/microtrios/trio-k-father-{1,2}.fastq
wgsim -e 0.005 -r 0.0 -d 450 -s 50 -N 3000 -1 100 -2 100 -S 888888 kevlar/tests/data/microtrios/trio-k-mother.fasta kevlar/tests/data/microtrios/trio-k-mother-{1,2}.fastq
wgsim -e 0.005 -r 0.0 -d 450 -s 50 -N 3000 -1 100 -2 100 -S 999999 kevlar/tests/data/microtrios/trio-k-proband.fasta kevlar/tests/data/microtrios/trio-k-proband-{1,2}.fastq

for trio in li na k
do
for ind in father mother proband
do
interleave-reads.py kevlar/tests/data/microtrios/trio-${trio}-${ind}-{1,2}.fastq | gzip -c > kevlar/tests/data/microtrios/trio-${trio}-${ind}.fq.gz
done
done
Binary file added kevlar/tests/data/microtrios/novel-na.augfastq.gz
Binary file not shown.
Binary file added kevlar/tests/data/microtrios/refr-k.fa.gz
Binary file not shown.
Binary file added kevlar/tests/data/microtrios/refr-li.fa.gz
Binary file not shown.
Binary file added kevlar/tests/data/microtrios/refr-na.fa.gz
Binary file not shown.
Binary file added kevlar/tests/data/microtrios/trio-k-father.fq.gz
Binary file not shown.
Binary file added kevlar/tests/data/microtrios/trio-k-mother.fq.gz
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
9 changes: 9 additions & 0 deletions kevlar/tests/data/microtrios/variants-k.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
##fileformat=VCFv4.2
##source=kevlar::gentrio
##INFO=<GT,Number=3,Type=String,Description="Genotypes of each individual in the trio (proband, mother, father)">
##INFO=<VW,Number=1,Type=String,Description="Genomic interval bounding all k-mers that contain the alternate allele">
##INFO=<RW,Number=1,Type=String,Description="Genomic interval bounding all k-mers that contain the reference allele">
#CHROM POS ID REF ALT QUAL FILTER INFO
seq1 1356 . C A . PASS GT=0/0,0/1,0/0;RW=TTAGTCTTAAGTCTTCTTTAGAGTCACCCTCCCTTTTTAAAGTTGTTCTCCTATACTTCCC;VW=TTAGTCTTAAGTCTTCTTTAGAGTCACCCTACCTTTTTAAAGTTGTTCTCCTATACTTCCC
seq1 7255 . T A . PASS GT=0/0,0/1,0/1;RW=CTTGTACTCCATCTGAGACCGGGAGGGAGATATCAGGATTATGACTTTGGGCTATGGATTG;VW=CTTGTACTCCATCTGAGACCGGGAGGGAGAAATCAGGATTATGACTTTGGGCTATGGATTG
seq1 16661 . A C . PASS GT=0/1,0/0,0/0;RW=AGCAATCTCCTTTGAAACTGACTCAAAATTAGCAAACAGTAAGCAGCCGCCACCCAGCCTC;VW=AGCAATCTCCTTTGAAACTGACTCAAAATTCGCAAACAGTAAGCAGCCGCCACCCAGCCTC
9 changes: 9 additions & 0 deletions kevlar/tests/data/microtrios/variants-li.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
##fileformat=VCFv4.2
##source=kevlar::gentrio
##INFO=<GT,Number=3,Type=String,Description="Genotypes of each individual in the trio (proband, mother, father)">
##INFO=<VW,Number=1,Type=String,Description="Genomic interval bounding all k-mers that contain the alternate allele">
##INFO=<RW,Number=1,Type=String,Description="Genomic interval bounding all k-mers that contain the reference allele">
#CHROM POS ID REF ALT QUAL FILTER INFO
seq1 9279 . A T . PASS GT=1/0,1/0,1/1;RW=ATTTTTCTTCTCTTTTTAGTCACTGCAACCAGGACATCACCCTGCAGCCATTTATTTCCAA;VW=ATTTTTCTTCTCTTTTTAGTCACTGCAACCTGGACATCACCCTGCAGCCATTTATTTCCAA
seq1 10367 . G C . PASS GT=1/0,0/1,0/0;RW=TGATCTGGCTCAGACCTGCCAGTGAAGGGGGGAGCTTCATTGGTATTTCAAGACCTGTGAA;VW=TGATCTGGCTCAGACCTGCCAGTGAAGGGGCGAGCTTCATTGGTATTTCAAGACCTGTGAA
seq1 14742 . T C . PASS GT=1/0,0/0,0/0;RW=AATTCATTTCTCCCAACTCACACCACCTGTTTATTCCAGGCTGCATTTACTATTTTAAAAT;VW=AATTCATTTCTCCCAACTCACACCACCTGTCTATTCCAGGCTGCATTTACTATTTTAAAAT
9 changes: 9 additions & 0 deletions kevlar/tests/data/microtrios/variants-na.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
##fileformat=VCFv4.2
##source=kevlar::gentrio
##INFO=<GT,Number=3,Type=String,Description="Genotypes of each individual in the trio (proband, mother, father)">
##INFO=<VW,Number=1,Type=String,Description="Genomic interval bounding all k-mers that contain the alternate allele">
##INFO=<RW,Number=1,Type=String,Description="Genomic interval bounding all k-mers that contain the reference allele">
#CHROM POS ID REF ALT QUAL FILTER INFO
seq1 5018 . A T . PASS GT=1/1,1/1,1/1;RW=GATCACCCACATACCACCCTGCACAGAGACAGGGTACCAACCTCCCTATCATCCTTCTCCA;VW=GATCACCCACATACCACCCTGCACAGAGACTGGGTACCAACCTCCCTATCATCCTTCTCCA
seq1 6691 . A T . PASS GT=1/0,0/0,0/0;RW=AACTGCCCGCCTTGGGGTCTATAGTTATCCAGGGAACAGAGACGCGGAGAGGAAAACCATA;VW=AACTGCCCGCCTTGGGGTCTATAGTTATCCTGGGAACAGAGACGCGGAGAGGAAAACCATA
seq1 17260 . T A . PASS GT=1/1,1/0,1/1;RW=AAATATGGCAAGGATCTACTAACACTGCTTTCATTTGATAGCTGGGAGTATAATCCGGATA;VW=AAATATGGCAAGGATCTACTAACACTGCTTACATTTGATAGCTGGGAGTATAATCCGGATA
6 changes: 0 additions & 6 deletions kevlar/tests/test_alac.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,7 @@ def test_pico_4(greedy, capsys):
'ACCCGCAAGCACACCGCTTTCAGTGTGTCACATGCACA'),
(5, 1175767, 'T', 'C'),
(6, 185751, 'TCAAACTCTGGCATTATACATAGGGTTCCCG', 'T'),
(7, 2265794, 'GCAGGGTACATAAGAGTCCATTGTGCCTGTATTATTTTGAGCAATGGCTAAAGTACCTTC'
'ACCCTTGCTC', 'G'),
(8, 636698, 'C', 'A'),
(9, 226610, 'TTCAACTCTACAGGGTCTGATGCTTACAGGAGTTCCCTTTTCCTACATTTGGTTCAAGATG'
'GCAACAAATACATTTTAGATTCACATAGCTCATCCTTCTAGGTTAACAGTAAACTTAAGAA'
'CTAAGACCAGAACCAGGAGGGTCAGGAAATTCTCCTGTGTGGTTGCTGGGACCACTGCAAA'
'GCAGTGGC', 'T'),
(10, 1527138, 'C', 'CTCCTGGTCTGCCACGGTTGACTTGCCTACATAT'),
])
def test_pico_calls(cc, pos, ref, alt):
Expand Down
58 changes: 25 additions & 33 deletions kevlar/tests/test_novel.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,25 +78,18 @@ def test_assumptions(kmer):


@pytest.mark.long
@pytest.mark.parametrize('case,ctrl,mem', [
('trio1/case1.fq', 'trio1/ctrl[1,2].fq', '500K'),
('trio1/case2.fq', 'trio1/ctrl[1,2].fq', '1M'),
('trio1/case3.fq', 'trio1/ctrl[1,2].fq', '1M'),
('trio1/case4.fq', 'trio1/ctrl[1,2].fq', '500K'),
('trio1/case5.fq', 'trio1/ctrl[3,4].fq', '1M'),
('trio1/case6.fq', 'trio1/ctrl[5,6].fq', '1M'),
('trio1/case7.fq', 'trio1/ctrl[5,6].fq', '1M'),
@pytest.mark.parametrize('case,ctrl', [
('microtrios/trio-li-proband.fq.gz', 'microtrios/trio-li-??ther.fq.gz'),
('microtrios/trio-na-proband.fq.gz', 'microtrios/trio-na-??ther.fq.gz'),
('microtrios/trio-k-proband.fq.gz', 'microtrios/trio-k-??ther.fq.gz'),
])
def test_novel_single_mutation(case, ctrl, mem, capsys):
from sys import stdout, stderr
def test_novel_single_mutation(case, ctrl, capsys):
casestr = data_file(case)
ctrls = kevlar.tests.data_glob(ctrl)
arglist = ['novel', '--case', casestr, '--ksize', '13', '--case-min', '8',
arglist = ['novel', '--case', casestr, '--ksize', '25', '--case-min', '7',
'--control', ctrls[0], '--control', ctrls[1],
'--ctrl-max', '0', '--memory', mem]
'--ctrl-max', '0', '--memory', '500K']
args = kevlar.cli.parser().parse_args(arglist)
args.out = None
args.err = stderr
kevlar.novel.main(args)
out, err = capsys.readouterr()

Expand All @@ -108,7 +101,7 @@ def test_novel_single_mutation(case, ctrl, mem, capsys):
case = int(abundmatch.group(1))
ctl1 = int(abundmatch.group(2))
ctl2 = int(abundmatch.group(3))
assert case >= 8, line
assert case >= 7, line
assert ctl1 == 0 and ctl2 == 0, line


Expand Down Expand Up @@ -221,17 +214,16 @@ def test_skip_until(capsys):


def test_novel_output_has_mates():
kid = data_file('minitrio/trio-proband.fq.gz')
mom = data_file('minitrio/trio-mother.fq.gz')
dad = data_file('minitrio/trio-father.fq.gz')
testnovel = data_file('minitrio/novel.augfastq.gz')
testmates = data_file('minitrio/novel-mates.fastq.gz')
kid = data_file('microtrios/trio-na-proband.fq.gz')
mom = data_file('microtrios/trio-na-mother.fq.gz')
dad = data_file('microtrios/trio-na-father.fq.gz')
testnovel = data_file('microtrios/novel-na.augfastq.gz')

with NamedTemporaryFile(suffix='.augfastq') as novelfile:
arglist = [
'novel', '--out', novelfile.name, '--case', kid, '--case-min', '5',
'--control', mom, '--control', dad, '--ctrl-max', '1',
'--memory', '5M'
'--memory', '500K'
]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.novel.main(args)
Expand All @@ -247,8 +239,8 @@ def test_novel_output_has_mates():
test_ids = set([r.name for r in stream])
assert intread_ids == test_ids

stream = kevlar.parse_augmented_fastx(kevlar.open(testmates, 'r'))
test_mate_seqs = set([r.sequence for r in stream])
stream = kevlar.parse_augmented_fastx(kevlar.open(testnovel, 'r'))
test_mate_seqs = set([m for r in stream for m in r.mateseqs])
assert mate_seqs == test_mate_seqs


Expand All @@ -257,8 +249,8 @@ def test_novel_save_counts():
try:
for ind in ('father', 'mother', 'proband'):
outfile = '{:s}/{:s}.ct'.format(outdir, ind)
infile = data_file('minitrio/trio-{:s}.fq.gz'.format(ind))
arglist = ['count', '--ksize', '27', '--memory', '5M', outfile,
infile = data_file('microtrios/trio-na-{:s}.fq.gz'.format(ind))
arglist = ['count', '--ksize', '27', '--memory', '500K', outfile,
infile]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.count.main(args)
Expand All @@ -267,10 +259,10 @@ def test_novel_save_counts():
'novel', '--ksize', '27', '--out', outdir + '/novel.augfastq.gz',
'--save-case-counts', outdir + '/kid.ct', '--save-ctrl-counts',
outdir + '/mom.ct', outdir + '/dad.ct', '--case',
data_file('minitrio/trio-proband.fq.gz'),
'--control', data_file('minitrio/trio-mother.fq.gz'),
'--control', data_file('minitrio/trio-father.fq.gz'),
'--memory', '5M'
data_file('microtrios/trio-na-proband.fq.gz'),
'--control', data_file('microtrios/trio-na-mother.fq.gz'),
'--control', data_file('microtrios/trio-na-father.fq.gz'),
'--memory', '500K'
]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.novel.main(args)
Expand All @@ -292,10 +284,10 @@ def test_novel_save_counts_mismatch(capsys):
'novel', '--ksize', '27', '--out', outdir + '/novel.augfastq.gz',
'--save-case-counts', outdir + '/kid.ct', '--save-ctrl-counts',
outdir + '/mom.ct', outdir + '/dad.ct', outdir + '/sibling.ct',
'--case', data_file('minitrio/trio-proband.fq.gz'),
'--control', data_file('minitrio/trio-mother.fq.gz'),
'--control', data_file('minitrio/trio-father.fq.gz'),
'--memory', '5M'
'--case', data_file('microtrios/trio-k-proband.fq.gz'),
'--control', data_file('microtrios/trio-k-mother.fq.gz'),
'--control', data_file('microtrios/trio-k-father.fq.gz'),
'--memory', '500K'
]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.novel.main(args)
Expand Down
20 changes: 12 additions & 8 deletions kevlar/tests/test_simplex.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,29 +99,33 @@ def test_simplex_trio1(capsys):
assert out.strip() == testvcf


def test_simplex_minitrio():
def test_simplex_minitrio(capsys):
proband = data_file('minitrio/trio-proband.fq.gz')
mother = data_file('minitrio/trio-mother.fq.gz')
father = data_file('minitrio/trio-father.fq.gz')
refr = data_file('minitrio/refr.fa')

arglist = [
'simplex', '--novel-memory', '10M', '--case', proband,
'simplex', '--novel-memory', '5M', '--case', proband,
'--control', mother, '--control', father, '--threads', '2',
'--ctrl-max', '1', '--case-min', '5', '--ksize', '25',
refr
]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.simplex.main(args)

out, err = capsys.readouterr()
print('DEBUG', out)
assert len(out.strip().split('\n')) == 1


def test_simplex_save_counts():
outdir = mkdtemp()
try:
for ind in ('father', 'mother', 'proband'):
outfile = '{:s}/{:s}.ct'.format(outdir, ind)
infile = data_file('minitrio/trio-{:s}.fq.gz'.format(ind))
arglist = ['count', '--ksize', '27', '--memory', '5M', outfile,
infile = data_file('microtrios/trio-k-{:s}.fq.gz'.format(ind))
arglist = ['count', '--ksize', '27', '--memory', '500K', outfile,
infile]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.count.main(args)
Expand All @@ -130,10 +134,10 @@ def test_simplex_save_counts():
'simplex', '--ksize', '27', '--out', outdir + '/calls.vcf',
'--save-case-counts', outdir + '/kid', '--save-ctrl-counts',
outdir + '/mom', outdir + '/dad', '--case',
data_file('minitrio/trio-proband.fq.gz'),
'--control', data_file('minitrio/trio-mother.fq.gz'),
'--control', data_file('minitrio/trio-father.fq.gz'),
'--novel-memory', '5M', data_file('minitrio/refr.fa')
data_file('microtrios/trio-k-proband.fq.gz'),
'--control', data_file('microtrios/trio-k-mother.fq.gz'),
'--control', data_file('microtrios/trio-k-father.fq.gz'),
'--novel-memory', '500K', data_file('microtrios/refr-k.fa.gz')
]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.simplex.main(args)
Expand Down

0 comments on commit f98a15e

Please sign in to comment.