Merge pull request #218 from dib-lab/fix/docs-and-defaults

Update tutorials, fix parameter defaults
kevlar-dev · Mar 15, 2018 · 2e2591d · 2e2591d
2 parents e299def + b63a802
commit 2e2591d
Show file tree

Hide file tree

Showing 10 changed files with 88 additions and 76 deletions.
diff --git a/docs/quick-start.rst b/docs/quick-start.rst
@@ -4,23 +4,25 @@ Quick start
 If you have not already done so, install kevlar using :doc:`the following instructions <install>`.
 
 This gives a crash course on running kevlar's simplex analysis workflow.
+The ``kevlar simplex`` command should be able to run on a laptop in about 5 minutes while consuming less than 200 Mb of RAM for this demo data set.
+
 A :doc:`more detailed tutorial is available <tutorial>`, and a complete listing of all available configuration options for each script can be found in :doc:`the CLI documentation <cli>`, or by executing ``kevlar <subcommand> -h`` in the terminal.
 
 ----------
 
-   .. code:: bash
-
-        curl -L https://osf.io/e8jb3/download?version=1 -o mother.fq.gz
-        curl -L https://osf.io/fuaty/download?version=1 -o father.fq.gz
-        curl -L https://osf.io/f5trh/download?version=1 -o proband.fq.gz
-        curl -L https://osf.io/58rwa/download?version=1 -o refr.fa.gz
-        bwa index refr.fa.gz
-
-        kevlar simplex \
-            --case proband.fq.gz --case-min 6 \
-            --control mother.fq.gz --control father.fq.gz --ctrl-max 0 \
-            --novel-memory 5M --novel-fpr 0.6 --threads 4 \
-            --filter-memory 1M --filter-fpr 0.005 \
-            --mask-files refr.fa.gz --mask-memory 5M \
-            --ksize 25 --out variant-calls.vcf
-            refr.fa.gz
+.. highlight:: none
+
+.. code::
+
+     curl -L https://s3-us-west-1.amazonaws.com/noble-trios/helium-mother-reads.fq.gz -o mother.fq.gz
+     curl -L https://s3-us-west-1.amazonaws.com/noble-trios/helium-father-reads.fq.gz -o father.fq.gz
+     curl -L https://s3-us-west-1.amazonaws.com/noble-trios/helium-proband-reads.fq.gz -o proband.fq.gz
+     curl -L https://s3-us-west-1.amazonaws.com/noble-trios/helium-refr.fa.gz -o refr.fa.gz
+
+     kevlar simplex \
+         --case proband.fq.gz --control mother.fq.gz --control father.fq.gz \
+         --novel-memory 50M --filter-memory 1M --filter-fpr 0.005 --mask-memory 5M  \
+         --mask-files refr.fa.gz \
+         --threads 4 --ksize 31 \
+         --out variant-calls.vcf \
+         refr.fa.gz
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
@@ -1,25 +1,26 @@
 Tutorial
 ========
 
-This tutorial will cover the same material introduced in the :doc:`kevlar quick start <quick-start>`, but with a bit more detail and commentary.
+This tutorial will cover material similar to that introduced in the :doc:`kevlar quick start <quick-start>`, but with a bit more detail and commentary.
 
 
 Test data
 ---------
 
-The data for this tutorial comes from `a simulated 2.5 Mb genome <https://osf.io/r9h73/>`__ (more details `here <https://osf.io/p5ngm/>`__).
-It can be downloaded anonymously from the `Open Science Framework <https://osf.io/>`__.
+The data for this tutorial comes from `a simulated 25 Mb genome <https://github.com/standage/noble>`__ and can be downloaded anonymously from the `Open Science Framework <https://osf.io/anr56/>`__.
 
-.. code:: bash
+.. highlight:: none
+
+.. code::
 
-    curl -L https://osf.io/e8jb3/download?version=1 -o mother.fq.gz
-    curl -L https://osf.io/fuaty/download?version=1 -o father.fq.gz
-    curl -L https://osf.io/f5trh/download?version=1 -o proband.fq.gz
-    curl -L https://osf.io/58rwa/download?version=1 -o refr.fa.gz
+    curl -L https://s3-us-west-1.amazonaws.com/noble-trios/neon-mother-reads.fq.gz -o mother.fq.gz
+    curl -L https://s3-us-west-1.amazonaws.com/noble-trios/neon-father-reads.fq.gz -o father.fq.gz
+    curl -L https://s3-us-west-1.amazonaws.com/noble-trios/neon-proband-reads.fq.gz -o proband.fq.gz
+    curl -L https://s3-us-west-1.amazonaws.com/noble-trios/neon-refr.fa.gz -o refr.fa.gz
 
 The reference genome must be indexed for BWA searches before proceeding.
 
-.. code:: bash
+.. code::
 
     bwa index refr.fa.gz
 
@@ -37,95 +38,104 @@ The required inputs are:
 
 This test data set doesn't include any simulated contamination, so we are simply using the reference genome sequence as the mask.
 
-The output of this command is a VCF file, in this case containing 8 variant calls and 2 no-calls for inversion variants that kevlar cannot yet classify.
-
-.. code:: bash
+.. code::
 
     kevlar simplex \
-        --case proband.fq.gz --case-min 6 \
-        --control mother.fq.gz --control father.fq.gz --ctrl-max 0 \
-        --novel-memory 5M --novel-fpr 0.6 --threads 4 \
-        --filter-memory 1M --filter-fpr 0.005 \
-        --mask-files refr.fa.gz --mask-memory 5M \
-        --ksize 25 --out variant-calls.vcf
-        refr.fa.gz
+         --case proband.fq.gz --control mother.fq.gz --control father.fq.gz \
+         --case-min 6 --ctrl-max 1 \
+         --novel-memory 500M --filter-memory 10M --filter-fpr 0.005 --mask-memory 50M  \
+         --mask-files refr.fa.gz \
+         --threads 4 --ksize 31 \
+         --out kevlar-variant-calls.vcf \
+         refr.fa.gz
 
 Here is a discussion of various parameter choices.
 
 - The ``--case-min`` and ``--ctrl-max`` parameters define the threshold model for selecting "interesting" (putatively novel) *k*-mers.
-  Here, any *k*-mer that appears at least 6 times in the proband and 0 times in both parents is marked as a putative signature of a *de novo* variant.
+  Here, any *k*-mer that appears at least 6 times in the proband and at most 1 time in either parent is marked as a putative signature of a *de novo* variant.
 - The ``--novel-memory`` parameter determines how much memory is allocated for computing initial *k*-mer counts in each sample.
   The accuracy of the counts depends on the number of distinct *k*-mers in the sample and the amount of memory used for counting *k*-mers.
-  For human data sets sampled to about 30x coverage, allocating 10-20 Gb of memory per sample will still result in high false discovery rates for the initial *k*-mer counts.
-  However, tests have shown that FDRs of up to 0.8 for the initial *k*-mer counts can be compensated for by subsequent filtering steps.
+  For human data sets sampled to about 30x coverage, allocating 10-20 Gb of memory per sample will result in high false positive rates for the initial *k*-mer counts.
+  However, tests have shown that FPRs of up to 0.8 for the initial *k*-mer counts can be compensated for by subsequent filtering steps.
 - The ``--filter-memory`` parameter indicates the amount of memory allocated for recomputing *k*-mer abundances of "interesting" reads.
   For human data sets sampled to about 30x coverage, 4 Gb of memory for this second filtering pass is usually more than enough for near-perfect accuracy.
   The ``--filter-fpr`` parameter indicates the level of desired accuracy (the program will terminate if the FPR is higher).
 
 
+Assessing accuracy
+------------------
+
+The output of the command above is a VCF file, and in this case should contain 10 variant calls.
+The ``kevlar-eval.sh`` script below uses ``bedtools intersect`` to do a quick and simple evaluation of kevlar's accuracy.
+
+.. code:: bash
+
+    curl -L https://s3-us-west-1.amazonaws.com/noble-trios/neon.vcf -o neon-refr.vcf
+    curl -L curl -L https://raw.githubusercontent.com/standage/noble/master/kevlar-eval.sh -o kevlar-eval.sh
+    bash kevlar-eval.sh neon-refr.vcf kevlar-variant-calls.vcf
+
+
 The banding workflow
 --------------------
 
-If memory is a limiting factor, it's possible to reduce the most memory intensive steps of the kevlar workflow (the initial *k*-mer counting) using "*k*-mer banding".
+If memory is a limiting factor, it's possible to reduce the most memory intensive steps of the kevlar workflow using "*k*-mer banding".
 To summarize, the *k*-mer counts are computed in N independent passes over the data, each pass only requiring 1/N of the memory required for a single pass.
 Banding is not yet supported in the ``kevlar simplex`` command, so the banding workflow is a bit more involved.
 
-.. code:: bash
+.. code::
 
     # Let's count k-mers and find novel k-mers in 6 passes for a 6x reduction in memory
     for $band in {1..6}
     do
         kevlar novel \
-            --case proband.fq.gz --case-min 6 \
-            --control mother.fq.gz --control father.fq.gz --ctrl-max 0 \
-            --memory 1M --max-fpr 0.6 --threads 4 --ksize 25 \
+            --case proband.fq.gz --control mother.fq.gz --control father.fq.gz \
+            --case-min 6 --ctrl-max 1 \
+            --memory 500M --threads 4 --ksize 31 \
             --num-bands 6 --band $band \
             --out proband-novel-${band}.augfastq.gz
     done
 
     # The "kevlar filter" command will combine the results from all 6 passes and recompute k-mer counts
     kevlar filter \
-        --mask refr.fa.gz --mask-memory 5M --mask-max-fpr 0.005 \
-        --abund-memory 1M --abund-max-fpr 0.005 \
-        --ksize 25 --out proband-filtered.augfastq.gz \
+        --mask refr.fa.gz --mask-memory 50M --mask-max-fpr 0.005 \
+        --abund-memory 10M --abund-max-fpr 0.005 \
+        --ksize 31 --out proband-filtered.augfastq.gz \
         proband-novel-{1..6}.augfastq.gz
 
     # The "kevlar partition" command separates the reads into sets corresponding to distinct variants.
-    kevlar partition --split partition proband-filtered.augfastq.gz
+    kevlar partition proband-partitioned.augfastq.gz proband-filtered.augfastq.gz
 
     # The "kevlar alac" command does assembly, alignment, and variant calling for each partition
-    numpart=$(wc -l partition.cc.log | cut -f 1 -d ' ')
-    for i in $(seq 1 $numpart)
-    do
-        kevlar alac --ksize 25 partition.cc${i}.augfastq.gz refr.fa.gz >> calls.vcf
-    done
+    kevlar alac --ksize 31 --seed-size 51 --delta 50 --out proband-calls.vcf proband-partitioned.augfastq.gz refr.fa.gz
 
 
 Pre-processing
 --------------
 
-Several types of pre-processing can lead to large improvements in kevlar's performance.
+Several types of pre-processing can yield large improvements in kevlar's performance.
 
 - **Error correction**:
   The amount of memory kevlar needs for accurate variant discovery depends on the number of distinct *k*-mers in each sample, the majority of which are associated with sequencing errors.
   Using an error correction tool such as `Lighter <https://github.com/mourisl/Lighter>`__ or `BFC <https://github.com/lh3/bfc>`__ will remove many erroneous *k*-mers, reduce kevlar's memory demands without adding too much processing time to the overall workflow.
   See `this blog post <https://standage.github.io/information-content-versus-data-volume-and-k-mer-counting-accuracy.html>`__ for more details.
+  However, in some cases this may lead to true variants being erroneously "corrected" and result in a loss of variant calling sensitivity.
 - **Discarding reads that match the reference genome perfectly**:
   Reducing the total volume of the data set doesn't reduce the number of distinct *k*-mers to the extent that error correction does, but it can reduce the number of reads that need to be processed by up to 80%, substantially speeding up subsequent processing steps.
 
-  .. code:: bash
+  .. code::
 
-      kevlar dump --out proband-dump.fq.gz refr.fa.gz proband.bam
+      kevlar dump --out proband-dump.fq.gz --refr refr.fa.gz proband.bam
 
   The time savings may not offset the time required from running ``kevlar dump`` on each sample.
   However, when testing and benchmarking kevlar on a new data set where multiple runs for parameter refinement are likely, the time savings will probably be worth the initial time investment.
 - **Pre-computing count tables**:
   In the examples above, ``kevlar simplex`` and ``kevlar novel`` compute *k*-mer counts directly from Fastq data.
-  However, both commands can also accept pre-computed count tables, which can be helpful when testing or benchmarking kevlar requires multiple runs over the same data.
-  The ``kevlar count`` command accepts Fastq files and saves *k*-mer count tables to disk for subsequent use.
+  However, both commands can also accept pre-computed count tables with the ``--case-counts`` and ``--control-counts`` options.
+  This can be helpful when testing or benchmarking kevlar requires multiple runs over the same data.
+  The ``kevlar count`` command takes Fastq files as input and saves *k*-mer count tables to disk for subsequent use.
 
-  .. code:: bash
+  .. code::
 
-      kevlar count --ksize 31 --memory 8G proband.counttable proband-r1.fq.gz proband-r2.fq.gz proband-ru.fq.gz
+      kevlar count --ksize 31 --memory 8G proband.counttable proband-paired-interleaved.fq.gz
       kevlar count --ksize 31 --memory 8G father.counttable father-r1.fq.gz father-r2.fq.gz
-      kevlar count --ksize 31 --memory 8G mother.counttable mother-reads-interleaved.fq.gz
+      kevlar count --ksize 31 --memory 8G mother.counttable mother-r1.fq.gz mother-r2.fq.gz mother-ru.fq.gz
diff --git a/kevlar/alac.py b/kevlar/alac.py
@@ -15,7 +15,7 @@
 from kevlar.call import call
 
 
-def alac(pstream, refrfile, ksize=31, bigpart=10000, delta=25, seedsize=31,
+def alac(pstream, refrfile, ksize=31, bigpart=10000, delta=50, seedsize=31,
          maxdiff=10000, match=1, mismatch=2, gapopen=5, gapextend=0,
          greedy=False, fallback=True, min_ikmers=None, logstream=sys.stderr):
     assembler = assemble_greedy if greedy else assemble_fml_asm

diff --git a/kevlar/cli/alac.py b/kevlar/cli/alac.py
@@ -35,9 +35,9 @@ def subparser(subparsers):
                             'this behavior')
 
     local_args = subparser.add_argument_group('Target extraction')
-    local_args.add_argument('-z', '--seed-size', type=int, default=31,
-                            metavar='Z', help='seed size; default is 31')
-    local_args.add_argument('-d', '--delta', type=int, default=25, metavar='D',
+    local_args.add_argument('-z', '--seed-size', type=int, default=51,
+                            metavar='Z', help='seed size; default is 51')
+    local_args.add_argument('-d', '--delta', type=int, default=50, metavar='D',
                             help='retrieve the genomic interval from the '
                             'reference by extending beyond the span of all '
                             'k-mer starting positions by D bp')

diff --git a/kevlar/cli/localize.py b/kevlar/cli/localize.py
@@ -24,12 +24,12 @@ def subparser(subparsers):
                            'positions should not exceed X bp; default is '
                            '10000 (10 kb)')
     subparser.add_argument('-d', '--delta', type=int, metavar='D',
-                           default=25, help='retrieve the genomic interval '
+                           default=50, help='retrieve the genomic interval '
                            'from the reference by extending beyond the span '
                            'of all k-mer starting positions by D bp')
     subparser.add_argument('-o', '--out', metavar='FILE', default='-',
                            help='output file; default is terminal (stdout)')
     subparser.add_argument('-z', '--seed-size', type=int, metavar='Z',
-                           default=31, help='seed size; default is 31')
+                           default=51, help='seed size; default is 51')
     subparser.add_argument('contigs', help='assembled reads in Fasta format')
     subparser.add_argument('refr', help='BWA indexed reference genome')
diff --git a/kevlar/cli/novel.py b/kevlar/cli/novel.py
@@ -91,9 +91,9 @@ def subparser(subparsers):
         'default is X=1'
     )
     samp_args.add_argument(
-        '-y', '--case-min', metavar='Y', type=int, default=5,
+        '-y', '--case-min', metavar='Y', type=int, default=6,
         help='k-mers with abund < Y in any case sample are uninteresting; '
-        'default is Y=5'
+        'default is Y=6'
     )
     samp_args.add_argument(
         '-M', '--memory', default='1e6', type=khmer_args.memory_setting,

diff --git a/kevlar/cli/simplex.py b/kevlar/cli/simplex.py
@@ -50,9 +50,9 @@ def subparser(subparsers):
         'default is X=1'
     )
     novel_args.add_argument(
-        '-y', '--case-min', metavar='Y', type=int, default=5, help='k-mers '
+        '-y', '--case-min', metavar='Y', type=int, default=6, help='k-mers '
         'with abund < Y in the case sample are uninteresting or unreliable; '
-        'default is Y=5'
+        'default is Y=6'
     )
     novel_args.add_argument(
         '--novel-memory', default='1e6', type=khmer_args.memory_setting,
@@ -128,8 +128,8 @@ def subparser(subparsers):
         'contig is aligned to a cutout of the reference genome, and the '
         'variant is called from the alignment.'
     )
-    alac_args.add_argument('-z', '--seed-size', type=int, default=31,
-                           metavar='Z', help='seed size; default is 31')
+    alac_args.add_argument('-z', '--seed-size', type=int, default=51,
+                           metavar='Z', help='seed size; default is 51')
     alac_args.add_argument(
         '-d', '--delta', type=int, default=50, metavar='D', help='extend the '
         'genomic cutout by D bp; default is 50'

diff --git a/kevlar/localize.py b/kevlar/localize.py
@@ -113,7 +113,7 @@ def get_exact_matches(contigstream, bwaindexfile, seedsize=31):
         yield seqid, pos
 
 
-def localize(contigstream, refrfile, seedsize=31, delta=25, maxdiff=10000,
+def localize(contigstream, refrfile, seedsize=31, delta=50, maxdiff=10000,
              refrseqs=None, logstream=sys.stderr):
     """Wrap the `kevlar localize` task as a generator.
 

diff --git a/kevlar/tests/test_novel.py b/kevlar/tests/test_novel.py
@@ -39,7 +39,7 @@ def test_cli():
         'cntl2.fq', '-k', '17',
     ])
     assert args.ksize == 17
-    assert args.case_min == 5
+    assert args.case_min == 6
     assert args.ctrl_max == 1
     assert args.num_bands is None
     assert args.band is None
@@ -49,7 +49,7 @@ def test_cli():
         '--control', 'cntl1.fq', '--control', 'cntl2.fq',
     ])
     assert args.ksize == 31
-    assert args.case_min == 5
+    assert args.case_min == 6
     assert args.ctrl_max == 1
     assert args.num_bands == 8
     assert args.band == 1

diff --git a/kevlar/tests/test_simplex.py b/kevlar/tests/test_simplex.py
@@ -83,7 +83,7 @@ def test_simplex_trio1(capsys):
         controls[1], '--case-min', '6', '--ctrl-max', '0', '--novel-memory',
         '1M', '--novel-fpr', '0.2', '--filter-memory', '50K', '--mask-files',
         refr, '--mask-memory', '1M', '--filter-fpr', '0.005', '--ksize', '21',
-        refr
+        '--seed-size', '31', '--delta', '25', refr
     ]
     args = kevlar.cli.parser().parse_args(arglist)
     kevlar.simplex.main(args)