Merge pull request #82 from dib-lab/refactor/cli

Separate subparser stuff into CLI module
kevlar-dev · May 31, 2017 · 3aa32e6 · 3aa32e6
2 parents 3375f8a + 9fad6f2
commit 3aa32e6
Show file tree

Hide file tree

Showing 21 changed files with 547 additions and 449 deletions.
diff --git a/.cloc.exclude b/.cloc.exclude
@@ -0,0 +1 @@
+kevlar/_version.py
diff --git a/Makefile b/Makefile
@@ -14,5 +14,9 @@ doc:
 	cd docs && make html
 
 loc:
-	cloc --exclude-list-file=<(echo kevlar/_version.py) kevlar/*.py kevlar/cli/*.py
+	@- echo "\n\n===== Core kevlar ====="
+	cloc --exclude-list-file=.cloc.exclude kevlar/*.py
+	@- echo "\n\n===== kevlar CLI ====="
+	cloc kevlar/cli/*.py
+	@- echo "\n\n===== kevlar tests ====="
 	cloc kevlar/tests/test_*.py
diff --git a/kevlar/assemble.py b/kevlar/assemble.py
@@ -9,7 +9,6 @@
 
 from __future__ import print_function
 from collections import defaultdict, namedtuple
-import argparse
 import itertools
 import sys
 try:
@@ -22,24 +21,6 @@
 from kevlar.seqio import load_reads_and_kmers
 
 
-def subparser(subparsers):
-    subparser = subparsers.add_parser('assemble')
-    subparser.add_argument('-d', '--debug', action='store_true',
-                           help='print debugging output')
-    subparser.add_argument('-o', '--out', metavar='FILE',
-                           help='output file; default is terminal (stdout)')
-    subparser.add_argument('--gml', metavar='FILE',
-                           help='write graph to .gml file')
-    subparser.add_argument('-n', '--min-abund', type=int, metavar='N',
-                           default=2, help='discard interesting k-mers that '
-                           'occur fewer than N times')
-    subparser.add_argument('-x', '--max-abund', type=int, metavar='X',
-                           default=500, help='discard interesting k-mers that '
-                           'occur more than X times')
-    subparser.add_argument('augfastq', help='annotated reads in augmented '
-                           'Fastq format')
-
-
 def merge_pair(pair):
     """
     Assemble a pair of overlapping reads.

diff --git a/kevlar/cli/__init__.py b/kevlar/cli/__init__.py
@@ -9,9 +9,17 @@
 # -----------------------------------------------------------------------------
 
 import argparse
-import kevlar
 import sys
-
+import kevlar
+from . import dump
+from . import count
+from . import novel
+from . import collect
+from . import filter
+from . import reaugment
+from . import assemble
+from . import mutate
+from . import partition
 
 mains = {
     'dump': kevlar.dump.main,
@@ -25,6 +33,18 @@
     'partition': kevlar.partition.main,
 }
 
+subparser_funcs = {
+    'dump': dump.subparser,
+    'count': count.subparser,
+    'novel': novel.subparser,
+    'collect': collect.subparser,
+    'filter': filter.subparser,
+    'reaugment': reaugment.subparser,
+    'assemble': assemble.subparser,
+    'mutate': mutate.subparser,
+    'partition': partition.subparser,
+}
+
 
 def parser():
     bubbletext = """
@@ -51,14 +71,7 @@ def parser():
                         'diagnostic messages, warnings, and errors')
     subparsers = parser.add_subparsers(dest='cmd', metavar='cmd',
                                        help='"' + subcommandstr + '"')
-    kevlar.dump.subparser(subparsers)
-    kevlar.count.subparser(subparsers)
-    kevlar.novel.subparser(subparsers)
-    kevlar.collect.subparser(subparsers)
-    kevlar.filter.subparser(subparsers)
-    kevlar.reaugment.subparser(subparsers)
-    kevlar.assemble.subparser(subparsers)
-    kevlar.mutate.subparser(subparsers)
-    kevlar.partition.subparser(subparsers)
+    for func in subparser_funcs.values():
+        func(subparsers)
 
     return parser
diff --git a/kevlar/cli/assemble.py b/kevlar/cli/assemble.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+#
+# -----------------------------------------------------------------------------
+# Copyright (c) 2017 The Regents of the University of California
+#
+# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
+# licensed under the MIT license: see LICENSE.
+# -----------------------------------------------------------------------------
+
+
+def subparser(subparsers):
+    subparser = subparsers.add_parser('assemble')
+    subparser.add_argument('-d', '--debug', action='store_true',
+                           help='print debugging output')
+    subparser.add_argument('-o', '--out', metavar='FILE',
+                           help='output file; default is terminal (stdout)')
+    subparser.add_argument('--gml', metavar='FILE',
+                           help='write graph to .gml file')
+    subparser.add_argument('-n', '--min-abund', type=int, metavar='N',
+                           default=2, help='discard interesting k-mers that '
+                           'occur fewer than N times')
+    subparser.add_argument('-x', '--max-abund', type=int, metavar='X',
+                           default=500, help='discard interesting k-mers that '
+                           'occur more than X times')
+    subparser.add_argument('augfastq', help='annotated reads in augmented '
+                           'Fastq format')
diff --git a/kevlar/cli/collect.py b/kevlar/cli/collect.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+#
+# -----------------------------------------------------------------------------
+# Copyright (c) 2017 The Regents of the University of California
+#
+# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
+# licensed under the MIT license: see LICENSE.
+# -----------------------------------------------------------------------------
+
+import argparse
+import khmer
+from khmer import khmer_args
+
+
+def subparser(subparsers):
+    subparser = subparsers.add_parser('collect')
+    subparser.add_argument('-d', '--debug', action='store_true',
+                           help='print debugging output')
+    subparser.add_argument('-M', '--memory', default='1e6', metavar='MEM',
+                           type=khmer_args.memory_setting,
+                           help='memory to allocate for recalculating '
+                           'abundances of novel k-mers; default is 1M')
+    subparser.add_argument('--minabund', type=int, default=5, metavar='Y',
+                           help='minimum case abundance required to call a '
+                           'k-mer novel; used to filter out k-mers with '
+                           'inflated abundances; should equal the value of '
+                           '--case_min used in "kevlar novel"; default is 5')
+    subparser.add_argument('--max-fpr', type=float, metavar='FPR',
+                           default=0.001, help='terminate if the expected '
+                           'false positive rate is higher than the specified '
+                           'FPR; default is 0.001')
+    subparser.add_argument('--refr', metavar='FILE', type=str, default=None,
+                           help='reference genome in Fasta/Fastq format; any '
+                           'k-mers designated as "interesting" by "kevlar '
+                           'novel" are ignored if they are present in the '
+                           'reference genome')
+    subparser.add_argument('--refr-memory', metavar='MEM', default='1e6',
+                           type=khmer_args.memory_setting,
+                           help='memory to allocate for storing the reference '
+                           'genome; default is 1M')
+    subparser.add_argument('-k', '--ksize', type=int, default=31, metavar='K',
+                           help='k-mer size; default is 31')
+    subparser.add_argument('--ignore', metavar='KMER', nargs='+',
+                           help='ignore the specified k-mer(s)')
+    subparser.add_argument('-o', '--out', type=argparse.FileType('w'),
+                           metavar='OUT',
+                           help='output file; default is terminal (stdout)')
+    subparser.add_argument('--collapse', action='store_true', help='collapse '
+                           'linear paths contained in other linear paths')
+    subparser.add_argument('novel_output', nargs='+', help='one or more output'
+                           ' files from the "kevlar novel" command')
diff --git a/kevlar/cli/count.py b/kevlar/cli/count.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python
+#
+# -----------------------------------------------------------------------------
+# Copyright (c) 2017 The Regents of the University of California
+#
+# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
+# licensed under the MIT license: see LICENSE.
+# -----------------------------------------------------------------------------
+
+import argparse
+import textwrap
+import khmer
+from khmer import khmer_args
+
+
+def subparser(subparsers):
+    """Define the `kevlar count` command-line interface."""
+
+    desc = "Compute k-mer abundances for the provided samples"
+    epilog = """\
+    Example::
+
+        kevlar count --ksize 31 --memory 4G --mem-frac 0.25 \\
+            --case proband.counttable proband-reads.fq.gz \\
+            --control father.counttable father-r1.fq.gz father-r2.fq.gz \\
+            --control mother.counttable mother-reads.fq.gz
+
+    Example::
+
+        kevlar count --ksize 25 --memory 500M \\
+            --case case_x.ct case_x.fq \\
+            --case case_y.ct case_y.fq \\
+            --control control1.ct control1a.fq control1b.fq \\
+            --control control2.ct control2a.fq control2b.fq"""
+    epilog = textwrap.dedent(epilog)
+
+    subparser = subparsers.add_parser(
+        'count', description=desc, epilog=epilog, add_help=False,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
+    samp_args = subparser.add_argument_group('Case/control config')
+    samp_args.add_argument(
+        '--case', metavar='F', nargs='+', required=True, action='append',
+        help='one or more FASTA/FASTQ files containing reads from a case '
+        'sample; can be declared multiple times corresponding to multiple '
+        'case samples; see examples below'
+    )
+    samp_args.add_argument(
+        '--control', metavar='F', nargs='+', action='append',
+        help='one or more FASTA/FASTQ files containing reads from a control '
+        'sample; can be declared multiple times corresponding to multiple '
+        'control samples; see examples below'
+    )
+    samp_args.add_argument(
+        '-x', '--ctrl-max', metavar='X', type=int, default=1,
+        help='k-mers with abund > X in any control sample are uninteresting; '
+        'default is X=1'
+    )
+
+    mem_desc = """\
+    Specify how much memory to allocate for the sketch data structures
+    used to store k-mer counts. The first control sample will be
+    allocated the full amount of specifed `--memory`, and all subsequent
+    samples will be allocated a fraction thereof.
+    """
+    mem_desc = textwrap.dedent(mem_desc)
+    memory_args = subparser.add_argument_group('Memory allocation', mem_desc)
+    memory_args.add_argument(
+        '-M', '--memory', default='1e6', metavar='MEM',
+        type=khmer_args.memory_setting, help='total memory to allocate for '
+        'the initial control sample; default is 1M'
+    )
+    memory_args.add_argument(
+        '-f', '--mem-frac', type=float, default=0.1, metavar='F',
+        help='fraction of the total memory to allocate to subsequent samples; '
+        'default is 0.1'
+    )
+    memory_args.add_argument(
+        '--max-fpr', type=float, default=0.2, metavar='FPR',
+        help='terminate if the expected false positive rate for any sample is '
+        'higher than the specified FPR; default is 0.2'
+    )
+
+    band_desc = """\
+    If memory is a limiting factor, it is possible to get a linear
+    decrease in memory consumption by running kevlar in "banded" mode.
+    Splitting the hashed k-mer space into N bands and only considering k-mers
+    from one band at a time reduces the memory consumption to approximately 1/N
+    of the total memory required. This implements a scatter/gather approach in
+    which `kevlar count` and/or `kevlar novel` is run N times, after which the
+    results are combined using `kevlar filter`.
+    """
+    band_desc = textwrap.dedent(band_desc)
+    band_args = subparser.add_argument_group('K-mer banding', band_desc)
+    band_args.add_argument(
+        '--num-bands', type=int, metavar='N', default=None,
+        help='number of bands into which to divide the hashed k-mer space'
+    )
+    band_args.add_argument(
+        '--band', type=int, metavar='I', default=None,
+        help='a number between 1 and N (inclusive) indicating the band to be '
+        'processed'
+    )
+
+    misc_args = subparser.add_argument_group('Miscellaneous settings')
+    misc_args.add_argument('-h', '--help', action='help',
+                           help='show this help message and exit')
+    misc_args.add_argument('-k', '--ksize', type=int, default=31, metavar='K',
+                           help='k-mer size; default is 31')
diff --git a/kevlar/cli/dump.py b/kevlar/cli/dump.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+#
+# -----------------------------------------------------------------------------
+# Copyright (c) 2016 The Regents of the University of California
+#
+# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
+# licensed under the MIT license: see LICENSE.
+# -----------------------------------------------------------------------------
+
+import khmer
+from khmer import khmer_args
+
+
+def subparser(subparsers):
+    subparser = subparsers.add_parser('dump')
+    subparser.add_argument('--seqid', metavar='SEQ',
+                           help='dump reads not mapped to SEQ')
+    subparser.add_argument('--genomemask', metavar='FILE', help='dump reads '
+                           'with median k-mer abundance >= 1 in the specified '
+                           'genome; if both --seqid and --genomemask are '
+                           'declared, reads passing either filter will be '
+                           'kept')
+    subparser.add_argument('--maskmemory', metavar='SIZE', default=2e9,
+                           type=khmer_args.memory_setting,
+                           help='memory to be occupied by genome mask; default'
+                           ' is 2G')
+    subparser.add_argument('--mask-k', metavar='K', default=31, type=int,
+                           help='k size for genome mask')
+    subparser.add_argument('--out', metavar='FILE', help='output file; default'
+                           ' is terminal (stdout)')
+    subparser.add_argument('refr', help='reference sequence in Fasta format')
+    subparser.add_argument('reads', help='read alignments in BAM format')