/
commands.py
1869 lines (1620 loc) · 87.9 KB
/
commands.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""Command-line interface and corresponding API for CNVkit."""
# NB: argparse CLI definitions and API functions are interwoven:
# "_cmd_*" handles I/O and arguments processing for the command
# "do_*" runs the command's functionality as an API
import argparse
import logging
import os
import sys
# Filter spurious Cython warnings re: numpy
# Via: https://github.com/numpy/numpy/pull/432
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
# If running headless, use a suitable GUI-less plotting backend
if not os.environ.get('DISPLAY'):
import matplotlib
matplotlib.use("Agg", force=True)
from matplotlib import pyplot
from matplotlib.backends.backend_pdf import PdfPages
pyplot.ioff()
import pandas as pd
from skgenome import tabio, GenomicArray as _GA
from skgenome.rangelabel import to_label
from . import (access, antitarget, autobin, batch, bintest, call, core,
coverage, diagram, export, fix, heatmap, import_rna, importers,
metrics, parallel, reference, reports, scatter, segmentation,
segmetrics, target)
from .cmdutil import (load_het_snps, read_cna, verify_sample_sex,
write_tsv, write_text, write_dataframe)
from ._version import __version__
__all__ = []
def public(fn):
__all__.append(fn.__name__)
return fn
AP = argparse.ArgumentParser(
description="CNVkit, a command-line toolkit for copy number analysis.",
epilog="See the online manual for details: https://cnvkit.readthedocs.io")
AP_subparsers = AP.add_subparsers(
help="Sub-commands (use with -h for more info)")
# _____________________________________________________________________________
# Core pipeline
# batch -----------------------------------------------------------------------
def _cmd_batch(args):
"""Run the complete CNVkit pipeline on one or more BAM files."""
logging.info("CNVkit %s", __version__)
# Validate/restrict options, beyond what argparse mutual exclusion can do
bad_args_msg = ""
if args.reference:
bad_flags = [flag
for is_used, flag in (
(args.normal is not None, '-n/--normal'),
(args.fasta, '-f/--fasta'),
(args.targets, '-t/--targets'),
(args.antitargets, '-a/--antitargets'),
(args.access, '-g/--access'),
(args.annotate, '--annotate'),
(args.short_names, '--short-names'),
(args.target_avg_size, '--target-avg-size'),
(args.antitarget_avg_size, '--antitarget-avg-size'),
(args.antitarget_min_size, '--antitarget-min-size'),
) if is_used]
if bad_flags:
bad_args_msg = ("If -r/--reference is given, options to construct "
"a new reference (%s) should not be used."
% ", ".join(bad_flags))
elif args.normal is None:
bad_args_msg = ("Option -n/--normal must be given to build a new "
"reference if -r/--reference is not used.")
elif args.method in ('hybrid', 'amplicon') and not args.targets:
bad_args_msg = ("For the '%r' sequencing method, option -t/--targets "
"(at least) must be given to build a new reference if "
"-r/--reference is not used." % args.method)
if bad_args_msg:
sys.exit(bad_args_msg + "\n(See: cnvkit.py batch -h)")
# Ensure sample IDs are unique to avoid overwriting outputs
seen_sids = {}
for fname in (args.bam_files or []) + (args.normal or []):
sid = core.fbase(fname)
if sid in seen_sids:
sys.exit("Duplicate sample ID %r (from %s and %s)"
% (sid, fname, seen_sids[sid]))
seen_sids[sid] = fname
if args.processes < 1:
import multiprocessing
args.processes = multiprocessing.cpu_count()
if not args.reference:
# Build a copy number reference; update (anti)targets upon request
args.reference, args.targets, args.antitargets = batch.batch_make_reference(
args.normal, args.targets, args.antitargets, args.male_reference,
args.fasta, args.annotate, args.short_names, args.target_avg_size,
args.access, args.antitarget_avg_size, args.antitarget_min_size,
args.output_reference, args.output_dir, args.processes,
args.count_reads, args.method, args.cluster)
elif args.targets is None and args.antitargets is None:
# Extract (anti)target BEDs from the given, existing CN reference
ref_arr = read_cna(args.reference)
targets, antitargets = reference.reference2regions(ref_arr)
ref_pfx = os.path.join(args.output_dir, core.fbase(args.reference))
args.targets = ref_pfx + '.target-tmp.bed'
args.antitargets = ref_pfx + '.antitarget-tmp.bed'
tabio.write(targets, args.targets, 'bed4')
tabio.write(antitargets, args.antitargets, 'bed4')
if args.bam_files:
if args.processes == 1:
procs_per_bam = 1
logging.info("Running %d samples in serial", len(args.bam_files))
else:
procs_per_bam = max(1, args.processes // len(args.bam_files))
logging.info("Running %d samples in %d processes "
"(that's %d processes per bam)",
len(args.bam_files), args.processes, procs_per_bam)
with parallel.pick_pool(args.processes) as pool:
for bam in args.bam_files:
pool.submit(batch.batch_run_sample,
bam, args.targets, args.antitargets, args.reference,
args.output_dir, args.male_reference, args.scatter,
args.diagram, args.rscript_path, args.count_reads,
args.drop_low_coverage, args.method, procs_per_bam,
args.cluster)
else:
logging.info("No tumor/test samples (but %d normal/control samples) "
"specified on the command line.",
len(args.normal))
P_batch = AP_subparsers.add_parser('batch', help=_cmd_batch.__doc__)
P_batch.add_argument('bam_files', nargs='*',
help="Mapped sequence reads (.bam)")
P_batch.add_argument('-m', '--method',
choices=('hybrid', 'amplicon', 'wgs'), default='hybrid',
help="""Sequencing protocol: hybridization capture ('hybrid'), targeted
amplicon sequencing ('amplicon'), or whole genome sequencing
('wgs'). Determines whether and how to use antitarget bins.
[Default: %(default)s]""")
P_batch.add_argument('-y', '--male-reference', '--haploid-x-reference',
action='store_true',
help="""Use or assume a male reference (i.e. female samples will have +1
log-CNR of chrX; otherwise male samples would have -1 chrX).""")
P_batch.add_argument('-c', '--count-reads', action='store_true',
help="""Get read depths by counting read midpoints within each bin.
(An alternative algorithm).""")
P_batch.add_argument("--drop-low-coverage", action='store_true',
help="""Drop very-low-coverage bins before segmentation to avoid
false-positive deletions in poor-quality tumor samples.""")
P_batch.add_argument('-p', '--processes',
nargs='?', type=int, const=0, default=1,
help="""Number of subprocesses used to running each of the BAM files in
parallel. Without an argument, use the maximum number of
available CPUs. [Default: process each BAM in serial]""")
P_batch.add_argument("--rscript-path", metavar="PATH", default="Rscript",
help="""Path to the Rscript excecutable to use for running R code.
Use this option to specify a non-default R installation.
[Default: %(default)s]""")
# Reference-building options
P_batch_newref = P_batch.add_argument_group(
"To construct a new copy number reference")
P_batch_newref.add_argument('-n', '--normal', nargs='*', metavar="FILES",
help="""Normal samples (.bam) used to construct the pooled, paired, or
flat reference. If this option is used but no filenames are
given, a "flat" reference will be built. Otherwise, all
filenames following this option will be used.""")
P_batch_newref.add_argument('-f', '--fasta', metavar="FILENAME",
help="Reference genome, FASTA format (e.g. UCSC hg19.fa)")
P_batch_newref.add_argument('-t', '--targets', metavar="FILENAME",
help="Target intervals (.bed or .list)")
P_batch_newref.add_argument('-a', '--antitargets', metavar="FILENAME",
help="Antitarget intervals (.bed or .list)")
# For pre-processing targets
P_batch_newref.add_argument('--annotate', metavar="FILENAME",
help="""Use gene models from this file to assign names to the target
regions. Format: UCSC refFlat.txt or ensFlat.txt file
(preferred), or BED, interval list, GFF, or similar.""")
P_batch_newref.add_argument('--short-names', action='store_true',
help="Reduce multi-accession bait labels to be short and consistent.")
P_batch_newref.add_argument('--target-avg-size', type=int,
help="Average size of split target bins (results are approximate).")
# For antitargets:
P_batch_newref.add_argument('-g', '--access', metavar="FILENAME",
help="""Regions of accessible sequence on chromosomes (.bed), as
output by the 'access' command.""")
P_batch_newref.add_argument('--antitarget-avg-size', type=int,
help="Average size of antitarget bins (results are approximate).")
P_batch_newref.add_argument('--antitarget-min-size', type=int,
help="Minimum size of antitarget bins (smaller regions are dropped).")
P_batch_newref.add_argument('--output-reference', metavar="FILENAME",
help="""Output filename/path for the new reference file being created.
(If given, ignores the -o/--output-dir option and will write the
file to the given path. Otherwise, \"reference.cnn\" will be
created in the current directory or specified output directory.)
""")
P_batch_newref.add_argument('--cluster',
action='store_true',
help="""Calculate and use cluster-specific summary stats in the
reference pool to normalize samples.""")
P_batch_oldref = P_batch.add_argument_group("To reuse an existing reference")
P_batch_oldref.add_argument('-r', '--reference', #required=True,
help="Copy number reference file (.cnn).")
# Reporting options
P_batch_report = P_batch.add_argument_group("Output options")
P_batch_report.add_argument('-d', '--output-dir',
metavar="DIRECTORY", default='.',
help="Output directory.")
P_batch_report.add_argument('--scatter', action='store_true',
help="Create a whole-genome copy ratio profile as a PDF scatter plot.")
P_batch_report.add_argument('--diagram', action='store_true',
help="Create an ideogram of copy ratios on chromosomes as a PDF.")
P_batch.set_defaults(func=_cmd_batch)
# target ----------------------------------------------------------------------
do_target = public(target.do_target)
def _cmd_target(args):
"""Transform bait intervals into targets more suitable for CNVkit."""
regions = tabio.read_auto(args.interval)
regions = target.do_target(regions, args.annotate, args.short_names,
args.split, args.avg_size)
tabio.write(regions, args.output, "bed4")
P_target = AP_subparsers.add_parser('target', help=_cmd_target.__doc__)
P_target.add_argument('interval',
help="""BED or interval file listing the targeted regions.""")
P_target.add_argument('--annotate',
help="""Use gene models from this file to assign names to the target
regions. Format: UCSC refFlat.txt or ensFlat.txt file
(preferred), or BED, interval list, GFF, or similar.""")
P_target.add_argument('--short-names', action='store_true',
help="Reduce multi-accession bait labels to be short and consistent.")
P_target.add_argument('--split', action='store_true',
help="Split large tiled intervals into smaller, consecutive targets.")
# Exons: [114--188==203==292--21750], mean=353 -> outlier=359, extreme=515
# NV2: [65--181==190==239--12630], mean=264 -> outlier=277, extreme=364
# Default avg_size chosen s.t. minimum bin size after split is ~= median
P_target.add_argument('-a', '--avg-size', type=int, default=200 / .75,
help="""Average size of split target bins (results are approximate).
[Default: %(default)s]""")
P_target.add_argument('-o', '--output', metavar="FILENAME",
help="""Output file name.""")
P_target.set_defaults(func=_cmd_target)
# access ----------------------------------------------------------------------
do_access = public(access.do_access)
def _cmd_access(args):
"""List the locations of accessible sequence regions in a FASTA file."""
access_arr = access.do_access(args.fa_fname, args.exclude,
args.min_gap_size)
tabio.write(access_arr, args.output, "bed3")
P_access = AP_subparsers.add_parser('access', help=_cmd_access.__doc__)
P_access.add_argument("fa_fname",
help="Genome FASTA file name")
P_access.add_argument("-s", "--min-gap-size", type=int, default=5000,
help="""Minimum gap size between accessible sequence
regions. Regions separated by less than this distance will
be joined together. [Default: %(default)s]""")
P_access.add_argument("-x", "--exclude", action="append", default=[],
help="""Additional regions to exclude, in BED format. Can be
used multiple times.""")
P_access.add_argument("-o", "--output", metavar="FILENAME",
type=argparse.FileType('w'), default=sys.stdout,
help="Output file name")
P_access.set_defaults(func=_cmd_access)
# antitarget ------------------------------------------------------------------
do_antitarget = public(antitarget.do_antitarget)
def _cmd_antitarget(args):
"""Derive off-target ("antitarget") bins from target regions."""
targets = tabio.read_auto(args.targets)
access = tabio.read_auto(args.access) if args.access else None
out_arr = antitarget.do_antitarget(targets, access, args.avg_size,
args.min_size)
if not args.output:
base, ext = args.interval.rsplit('.', 1)
args.output = base + '.antitarget.' + ext
tabio.write(out_arr, args.output, "bed4")
P_anti = AP_subparsers.add_parser('antitarget', help=_cmd_antitarget.__doc__)
P_anti.add_argument('targets',
help="""BED or interval file listing the targeted regions.""")
P_anti.add_argument('-g', '--access', metavar="FILENAME",
help="""Regions of accessible sequence on chromosomes (.bed), as
output by genome2access.py.""")
P_anti.add_argument('-a', '--avg-size', type=int, default=150000,
help="""Average size of antitarget bins (results are approximate).
[Default: %(default)s]""")
P_anti.add_argument('-m', '--min-size', type=int,
help="""Minimum size of antitarget bins (smaller regions are dropped).
[Default: 1/16 avg size, calculated]""")
P_anti.add_argument('-o', '--output', metavar="FILENAME",
help="""Output file name.""")
P_anti.set_defaults(func=_cmd_antitarget)
# autobin ---------------------------------------------------------------------
do_autobin = public(autobin.do_autobin)
def _cmd_autobin(args):
"""Quickly calculate reasonable bin sizes from BAM read counts."""
if args.method in ('hybrid', 'amplicon') and not args.targets:
raise RuntimeError("Sequencing method %r requires targets (-t)",
args.method)
if args.method == 'wgs':
if not args.access:
raise RuntimeError("Sequencing method 'wgs' requires accessible "
"regions (-g)")
if args.targets:
logging.warning("Targets will be ignored: %s", args.targets)
if args.method == 'amplicon' and args.access:
logging.warning("Sequencing-accessible regions will be ignored: %s",
args.access)
def read_regions(bed_fname):
if bed_fname:
regions = tabio.read_auto(bed_fname)
if len(regions):
return regions
else:
logging.warning("No regions to estimate depth from %s",
regions.meta.get('filename', ''))
tgt_arr = read_regions(args.targets)
access_arr = read_regions(args.access)
bam_fname = autobin.midsize_file(args.bams)
fields = autobin.do_autobin(bam_fname, args.method, tgt_arr, access_arr,
args.bp_per_bin, args.target_min_size,
args.target_max_size, args.antitarget_min_size,
args.antitarget_max_size)
(_tgt_depth, tgt_bin_size), (_anti_depth, anti_bin_size) = fields
# Create & write BED files
target_out_arr = target.do_target(access_arr if args.method == 'wgs'
else tgt_arr,
args.annotate, args.short_names,
do_split=True, avg_size=tgt_bin_size)
tgt_name_base = tgt_arr.sample_id if tgt_arr else core.fbase(bam_fname)
target_bed = tgt_name_base + '.target.bed'
tabio.write(target_out_arr, target_bed, "bed4")
if args.method == "hybrid" and anti_bin_size:
# Build antitarget BED from the given targets
anti_arr = antitarget.do_antitarget(target_out_arr,
access=access_arr,
avg_bin_size=anti_bin_size,
min_bin_size=args.antitarget_min_size)
else:
# No antitargets for wgs, amplicon
anti_arr = _GA([])
antitarget_bed = tgt_name_base + '.antitarget.bed'
tabio.write(anti_arr, antitarget_bed, "bed4")
# Print depths & bin sizes as a table on stdout
labels = ("Target", "Antitarget")
width = max(map(len, labels)) + 1
print(" " * width, "Depth", "Bin size", sep='\t')
for label, (depth, binsize) in zip(labels, fields):
if depth is not None:
print((label + ":").ljust(width),
format(depth, ".3f"),
binsize,
sep='\t')
P_autobin = AP_subparsers.add_parser('autobin', help=_cmd_autobin.__doc__)
P_autobin.add_argument('bams', nargs='+',
help="""Sample BAM file(s) to test for target coverage""")
P_autobin.add_argument('-m', '--method',
choices=('hybrid', 'amplicon', 'wgs'), default='hybrid',
help="""Sequencing protocol: hybridization capture ('hybrid'), targeted
amplicon sequencing ('amplicon'), or whole genome sequencing
('wgs'). Determines whether and how to use antitarget bins.
[Default: %(default)s]""")
P_autobin.add_argument('-g', '--access', metavar="FILENAME",
help="""Sequencing-accessible genomic regions, or exons to use as
possible targets (e.g. output of refFlat2bed.py)""")
P_autobin.add_argument('-t', '--targets',
help="""Potentially targeted genomic regions, e.g. all possible exons
for the reference genome. Format: BED, interval list, etc.""")
P_autobin.add_argument('-b', '--bp-per-bin',
type=float, default=100000.,
help="""Desired average number of sequencing read bases mapped to each
bin. [Default: %(default)s]""")
P_autobin.add_argument('--target-max-size', metavar="BASES",
type=int, default=20000,
help="Maximum size of target bins. [Default: %(default)s]")
P_autobin.add_argument('--target-min-size', metavar="BASES",
type=int, default=20,
help="Minimum size of target bins. [Default: %(default)s]")
P_autobin.add_argument('--antitarget-max-size', metavar="BASES",
type=int, default=500000,
help="Maximum size of antitarget bins. [Default: %(default)s]")
P_autobin.add_argument('--antitarget-min-size', metavar="BASES",
type=int, default=500,
help="Minimum size of antitarget bins. [Default: %(default)s]")
P_autobin.add_argument('--annotate', metavar="FILENAME",
help="""Use gene models from this file to assign names to the target
regions. Format: UCSC refFlat.txt or ensFlat.txt file
(preferred), or BED, interval list, GFF, or similar.""")
P_autobin.add_argument('--short-names', action='store_true',
help="Reduce multi-accession bait labels to be short and consistent.")
# Option: --dry-run to not write BED files?
# P_autobin.add_argument('-o', '--output', help="Output filename.")
P_autobin.set_defaults(func=_cmd_autobin)
# coverage --------------------------------------------------------------------
do_coverage = public(coverage.do_coverage)
def _cmd_coverage(args):
"""Calculate coverage in the given regions from BAM read depths."""
pset = coverage.do_coverage(args.interval, args.bam_file, args.count,
args.min_mapq, args.processes)
if not args.output:
# Create an informative but unique name for the coverage output file
bambase = core.fbase(args.bam_file)
bedbase = core.fbase(args.interval)
tgtbase = ('antitargetcoverage'
if 'anti' in bedbase.lower()
else 'targetcoverage')
args.output = '%s.%s.cnn' % (bambase, tgtbase)
if os.path.exists(args.output):
args.output = '%s.%s.cnn' % (bambase, bedbase)
core.ensure_path(args.output)
tabio.write(pset, args.output)
P_coverage = AP_subparsers.add_parser('coverage', help=_cmd_coverage.__doc__)
P_coverage.add_argument('bam_file', help="Mapped sequence reads (.bam)")
P_coverage.add_argument('interval', help="Intervals (.bed or .list)")
P_coverage.add_argument('-c', '--count', action='store_true',
help="""Get read depths by counting read midpoints within each bin.
(An alternative algorithm).""")
P_coverage.add_argument('-q', '--min-mapq', type=int, default=0,
help="""Minimum mapping quality score (phred scale 0-60) to count a read
for coverage depth. [Default: %(default)s]""")
P_coverage.add_argument('-o', '--output', metavar="FILENAME",
help="""Output file name.""")
P_coverage.add_argument('-p', '--processes',
nargs='?', type=int, const=0, default=1,
help="""Number of subprocesses to calculate coverage in parallel.
Without an argument, use the maximum number of available CPUs.
[Default: use 1 process]""")
P_coverage.set_defaults(func=_cmd_coverage)
# reference -------------------------------------------------------------------
do_reference = public(reference.do_reference)
do_reference_flat = public(reference.do_reference_flat)
def _cmd_reference(args):
"""Compile a coverage reference from the given files (normal samples)."""
usage_err_msg = ("Give .cnn samples OR targets and (optionally) antitargets.")
if args.targets:
# Flat refence
assert not args.references, usage_err_msg
ref_probes = reference.do_reference_flat(args.targets, args.antitargets,
args.fasta,
args.male_reference)
elif args.references:
# Pooled reference
assert not args.targets and not args.antitargets, usage_err_msg
filenames = []
for path in args.references:
if os.path.isdir(path):
filenames.extend(os.path.join(path, f) for f in os.listdir(path)
if f.endswith('targetcoverage.cnn'))
else:
filenames.append(path)
targets = [f for f in filenames if 'antitarget' not in f]
antitargets = [f for f in filenames if 'antitarget' in f]
logging.info("Number of target and antitarget files: %d, %d",
len(targets), len(antitargets))
female_samples = ((args.sample_sex.lower() not in ['y', 'm', 'male'])
if args.sample_sex else None)
ref_probes = reference.do_reference(targets, antitargets, args.fasta,
args.male_reference, female_samples,
args.do_gc, args.do_edge,
args.do_rmask, args.cluster,
args.min_cluster_size)
else:
raise ValueError(usage_err_msg)
ref_fname = args.output or "cnv_reference.cnn"
core.ensure_path(ref_fname)
tabio.write(ref_probes, ref_fname)
P_reference = AP_subparsers.add_parser('reference', help=_cmd_reference.__doc__)
P_reference.add_argument('references', nargs='*',
help="""Normal-sample target or antitarget .cnn files, or the
directory that contains them.""")
P_reference.add_argument('-f', '--fasta',
help="Reference genome, FASTA format (e.g. UCSC hg19.fa)")
P_reference.add_argument('-o', '--output', metavar="FILENAME",
help="Output file name.")
P_reference.add_argument('-c', '--cluster',
action='store_true',
help="""Calculate and store summary stats for clustered subsets of the
normal samples with similar coverage profiles.""")
P_reference.add_argument('--min-cluster-size',
metavar="NUM",
type=int,
default=4,
help="""Minimum cluster size to keep in reference profiles.""")
P_reference.add_argument('-x', '--sample-sex', '-g', '--gender',
dest='sample_sex',
choices=('m', 'y', 'male', 'Male', 'f', 'x', 'female', 'Female'),
help="""Specify the chromosomal sex of all given samples as male or
female. (Default: guess each sample from coverage of X and Y
chromosomes).""")
P_reference.add_argument('-y', '--male-reference', '--haploid-x-reference',
action='store_true',
help="""Create a male reference: shift female samples' chrX
log-coverage by -1, so the reference chrX average is -1.
Otherwise, shift male samples' chrX by +1, so the reference chrX
average is 0.""")
P_reference_flat = P_reference.add_argument_group(
"To construct a generic, \"flat\" copy number reference with neutral "
"expected coverage")
P_reference_flat.add_argument('-t', '--targets',
help="Target intervals (.bed or .list)")
P_reference_flat.add_argument('-a', '--antitargets',
help="Antitarget intervals (.bed or .list)")
P_reference_bias = P_reference.add_argument_group(
"To disable specific automatic bias corrections")
P_reference_bias.add_argument('--no-gc', dest='do_gc', action='store_false',
help="Skip GC correction.")
P_reference_bias.add_argument('--no-edge', dest='do_edge', action='store_false',
help="Skip edge-effect correction.")
P_reference_bias.add_argument('--no-rmask', dest='do_rmask', action='store_false',
help="Skip RepeatMasker correction.")
P_reference.set_defaults(func=_cmd_reference)
# fix -------------------------------------------------------------------------
do_fix = public(fix.do_fix)
def _cmd_fix(args):
"""Combine target and antitarget coverages and correct for biases.
Adjust raw coverage data according to the given reference, correct potential
biases and re-center.
"""
# Verify that target and antitarget are from the same sample
tgt_raw = read_cna(args.target, sample_id=args.sample_id)
anti_raw = read_cna(args.antitarget, sample_id=args.sample_id)
if len(anti_raw) and tgt_raw.sample_id != anti_raw.sample_id:
raise ValueError("Sample IDs do not match:"
"'%s' (target) vs. '%s' (antitarget)"
% (tgt_raw.sample_id, anti_raw.sample_id))
target_table = fix.do_fix(tgt_raw, anti_raw, read_cna(args.reference),
args.do_gc, args.do_edge, args.do_rmask,
args.cluster)
tabio.write(target_table, args.output or tgt_raw.sample_id + '.cnr')
P_fix = AP_subparsers.add_parser('fix', help=_cmd_fix.__doc__)
P_fix.add_argument('target',
help="Target coverage file (.targetcoverage.cnn).")
P_fix.add_argument('antitarget',
help="Antitarget coverage file (.antitargetcoverage.cnn).")
P_fix.add_argument('reference',
help="Reference coverage (.cnn).")
P_fix.add_argument('-c', '--cluster',
action='store_true',
help="""Compare and use cluster-specific values present in the
reference profile. (Requires that the reference profile
was built with the --cluster option.)""")
P_fix.add_argument('-i', '--sample-id',
help="Sample ID for target/antitarget files. Otherwise inferred from file names.")
# P_fix.add_argument('--do-gc', action='store_true', default=True,
# help="Do GC correction.")
# P_fix.add_argument('--do-edge', action='store_true',
# help="Do edge-effect correction.")
# P_fix.add_argument('--do-size', action='store_true',
# help="Do interval-size correction.")
P_fix.add_argument('--no-gc', dest='do_gc', action='store_false',
help="Skip GC correction.")
P_fix.add_argument('--no-edge', dest='do_edge', action='store_false',
help="Skip edge-effect correction.")
P_fix.add_argument('--no-rmask', dest='do_rmask', action='store_false',
help="Skip RepeatMasker correction.")
P_fix.add_argument('-o', '--output', metavar="FILENAME",
help="Output file name.")
P_fix.set_defaults(func=_cmd_fix)
# segment ---------------------------------------------------------------------
do_segmentation = public(segmentation.do_segmentation)
def _cmd_segment(args):
"""Infer copy number segments from the given coverage table."""
cnarr = read_cna(args.filename)
variants = load_het_snps(args.vcf, args.sample_id, args.normal_id,
args.min_variant_depth, args.zygosity_freq)
results = segmentation.do_segmentation(cnarr, args.method, args.threshold,
variants=variants,
skip_low=args.drop_low_coverage,
skip_outliers=args.drop_outliers,
save_dataframe=bool(args.dataframe),
rscript_path=args.rscript_path,
processes=args.processes)
if args.dataframe:
segments, dframe = results
with open(args.dataframe, 'w') as handle:
handle.write(dframe)
logging.info("Wrote %s", args.dataframe)
else:
segments = results
tabio.write(segments, args.output or segments.sample_id + '.cns')
P_segment = AP_subparsers.add_parser('segment', help=_cmd_segment.__doc__)
P_segment.add_argument('filename',
help="Bin-level log2 ratios (.cnr file), as produced by 'fix'.")
P_segment.add_argument('-o', '--output', metavar="FILENAME",
help="Output table file name (CNR-like table of segments, .cns).")
P_segment.add_argument('-d', '--dataframe',
help="""File name to save the raw R dataframe emitted by CBS or
Fused Lasso. (Useful for debugging.)""")
P_segment.add_argument('-m', '--method', default='cbs',
choices=('cbs', 'flasso', 'haar', 'none',
'hmm', 'hmm-tumor', 'hmm-germline'),
help="""Segmentation method (CBS, fused lasso, haar wavelet, HMM), or
'none' for chromosome arm-level averages as segments.
[Default: %(default)s]""")
P_segment.add_argument('-t', '--threshold', type=float,
help="""Significance threshold (p-value or FDR, depending on method) to
accept breakpoints during segmentation.
For HMM methods, this is the smoothing window size.""")
P_segment.add_argument("--drop-low-coverage", action='store_true',
help="""Drop very-low-coverage bins before segmentation to avoid
false-positive deletions in poor-quality tumor samples.""")
P_segment.add_argument("--drop-outliers", metavar="FACTOR",
type=float, default=10,
help="""Drop outlier bins more than this many multiples of the 95th
quantile away from the average within a rolling window.
Set to 0 for no outlier filtering.
[Default: %(default)g]""")
P_segment.add_argument("--rscript-path", metavar="PATH", default="Rscript",
help="""Path to the Rscript excecutable to use for running R code.
Use this option to specify a non-default R installation.
[Default: %(default)s]""")
P_segment.add_argument('-p', '--processes',
nargs='?', type=int, const=0, default=1,
help="""Number of subprocesses to segment in parallel.
Give 0 or a negative value to use the maximum number
of available CPUs. [Default: use 1 process]""")
P_segment_vcf = P_segment.add_argument_group(
"To additionally segment SNP b-allele frequencies")
P_segment_vcf.add_argument('-v', '--vcf', metavar="FILENAME",
help="""VCF file name containing variants for segmentation by allele
frequencies.""")
P_segment_vcf.add_argument('-i', '--sample-id',
help="""Specify the name of the sample in the VCF (-v/--vcf) to use for
b-allele frequency extraction and as the default plot title.""")
P_segment_vcf.add_argument('-n', '--normal-id',
help="""Corresponding normal sample ID in the input VCF (-v/--vcf).
This sample is used to select only germline SNVs to plot
b-allele frequencies.""")
P_segment_vcf.add_argument('--min-variant-depth', type=int, default=20,
help="""Minimum read depth for a SNV to be displayed in the b-allele
frequency plot. [Default: %(default)s]""")
P_segment_vcf.add_argument('-z', '--zygosity-freq',
metavar='ALT_FREQ', nargs='?', type=float, const=0.25,
help="""Ignore VCF's genotypes (GT field) and instead infer zygosity
from allele frequencies. [Default if used without a number:
%(const)s]""")
P_segment.set_defaults(func=_cmd_segment)
# call ------------------------------------------------------------------------
do_call = public(call.do_call)
def _cmd_call(args):
"""Call copy number variants from segmented log2 ratios."""
if args.purity and not 0.0 < args.purity <= 1.0:
raise RuntimeError("Purity must be between 0 and 1.")
cnarr = read_cna(args.filename)
if args.center_at:
logging.info("Shifting log2 values by %f", -args.center_at)
cnarr['log2'] -= args.center_at
elif args.center:
cnarr.center_all(args.center, skip_low=args.drop_low_coverage,
verbose=True)
varr = load_het_snps(args.vcf, args.sample_id, args.normal_id,
args.min_variant_depth, args.zygosity_freq)
is_sample_female = (verify_sample_sex(cnarr, args.sample_sex,
args.male_reference)
if args.purity and args.purity < 1.0
else None)
cnarr = call.do_call(cnarr, varr, args.method, args.ploidy, args.purity,
args.male_reference, is_sample_female, args.filters,
args.thresholds)
tabio.write(cnarr, args.output or cnarr.sample_id + '.call.cns')
def csvstring(text):
return tuple(map(float, text.split(",")))
P_call = AP_subparsers.add_parser('call', help=_cmd_call.__doc__)
P_call.add_argument('filename',
help="Copy ratios (.cnr or .cns).")
P_call.add_argument("--center", nargs='?', const='median',
choices=('mean', 'median', 'mode', 'biweight'),
help="""Re-center the log2 ratio values using this estimator of the
center or average value. ('median' if no argument given.)""")
P_call.add_argument("--center-at", type=float,
help="""Subtract a constant number from all log2 values. For "manual"
re-centering, in case the --center option gives unsatisfactory
results.)""")
P_call.add_argument('--filter', action='append', default=[], dest='filters',
choices=('ampdel', 'cn', 'ci', 'sem', # 'bic'
),
help="""Merge segments flagged by the specified filter(s) with the
adjacent segment(s).""")
P_call.add_argument('-m', '--method',
choices=('threshold', 'clonal', 'none'), default='threshold',
help="""Calling method. [Default: %(default)s]""")
P_call.add_argument('-t', '--thresholds',
type=csvstring, default="-1.1,-0.25,0.2,0.7",
help="""Hard thresholds for calling each integer copy number, separated
by commas. Use the '=' sign on the command line, e.g.: -t=-1,0,1
[Default: %(default)s]""")
P_call.add_argument("--ploidy", type=int, default=2,
help="Ploidy of the sample cells. [Default: %(default)d]")
P_call.add_argument("--purity", type=float,
help="Estimated tumor cell fraction, a.k.a. purity or cellularity.")
P_call.add_argument("--drop-low-coverage", action='store_true',
help="""Drop very-low-coverage bins before segmentation to avoid
false-positive deletions in poor-quality tumor samples.""")
P_call.add_argument('-x', '--sample-sex', '-g', '--gender', dest='sample_sex',
choices=('m', 'y', 'male', 'Male', 'f', 'x', 'female', 'Female'),
help="""Specify the sample's chromosomal sex as male or female.
(Otherwise guessed from X and Y coverage).""")
P_call.add_argument('-y', '--male-reference', '--haploid-x-reference',
action='store_true',
help="""Was a male reference used? If so, expect half ploidy on
chrX and chrY; otherwise, only chrY has half ploidy. In CNVkit,
if a male reference was used, the "neutral" copy number (ploidy)
of chrX is 1; chrY is haploid for either reference sex.""")
P_call.add_argument('-o', '--output', metavar="FILENAME",
help="Output table file name (CNR-like table of segments, .cns).")
P_call_vcf = P_call.add_argument_group(
"To additionally process SNP b-allele frequencies for allelic copy number")
P_call_vcf.add_argument('-v', '--vcf', metavar="FILENAME",
help="""VCF file name containing variants for calculation of b-allele
frequencies.""")
P_call_vcf.add_argument('-i', '--sample-id',
help="""Name of the sample in the VCF (-v/--vcf) to use for b-allele
frequency extraction.""")
P_call_vcf.add_argument('-n', '--normal-id',
help="""Corresponding normal sample ID in the input VCF (-v/--vcf).
This sample is used to select only germline SNVs to calculate
b-allele frequencies.""")
P_call_vcf.add_argument('--min-variant-depth', type=int, default=20,
help="""Minimum read depth for a SNV to be used in the b-allele
frequency calculation. [Default: %(default)s]""")
P_call_vcf.add_argument('-z', '--zygosity-freq',
metavar='ALT_FREQ', nargs='?', type=float, const=0.25,
help="""Ignore VCF's genotypes (GT field) and instead infer zygosity
from allele frequencies. [Default if used without a number:
%(const)s]""")
P_call.set_defaults(func=_cmd_call)
# _____________________________________________________________________________
# Plots and graphics
# diagram ---------------------------------------------------------------------
def _cmd_diagram(args):
"""Draw copy number (log2 coverages, CBS calls) on chromosomes as a diagram.
If both the raw probes and segments are given, show them side-by-side on
each chromosome (segments on the left side, probes on the right side).
"""
if not args.filename and not args.segment:
raise ValueError("Must specify a filename as an argument or with "
"the '-s' option, or both. You did neither.")
cnarr = read_cna(args.filename) if args.filename else None
segarr = read_cna(args.segment) if args.segment else None
if args.adjust_xy:
is_sample_female = verify_sample_sex(cnarr or segarr, args.sample_sex,
args.male_reference)
if cnarr:
cnarr = cnarr.shift_xx(args.male_reference, is_sample_female)
if segarr:
segarr = segarr.shift_xx(args.male_reference, is_sample_female)
outfname = diagram.create_diagram(cnarr, segarr, args.threshold,
args.min_probes, args.output, args.title)
logging.info("Wrote %s", outfname)
P_diagram = AP_subparsers.add_parser('diagram', help=_cmd_diagram.__doc__)
P_diagram.add_argument('filename', nargs='?',
help="""Processed coverage data file (*.cnr), the output of the
'fix' sub-command.""")
P_diagram.add_argument('-s', '--segment',
help="Segmentation calls (.cns), the output of the 'segment' command.")
P_diagram.add_argument('-t', '--threshold', type=float, default=0.5,
help="""Copy number change threshold to label genes.
[Default: %(default)s]""")
P_diagram.add_argument('-m', '--min-probes', type=int, default=3,
help="""Minimum number of covered probes to label a gene.
[Default: %(default)d]""")
P_diagram.add_argument('-y', '--male-reference', '--haploid-x-reference',
action='store_true',
help="""Assume inputs were normalized to a male reference
(i.e. female samples will have +1 log-CNR of chrX;
otherwise male samples would have -1 chrX).""")
P_diagram.add_argument('-x', '--sample-sex', '-g', '--gender',
dest='sample_sex',
choices=('m', 'y', 'male', 'Male', 'f', 'x', 'female', 'Female'),
help="""Specify the sample's chromosomal sex as male or female.
(Otherwise guessed from X and Y coverage).""")
P_diagram.add_argument('--no-shift-xy', dest='adjust_xy', action='store_false',
help="Don't adjust the X and Y chromosomes according to sample sex.")
P_diagram.add_argument('-o', '--output', metavar="FILENAME",
help="Output PDF file name.")
P_diagram_aes = P_diagram.add_argument_group("Plot aesthetics")
P_diagram_aes.add_argument('--title',
help="Plot title. [Default: sample ID, from filename or -i]")
P_diagram.set_defaults(func=_cmd_diagram)
# scatter ---------------------------------------------------------------------
do_scatter = public(scatter.do_scatter)
def _cmd_scatter(args):
"""Plot probe log2 coverages and segmentation calls together."""
cnarr = read_cna(args.filename, sample_id=args.sample_id
) if args.filename else None
segarr = read_cna(args.segment, sample_id=args.sample_id
) if args.segment else None
varr = load_het_snps(args.vcf, args.sample_id, args.normal_id,
args.min_variant_depth, args.zygosity_freq)
scatter_opts = {k: v for k, v in (
("do_trend", args.trend),
("by_bin", args.by_bin),
("window_width", args.width),
("y_min", args.y_min),
("y_max", args.y_max),
("antitarget_marker", args.antitarget_marker),
("segment_color", args.segment_color),
) if v is not None}
if args.range_list:
with PdfPages(args.output) as pdf_out:
for region in tabio.read_auto(args.range_list).coords():
try:
if args.title is not None:
scatter_opts["title"] = "%s %s" % (args.title,
region.chromosome)
scatter.do_scatter(cnarr, segarr, varr, show_range=region,
**scatter_opts)
except ValueError as exc:
# Probably no bins in the selected region
logging.warning("Not plotting region %r: %s",
to_label(region), exc)
pdf_out.savefig()
pyplot.close()
else:
if args.title is not None:
scatter_opts["title"] = args.title
scatter.do_scatter(cnarr, segarr, varr, args.chromosome, args.gene,
**scatter_opts)
if args.output:
oformat = os.path.splitext(args.output)[-1].replace(".", "")
pyplot.savefig(args.output, format=oformat, bbox_inches="tight")
logging.info("Wrote %s", args.output)
else:
pyplot.show()
P_scatter = AP_subparsers.add_parser('scatter', help=_cmd_scatter.__doc__)
P_scatter.add_argument('filename', nargs="?",
help="""Processed bin-level copy ratios (*.cnr), the output
of the 'fix' sub-command.""")
P_scatter.add_argument('-s', '--segment', metavar="FILENAME",
help="Segmentation calls (.cns), the output of the 'segment' command.")
P_scatter.add_argument('-c', '--chromosome', metavar="RANGE",
help="""Chromosome or chromosomal range, e.g. 'chr1' or
'chr1:2333000-2444000', to display. If a range is given,
all targeted genes in this range will be shown, unless
-g/--gene is also given.""")
P_scatter.add_argument('-g', '--gene',
help="Name of gene or genes (comma-separated) to display.")
P_scatter.add_argument('-l', '--range-list',
help="""File listing the chromosomal ranges to display, as BED, interval
list or 'chr:start-end' text. Creates focal plots similar to
-c/--chromosome for each listed region, combined into a
multi-page PDF. The output filename must also be
specified (-o/--output).""")
P_scatter.add_argument('-w', '--width', type=float, default=1e6,
help="""Width of margin to show around the selected gene(s) (-g/--gene)
or small chromosomal region (-c/--chromosome).
[Default: %(default)d]""")
P_scatter.add_argument('-o', '--output', metavar="FILENAME",
help="Output PDF file name.")
P_scatter_aes = P_scatter.add_argument_group("Plot aesthetics")
P_scatter_aes.add_argument('-a', '--antitarget-marker',
metavar='CHARACTER', dest='antitarget_marker', default=None,
help="""Plot antitargets using this symbol when plotting in a selected
chromosomal region (-g/--gene or -c/--chromosome).
[Default: same as targets]""")
P_scatter_aes.add_argument("--by-bin", action="store_true",
help="""Plot data x-coordinates by bin indices instead of genomic
coordinates. All bins will be shown with equal width, no blank
regions will be shown, and x-axis values indicate bin number
(within chromosome) instead of genomic position.""")
P_scatter_aes.add_argument('--segment-color', default=scatter.SEG_COLOR,
help="""Plot segment lines in this color. Value can be any string
accepted by matplotlib, e.g. 'red' or '#CC0000'.""")
P_scatter_aes.add_argument('--title',
help="Plot title. [Default: sample ID, from filename or -i]")
P_scatter_aes.add_argument('-t', '--trend', action='store_true',
help="Draw a smoothed local trendline on the scatter plot.")
P_scatter_aes.add_argument('--y-max', type=float, help="y-axis upper limit.")
P_scatter_aes.add_argument('--y-min', type=float, help="y-axis lower limit.")
P_scatter_vcf = P_scatter.add_argument_group(
"To plot SNP b-allele frequencies")
P_scatter_vcf.add_argument('-v', '--vcf', metavar="FILENAME",
help="""VCF file name containing variants to plot for SNV b-allele
frequencies.""")
P_scatter_vcf.add_argument('-i', '--sample-id',
help="""Name of the sample in the VCF to use for b-allele frequency
extraction and as the default plot title.""")
P_scatter_vcf.add_argument('-n', '--normal-id',
help="""Corresponding normal sample ID in the input VCF. This sample is
used to select only germline SNVs to plot.""")
P_scatter_vcf.add_argument('-m', '--min-variant-depth', type=int, default=20,
help="""Minimum read depth for a SNV to be used in the b-allele
frequency calculation. [Default: %(default)s]""")
P_scatter_vcf.add_argument('-z', '--zygosity-freq',
metavar='ALT_FREQ', nargs='?', type=float, const=0.25,
help="""Ignore VCF's genotypes (GT field) and instead infer zygosity
from allele frequencies. [Default if used without a number: