-
Notifications
You must be signed in to change notification settings - Fork 294
/
abundance-dist.py
executable file
·137 lines (111 loc) · 4.86 KB
/
abundance-dist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#! /usr/bin/env python
#
# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2010-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project@idyll.org
#
# pylint: disable=missing-docstring
"""
Produce the k-mer abundance distribution for the given file.
% python scripts/abundance-dist.py [ -z -s ] <htname> <data> <histout>
Use '-h' for parameter help.
"""
from __future__ import print_function
import sys
import csv
import khmer
import argparse
import os
from khmer.kfile import check_input_files
from khmer.khmer_args import info
def get_parser():
parser = argparse.ArgumentParser(
description="Calculate abundance distribution of the k-mers in "
"the sequence file using a pre-made k-mer countgraph.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('input_count_graph_filename', help='The name of the'
' input k-mer countgraph file.')
parser.add_argument('input_sequence_filename', help='The name of the input'
' FAST[AQ] sequence file.')
parser.add_argument('output_histogram_filename', help='The columns are: '
'(1) k-mer abundance, (2) k-mer count, (3) cumulative '
'count, (4) fraction of total distinct k-mers.')
parser.add_argument('-z', '--no-zero', dest='output_zero', default=True,
action='store_false',
help='Do not output zero-count bins')
parser.add_argument('-s', '--squash', dest='squash_output', default=False,
action='store_true',
help='Overwrite existing output_histogram_filename')
parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
action='store_false',
help='Do not count k-mers past 255')
parser.add_argument('--version', action='version', version='%(prog)s ' +
khmer.__version__)
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Continue even if specified input files '
'do not exist or are empty.')
return parser
def main():
info('abundance-dist.py', ['counting'])
args = get_parser().parse_args()
infiles = [args.input_count_graph_filename,
args.input_sequence_filename]
for infile in infiles:
check_input_files(infile, False)
print('Counting graph from', args.input_count_graph_filename,
file=sys.stderr)
countgraph = khmer.load_countgraph(
args.input_count_graph_filename)
if not countgraph.get_use_bigcount() and args.bigcount:
print("WARNING: The loaded graph has bigcount DISABLED while bigcount"
" reporting is ENABLED--counts higher than 255 will not be "
"reported.",
file=sys.stderr)
countgraph.set_use_bigcount(args.bigcount)
kmer_size = countgraph.ksize()
hashsizes = countgraph.hashsizes()
tracking = khmer._Nodegraph( # pylint: disable=protected-access
kmer_size, hashsizes)
print('K:', kmer_size, file=sys.stderr)
print('outputting to', args.output_histogram_filename, file=sys.stderr)
if args.output_histogram_filename in ('-', '/dev/stdout'):
pass
elif os.path.exists(args.output_histogram_filename):
if not args.squash_output:
print('ERROR: %s exists; not squashing.' %
args.output_histogram_filename,
file=sys.stderr)
sys.exit(1)
print('** squashing existing file %s' %
args.output_histogram_filename, file=sys.stderr)
print('preparing hist...', file=sys.stderr)
abundances = countgraph.abundance_distribution(
args.input_sequence_filename, tracking)
total = sum(abundances)
if 0 == total:
print("ERROR: abundance distribution is uniformly zero; "
"nothing to report.", file=sys.stderr)
print("\tPlease verify that the input files are valid.",
file=sys.stderr)
sys.exit(1)
if args.output_histogram_filename in ('-', '/dev/stdout'):
countgraph_fp = sys.stdout
else:
countgraph_fp = open(args.output_histogram_filename, 'w')
countgraph_fp_csv = csv.writer(countgraph_fp)
# write headers:
countgraph_fp_csv.writerow(['abundance', 'count', 'cumulative',
'cumulative_fraction'])
sofar = 0
for _, i in enumerate(abundances):
if i == 0 and not args.output_zero:
continue
sofar += i
frac = sofar / float(total)
countgraph_fp_csv.writerow([_, i, sofar, round(frac, 3)])
if sofar == total:
break
if __name__ == '__main__':
main()
# vim: set ft=python ts=4 sts=4 sw=4 et tw=79: