-
Notifications
You must be signed in to change notification settings - Fork 296
/
abundance-dist.py
executable file
·128 lines (105 loc) · 4.54 KB
/
abundance-dist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#! /usr/bin/env python2
#
# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
# Copyright (C) Michigan State University, 2010-2015. It is licensed under
# the three-clause BSD license; see doc/LICENSE.txt.
# Contact: khmer-project@idyll.org
#
# pylint: disable=missing-docstring,invalid-name
"""
Produce the k-mer abundance distribution for the given file.
% python scripts/abundance-dist.py [ -z -s ] <htname> <data> <histout>
Use '-h' for parameter help.
"""
from __future__ import print_function
import sys
import csv
import khmer
import argparse
import os
from khmer.kfile import check_file_status
from khmer.khmer_args import info
from khmer.utils import write_record
def get_parser():
parser = argparse.ArgumentParser(
description="Calculate abundance distribution of the k-mers in "
"the sequence file using a pre-made k-mer counting table.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('input_counting_table_filename', help='The name of the'
' input k-mer counting table file.')
parser.add_argument('input_sequence_filename', help='The name of the input'
' FAST[AQ] sequence file.')
parser.add_argument('output_histogram_filename', help='The columns are: '
'(1) k-mer abundance, (2) k-mer count, (3) cumulative '
'count, (4) fraction of total distinct k-mers.')
parser.add_argument('-z', '--no-zero', dest='output_zero', default=True,
action='store_false',
help='Do not output 0-count bins')
parser.add_argument('-s', '--squash', dest='squash_output', default=False,
action='store_true',
help='Overwrite output file if it exists')
parser.add_argument('--csv', default=False, action='store_true',
help='Use the CSV format for the histogram. '
'Includes column headers.')
parser.add_argument('--version', action='version', version='%(prog)s ' +
khmer.__version__)
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Overwrite output file if it exists')
return parser
def main():
info('abundance-dist.py', ['counting'])
args = get_parser().parse_args()
infiles = [args.input_counting_table_filename,
args.input_sequence_filename]
for infile in infiles:
check_file_status(infile, args.force)
print ('hashtable from', args.input_counting_table_filename,
file=sys.stderr)
counting_hash = khmer.load_counting_hash(
args.input_counting_table_filename)
kmer_size = counting_hash.ksize()
hashsizes = counting_hash.hashsizes()
tracking = khmer._Hashbits( # pylint: disable=protected-access
kmer_size, hashsizes)
print ('K:', kmer_size, file=sys.stderr)
print ('HT sizes:', hashsizes, file=sys.stderr)
print ('outputting to', args.output_histogram_filename, file=sys.stderr)
if os.path.exists(args.output_histogram_filename):
if not args.squash_output:
print('ERROR: %s exists; not squashing.' %
args.output_histogram_filename,
file=sys.stderr)
sys.exit(1)
print('** squashing existing file %s' %
args.output_histogram_filename, file=sys.stderr)
print('preparing hist...', file=sys.stderr)
abundances = counting_hash.abundance_distribution(
args.input_sequence_filename, tracking)
total = sum(abundances)
if 0 == total:
print("ERROR: abundance distribution is uniformly zero; "
"nothing to report.", file=sys.stderr)
print("\tPlease verify that the input files are valid.",
file=sys.stderr)
sys.exit(1)
hash_fp = open(args.output_histogram_filename, 'w')
if args.csv:
hash_fp_csv = csv.writer(hash_fp)
# write headers:
hash_fp_csv.writerow(['abundance', 'count', 'cumulative',
'cumulative_fraction'])
sofar = 0
for _, i in enumerate(abundances):
if i == 0 and not args.output_zero:
continue
sofar += i
frac = sofar / float(total)
if args.csv:
hash_fp_csv.writerow([_, i, sofar, round(frac, 3)])
else:
print(_, i, sofar, round(frac, 3), file=hash_fp)
if sofar == total:
break
if __name__ == '__main__':
main()
# vim: set ft=python ts=4 sts=4 sw=4 et tw=79: