-
Notifications
You must be signed in to change notification settings - Fork 295
/
filter-abund-single.py
executable file
·135 lines (109 loc) · 4.46 KB
/
filter-abund-single.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#! /usr/bin/env python2
#
# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2014. It is licensed under
# the three-clause BSD license; see doc/LICENSE.txt.
# Contact: khmer-project@idyll.org
#
# pylint: disable=missing-docstring,invalid-name
"""
Trim sequences at k-mers of the given abundance for the given file,
without loading a prebuilt counting table. Output sequences will be
placed in 'infile.abundfilt'.
% python scripts/filter-abund-single.py <data>
Use '-h' for parameter help.
"""
import os
import sys
import khmer
import threading
import textwrap
from khmer.thread_utils import ThreadedSequenceProcessor, verbose_loader
from khmer.khmer_args import (build_counting_args, report_on_config,
add_threading_args, info)
from khmer.file import (check_file_status, check_space,
check_space_for_hashtable)
#
DEFAULT_CUTOFF = 2
def get_parser():
epilog = """
Trimmed sequences will be placed in ${input_sequence_filename}.abundfilt.
This script is constant memory.
To trim reads based on k-mer abundance across multiple files, use
:program:`load-into-counting.py` and :program:`filter-abund.py`.
Example::
filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa
"""
parser = build_counting_args(
descr="Trims sequences at a minimum k-mer abundance "
"(in memory version).", epilog=textwrap.dedent(epilog))
add_threading_args(parser)
parser.add_argument('--cutoff', '-C', default=DEFAULT_CUTOFF, type=int,
help="Trim at k-mers below this abundance.")
parser.add_argument('--savetable', metavar="filename", default='',
help="If present, the name of the file to save the "
"k-mer counting table to")
parser.add_argument('datafile', metavar='input_sequence_filename',
help="FAST[AQ] sequence file to trim")
parser.add_argument('--report-total-kmers', '-t', action='store_true',
help="Prints the total number of k-mers to stderr")
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Overwrite output file if it exists')
return parser
def main():
info('filter-abund-single.py', ['counting', 'SeqAn'])
args = get_parser().parse_args()
check_file_status(args.datafile, args.force)
check_space([args.datafile], args.force)
if args.savetable:
check_space_for_hashtable(
args.n_tables * args.min_tablesize, args.force)
report_on_config(args)
print >>sys.stderr, 'making k-mer counting table'
htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
args.n_tables)
# first, load reads into hash table
rparser = khmer.ReadParser(args.datafile)
threads = []
print >>sys.stderr, 'consuming input, round 1 --', args.datafile
for _ in xrange(args.threads):
cur_thread = \
threading.Thread(
target=htable.consume_fasta_with_reads_parser,
args=(rparser, )
)
threads.append(cur_thread)
cur_thread.start()
for _ in threads:
_.join()
if args.report_total_kmers:
print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
htable.n_unique_kmers())
fp_rate = khmer.calc_expected_collisions(htable)
print >>sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate
# now, trim.
# the filtering function.
def process_fn(record):
name = record['name']
seq = record['sequence']
if 'N' in seq:
return None, None
trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)
if trim_at >= args.ksize:
return name, trim_seq
return None, None
# the filtering loop
print >>sys.stderr, 'filtering', args.datafile
outfile = os.path.basename(args.datafile) + '.abundfilt'
outfp = open(outfile, 'w')
tsp = ThreadedSequenceProcessor(process_fn)
tsp.start(verbose_loader(args.datafile), outfp)
print >>sys.stderr, 'output in', outfile
if args.savetable:
print >>sys.stderr, 'Saving k-mer counting table filename', \
args.savetable
print >>sys.stderr, '...saving to', args.savetable
htable.save(args.savetable)
print >>sys.stderr, 'wrote to: ', outfile
if __name__ == '__main__':
main()