-
Notifications
You must be signed in to change notification settings - Fork 295
/
make-initial-stoptags.py
executable file
·126 lines (102 loc) · 4.62 KB
/
make-initial-stoptags.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#! /usr/bin/env python2
#
# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see doc/LICENSE.txt.
# Contact: khmer-project@idyll.org
#
# pylint: disable=invalid-name,missing-docstring
"""
Find an initial set of highly connected k-mers, to save on repartitioning time.
% python scripts/make-initial-stoptags.py <base>
"""
import sys
import textwrap
import khmer
from khmer.khmer_args import (build_counting_args, info)
from khmer.kfile import check_file_status, check_space
DEFAULT_SUBSET_SIZE = int(1e4)
DEFAULT_COUNTING_HT_SIZE = 3e6 # number of bytes
DEFAULT_COUNTING_HT_N = 4 # number of counting hash tables
# Lump removal parameters. Probably shouldn't be changed, but who knows?
#
# explanation:
#
# We will walk EXCURSION_DISTANCE out from each tag; if we find more than
# EXCURSION_KMER_THRESHOLD kmers within that range, this will be a "big"
# excursion and we will track all k-mers visited. If we find that any
# k-mer has been visited more than EXCURSION_KMER_COUNT_THRESHOLD times,
# we will mark it as BAD and make it a stop tag for traversal.
# don't change these!
EXCURSION_DISTANCE = 40
EXCURSION_KMER_THRESHOLD = 200
EXCURSION_KMER_COUNT_THRESHOLD = 5
def get_parser():
epilog = """
Loads a k-mer presence table/tagset pair created by load-graph.py, and does
a small set of traversals from graph waypoints; on these traversals, looks
for k-mers that are repeatedly traversed in high-density regions of the
graph, i.e. are highly connected. Outputs those k-mers as an initial set of
stoptags, which can be fed into partition-graph.py, find-knots.py, and
filter-stoptags.py.
The k-mer counting table size options parameters are for a k-mer counting
table to keep track of repeatedly-traversed k-mers. The subset size option
specifies the number of waypoints from which to traverse; for highly
connected data sets, the default (1000) is probably ok.
"""
parser = build_counting_args(
descr="Find an initial set of highly connected k-mers.",
epilog=textwrap.dedent(epilog))
parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE,
dest='subset_size', type=float,
help='Set subset size (default 1e4 is prob ok)')
parser.add_argument('--stoptags', '-S', metavar='filename', default='',
help="Use stoptags in this file during partitioning")
parser.add_argument('graphbase', help='basename for input and output '
'filenames')
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Overwrite output file if it exists')
return parser
def main():
info('make-initial-stoptags.py', ['graph'])
args = get_parser().parse_args()
graphbase = args.graphbase
# @RamRS: This might need some more work
infiles = [graphbase + '.pt', graphbase + '.tagset']
if args.stoptags:
infiles.append(args.stoptags)
for _ in infiles:
check_file_status(_, args.force)
check_space(infiles, args.force)
print >>sys.stderr, 'loading htable %s.pt' % graphbase
htable = khmer.load_hashbits(graphbase + '.pt')
# do we want to load stop tags, and do they exist?
if args.stoptags:
print >>sys.stderr, 'loading stoptags from', args.stoptags
htable.load_stop_tags(args.stoptags)
print >>sys.stderr, 'loading tagset %s.tagset...' % graphbase
htable.load_tagset(graphbase + '.tagset')
ksize = htable.ksize()
counting = khmer.new_counting_hash(ksize, args.min_tablesize,
args.n_tables)
# divide up into SUBSET_SIZE fragments
divvy = htable.divide_tags_into_subsets(args.subset_size)
# pick off the first one
if len(divvy) == 1:
start, end = 0, 0
else:
start, end = divvy[:2]
# partition!
print >>sys.stderr, 'doing pre-partitioning from', start, 'to', end
subset = htable.do_subset_partition(start, end)
# now, repartition...
print >>sys.stderr, 'repartitioning to find HCKs.'
htable.repartition_largest_partition(subset, counting,
EXCURSION_DISTANCE,
EXCURSION_KMER_THRESHOLD,
EXCURSION_KMER_COUNT_THRESHOLD)
print >>sys.stderr, 'saving stop tags'
htable.save_stop_tags(graphbase + '.stoptags')
print >> sys.stderr, 'wrote to:', graphbase + '.stoptags'
if __name__ == '__main__':
main()