-
Notifications
You must be signed in to change notification settings - Fork 294
/
find-knots.py
executable file
·153 lines (123 loc) · 5.57 KB
/
find-knots.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#! /usr/bin/env python
#
# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project@idyll.org
#
# pylint: disable=invalid-name,missing-docstring
"""
Find highly-connected k-mers.
k-mers are output into a .stoptags file, for later use in partitioning.
% python scripts/find-knots.py <base>
"""
from __future__ import print_function
import argparse
import glob
import os
import textwrap
import khmer
import sys
from khmer.kfile import check_input_files, check_space
from khmer import khmer_args
from khmer.khmer_args import (build_counting_args, info, add_loadgraph_args,
report_on_config, sanitize_epilog)
# counting hash parameters.
DEFAULT_COUNTING_HT_SIZE = 3e6 # number of bytes
DEFAULT_COUNTING_HT_N = 4 # number of counting hash tables
# Lump removal parameters. Probably shouldn't be changed, but who knows?
#
# explanation:
#
# We will walk EXCURSION_DISTANCE out from each tag; if we find more than
# EXCURSION_KMER_THRESHOLD kmers within that range, this will be a "big"
# excursion and we will track all k-mers visited. If we find that any
# k-mer has been visited more than EXCURSION_KMER_COUNT_THRESHOLD times,
# we will mark it as BAD and make it a stop tag for traversal.
# don't change these!
EXCURSION_DISTANCE = 40
EXCURSION_KMER_THRESHOLD = 200
EXCURSION_KMER_COUNT_THRESHOLD = 2
# EXCURSION_KMER_COUNT_THRESHOLD=5 # -- works ok for non-diginormed data
def get_parser():
epilog = """
Load an k-mer nodegraph/tagset pair created by
:program:`load-into-graph.py`, and a set of pmap files created by
:program:`partition-graph.py`. Go through each pmap file,
select the largest partition in each, and do the same kind of traversal as
in :program:`make-initial-stoptags.py` from each of the waypoints in that
partition; this should identify all of the HCKs in that partition. These
HCKs are output to <graphbase>.stoptags after each pmap file.
Parameter choice is reasonably important. See the pipeline in
:doc:`partitioning-big-data` for an example run.
This script is not very scalable and may blow up memory and die horribly.
You should be able to use the intermediate stoptags to restart the
process, and if you eliminate the already-processed pmap files, you can
continue where you left off.
"""
parser = build_counting_args(
descr="Find all highly connected k-mers.",
epilog=textwrap.dedent(epilog))
parser.add_argument('graphbase', help='Basename for the input and output '
'files.')
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Continue past warnings')
return parser
def main():
info('find-knots.py', ['graph'])
args = get_parser().parse_args()
graphbase = args.graphbase
# @RamRS: This might need some more work
infiles = [graphbase, graphbase + '.tagset']
if os.path.exists(graphbase + '.stoptags'):
infiles.append(graphbase + '.stoptags')
for _ in infiles:
check_input_files(_, args.force)
check_space(infiles, args.force)
print('loading k-mer nodegraph %s' % graphbase, file=sys.stderr)
graph = khmer.load_nodegraph(graphbase)
print('loading tagset %s.tagset...' % graphbase, file=sys.stderr)
graph.load_tagset(graphbase + '.tagset')
initial_stoptags = False # @CTB regularize with make-initial
if os.path.exists(graphbase + '.stoptags'):
print('loading stoptags %s.stoptags' % graphbase, file=sys.stderr)
graph.load_stop_tags(graphbase + '.stoptags')
initial_stoptags = True
pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')
print('loading %d pmap files (first one: %s)' %
(len(pmap_files), pmap_files[0]), file=sys.stderr)
print('---', file=sys.stderr)
print('output stoptags will be in',
graphbase + '.stoptags', file=sys.stderr)
if initial_stoptags:
print(
'(these output stoptags will include the already-loaded set)',
file=sys.stderr)
print('---', file=sys.stderr)
# create countgraph
ksize = graph.ksize()
counting = khmer_args.create_countgraph(args, ksize=ksize)
# load & merge
for index, subset_file in enumerate(pmap_files):
print('<-', subset_file, file=sys.stderr)
subset = graph.load_subset_partitionmap(subset_file)
print('** repartitioning subset... %s' % subset_file, file=sys.stderr)
graph.repartition_largest_partition(subset, counting,
EXCURSION_DISTANCE,
EXCURSION_KMER_THRESHOLD,
EXCURSION_KMER_COUNT_THRESHOLD)
print('** merging subset... %s' % subset_file, file=sys.stderr)
graph.merge_subset(subset)
print('** repartitioning, round 2... %s' %
subset_file, file=sys.stderr)
size = graph.repartition_largest_partition(
None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD,
EXCURSION_KMER_COUNT_THRESHOLD)
print('** repartitioned size:', size, file=sys.stderr)
print('saving stoptags binary', file=sys.stderr)
graph.save_stop_tags(graphbase + '.stoptags')
os.rename(subset_file, subset_file + '.processed')
print('(%d of %d)\n' % (index, len(pmap_files)), file=sys.stderr)
print('done!', file=sys.stderr)
if __name__ == '__main__':
main()