/
contigs_search_taxonomy.py
160 lines (130 loc) · 5.27 KB
/
contigs_search_taxonomy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#! /usr/bin/env python
"""
Do gather matches on contigs => taxonomy, and save to JSON
"""
import sys
import argparse
import os.path
import json
import screed
import sourmash
from sourmash.lca.command_index import load_taxonomy_assignments
from sourmash.lca import LCA_Database
from .lineage_db import LineageDB
from .version import version
from .utils import (gather_at_rank, get_ident, ContigGatherInfo)
def main(args):
"Main entry point for scripting. Use cmdline for command line entry."
genomebase = os.path.basename(args.genome)
match_rank = args.match_rank
# load taxonomy CSV
tax_assign, _ = load_taxonomy_assignments(args.lineages_csv,
start_column=2)
print(f'loaded {len(tax_assign)} tax assignments.')
# load the genome signature
genome_sig = sourmash.load_one_signature(args.genome_sig)
# load the matches from prefetch as a picklist
picklist = sourmash.picklist.SignaturePicklist('prefetch')
try:
picklist.load(args.matches_csv, picklist.column_name)
except ValueError:
with open(args.matches_csv, 'rt') as fp:
contents = fp.read()
if not len(contents): # empty is ok.
picklist = None
else:
raise
# load all of the matches in the database, as found by prefetch;
# select on them; and then aggregate into MultiIndex.
# CTB note: currently, this loads all the signatures into memory.
# Alternatively we could do something with LazyLoadedIndex maybe?
siglist = []
for filename in args.databases:
db = sourmash.load_file_as_index(filename)
db = db.select(picklist=picklist)
siglist += list(db.signatures())
print(f"loaded {len(siglist)} matches from '{args.matches_csv}'")
# Hack for examining members of our search database: remove exact matches.
new_siglist = []
for ss in siglist:
if genome_sig.similarity(ss) == 1.0:
print(f'removing an identical match: {ss.name}')
else:
new_siglist.append(ss)
siglist = new_siglist
# if, after removing exact match(es), there is nothing left, quit.
# (but write an empty JSON file so that snakemake workflows don't
# complain.)
if not siglist:
print('no non-identical matches for this genome, exiting.')
contigs_tax = {}
with open(args.json_out, 'wt') as fp:
fp.write(json.dumps(contigs_tax))
return 0
# remove duplicate signatures in matches
# workaround for issue of duplicate sigs in SBT, see sourmash/#1171
new_siglist = []
seen_md5 = set()
for ss in siglist:
ss_md5 = ss.md5sum()
if not ss_md5 in seen_md5:
new_siglist.append(ss)
seen_md5.add(ss_md5)
else:
print(f'removing a duplicate match: {ss.name}')
siglist = new_siglist
# construct a template minhash object that we can use to create new 'uns
empty_mh = siglist[0].minhash.copy_and_clear()
ksize = empty_mh.ksize
scaled = empty_mh.scaled
moltype = empty_mh.moltype
# create empty LCA database to populate...
lca_db = LCA_Database(ksize=ksize, scaled=scaled, moltype=moltype)
lin_db = LineageDB()
# ...with specific matches.
for ss in siglist:
ident = get_ident(ss)
lineage = tax_assign[ident]
lca_db.insert(ss, ident=ident)
lin_db.insert(ident, lineage)
print(f'loaded {len(siglist)} signatures & created LCA Database')
print('')
print(f'reading contigs from {genomebase}')
screed_iter = screed.open(args.genome)
contigs_tax = {}
for n, record in enumerate(screed_iter):
# look at each contig individually
mh = empty_mh.copy_and_clear()
mh.add_sequence(record.sequence, force=True)
# collect all the gather results at genus level, together w/counts;
# here, results is a list of (lineage, count) tuples.
results = list(gather_at_rank(mh, lca_db, lin_db, match_rank))
# store together with size of sequence.
info = ContigGatherInfo(len(record.sequence), len(mh), results)
contigs_tax[record.name] = info
print(f"Processed {len(contigs_tax)} contigs.")
# save!
with open(args.json_out, 'wt') as fp:
fp.write(json.dumps(contigs_tax))
return 0
def cmdline(sys_args):
"Command line entry point w/argparse action."
p = argparse.ArgumentParser(sys_args)
p.add_argument('--genome', help='genome file', required=True)
p.add_argument('--genome-sig', help='genome sig', required=True)
p.add_argument('--matches-csv', help='all relevant matches', required=True)
p.add_argument('--databases', help='sourmash databases', required=True,
nargs='+')
p.add_argument('--lineages-csv', help='lineage spreadsheet', required=True)
p.add_argument('--force', help='continue past survivable errors',
action='store_true')
p.add_argument('--json-out',
help='JSON-format output file of all tax results',
required=True)
p.add_argument('--match-rank', required=True)
args = p.parse_args()
return main(args)
# execute this, when run with `python -m`.
if __name__ == '__main__':
returncode = cmdline(sys.argv[1:])
sys.exit(returncode)