Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 236 lines (183 sloc) 7.93 KB
#! /usr/bin/env python
Classify a directory of sourmash signatures for individual genomes by
taxonomic rank, using a kraken-style least-common-ancestor analysis.
import os
import argparse
import collections
import sys
import csv
from pickle import load, dump
import sourmash_lib.signature
import lca_json # from
LCA_DBs = ['db/genbank.lca.json']
THRESHOLD=5 # how many counts of a taxid at min
want_taxonomy = ['superkingdom', 'phylum', 'order', 'class', 'family', 'genus', 'species']
# Bacteria, cellular organisms, Archaea
DEPRECATED_LCA_to_remove = set({ 2, 131567, 2157 })
def traverse_find_sigs(dirnames):
Find all the filenames ending with .sig under given directories.
for dirname in dirnames:
for root, dirs, files in os.walk(dirname):
for name in files:
if name.endswith('.sig'):
fullname = os.path.join(root, name)
yield fullname
def load_all_signatures(dirname, ksize):
Load all signatures under given dirname with given ksize, return
dictionary d[name] -> signature.
Because this can be slow for many hundreds of signatures, cache dict
using pickle.
sigd = {}
filename_d = {}
pickle_cache = dirname.rstrip('/') + '.pickle'
if os.path.exists(pickle_cache):
print('loading from cache:', pickle_cache)
with open(pickle_cache, 'rb') as fp:
filename_d, sigd = load(fp)
loaded_new = False
n = 0
for filename in traverse_find_sigs([dirname]):
if filename not in filename_d:
loaded_new = True
sig = sourmash_lib.signature.load_one_signature(filename,
filename_d[filename] = 1
sigd[] = sig
n += 1
if loaded_new:
print('saving to cache:', pickle_cache)
with open(pickle_cache, 'wb') as fp:
dump((filename_d, sigd), fp)
return sigd
def main():
p = argparse.ArgumentParser()
p.add_argument('-k', '--ksize', default=31, type=int)
p.add_argument('--lca', nargs='+', default=LCA_DBs)
p.add_argument('-o', '--output-csv')
p.add_argument('--threshold', type=int, default=THRESHOLD,
help="minimum number of times a taxid must be present to count")
args = p.parse_args()
if args.output_csv:
output_filename = args.output_csv
output_filename = os.path.basename(args.dir) + '.taxonomy.csv'
outfp = open(output_filename, 'wt')
outw = csv.writer(outfp)
outw.writerow(['name', 'taxid', 'status', 'rank_info', 'lineage'])
# load the LCA databases from the JSON file(s)
lca_db_list = []
for lca_filename in args.lca:
print('loading LCA database from {}'.format(lca_filename))
lca_db = lca_json.LCA_Database(lca_filename)
taxfoo, hashval_to_lca, _ = lca_db.get_database(args.ksize, SCALED)
lca_db_list.append((taxfoo, hashval_to_lca))
print('loading all signatures in directory:', args.dir)
sigdict = load_all_signatures(args.dir, args.ksize)
print('...loaded {} signatures at k={}'.format(len(sigdict), args.ksize))
# track number of classifications at various rank levels
classified_at = collections.defaultdict(int)
# track number of disagreements at various rank levels
disagree_at = collections.defaultdict(int)
# for each minhash signature in the directory,
n_in_lca = 0
for name, sig in sigdict.items():
# for each k-mer in each minhash signature, collect assigned taxids
# across all databases (& count).
taxid_set = collections.defaultdict(int)
for hashval in sig.minhash.get_mins():
# if a k-mer is present in multiple DBs, pull the
# least-common-ancestor taxonomic node across all of the
# DBs.
this_hashval_taxids = set()
for (_, hashval_to_lca) in lca_db_list:
hashval_lca = hashval_to_lca.get(hashval)
if hashval_lca is not None and hashval_lca != 1:
if this_hashval_taxids:
this_hashval_lca = taxfoo.find_lca(this_hashval_taxids)
if this_hashval_lca != None:
taxid_set[this_hashval_lca] += 1
# filter on given threshold - only taxids that show up in this
# signature more than THRESHOLD.
abundant_taxids = set([k for (k, cnt) in taxid_set.items() \
if cnt >= args.threshold])
# remove root (taxid == 1) if it's in there:
if 1 in abundant_taxids:
# default to nothing found. boo.
status = 'nomatch'
status_rank = ''
taxid = 0
# ok - out of the loop, got our LCAs, ...are there any left?
if abundant_taxids:
# increment number that are classifiable at *some* rank.
n_in_lca += 1
disagree_below_rank, taxid_at_rank, disagree_taxids = \
except ValueError:
# @CTB this is probably bad.
assert 0
# we found a disagreement - report the rank *at* the disagreement,
# the lineage *above* the disagreement.
if disagree_below_rank:
# global record of disagreements
disagree_at[disagree_below_rank] += 1
list_at_rank = [taxfoo.get_taxid_name(r) for r in disagree_taxids]
list_at_rank = ", ".join(list_at_rank)
print('{} has multiple LCA below {}: \'{}\''.format(name,
# set output
status = 'disagree'
status_rank = disagree_below_rank
taxid = taxid_at_rank
# found unambiguous! yay.
status = 'found'
taxid = taxfoo.get_lowest_lineage(abundant_taxids,
status_rank = taxfoo.get_taxid_rank(taxid)
status = 'found'
classified_at[status_rank] += 1
if taxid != 0:
lineage_found = taxfoo.get_lineage(taxid,
lineage_found = ";".join(lineage_found)
lineage_found = ""
outw.writerow([name, taxid, status, status_rank, lineage_found])
print('classified sourmash signatures in directory: \'{}\''.format(args.dir))
print('LCA databases used: {}'.format(', '.join(args.lca)))
print('total signatures found: {}'.format(len(sigdict)))
print('no classification information: {}'.format(len(sigdict) - n_in_lca))
print('could classify {}'.format(n_in_lca))
print('of those, {} disagree at some rank.'.format(sum(disagree_at.values())))
print('number classified unambiguously, by lowest classification rank:')
for rank in want_taxonomy:
if classified_at.get(rank):
print('\t{}: {}'.format(rank, classified_at.get(rank, 0)))
print('disagreements by rank:')
for rank in want_taxonomy:
if disagree_at.get(rank):
print('\t{}: {}'.format(rank, disagree_at.get(rank, 0)))
print('classification output as CSV, here: {}'.format(output_filename))
if __name__ == '__main__':