# Display gather results w/taxonomy

In [1]:
from charcoal import utils
import sourmash
from sourmash.lca import taxlist, LineagePair
import collections

import plotly.graph_objects as go

import sourmash_sankey
import importlib

import os.path

import screed

from sourmash.lca.command_index import load_taxonomy_assignments
from sourmash.lca import LCA_Database

from charcoal.lineage_db import LineageDB
from charcoal.utils import (gather_at_rank, get_ident, ContigGatherInfo)

In [24]:
def gather_at_rank(mh, lca_db, match_rank):
    "Run gather, and aggregate at given rank."
    import copy
    minhash = copy.copy(mh)
    query_sig = sourmash.SourmashSignature(minhash)

    # do the gather:                                                            
    counts = collections.Counter()
    while 1:
        results = lca_db.gather(query_sig, threshold_bp=0)
        if not results:
            break

        (match, match_sig, _) = results[0]
        
        print('XXX', match_sig._name, match_sig.filename)

        # retrieve lineage & pop to match_rank                                  
        match_ident = os.path.basename(match_sig.filename)
        match_idx = lca_db.ident_to_idx[match_ident]
        match_lid = lca_db.idx_to_lid[idx]
        match_lineage = lca_db.lid_to_lineage[match_lid]
        match_lineage = pop_to_rank(match_lineage, match_rank)

        # count at match_rank                                                   
        common = match_sig.minhash.count_common(query_sig.minhash)
        counts[match_lineage] += common

        # finish out gather algorithm!                                          
        minhash.remove_many(match_sig.minhash.hashes)
        query_sig = sourmash.SourmashSignature(minhash)

    # return!                                                                   
    for lin, count in counts.most_common():
        yield lin, count
        
def do(genome_filename, genome_sig_filename, lca_db):
    genomebase = os.path.basename(genome_filename)
    match_rank = 'genus'

    # load the genome signature                                                 
    genome_sig = sourmash.load_one_signature(genome_sig_filename)

    # construct a template minhash object that we can use to create new 'uns    
    empty_mh = genome_sig.minhash.copy_and_clear()
    ksize = empty_mh.ksize
    scaled = empty_mh.scaled
    moltype = empty_mh.moltype

    print('')
    print(f'reading contigs from {genomebase}')

    screed_iter = screed.open(genome_filename)
    contigs_tax = {}
    for n, record in enumerate(screed_iter):
       # look at each contig individually                                      
        mh = empty_mh.copy_and_clear()
        mh.add_sequence(record.sequence, force=True)

        # collect all the gather results at genus level, together w/counts;     
        # here, results is a list of (lineage, count) tuples.                   
        results = list(gather_at_rank(mh, lca_db, match_rank))

        # store together with size of sequence.                                 
        info = ContigGatherInfo(len(record.sequence), len(mh), results)
        contigs_tax[record.name] = info

    print(f"Processed {len(contigs_tax)} contigs.")
    return contigs_tax


In [25]:
importlib.reload(sourmash_sankey)
from sourmash_sankey import GenomeSankeyFlow

def make_sankey_fig(title, genome_lineage, contigs_info):
    obj = GenomeSankeyFlow()
    
    counts = collections.Counter()
    for contig_name, gather_info in contigs_info.items():
        contig_taxlist = gather_info.gather_tax

        # iterate over each contig match and summarize counts.              
        # note - here we can stop at first one, or track them all.          
        # note - b/c gather counts each hash only once, these               
        #     are non-overlapping                                           
        total_hashcount = 0
        for lin, hashcount in contig_taxlist:
            counts[lin] += hashcount
            total_hashcount += hashcount

        # track missing => unassigned lineage
        unident = gather_info.num_hashes - total_hashcount
        counts[obj.unassigned_lin] += unident
    
    # set the color of the main lineage
    genome_lineage = tuple(genome_lineage)
    obj.colors[genome_lineage] = "lightseagreen"
    
    # for phylum level disagreements, let's go with palevioletred
    for lin in counts:
        if not utils.is_lineage_match(lin, genome_lineage, 'phylum'):
            obj.colors[lin] = 'palevioletred'
            
    # assign unassigned to good lineage, maybe?
    counts[genome_lineage] += counts[obj.unassigned_lin]
    del counts[obj.unassigned_lin]
    
    obj.make_links(genome_lineage, counts)
    fig = obj.make_plotly_fig(title)
    
    return fig

genome_lin = 'd__Archaea,p__Thermoplasmatota,c__Poseidoniia,o__Poseidoniales,f__Thalassoarchaeaceae,g__MGIIb-O3,s__MGIIb-O3 sp002722735'
genome_lin = utils.make_lineage(genome_lin)

lca_db = sourmash.load_file_as_index('xxx.lca.json')

print(list(lca_db.ident_to_name.keys()))

contigs_info = do('tobg_np-110.fa', 'tobg_np-110.k31.sig', lca_db)
make_sankey_fig('tobg_np-110', genome_lin, contigs_info)

['GCA_002722735.1_genomic.fna.gz', 'GCA_003193925.1_genomic.fna.gz', 'GCA_002495735.1_genomic.fna.gz', 'GCA_002503285.1_genomic.fna.gz', 'GCA_002457145.1_genomic.fna.gz', 'GCA_002507125.1_genomic.fna.gz', 'GCA_002505775.1_genomic.fna.gz', 'GCA_002502095.1_genomic.fna.gz', 'GCA_001628485.1_genomic.fna.gz', 'GCA_002716085.1_genomic.fna.gz', 'GCA_002494975.1_genomic.fna.gz', 'GCA_002499325.1_genomic.fna.gz', 'GCA_001629205.1_genomic.fna.gz', 'GCA_001629255.1_genomic.fna.gz', 'GCF_000493815.1_genomic.fna.gz', 'GCA_002457605.1_genomic.fna.gz', 'GCA_002730775.1_genomic.fna.gz', 'GCA_002713185.1_genomic.fna.gz', 'GCA_002719615.1_genomic.fna.gz', 'GCA_002457595.1_genomic.fna.gz', 'GCA_002712285.1_genomic.fna.gz', 'GCA_002503055.1_genomic.fna.gz', 'GCA_002501605.1_genomic.fna.gz', 'GCA_002498525.1_genomic.fna.gz', 'GCA_002495675.1_genomic.fna.gz', 'GCA_002172185.1_genomic.fna.gz', 'GCA_002506825.1_genomic.fna.gz', 'GCA_002923215.1_genomic.fna.gz', 'GCA_002726275.1_genomic.fna.gz', 'GCA_00250490

KeyError: ''

In [5]:
x = list(sourmash.load_signatures('tobg_np-110.sig.matches'))


In [7]:
x[0].filename

'/home/ctbrown/gtdbtk/release89/fastani/database/GCA_002722735.1_genomic.fna.gz'

In [8]:
ls xxx.lca.json


xxx.lca.json


In [11]:
db

LCA_Database('xxx.lca.json')

In [12]:
db.ksize

31

In [13]:
db.scaled

10000

In [14]:
db.moltype

'DNA'