In [1]:
from Bio import SearchIO,SeqIO
import ete3
import os,functools,json

ncbi=ete3.ncbi_taxonomy.NCBITaxa()

In [2]:
acc_to_gi={} ## accession to gi dict
acc_to_taxid={} ## accession to taxid dict

for line in open('/Users/evogytis/Documents/skeeters/data/acc2gi.txt','r'):
    acc,gi,taxid=line.strip('\n').split('\t')
    acc_to_gi[acc]=gi
    acc_to_taxid[acc]=int(taxid)

In [3]:
J=json.load(open('/Users/evogytis/Documents/skeeters/treemap/displayed_taxa.json','r')) ## load designated treemap branches
branches={b['taxid']:b for b in J} ## flat list of branches indexed by taxid
branches['no_hit']={'taxonomy':'no blast hit','taxid':'no_hit'}

local_path='/Users/evogytis/Dropbox/Jupyter_notebooks/Biohub/California_mosquitoes/s3_bucket/'

sampleCount=1
for subdir, dirs, files in os.walk(local_path): ## iterate through samples
    sample=os.path.basename(subdir) ## sample name is folder name
    
    search={}
    print('\n%d %s'%(sampleCount,sample))
    for fname in files: ## iterate through files in subfolders
        fpath=os.path.join(subdir, fname)
        if 'blast' in fname: ## only interested in blast outputs
            if 'gsnap' in fname:
                blast_type='nt'
            elif 'rapsearch' in fname:
                blast_type='aa'
            
            blast_search=SearchIO.parse(fpath,format='blast-tab') ## read blast output
            search[blast_type]=SearchIO.to_dict(blast_search) ## turn to dict
        
        elif 'fasta' in fname: ## fasta contains contigs
            contigs=SeqIO.parse(fpath,format='fasta') ## parse fasta contigs
            seqs=SeqIO.to_dict(contigs) ## convert to dict
            print('total number of contigs in sample: %s'%(len(seqs)))
            
    if len(search)==2 and contigs and 'water' not in sample.lower(): ## once done with sample, if there's nt and nr blast results, contigs are loaded and they're not controls - proceed
        for searchType in search: ## iterate over nt or nr blast results
            no_hits=len(seqs)-len(search[searchType]) ## no hits is (total number of contigs - blast hits)
            
            if searchType=='aa':
                if 'attrs' not in branches['no_hit']: ## no-hit branch doesn't have attributes yet
                    branches['no_hit']['attrs']={}
                branches['no_hit']['attrs'][sample]={'contig_count':no_hits} ## assign sample contig_count
                print('number of contigs without blastx hit: %d'%(no_hits))
            
            for contig in search[searchType]: ## iterate through contigs blasted
                blast_hit={} ## keep a record of hits
                bitscores={} ## keep a record of bitscores
                percent_id={} ## keep a record of percent identities
                for hit in search[searchType][contig]: ## iterate over hits to contig
                    hit_id=hit.id
                    
                    ## want to keep hits where best HSP is <1e-2 and longest match is >200 nt
                    if min([h.evalue for h in hit.hsps])<1e-2 and max([len(f) for f in hit.fragments])>200:
                        blast_hit[hit_id]=hit
                        bitscores[hit_id]=min([k.bitscore for k in hit.hsps]) ## keep track of worst HSP bitscore
                        percent_id[hit_id]=min([k.ident_pct for k in hit.hsps]) ## keep track of worst HSP percent identity
                
                cutoff=0.1 ## within 10% of top hit
                filteredHits=filter(lambda k: max(bitscores.values())*(1.0-cutoff)<=bitscores[k], blast_hit) ## filter hits down to those whose bitscore within 10% of the best bitscore
                hitLineages=[ncbi.get_lineage(acc_to_taxid[k]) for k in filteredHits] ## get lineage of every hit
                hitLineages=list(filter(lambda x: 7157 not in x, hitLineages)) ## filter out lineages with mosquito in its path
                
                if searchType=='aa': ## if nr
                    hitLineages=list(filter(lambda x: 10239 in x, hitLineages)) ## keep viruses (nr)
                elif searchType=='nt': ## if nt
                    hitLineages=list(filter(lambda x: 10239 not in x, hitLineages)) ## ignore viruses (nt)
                
                if len(list(hitLineages))>0: ## hits exist
                    ranksPresent=functools.reduce(set.intersection,map(set,hitLineages)) ## reduce lineages seen down to those that are shared by all hits
                    
                    lineages=[ncbi.get_lineage(lin) for lin in ranksPresent] ## get full lineage of each taxonomic rank that's shared across all hits
                    mrca_path=[lin for lin in lineages if len(lin)==max(map(len,lineages))][-1] ## longest lineage path is the most recent common ancestor
                    
                    for rank in mrca_path[::-1]: ## invert ancestor path, so ranks iterated from most recent to oldest
                        if rank in branches: ## rank is present amongst those designated for treemap
                            branch=branches[rank] ## fetch branch from treemap
                            
                            if 'attrs' not in branch:
                                branch['attrs']={} ## branch doesn't have attrs
                                
                            if sample not in branch['attrs']:
                                branch['attrs'][sample]={} ## branch hasn't seen sample
                                
                            if 'contig_count' not in branch['attrs'][sample]:
                                branch['attrs'][sample]['contig_count']=0 ## branch hasn't seen contig_count in sample
                            branch['attrs'][sample]['contig_count']+=1 ## add to contig_count
                            break ## no need to iterate further
    sampleCount+=1


1 

2 CMS002_026d_Rb_S149_L004
total number of contigs in sample: 596




number of contigs without blastx hit: 4

3 CMS002_045f_Rb_S189_L004
total number of contigs in sample: 77
number of contigs without blastx hit: 0

4 CMS002_050a_Rb_S5_L004
total number of contigs in sample: 173
number of contigs without blastx hit: 2

5 CMS002_016a_Rb_S121_L004
total number of contigs in sample: 0
number of contigs without blastx hit: 0

6 CMS002_0Water2_Rb_S139_L004
total number of contigs in sample: 82

7 CMS002_045d_Rb_S186_L004
total number of contigs in sample: 139
number of contigs without blastx hit: 2

8 CMS002_049a_Rb_S4_L004
total number of contigs in sample: 1175
number of contigs without blastx hit: 82

9 CMS002_026a_Rb_S146_L004
total number of contigs in sample: 123
number of contigs without blastx hit: 1

10 CMS002_0Water4_Rb_S163_L004
total number of contigs in sample: 121

11 CMS001_042_Ra_S23
total number of contigs in sample: 6697
number of contigs without blastx hit: 323

12 CMS002_029c_Rb_S161_L004
total number of contigs in sample: 5188
number of 

total number of contigs in sample: 12284
number of contigs without blastx hit: 607

83 CMS001_053_Ra_S8
total number of contigs in sample: 2587
number of contigs without blastx hit: 90

84 CMS002_020d_Rb_S134_L004
total number of contigs in sample: 10292
number of contigs without blastx hit: 832

85 CMS001_030_Ra_S7
total number of contigs in sample: 1004
number of contigs without blastx hit: 49

86 CMS001_016_Ra_S6
total number of contigs in sample: 21018
number of contigs without blastx hit: 2437

87 CMS002_047d_Rb_S196_L004
total number of contigs in sample: 7764
number of contigs without blastx hit: 369

88 CMS001_043_Ra_S24
total number of contigs in sample: 5072
number of contigs without blastx hit: 278

89 CMS001_water3_Qiagen_S26
total number of contigs in sample: 6169

90 CMS001_020_Ra_S15
total number of contigs in sample: 6417
number of contigs without blastx hit: 291

91 CMS002_047i_Rb_S2_L004
total number of contigs in sample: 5774
number of contigs without blastx hit: 246

total number of contigs in sample: 6400
number of contigs without blastx hit: 250

161 CMS001_water4_Zymo_S27
total number of contigs in sample: 449


In [4]:
taxa_tree={'children':[branches['no_hit']]} ## tree structure (+ add no-hit branch from the beginning)

J=sorted(J,key=lambda k: len(ncbi.get_lineage(k['taxid'])))

for b in branches: ## iterate through flat list of branches
    if 'attrs' in branches[b]: ## branch has been annotated before
        branches[b]['attrs']['contig_count']=sum([branches[b]['attrs'][c]['contig_count'] for c in branches[b]['attrs'] if 'CMS' in c]) ## compute sum of sample contig counts
    else:
        branches[b]['attrs']={'contig_count':0} ## nothing landed on the branch

for taxon in J: ## iterate over every taxon in treemap
    lineage=ncbi.get_lineage(taxon['taxid']) ## get its lineage
    
    if len(lineage)==1 and taxon not in taxa_tree['children']: ## if root
        taxa_tree['children'].append(taxon) ## add root to tree
        
    for lin in lineage[::-1][1:]: ## iterate through lineage, starting from most recent, ignore first entry (self)
        
        if lin in branches: ## rank present amongst branches
            parent=branches[lin] ## grab parent
            
            if 'children' not in parent: ## if parent doesn't have children yet - add the attribute
                parent['children']=[]
                
            if taxon not in parent['children']: ## branch wasn't assigned to its parent yet
                parent['children'].append(taxon) ## add child to parent
            
            break

json.dump(taxa_tree,open('/Users/evogytis/Documents/skeeters/treemap/skeeters.json','w'),indent=1) ## write json out to repo

In [None]:
## then run python3 server locally in the folder where treemap.html is located:
## python3 -m http.server 4000
## python -m SimpleHTTPServer 4000