In [1]:
from Bio import SearchIO,SeqIO
import ete3
import os,re,requests,functools,json
from datetime import datetime as dt

ncbi=ete3.ncbi_taxonomy.NCBITaxa()
# ete3.NCBITaxa.update_taxonomy_database(ncbi)

In [2]:
## Download a translation of accessions to GIs and taxids
## takes ~2 hours

local_path='/Users/evogytis/Dropbox/Jupyter_notebooks/Biohub/California_mosquitoes/s3_bucket/'

for subdir, dirs, files in os.walk(local_path):
    sample=os.path.basename(subdir)
    print(sample)
    for fname in files:
        fpath=os.path.join(subdir, fname)
        if 'blast' in fname:
            blast_search=SearchIO.parse(fpath,format='blast-tab')
            if 'gsnap' in fname:
                blast_type='nt'
            elif 'rapsearch' in fname:
                blast_type='aa'
                
            done=[]
            for blast_entry in blast_search: ## iterate over queries in file
                done+=blast_entry.hit_keys ## add all hits
            
            done_set=set(done) ## keep unique
            print(len(done),len(done_set))
            done=list(done_set)
            chunkSize=240 ## how many files should be fetched
            
            done_previously=[]
            
#             d=os.path.getmtime(subdir+'/acc2gi_%s.txt'%(blast_type)) ## get modification date for file
#             dT=dt.utcfromtimestamp(d) ## convert
#             if dT.year==2019: ## check if year is recent
#             if sample!='CMS002_013a_Rb_S120_L004':
            for line in open(subdir+'/acc2gi_%s.txt'%(blast_type),'r'):
                done_previously.append(line) ## log as being known previously
                
            if len(done_previously)<2:
                out=open(subdir+'/acc2gi_%s.txt'%(blast_type),'w')

                for c,chunk in enumerate(range(0,len(done),chunkSize)): ## split accessions into chunks
                    if chunk+chunkSize<len(done):
                        accs=','.join(set(done[chunk:chunk+chunkSize]))
                    else:
                        accs=','.join(set(done[chunk:]))

                    download=None
                    ## download chunks from either nuccore or protein databases
                    if blast_type=='aa':
                        request='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=protein&id=%s'%(accs)
                        download=requests.post(request)
                    elif blast_type=='nt':
                        request='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=nuccore&id=%s'%(accs)
                        download=requests.post(request)

                    if download.status_code!=200:
                        raise Exception('Request failed, %s'%(request))

                    gi=re.compile('<Item Name="Gi" Type="Integer">([0-9]+)</Item>') ## fetch GI numbers
                    taxid=re.compile('<Item Name="TaxId" Type="Integer">([0-9]+)</Item>') ## fetch taxids
                    GIs=gi.findall(download.text)
                    TaxIDs=taxid.findall(download.text)

                    combine=map('\t'.join,zip(accs.split(','),GIs,TaxIDs)) ## format
                    out.write('\n'.join(combine)) ## output
                    out.write('\n')
                out.close()


CMS002_026d_Rb_S149_L004




2805 1493
1344 325
CMS002_045f_Rb_S189_L004
367 229
193 137
CMS002_050a_Rb_S5_L004
732 234
68 20
CMS002_016a_Rb_S121_L004
0 0
0 0
CMS002_0Water2_Rb_S139_L004
384 237
214 140
CMS002_045d_Rb_S186_L004
609 261
135 78
CMS002_049a_Rb_S4_L004
4182 863
424 257
CMS002_026a_Rb_S146_L004
557 287
251 109
CMS002_0Water4_Rb_S163_L004
568 329
310 195
CMS001_042_Ra_S23
25392 2674
112 81
CMS002_029c_Rb_S161_L004
21137 3542
179 131
CMS001_050_Ra_S23
28010 5846
1005 345
CMS001_048_Ra_S5
10305 1949
373 129
CMS002_010a_Rb_S119_L004
3521 1202
685 40
CMS001_058_Ra_S9
28557 3601
279 199
CMS001_009_Ra_S13
91676 32945
2633 999
CMS002_029b_Rb_S160_L004
20965 3273
142 98
CMS001_water5_RNA_A_S12
0 0
0 0
CMS001_014_Ra_S5
42942 10976
131 104
CMS001_022_Ra_S6
12671 2118
664 353
CMS002_028b_Rb_S155_L004
25041 3748
193 145
CMS002_017a_Rb_S122_L004
700 154
95 40
CMS001_039_Ra_S9
16742 1633
44 39
CMS001_015_Ra_S13
10015 1369
140 84
CMS002_027a_Rb_S152_L004
1555 659
347 32
CMS002_034a_Rb_S168_L004
13753 1818
110 75
CMS00

In [3]:
## takes ~15 minutes

local_path='/Users/evogytis/Dropbox/Jupyter_notebooks/Biohub/California_mosquitoes/s3_bucket/'

rankCounts={}
for subdir, dirs, files in os.walk(local_path): ## iterate through samples
    sample=os.path.basename(subdir) ## sample name is folder name
    
    acc_to_gi={} ## accession to gi dict
    acc_to_taxid={} ## accession to taxid dict
    search={}
    print('\n%s'%(sample))
    for fname in files: ## iterate through files in subfolders
        fpath=os.path.join(subdir, fname)
        if 'blast' in fname: ## only interested in blast outputs
            if 'gsnap' in fname:
                blast_type='nt'
            elif 'rapsearch' in fname:
                blast_type='aa'
            
            blast_search=SearchIO.parse(fpath,format='blast-tab') ## read blast output
            search[blast_type]=SearchIO.to_dict(blast_search) ## turn to dict
            
            rankCounts[sample]={} ## data for this sample will be stored in a dict
            
            for line in open(subdir+'/acc2gi_%s.txt'%(blast_type),'r'): ## load translations of accessions to GIs and taxids
                acc,gi,taxid=line.strip('\n').split('\t')
                acc_to_gi[acc]=gi
                acc_to_taxid[acc]=int(taxid)
        
        elif 'fasta' in fname: ## fasta contains contigs
            contigs=SeqIO.parse(fpath,format='fasta') ## parse fasta contigs
            seqs=SeqIO.to_dict(contigs) ## convert to dict
            print('total number of contigs in sample: %s'%(len(seqs)))
    if len(search)==2 and contigs and 'water' not in sample.lower(): ## once done with sample, if there's nt and nr blast results, contigs are loaded and they're not controls - proceed
        
        for searchType in search: ## iterate over nt or nr blast results
            no_hits=len(seqs)-len(search[searchType]) ## no hits is (total number of contigs - blast hits)
            
            if searchType=='aa':
                rankCounts[sample]['no_hit']=float(no_hits) ## keep track of how many times a contig didn't hit anything in nr
                print('number of contigs without blast hit: %d'%(no_hits))
            
            for contig in search[searchType]: ## iterate through contigs blasted
                blast_hit={} ## keep a record of hits
                bitscores={} ## keep a record of bitscores
                percent_id={} ## keep a record of percent identities
                for hit in search[searchType][contig]: ## iterate over hits to contig
                    hit_id=hit.id
                    
                    ## want to keep hits where best HSP is <1e-2 and longest match is >200 nt
                    if min([h.evalue for h in hit.hsps])<1e-2 and max([len(f) for f in hit.fragments])>200:
                        blast_hit[hit_id]=hit
                        bitscores[hit_id]=min([k.bitscore for k in hit.hsps]) ## keep track of worst HSP bitscore
                        percent_id[hit_id]=min([k.ident_pct for k in hit.hsps]) ## keep track of worst HSP percent identity
                cutoff=0.1 ## within 10% of top hit
                filteredHits=filter(lambda k: max(bitscores.values())*(1.0-cutoff)<=bitscores[k], blast_hit) ## filter hits down to those whose bitscore within 10% of the best bitscore
                hitLineages=[ncbi.get_lineage(acc_to_taxid[k]) for k in filteredHits] ## get lineage of every hit
                hitLineages=list(filter(lambda x: 7157 not in x, hitLineages)) ## filter out mosquitos
                
                if searchType=='aa': ## if nr
                    hitLineages=list(filter(lambda x: 10239 in x, hitLineages)) ## keep viruses (nr)
                elif searchType=='nt': ## if nt
                    hitLineages=list(filter(lambda x: 10239 not in x, hitLineages)) ## ignore viruses (nt)
                
                if len(list(hitLineages))>0: ## hits exist
                    lineages=sum(hitLineages,[]) ## flattened
                    T=ncbi.get_taxid_translator(lineages) ## get translation dict for every taxid present
                    ranksPresent=functools.reduce(set.intersection,map(set,hitLineages)) ## reduce lineages seen down to those that are shared by all hits
                    rankIndices={r: max([lin.index(r) for lin in hitLineages if r in lin]) for r in ranksPresent} ## find the highest rank amongst lineages that is still shared by all hits
                    highestRank=[k for k in rankIndices if rankIndices[k]==max(rankIndices.values())][-1] ## get highest
                    
                    if highestRank in rankCounts[sample]: ## rank seen before in sample
                        rankCounts[sample][highestRank]+=1 ## increment count
                    elif highestRank not in rankCounts[sample]: ## rank not seen before
                        rankCounts[sample][highestRank]=1 ## start counting
        print('contigs included in treemap: %d'%(sum(rankCounts[sample].values())))




CMS002_026d_Rb_S149_L004
total number of contigs in sample: 596
number of contigs without blast hit: 4
contigs included in treemap: 466

CMS002_045f_Rb_S189_L004
total number of contigs in sample: 77
number of contigs without blast hit: 0
contigs included in treemap: 53

CMS002_050a_Rb_S5_L004
total number of contigs in sample: 173
number of contigs without blast hit: 2
contigs included in treemap: 18

CMS002_016a_Rb_S121_L004
total number of contigs in sample: 0
number of contigs without blast hit: 0
contigs included in treemap: 0

CMS002_0Water2_Rb_S139_L004
total number of contigs in sample: 82

CMS002_045d_Rb_S186_L004
total number of contigs in sample: 139
number of contigs without blast hit: 2
contigs included in treemap: 44

CMS002_049a_Rb_S4_L004
total number of contigs in sample: 1175
number of contigs without blast hit: 82
contigs included in treemap: 176

CMS002_026a_Rb_S146_L004
total number of contigs in sample: 123
number of contigs without blast hit: 1
contigs include

total number of contigs in sample: 11896
number of contigs without blast hit: 677
contigs included in treemap: 1210

CMS002_025a_Rb_S140_L004
total number of contigs in sample: 18
number of contigs without blast hit: 0
contigs included in treemap: 10

CMS001_059_Ra_S10
total number of contigs in sample: 1311
number of contigs without blast hit: 63
contigs included in treemap: 411

CMS002_044a_Rb_S178_L004
total number of contigs in sample: 272
number of contigs without blast hit: 9
contigs included in treemap: 44

CMS001_008_Ra_S3
total number of contigs in sample: 13396
number of contigs without blast hit: 1292
contigs included in treemap: 1354

CMS002_047b_Rb_S194_L004
total number of contigs in sample: 2522
number of contigs without blast hit: 84
contigs included in treemap: 148

CMS001_013_Ra_S5
total number of contigs in sample: 7071
number of contigs without blast hit: 313
contigs included in treemap: 372

CMS002_025d_Rb_S143_L004
total number of contigs in sample: 0
number of co

number of contigs without blast hit: 34
contigs included in treemap: 372

CMS001_033_Ra_S8
total number of contigs in sample: 6935
number of contigs without blast hit: 346
contigs included in treemap: 379

CMS002_057a_Rb_S10_L004
total number of contigs in sample: 623
number of contigs without blast hit: 24
contigs included in treemap: 87

CMS002_046b_Rb_S192_L004
total number of contigs in sample: 1119
number of contigs without blast hit: 40
contigs included in treemap: 275

CMS002_013a_Rb_S120_L004
total number of contigs in sample: 1716
number of contigs without blast hit: 85
contigs included in treemap: 545

CMS001_017_Ra_S6
total number of contigs in sample: 7223
number of contigs without blast hit: 361
contigs included in treemap: 380

CMS001_035_Ra_S20
total number of contigs in sample: 6126
number of contigs without blast hit: 178
contigs included in treemap: 584

CMS001_038_Ra_S22
total number of contigs in sample: 4575
number of contigs without blast hit: 197
contigs included

In [4]:
sanitise=re.compile('[\(\),\[\]\;\'\"]') ## used to remove crud from rank names that might interfere later

branch_hash={} ## will contain branches of taxonomy tree

links={} ## will link parents to children
no_hit={'taxonomy':'no blast hit','taxid':'no_hit','attrs':{}} ## the no-hit branch

for sample in rankCounts: ## iterate through samples
    ranks=rankCounts[sample] ## get the rank counts in sample
    taxidT=ncbi.get_taxid_translator(sum([ncbi.get_lineage(r) for r in ranks.keys() if r!='no_hit'],[])) ## get taxid translation
    
    if 'no_hit' in ranks:
        no_hit['attrs']['%s_count'%(sample)]=ranks['no_hit'] ## annotate no-hit branch with sample's no-hit count
        
    for r,rank in enumerate(ranks): ## iterate over taxonomic ranks seen in sample
        if rank not in branch_hash: ## taxon not seen in previous loops
            branch_hash[rank]={'attrs':{}} ## create new branch
            if rank!='no_hit':
                branch_hash[rank]['taxonomy']=str(re.sub(sanitise,'',taxidT[rank])) ## sanitise name
                branch_hash[rank]['taxid']=rank
                
        branch_hash[rank]['attrs']['%s_count'%(sample)]=ranks[rank] ## add contig count to branch
            
        lineage=None
        try:
            lineage=ncbi.get_lineage(rank) ## try getting lineage of rank
        except:
            pass

        if lineage: ## lineage available (.get_lineage() didn't fail)
            if 2 in lineage: ## Bacteria
                colour='indianred'
            elif 2759 in lineage: ## Eukaryota
                colour='skyblue'
            elif 2157 in lineage: ## Archaea
                colour='steelblue'
            elif 10239 in lineage: ## Viruses
                colour='slategrey'
                
            for i in range(len(lineage)-1): ## iterate over lineage - store parent-child links
                cur=lineage[::-1][i] ## starting from most recent, heading deeper
                par=lineage[::-1][i+1] ## next in line
                
                if cur not in branch_hash: ## taxid not seen before - create it
                    branch_hash[cur]={'taxid':cur,'taxonomy':taxidT[cur],'attrs':{'count':0.0}}
                if par not in branch_hash:
                    branch_hash[par]={'taxid':par,'taxonomy':taxidT[par],'attrs':{'count':0.0}}
                
                branch_hash[cur]['colour']=colour
                branch_hash[par]['colour']=colour
                if cur==131567:
                    branch_hash[cur]['colour']='lightgrey'
                elif par==131567:
                    branch_hash[par]['colour']='lightgrey'
                
                links[cur]=par ## remember link

json_tree={'children':[no_hit]} ## root of the tree has one child to begin with - the no-hit branch

no_hit['attrs']['count']=sum([no_hit['attrs'][w] for w in no_hit['attrs'] if '_count' in w]) ## sum no hits
for k in branch_hash: ## iterate through branches
    branch_hash[k]['attrs']['count']=sum([branch_hash[k]['attrs'][w] for w in branch_hash[k]['attrs'] if '_count' in w]) ## compute total counts from individual sample scores

for cur in links: ## iterate through parent-child links
    par=links[cur]
    child=branch_hash[cur] ## get child
    parent=branch_hash[par] ## get parent
    if 'children' not in parent: ## parent hasn't had children - add it
        parent['children']=[] ## children are stored in a list
        
    if child not in parent['children']: ## child is not yet present
        parent['children'].append(child) ## add child to parent
    if par==1 and parent not in json_tree['children']: ## at root of the tree
        json_tree['children'].append(parent) ## add root to tree
        
# print(json_tree)
json.dump(json_tree,open('/Users/evogytis/Dropbox/Jupyter_notebooks/Biohub/California_mosquitoes/test_skeeters.json','w'),indent=1)

In [5]:
## can check for contig numbers of individual samples
j=json.load(open('/Users/evogytis/Dropbox/Jupyter_notebooks/Biohub/California_mosquitoes/test_skeeters.json','r'))

# sample='CMS002_028a_Rb_S154_L004_count'
## sample contains 5396 contigs (5378 >200 nt)

sample='CMS002_029e_Rb_S164_L004_count'
## 1908 contigs >200 nt

global accumulate
accumulate=0.0
def walkJ(j,sample):
    global accumulate
    if 'attrs' in j and sample in j['attrs']:
        accumulate+=j['attrs'][sample]
    if 'children' in j:
        for child in j['children']:
            walkJ(child,sample)

walkJ(j,sample)
print(accumulate)

108.0
