## protein sequence identity across genome assemblies

* find all assemblies for species we need
* download protein fasta
* make blast db for all seqs
* blast a tempalte target protein e.g. pknh1 
* find best hits for each species?
* store each protein seq per species and get identities etc.

alternatives could be to use roary for clustering?

In [113]:
import os, glob, subprocess, gzip
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import Phylo, AlignIO
from pylab import plt
import pandas as pd
import numpy as np
from epitopepredict import sequtils, utilities
pd.set_option('display.width', 180)
pd.set_option('max_colwidth', 120)
from tools import *

In [188]:
species = ['africanum','h37rv','h37ra','beijing','caprae','microti','ulcerans','marinum','canettii','BCG','bovis','pinnipedii','cdc']
def get_species(x):
    for s in species:
        if x.lower().find(s) != -1:
            return s
    return 'mtb'

In [230]:
df=pd.read_csv('Genome_data.csv')
df2=pd.read_csv('mtb_assemblies.csv')
df2['species'] = df2['#Organism/Name'].apply(get_species)
#df2.to_csv('mtb_assemblies.csv',index=False)
#print (df2.columns)

#omit = ['PRJNA343736','PRJEB2138']
#df2=df2[~df2.BioProject.isin(omit)]
#print (df2[:5])

#include some extra mtb strains
include = ['']
df2=df2[(df2.species!='mtb')]
print (len(df2))
print (df2.groupby('species').agg({'BioSample':np.size}).sort_values(by='BioSample',ascending=False))

209
            BioSample
species              
bovis             107
africanum          31
marinum            25
canettii            9
h37rv               9
cdc                 7
ulcerans            7
caprae              3
h37ra               3
microti             3
pinnipedii          3
beijing             2


In [None]:
asm=df.merge(df2,left_on='BIOSAMPLE',right_on='BioSample')
print (asm)

## create protein fasta of all target assemblies

In [190]:
path='../myco_assemblies/'
new=open('myco_proteins.faa','w')
for i,r in list(df2.iterrows()):    
    try:
        filename = glob.glob(path+'{n}*'.format(n=r.Assembly))[0]    
    except:
        continue
    #print (r.Assembly, r.Strain, r.BioProject)
    with gzip.open(filename, "rt") as handle:
        seqs = list(SeqIO.parse(handle, "fasta"))
        #print (len(seqs))
        for s in seqs:
            s.description += '_'+r.Strain
        new=open('myco_proteins.faa','a')
        SeqIO.write(seqs,new,'fasta')

## make blast db 

In [191]:
cmd='makeblastdb -in myco_proteins.faa -dbtype prot -out myco_proteins'
temp=subprocess.check_output(cmd,shell=True)

In [234]:
def find_orthologs(query, label):
    """Find hits in a target db and save unique sequences"""
    
    bl = sequtils.blast_sequences('myco_proteins',[myseq],cpus=8,maxseqs=1000)
    #print (bl)
    bl=bl[(bl.pident>90) & (bl.qcovs>90)]
    #get unique
    bl=bl.drop_duplicates('sseq')
    print (len(bl))

    bl['strain'] = bl.stitle.apply(lambda x: x.split('_')[1])
    #print (bl)
    bl['species'] = bl.stitle.apply(get_species)
    cols=['sseqid','strain','species','pident','stitle']
    #print (bl[cols][:10])
    print ('found %s unique hits' %len(bl))
    print (bl.groupby('species').agg({'strain':np.size}))
    
    #save hits    
    seqs = [SeqRecord(Seq(r.sseq), id=r.sseqid+'_'+r.species, description=r.species) for i,r in bl.iterrows()]
    SeqIO.write(seqs,'%s_hits.faa' %label,'fasta')
    #align results
    #alncmd="/usr/bin/mafft --auto --clustalout --reorder pknh1_hits.faa > pknh1_hits.aln"
    alncmd="muscle -in {l}_hits.faa -out {l}_hits.aln".format(l=label)
    tmp=subprocess.check_output(alncmd, shell=True)
    
    aln = AlignIO.read('%s_hits.aln' %label,'fasta')
    dm,tree=get_tree(aln)
    Phylo.write([tree], '%s.newick' %label, 'newick')
    return bl


## blast a query

In [237]:
myseqs=list(SeqIO.parse('pknh_orthologs.fa','fasta'))
#myseqs=list(SeqIO.parse('tbd2.faa','fasta'))
myseq = myseqs[0]
print (myseq)

bl = find_orthologs(myseq, label="pknh1")


ID: pknh1_mtb
Name: pknh1_mtb
Description: pknh1_mtb
Number of features: 0
Seq('MSDAQDSRVGSMFGPYHLKRLLGRGGMGEVYEAEHTVKEWTVAVKLMTAEFSKD...NKE', SingleLetterAlphabet())
27
found 27 unique hits
           strain
species          
africanum       5
bovis          16
canettii        4
cdc             1
h37rv           1


In [203]:
#r=Phylo.parse('trees.xml', 'phyloxml')
#tree=r.next()
#f,tr=draw_tree(tree,root='AOZ42422.1_mtb', title='pknh1')


1