# Generate Bidirectional best hits for two species of *Yersinia pestis*

In [2]:
!ls *.faa

GCA_000009065.1_ASM906v1_protein.faa GCF_000754785.1_YAU_protein.faa      Y_pestis_Yp4003.faa
GCF_000009065.1_ASM906v1_protein.faa Y_pestis_CO92.faa


In [3]:
%%bash
ORG1=Y_pestis_Yp4003
ORG2=Y_pestis_CO92
for i in `seq 1 4`
do
    exonerate --query $ORG1.faa --target $ORG2.faa --querychunkid $i --querychunktotal 4 --bestn 1 --ryo "%qi\t%ti\t%ps\n" --showvulgar no --showalignment no > $ORG1.to.$ORG2.$i.tsv &
done
for i in `seq 1 4`
do
    exonerate --target $ORG1.faa --query $ORG2.faa --querychunkid $i --querychunktotal 4 --bestn 1 --ryo "%qi\t%ti\t%ps\n" --showvulgar no --showalignment no > $ORG2.to.$ORG1.$i.tsv &
done

##  Concatenate output chunks into one output file for each mapping

In [1]:
%%bash
ORG1=Y_pestis_Yp4003
ORG2=Y_pestis_CO92
LOCUS=YP
echo "$ORG1	$ORG2	Similarity_$ORG1.to.$ORG2" > $ORG1.to.$ORG2.tab
echo "$ORG2	$ORG1	Similarity_$ORG2.to.$ORG1" > $ORG2.to.$ORG1.tab
for i in `seq 1 4`
do
    grep $LOCUS $ORG1.to.$ORG2.$i.tsv >> $ORG1.to.$ORG2.tab
    grep $LOCUS $ORG2.to.$ORG1.$i.tsv >> $ORG2.to.$ORG1.tab
done

grep: Y_pestis_Yp4003.to.Y_pestis_CO92.1.tsv: No such file or directory
grep: Y_pestis_CO92.to.Y_pestis_Yp4003.1.tsv: No such file or directory
grep: Y_pestis_Yp4003.to.Y_pestis_CO92.2.tsv: No such file or directory
grep: Y_pestis_CO92.to.Y_pestis_Yp4003.2.tsv: No such file or directory
grep: Y_pestis_Yp4003.to.Y_pestis_CO92.3.tsv: No such file or directory
grep: Y_pestis_CO92.to.Y_pestis_Yp4003.3.tsv: No such file or directory
grep: Y_pestis_Yp4003.to.Y_pestis_CO92.4.tsv: No such file or directory
grep: Y_pestis_CO92.to.Y_pestis_Yp4003.4.tsv: No such file or directory


## Read the mappings in to a pandas dataframe

In [6]:
import pandas as pd
org1 = 'Y_pestis_Yp4003'
org2 = 'Y_pestis_CO92'
org1_to_org2 = pd.read_table('{}.to.{}.tab'.format(org1, org2))
org2_to_org1 = pd.read_table('{}.to.{}.tab'.format(org2,org1))
display(org1_to_org2.sort_values(by='Similarity_{}.to.{}'.format(org1,org2),ascending=False).head(10))
display(org2_to_org1.sort_values(by='Similarity_{}.to.{}'.format(org2,org1),ascending=False).head(10))

Unnamed: 0,Y_pestis_Yp4003,Y_pestis_CO92,Similarity_Y_pestis_Yp4003.to.Y_pestis_CO92
0,Yp4003_00001,gnl|YPES214092|GKDD-457-MONOMER,100.0
2672,Yp4003_02910,gnl|YPES214092|GKDD-364-MONOMER,100.0
2659,Yp4003_02896,gnl|YPES214092|GKDD-350-MONOMER,100.0
2660,Yp4003_02897,gnl|YPES214092|GKDD-351-MONOMER,100.0
2661,Yp4003_02898,gnl|YPES214092|GKDD-352-MONOMER,100.0
2662,Yp4003_02899,gnl|YPES214092|GKDD-353-MONOMER,100.0
2663,Yp4003_02901,gnl|YPES214092|GKDD-355-MONOMER,100.0
2664,Yp4003_02902,gnl|YPES214092|GKDD-356-MONOMER,100.0
2665,Yp4003_02903,gnl|YPES214092|GKDD-357-MONOMER,100.0
2666,Yp4003_02904,gnl|YPES214092|GKDD-358-MONOMER,100.0


Unnamed: 0,Y_pestis_CO92,Y_pestis_Yp4003,Similarity_Y_pestis_CO92.to.Y_pestis_Yp4003
0,gnl|YPES214092|GKDD-100-MONOMER,Yp4003_03538,100.0
1921,gnl|YPES214092|GKDD-798-MONOMER,Yp4003_00111,100.0
2571,gnl|YPES214092|GKDD-3242-MONOMER,Yp4003_03578,100.0
2572,gnl|YPES214092|GKDD-3243-MONOMER,Yp4003_03579,100.0
2573,gnl|YPES214092|GKDD-325-MONOMER,Yp4003_01470,100.0
2574,gnl|YPES214092|GKDD-3252-MONOMER,Yp4003_03588,100.0
2575,gnl|YPES214092|GKDD-3253-MONOMER,Yp4003_03589,100.0
2578,gnl|YPES214092|GKDD-3263-MONOMER,Yp4003_00876,100.0
2579,gnl|YPES214092|GKDD-3266-MONOMER,Yp4003_00879,100.0
2580,gnl|YPES214092|GKDD-327-MONOMER,Yp4003_01468,100.0


## Take the intersection of mappings to get bidirection best hits.

In [7]:
from IPython.display import display, HTML
org1 = 'Y_pestis_Yp4003'
org2 = 'Y_pestis_CO92'
bbh  = org1_to_org2.merge(org2_to_org1, 
                 on=[org1, org2], 
                 how='inner',
                 #suffixes=('_albicans2auris','_auris2albicans'),
                 sort=True).sort_values(by='Similarity_{}.to.{}'.format(org1,org2),ascending=False)
display(HTML(bbh.to_html()))
bbh.to_csv('{}_and_{}_BBH.tab'.format(org1,org2), index=False, sep='\t')


Unnamed: 0,Y_pestis_Yp4003,Y_pestis_CO92,Similarity_Y_pestis_Yp4003.to.Y_pestis_CO92,Similarity_Y_pestis_CO92.to.Y_pestis_Yp4003
0,Yp4003_00001,gnl|YPES214092|GKDD-457-MONOMER,100.0,100.0
2555,Yp4003_02960,gnl|YPES214092|GKDD-1064-MONOMER,100.0,100.0
2543,Yp4003_02942,gnl|YPES214092|GKDD-1276-MONOMER,100.0,100.0
2544,Yp4003_02945,gnl|YPES214092|GKDD-1275-MONOMER,100.0,100.0
2545,Yp4003_02946,gnl|YPES214092|GKDD-1274-MONOMER,100.0,100.0
2546,Yp4003_02947,gnl|YPES214092|GKDD-1273-MONOMER,100.0,100.0
2547,Yp4003_02949,gnl|YPES214092|GKDD-1074-MONOMER,100.0,100.0
2548,Yp4003_02951,gnl|YPES214092|GKDD-1073-MONOMER,100.0,100.0
2549,Yp4003_02952,gnl|YPES214092|GKDD-1072-MONOMER,100.0,100.0
2550,Yp4003_02953,gnl|YPES214092|GKDD-1071-MONOMER,100.0,100.0
