# dN dS pairwise

References:

* yn00 alg: https://academic.oup.com/mbe/article/17/1/32/975527
* yn00 within PAML suit. https://academic.oup.com/mbe/article/24/8/1586/1103731
* example in mouse vs human genomes https://www.nature.com/articles/nature01262
* ynn google forum https://groups.google.com/forum/#!searchin/pamlsoftware/omega/pamlsoftware/RQhzqhhZEMM/eWQC3j-4o8kJ
* one genome (waterhouse signs): https://www.g3journal.org/content/ggg/9/3/625.full.pdf



In [None]:
import os, subprocess, sys,glob
from Bio import SeqIO,AlignIO, Phylo
from Bio.Phylo.PAML import codeml, yn00
import pandas as pd
from Bio.Align.Applications import MuscleCommandline

## Get Single Copy Orthologs

In [None]:
Orthologs_table=pd.read_csv("Orthogroups/Orthogroups.GeneCount.tsv",sep="\t" )

In [None]:
Gbi_Lko_1to1=Orthologs_table[(Orthologs_table['Gryllus_bimaculatus']==1) & (Orthologs_table['Laupala_kohalensis']==1) ][['Orthogroup','Gryllus_bimaculatus','Laupala_kohalensis' ]]
Gbi_Ame_1to1=Orthologs_table[(Orthologs_table['Gryllus_bimaculatus']==1) & (Orthologs_table['Apis_mellifera']==1) ][['Orthogroup','Gryllus_bimaculatus','Apis_mellifera' ]]
Lko_Ame_1to1=Orthologs_table[(Orthologs_table['Laupala_kohalensis']==1) & (Orthologs_table['Apis_mellifera']==1) ][['Orthogroup','Laupala_kohalensis','Apis_mellifera' ]]


In [None]:
#Numbers of singel copy Orthologs
print(  "Gbi_Lko_1to1=",Gbi_Lko_1to1.shape, ";",
        "Gbi_Ame_1to1=",Gbi_Ame_1to1.shape,";",
        "Lko_Ame_1to1=",Lko_Ame_1to1.shape)


Gbi_Lko_1to1= (5728, 3) ; Gbi_Ame_1to1= (5298, 3) ; Lko_Ame_1to1= (5009, 3)


## MUSCLE alignments

For each OGs get the 2 Orthologous sequences and align them

In [None]:
OGspath="Orthogroup_Sequences/"

def run_MUSCLE_pariwise_batch(OGlist, sp1, sp2):

    sp1=sp1
    sp2=sp2

    #for each OG get 2 seqs of interest and write in tempfile
    for OGs in OGlist:
        print(OGs)
        fasta_sequences = SeqIO.parse(open(os.path.join(OGspath, OGs+".fa")),'fasta')

        with open(".cacheseqstoalign", "w+") as out_file:
            for fasta in fasta_sequences:
                name, sequence = fasta.id, str(fasta.seq)
                if(name.startswith(sp1) or name.startswith(sp2)):
                    print(name)
                    SeqIO.write(fasta, out_file, "fasta")
        out_file.close()
        # the 2 seqs are in a temp file. 
        #prepare Muscle command
        muscle_cline = MuscleCommandline("~/data_disk/Software/muscle3.8.31_i86linux64",input=".cacheseqstoalign", out="Muscle_out/OGs_%s_%s_%s.aln"%(OGs, sp1, sp2))
        print(muscle_cline)
        ## RUN MUSCLE
        subprocess.run(str(muscle_cline), shell=True)      

#os.remove(".cacheseqstoalign")    

In [None]:
# Run MUSCLE on 1 to 1 Orthologs between Gbi and Lko
run_MUSCLE_pariwise_batch(Gbi_Lko_1to1['Orthogroup'], "Gbi", "Lko")
# Run MUSCLE on 1 to 1 Orthologs between Gbi and Ame
run_MUSCLE_pariwise_batch(Gbi_Ame_1to1['Orthogroup'], "Gbi", "Ame")
# Run MUSCLE on 1 to 1 Orthologs between Lko and Ame
run_MUSCLE_pariwise_batch(Lko_Ame_1to1['Orthogroup'], "Lko", "Ame")

## Pal2Nal and dN/dS with Ynn00

In [None]:
def Pal2Nal_dNdS_Ynn00_batch(sp1,sp2):


    sp1=sp1
    sp2=sp2

    alignments_dir="Muscle_out"
    nts_fastas_directory=os.path.join(os.getcwd(),"mRNA_Seqs")
    seqsdone=0
    Omega=dict()


    dictionary_spp_file= {  "Ame" : "Apis_mellifera_longest_CDS.fa",
                            "Gbi" : "Longest_Prot_per_gene_mrna_V2.fa",
                            "Lko" : "Lko_Longest_CDS_per_gene.fa"}

    for alignment_file in os.listdir(alignments_dir):
        ## select alignments containg 2 selected species
        if ".aln" in alignment_file and sp1 in alignment_file and sp2 in alignment_file: 
            print("Current alignment: ",alignment_file)
            OG_Name=os.path.basename(alignment_file)
            seqsdone+=1
            #tempfile for mRNA seqs
            nts_File= open(".ntsfile", "w+")
            tempalign_File= open(".tempalign_File", "w+") # I copy here the alignment but with seq names changed(speciesnames)

            for alnrecord in AlignIO.read(os.path.join(alignments_dir,alignment_file), "fasta"):
                
                spp=alnrecord.id.split("_",1)[0]
                geneid=alnrecord.id.split("_",1)[1]
                
                tempalign_File.write(">"+str(spp)+"\n"+str(alnrecord.seq)+"\n") #write align in temp file with sppa s name
                
                # fasta files with mRNAs from dictionary
                spp_fa_filname=dictionary_spp_file[spp]

                ntsseqfound=False

                for fastaseq in SeqIO.parse(os.path.join(nts_fastas_directory,spp_fa_filname), "fasta"):
                    if geneid in fastaseq.id and ntsseqfound == False: ## onces its found stp searching (avoid ame to add 2 seqs..)
                        SeqIO.write(fastaseq, nts_File, "fasta")
                        ntsseqfound=True
                    elif geneid in fastaseq.description and ntsseqfound == False:  # For AME, LOC is in descriptor 
                        fastaseq.id=geneid
                        SeqIO.write(fastaseq, nts_File, "fasta")
                        ntsseqfound=True
                if ntsseqfound == False: ## make sure that I found the mRNA for each protein of the alignment
                    print("Error: Nucelotide sequence %s NOT found for %s" %(geneid, spp))
                    #sys.exit("Error: Nucelotide sequence %s NOT found for %s" %(geneid, spp))
            tempalign_File.close()
            nts_File.close()


             
            # tmp contain all the mRNAs of the proteins form alignemnt    
            ## Now run Pal2Nal
            Pal2nal_oupput_file=OG_Name+"pal2nal"
            paltonal_command="Software/pal2nal.v14/pal2nal.pl -nogap  -output paml %s %s > %s" \
                    % (".tempalign_File", ".ntsfile", "pal2nal_outputs/"+Pal2nal_oupput_file)  
            #print(paltonal_command)    
            subprocess.run(paltonal_command,shell=True)


            #check if Pal2Nal worked (file exisst and not empty)
            if (os.path.exists("pal2nal_outputs/"+Pal2nal_oupput_file) and os.path.getsize("pal2nal_outputs/"+Pal2nal_oupput_file) > 0):
                #if Pal2Nal worked, RUN YNN00
                yn = yn00.Yn00(alignment = "pal2nal_outputs/"+Pal2nal_oupput_file,
                               out_file = "ynn00_output/"+OG_Name+"_Ynn00Result.txt",
                               working_dir = os.getcwd())
                yn.set_options(verbose=True)
                try:
                    Yn00_results = yn.run()
                    Omega[OG_Name]=Yn00_results[sp1][sp2]["YN00"]["omega"],Yn00_results[sp1][sp2]["YN00"]["dN"],Yn00_results[sp1][sp2]["YN00"]["dS"]
                except:
                    print("\n Ynn failed, skipping OG: \n",OG_Name)
                    Omega[OG_Name]=None,None,None
                    Omega[OG_Name]=None,None,None
                    Omega[OG_Name]=None,None,None
            else:   
                print("\n Pal2Nal failed, skipping OG: \n",OG_Name)
                Omega[OG_Name]=None,None,None
                Omega[OG_Name]=None,None,None
                Omega[OG_Name]=None,None,None




#         if seqsdone >50:        
#               break

    return(Omega)

    os.remove(".ntsfile")
    os.remove(".tempalign_File")

In [None]:
#Omega_Gbi_Ame= Pal2Nal_dNdS_Ynn00_batch("Gbi", "Ame")
Omega_Lko_Ame= Pal2Nal_dNdS_Ynn00_batch("Lko", "Ame")
Omega_Gbi_Lko= Pal2Nal_dNdS_Ynn00_batch("Gbi", "Lko")

Save Omega values in csv files

In [None]:
import csv


with open('Omega_Gbi_Lko_v2.csv','w+') as f:
    f.write("key \t Omega \t dN \t dS \n")
    for key in Omega_Gbi_Lko:
        f.write(key+"\t"+'\t'.join([str(x) for x in Omega_Gbi_Lko[key]])+"\n")
    


with open('Omega_Gbi_Ame_v2.csv','w+') as f:
    f.write("key \t Omega \t dN \t dS \n")
    for key in Omega_Gbi_Ame:
        f.write(key+"\t"+'\t'.join([str(x) for x in Omega_Gbi_Ame[key]])+"\n")


with open('Omega_Lko_Ame_v2.csv','w+') as f:
    f.write("key \t Omega \t dN \t dS \n")
    for key in Omega_Lko_Ame:
        f.write(key+"\t"+'\t'.join([str(x) for x in Omega_Lko_Ame[key]])+"\n")
