In [None]:
import os 
import pandas as pd
import sys
import glob
import numpy

The goal of this notebook is to collate desired sequences for each clade in a directory titled by that clade. Have functions for exons, transcripts, and gene sequences. Welcome to the roundup! &#x1F920;

In [None]:
#first, find the files associated with final clade membership by using the gene table 
base='/global/scratch/users/chandlersutherland/e14/NLRCladeFinder/Atha_NLRome/'
gene_table=base+"Atha_NLRome_GeneTable.txt"
gene=pd.read_csv(gene_table,sep = '\t')
gene

In [None]:
#generate a dictionary with Clade names as the keys and representative genes as the values by getting each gene in the table associated with the clade
dictionary=dict()
clades=numpy.unique(gene['Clade'])
for clade in clades:
    print(clade)
    genes=list(gene[gene['Clade']==clade]['Gene'])
    dictionary[clade]=genes

In [None]:
#Define CDS roundup, which takes the items of this dictionary and writes a fasta file with all exon sequences to the clade folder
def CDS_roundup(clade, gene_list):
    #initialize some housekeeping, count being the number of found exons and found a set of gene names 
    count=0
    found=[]
    
    #get the clade name and initialize the new output directory 
    output_dir="/global/scratch/users/chandlersutherland/e14/popgen/clades/"+clade
    if not os.path.exists(output_dir): #this prevents a directory exists error
       os.makedirs(output_dir)
    
    #read in the gene list per clade, and clean up 
    clean_list=[]
    for i in gene_list:
        clean=i.replace('_1', '.1').replace('_2', '.2').replace('_3', '.3')
        clean_list.append(clean)

    # create output CDS file 
    g = open(output_dir+"/"+clade+'.CDS.fa', "w")

    # open the NLRome CDS file and search for gene names 
    #if found, write to new fasta file 
    with open("/global/scratch/users/chandlersutherland/e14/hvNLR/ArabidopsisRENSEQ/CDS/NLRome.CDS.fasta",'r') as text_file:
        lines = text_file.readlines()
        for line in lines:
            for gene in clean_list:
                if gene in line:
                    count=count+1
                    found.append(gene)
                    g.writelines(line)
                    g.writelines(lines[lines.index(line)+1])
    
    #special treatment for Col-0 because of the full gene names 
    with open("/global/scratch/users/chandlersutherland/e14/popgen/Col0_Exons.fa",'r') as text_file:
        lines = text_file.readlines()
        for line in lines:
                for gene in clean_list:
                    if gene.startswith('ATHALIANA'):
                        short=gene.replace('ATHALIANA_', '')
                        if short in line: 
                            count=count+1
                            found.append(gene)
                            g.writelines(line)
                            g.writelines(lines[lines.index(line)+1])
    
    #sanity check. The found genes should be the same as the input genes. 
    print('found '+str(count)+' transcripts')
    s=set(found)
    q=set(list(clean_list))
    if s == q:
          print('found all genes supplied')
    else:
          print('error, some genes missing')
          temp3 = [x for x in q if x not in s]
          print(str(temp3)+' not found')
    
    g.close()
    print('CDS file written to '+output_dir+"/"+clade+'.CDS.fa')

In [None]:
for k, v in dictionary.items():
    CDS_roundup(k, v)

In [None]:
#Define Gene roundup, which takes the items of this dictionary and writes a fasta file with all Gene sequences to the clade folder
def Gene_roundup(clade, gene_list):
    #initialize some housekeeping, count being the number of found exons and found a set of gene names 
    count=0
    found=[]
    
    #get the clade name and initialize the new output directory 
    output_dir="/global/scratch/users/chandlersutherland/e14/popgen/clades/"+clade
    if not os.path.exists(output_dir): #this prevents a directory exists error
       os.makedirs(output_dir)
    
    #read in the gene list per clade, and clean up 
    clean_list=[]
    for i in gene_list:
        clean=i.replace('_T', '_G').replace('-R1', '').replace('_1', '.1').replace('_2', '.2').replace('_3', '.3')
        clean_list.append(clean)

    # create output CDS file 
    g = open(output_dir+"/"+clade+'.GeneSequence_test.fa', "w")

    # open the NLRome CDS file and search for gene names 
    #if found, write to new fasta file 
    with open("/global/scratch/users/chandlersutherland/e14/hvNLR/ArabidopsisRENSEQ/GeneSequences/NLRome_Gene_Sequences.respectify.fa",'r') as text_file:
        lines = text_file.readlines()
        for line in lines:
            for gene in clean_list:
                if gene in line:
                    count=count+1
                    found.append(gene)
                    g.writelines(line)
                    g.writelines(lines[lines.index(line)+1])
    
    #special treatment for Col-0 because of the full gene names 
    with open("/global/scratch/users/chandlersutherland/e14/popgen/Col0_GeneSequences.fa",'r') as text_file:
        lines = text_file.readlines()
        for line in lines:
                for gene in clean_list:
                    if gene.startswith('ATHALIANA'):
                        short=gene.replace('ATHALIANA_', '').replace('_1', '').replace('.1', '')
                        if short in line: 
                            count=count+1
                            found.append(gene)
                            g.writelines(line)
                            g.writelines(lines[lines.index(line)+1])
    
    #sanity check. The found genes should be the same as the input genes. 
    print('found '+str(count)+' genes')
    s=set(found)
    q=set(list(clean_list))
    if s == q:
          print('found all genes supplied')
    else:
          print('error, some genes missing')
          temp3 = [x for x in q if x not in s]
          print(str(temp3)+' not found')
    
    g.close()
    print('Gene file written to '+output_dir+"/"+clade+'.GeneSequence.fa')

In [None]:
for k, v in dictionary.items():
    Gene_roundup(k, v)

In [None]:
#sanity check 
len(numpy.unique(gene['Clade']))
#same number of directories as clades
genes=list(gene[gene['Clade']=='Int10637_304_324_R_203']['Gene'])
#sort(genes)
genes.sort()
genes

Input strategy change, get the Gene names from the alignment and use that to build the FASTA to ensure they are exactly the same 

In [None]:
#generate a dictionary with Clade names as the keys and representative genes as the values 
dictionary2=dict()
gene=gene.astype({'File':'str'})
files=numpy.unique(gene['File'])

cleanedfiles = [x for x in files if str(x) != 'nan']

for file in cleanedfiles:    
    genes=[]
    with open("/global/scratch/users/chandlersutherland/e14/NLRCladeFinder/Atha_NLRome/"+file,'r') as text_file:
        lines = text_file.readlines()
        for line in lines:
            if line.startswith('>'):
                genes.append(line.replace('>', '').replace(' \n', ''))
                
    
    clade=file.split('/')[-1].replace('.afa', '')
    dictionary2[clade]=genes

dictionary2

In [None]:
#get the Col0 genes so I can pull sequences from phytozome 
all_genes=list(dictionary2.values())
flat_list = [item for sublist in all_genes for item in sublist]
for gene in flat_list:
    if gene.startswith("ATHALIANA"):
        col0=gene.replace("ATHALIANA_", '').strip("_1")
        print(col0)

In [None]:
#Gene roundup sandbox to get the same name as in the alignment file 
def Gene_roundup_test(clade, gene_list):
    #initialize some housekeeping, count being the number of found exons and found a set of gene names 
    count=0
    found=[]
    
    #get the clade name and initialize the new output directory 
    output_dir="/global/scratch/users/chandlersutherland/e14/popgen/clades/"+clade
    if not os.path.exists(output_dir): #this prevents a directory exists error
       os.makedirs(output_dir)

    # create output Gene Sequences file 
    g = open(output_dir+"/"+clade+'.GeneSequence_test.fa', "w")

    # open the NLRome CDS file and search for gene names 
    #if found, write to new fasta file 
    with open("/global/scratch/users/chandlersutherland/e14/hvNLR/ArabidopsisRENSEQ/GeneSequences/NLRome_Gene_Sequences.respectify.fa",'r') as text_file:
        lines = text_file.readlines()
        for line in lines:
            for gene in gene_list:
                clean=gene.replace('_T', '_G').replace('-R1', '').replace('_1', '.1').replace('_2', '.2').replace('_3', '.3')
                if clean in line:
                    count=count+1
                    found.append(gene)
                    g.writelines('>'+gene+'\n')
                    g.writelines(lines[lines.index(line)+1])
    
    #special treatment for Col-0 because of the full gene names 
    with open("/global/scratch/users/chandlersutherland/e14/popgen/Col0_GeneSequences.fa",'r') as text_file:
        lines = text_file.readlines()
        for line in lines:
                for gene in gene_list:
                    if gene.startswith('ATHALIANA'):
                        short=gene.replace('ATHALIANA_', '').replace('_1', '').replace('.1', '')
                        if short in line: 
                            count=count+1
                            found.append(gene)
                            g.writelines('>'+gene+'\n')
                            g.writelines(lines[lines.index(line)+1])
    
    #sanity check. The found genes should be the same as the input genes. 
    print('found '+str(count)+' genes')
    s=set(found)
    q=set(list(gene_list))
    if s == q:
          print('found all genes supplied')
    else:
          print('error, some genes missing')
          temp3 = [x for x in q if x not in s]
          print(str(temp3)+' not found')
    
    g.close()
    print('Gene file written to '+output_dir+"/"+clade+'.GeneSequence.fa')

In [None]:
Gene_roundup_test(clade, dictionary2[clade])

Switch strategy part 2: get the CDS together as single sequences to try not to break palnal2

In [None]:
#input: a clade name, and the list of genes found in the alignment in Daniil's format 
#output: a fasta file written, with Daniil gene IDs. Exons are concatenated to form the transcript sequence. 
#update: for col0, use tair10 transcripts 
def transcript_roundup(clade, gene_list):
    #initialize some housekeeping, count being the number of found exons and found a set of gene names 
    exons=[]
    found=[]

    #get the clade name and initialize the new output directory 
    output_dir="/global/scratch/users/chandlersutherland/e14/popgen/clades/"+clade
    if not os.path.exists(output_dir): #this prevents a directory exists error
        os.makedirs(output_dir)

    # create output CDS file 
    g = open(output_dir+"/"+clade+'.transcript.fa', "w")

    for gene in gene_list:
        #special treatment for Col-0 because of the full gene names 
        count=0
        if gene.startswith('ATHALIANA'):
            #using just the primary transcript
            clean=gene.replace('ATHALIANA_', '').replace('_1', '')+'.1'
            transcript=[]
            line_number=0

            with open("/global/scratch/users/chandlersutherland/e14/popgen/Col0_CDS_tair10.fa",'r') as text_file:
                lines = text_file.readlines()
                for line in lines:
                    line_number += 1
                    if line.find(clean) != -1: 
                        transcript.append(lines[line_number].strip('\n'))
                        count += 1
                        
            found.append(gene)
            exons.append([gene, count])
            g.writelines('>'+gene+'\n')
            g.writelines(''.join(transcript)+'\n')
         
        #everything else, search the NLRome CDS file 
        else:
            clean=gene.replace('_1', '.1').replace('_2', '.2').replace('_3', '.3')
            transcript=[]
            line_number=0

            with open("/global/scratch/users/chandlersutherland/e14/hvNLR/ArabidopsisRENSEQ/CDS/NLRome.CDS.fasta",'r') as text_file:
                lines = text_file.readlines()
                for line in lines:
                    line_number += 1
                    if line.find(clean) != -1: 
                        transcript.append(lines[line_number].strip('\n'))
                        count += 1

            found.append(gene)
            exons.append([gene, count])
            g.writelines('>'+gene+'\n')
            g.writelines(''.join(transcript)+'\n')

    g.close()
    
    #check that output transcripts matches number of input 
    s=set(found)
    q=set(list(gene_list))
    if s == q:
          print('found all genes supplied; '+str(len(s))+' genes\n\n')
    else:
          print('error, some genes missing')
          temp3 = [x for x in q if x not in s]
          print(str(temp3)+' not found')
    print('Exons added per gene: ')
    print(exons)
    print('\n\n')
    
    g.close()
    print('Gene file written to '+output_dir+"/"+clade+'.transcript.fa')

In [None]:
clade='Int10637_304_324_R_203'
gene_list=dictionary2[clade]
transcript_roundup(clade, gene_list)

In [None]:
for k, v in dictionary2.items():
    transcript_roundup(k, v)

It would be helpful to copy the best alignment file into the created clade folder.

In [None]:
for file in cleanedfiles: 
    clade=file.split('/')[-1].strip('.afa')
    input_file = "/global/scratch/users/chandlersutherland/e14/NLRCladeFinder/Atha_NLRome/" + file
    output_dir = "/global/scratch/users/chandlersutherland/e14/popgen/clades/"+clade
    os.popen('cp '+input_file+' '+output_dir)

In [None]:
cleanedfiles

Similarly for the tree output 

In [None]:
for file in cleanedfiles: 
    subfolder=file.split('/')[-2]
    clade=file.split('/')[-1].strip('.afa')
    input_file = "/global/scratch/users/chandlersutherland/e14/NLRCladeFinder/Atha_NLRome/RAxML_tree_pbNB-ARC/" + subfolder+'/RAxML_bipartitionsBranchLabels.'+clade+'.Raxml.out'
    output_dir = "/global/scratch/users/chandlersutherland/e14/popgen/clades/"+clade
    os.popen('cp '+input_file+' '+output_dir)

Create a single fasta file with all the NLR proteins in the NLRome

In [None]:
all_genes=gene.loc[:,'Gene']

In [None]:
#does not have to be clade aware, just making a single fasta
# create output fasta file 
g = open('/global/scratch/users/chandlersutherland/e14/popgen/NLRome_proteins.fasta', "w")

for gene in all_genes:
    count=0
    protein=[]
    line_number=0
    
    #convert gene name 
    if gene.startswith('ATHALIANA'):
        #using just the primary transcript
        clean=gene.replace('ATHALIANA_', '').replace('_1', '')+'.1'
    else:
        clean=gene.replace('_1', '.1').replace('_2', '.2').replace('_3', '.3')
    
    #search file with clean gene name 
    with open("/global/scratch/users/chandlersutherland/e14/hvnlr_clades/hvNLR/Athaliana_NLR_Phylogeny/Proteomes/NLRome.aa.fa",'r') as text_file:
        lines = text_file.readlines()
        for line in lines:
            line_number += 1
            if line.find(clean) != -1: 
                protein.append(lines[line_number].strip('\n'))
                count += 1

        found.append(gene)
        g.writelines('>'+gene+'\n')
        g.writelines(''.join(protein)+'\n')

g.close()

#check that output transcripts matches number of input 
s=set(found)
q=set(list(gene_list))
if s == q:
      print('found all genes supplied; '+str(len(s))+' genes\n\n')
else:
      print('error, some genes missing')
      temp3 = [x for x in q if x not in s]
      print(str(temp3)+' not found')
        
print('\n\n')

g.close()
print('Gene file written to '+'/global/scratch/users/chandlersutherland/e14/popgen/NLRome_proteins.fasta')