In [None]:
import Bio
from Bio import SeqIO
from Bio import Entrez
from Bio import Phylo
from Bio import AlignIO
import tkinter
from tkinter import *
from tkinter import filedialog
import os
import pandas as pd

In [None]:
# select excel file with accession numbers for nucleotide sequences (available in the supplementary materials of the paper)
# this is just a shortcut to typing out the complete file path

root = tkinter.Tk()
request_load_file = filedialog.askopenfile(initialdir=os.getcwd())
if request_load_file:
      filepath = os.path.abspath(request_load_file.name)
root.destroy()

filepath

In [None]:
#  Creates a new folder called "fasta_files" in the same directory where your spreadsheet file is located

folder = '\\'.join(filepath.split('\\')[:-1])
if not os.path.exists(folder+'\\fasta_files'):
    os.makedirs(folder+'\\fasta_files')

In [None]:
# creates a local copy of the spreadsheet that you selected

df_accession = pd.read_excel(filepath)
df_accession

In [None]:
# creates a pop up window that asks for the users email
# NCBI Genbank wants to know who you are in case there is an issue

root=tkinter.Tk(className='Email Input')
root.geometry("500x75")
email_var=tkinter.StringVar()

# Gives function to the input box
def submit():
    email=email_var.get()
    print("The user's email address is: " + email)
    root.destroy()

# Creates buttons and labels on the pop-up window
email_label = tkinter.Label(root, text = "User's Email Address", font=('calibre',10, 'bold'))
email_entry = tkinter.Entry(root,textvariable = email_var, font=('calibre',10,'normal'),width=50)
sub_btn=tkinter.Button(root,text = 'Submit', command = submit)
email_label.grid(row=0,column=0)
email_entry.grid(row=0,column=1)
sub_btn.grid(row=2,column=1)

#  Inbeds the pop-up window in the script
root.mainloop()

In [None]:
#  This cell will fetch fasta strings from the genbank database and save them in individual fasta files with deflines that match the nomenclature in the solaralo dogs study
#  Expect this cell to have a longer runtime due to repeated interaction with the database

fasta_list = list()
fasta_string = str()

#  Iterate over available accession codes in the spreadsheet object
for accession in df_accession[df_accession.columns[1]].dropna():
    try:
        
#  email is needed to identify yourself to the Genbank database
        Entrez.email = email_var.get()
    
#  'efetch' command retreives data from Genbank ("gb") nucleotide database
        handle = Entrez.efetch(db="nucleotide", id=accession, rettype="gb", retmode="text")
        record = SeqIO.read(handle, "genbank")
        handle.close()
        
#  Boolean statements are used to name the defline appropriately (based on figure1 in solarolo dogs study)
        if not df_accession['Breed'][df_accession['Accession Number*']==accession].values[0]=='-':
            record.description = df_accession['Breed'][df_accession['Accession Number*']==accession].values[0]
        else:
            record.description = df_accession['ID'][df_accession['Accession Number*']==accession].values[0]
            
#  this creates individual fasta files for each accession number with updated deflines
        SeqIO.write(record,f'{folder}\\fasta_files\\{record.description}.fasta',"fasta")
    
#  convert the record object to text and add to a local list
        fasta_list.append(SeqIO.FastaIO.as_fasta(record))

#  exception is made when the accession code is not valid, jumps to the next accession code
    except Exception:
        pass

In [None]:
# this cell writes the strings in the 'fasta_list' to a single .txt file

with open(f"{folder}canid_rDNA.txt", "w") as text_file:
    text_file.write(fasta_string.join(fasta_list))

In [None]:
#  this cell uses list comprehension to iterate over the fasta files that you just created and writes them into a single fasta file

SeqIO.write(
    
#  wicked awesome list comprehension iterates over an imported function
    [SeqIO.read(f'{folder}\\fasta_files\\{i}',format='fasta') for i in 
     os.listdir(f'{folder}\\fasta_files')], 
    
#  input directory of new combined fasta file
    f"{folder}\\canid_rDNA.fasta",
    format="fasta")