In [None]:
# Last updated July 3, 2025 by Emily Trudeau.

# This code renames sequences from fasta files downloaded from ncbi/blast.  
# Sequences downloaded from ncbi/blast have this format:
# eg: >KAI9176136.1 hypothetical protein H9P43_006501 [Blastocladiella emersonii ATCC 22665]
# This code renames sequences to look like this:
# eg: >Blastocladiella-emersonii-ATCC-22665_KAI9176136-1_Blastocladiomycota
# i.e. Species-name-and-or-strain-information_Accession-version_Other_Information
# Specifically, it separates species name, accession, and other information types by '_'; replaces
# spaces with '-', removes '.' and any other unwanted characters that might cause trouble for downstream analyses.

# You will need to download Biopython if you don't have it on your computer:
# https://biopython.org/wiki/Download

# Import Biopython packages
import Bio
from Bio import SeqIO

# !! UPDATE THESE BEFORE RUNNING CODE -- 'path' should not incldue file name, and filename excludes '.fasta'
path = 'fasta-files/' # e.g. /Users/emilytrudeau/Documents/Test-sequences/
filename = 'E-luteolus-1174482_GH28-blast-Ascomycota_100seq' # e.g. E-luteolus-1174482_GH28-blast-Ascomycota_100seq
group = 'Ascomycota' # information that applies to all sequences in your file

#------------------------------------------------------------------------------------------------------------------

fasta_dict = {}
patent_list = []
dupe_counter = 0

# Reading in the original fasta file, prints and removes any duplicates
for seq_record in SeqIO.parse('{}{}.fasta'.format(path, filename), 'fasta'):
    if seq_record.description in fasta_dict.keys():
        print('Duplicate:', seq_record.id, '/', 'Sequences match?', seq_record.seq == fasta_dict[seq_record.description])
        dupe_counter += 1
    else:
        fasta_dict[seq_record.description] = seq_record.seq
        
print("---------------------------------------------")
print('Number of duplicates removed:', dupe_counter)
print("---------------------------------------------")
print("Number of sequences in dictionary:", len(fasta_dict))
print("---------------------------------------------")
print("Renaming {} sequences...".format(database))
print("---------------------------------------------")

old_key_buffer = []
new_key_buffer = []

# Renaming sequences 
for k,v in fasta_dict.items():
    old_key = k
    old_key_buffer.append(old_key)
    accession_buffer = k.split('.')[0] # taking what comes before '.' from the accession
    accession = accession_buffer.replace("_", "") # removing underscores
    accession_version_buffer = k.split('.')[1] # taking the version number of the accession
    accession_version = accession_version_buffer.split(' ')[0]
    if len(k.split('[')) < 2: # if no species name in sequence description
        print('Delete this sequence from original fasta file:', k) 
    species_buffer1 = k.split('[')[1] # removing '['
    species_buffer2 = species_buffer1.split(']')[0] # removing ']'
    species_buffer3 = species_buffer2.replace(" ", "-") # replacing spaces with hyphens
    species = species_buffer3.replace("_", "-") # replacing underscores with hyphens
    new_key_buff1 = '>' + species + '_' + accession + '-' + accession_version + '_' + group # building the new sequence id
    new_key_buff2 = new_key_buff1.replace("(", "") # removing unwanted characters
    new_key_buff3 = new_key_buff2.replace(")", "") # removing unwanted characters
    new_key_buff4 = new_key_buff3.replace(",", "") # removing unwanted characters
    new_key_buff5 = new_key_buff4.replace("'", "") # removing unwanted characters
    new_key_buff6 = new_key_buff5.replace("/", "") # removing unwanted characters
    new_key_buff7 = new_key_buff6.replace(":", "") # removing unwanted characters
    new_key_buff8 = new_key_buff7.replace(";", "") # removing unwanted characters
    new_key = new_key_buff8.replace(".", "") # removing unwanted characters
    new_key_buffer.append(new_key)

# Building the dictionary with new names 
i=0
for new_key in new_key_buffer:
    fasta_dict[new_key] = fasta_dict.pop(old_key_buffer[i])
    i+=1

keys_list = []
for kk in fasta_dict.keys():
    str(keys_list.append(kk))

values_list = []
for vv in fasta_dict.values():
    values_list.append(str(vv))

# Check that we haven't lost any sequences
print('Check:', len(values_list), 'sequences in output.')

# Writing our renamed sequences to a fasta file
j = 0
with open('{}{}-renamed.fasta'.format(path, filename), 'w') as output:
    for key in keys_list:
        output.write(key + '\n' + values_list[j] + '\n')
        j+=1

print("---------------------------------------------")
print("File renamed and ouput to path.")