## Append Gene name on FASTA file

In [1]:
from Bio import SeqIO
import pandas as pd

In [2]:
gene_names = []
input = "../annotation/annotation/concatenated_fasta.fasta"
output = "../annotation/annotation/concatenated_fasta_appened.fasta"

card = "../annotation/annotation/amr_databases/aro_index_v3.8.9.tsv"
amrfinder = "../annotation/annotation/amr_databases/ReferenceGeneCatalog_v3.12.txt"


with open(input, "r") as fasta_file:
    for record in SeqIO.parse(fasta_file, "fasta"):
        gene_names.append(record.name)

In [3]:
card = pd.read_csv(card, sep = "\t")
amrfinder = pd.read_csv(amrfinder, sep = "\t")

def concatenate_gene_names(element):
    updated_header = {}
    try:
        query = amrfinder.loc[(amrfinder['genbank_protein_accession'] == element) | (amrfinder['refseq_protein_accession'] == element), 'gene_family']
        if len(query) > 0:
            updated_header[element] = element + "_" + query.iloc[0]
        else:
            if element == "sp|P24734.3|AMPR_PSEAE":
                element = 'P24734.3'
            updated_header[element] = element + "_" + card.loc[card['Protein Accession'] == element, 'CARD Short Name'].iloc[0]
    except IndexError:
        print('Index error', element)
    except Exception as e:
        print(e)
    
    return updated_header[element]


updated_headers = {element: concatenate_gene_names(element) for element in gene_names}

In [6]:
updated_records = []
with open(input, "r") as fasta_file:
    for record in SeqIO.parse(fasta_file, "fasta"):
        record.id = updated_headers[record.id]
        record.description = record.description
        updated_records.append(record)


with open(output, "w") as output_file:
    SeqIO.write(updated_records, output_file, "fasta")