In [None]:
!pip3 install biopython

In [None]:
# Install libraries
import pandas as pd

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

In [None]:
# Define file paths
class FilePaths:
    def __init__(self, input_path, clearned_up_input_path, output_path):
        self.input_path = input_path
        self.cleaned_up_input_path = clearned_up_input_path
        self.output_path = output_path

file_names = FilePaths(
    input_path = "/home/azureuser/cloudfiles/code/Users/jc62/projects/esm2/data/sequences/big_merge_translate_unique_translations_95_percent.csv",
    cleaned_up_input_path = "/home/azureuser/cloudfiles/code/Users/jc62/projects/esm2/data/sequences/translations_1_6189.csv",
    output_path = "/home/azureuser/cloudfiles/code/Users/jc62/projects/esm2/data/sequences/translations_complete.fasta"
)

In [None]:
# Read translation data from processed wastewater data
translation_data = pd.read_csv(file_names.input_path)

In [None]:
## Prepare the translation data for FASTA conversion input
# Create key for each entry to identify unique sequences
translation_data['key'] = (
    translation_data['Protein ID'].astype(str) + '_' +
    translation_data['Product'].astype(str) + '_' +
    translation_data['Organism'].astype(str) +
    '_length_' + translation_data['length'].astype(str) +
    '_new_index_' + translation_data['new_index'].astype(str)
    )

# Clean up the key to remove spaces and special characters
translation_data['key'] = translation_data['key'].str.replace(" ", "_")
translation_data['key'] = translation_data['key'].str.replace(r'[^\w\-.]', '', regex=True)

# Output the cleaned up translation data to a new CSV file for record
translation_data.to_csv(file_names.cleaned_up_input_path, index=False)

In [None]:
## Convert the translation data to FASTA format
# Create a zipped list of meta data and sequences for FASTA conversion
sequences = list(zip(
    translation_data['new_index'].astype(str),
    translation_data["Product"],
    translation_data['key'],
    translation_data["Translation"])
    )
translation_data['key_sequences'] = sequences

# Convert to SeqRecord objects
records = [SeqRecord(Seq(seq), id=id, name=name, description=description)
           for id, name, description, seq in translation_data['key_sequences']]
translation_data['record'] = records

In [None]:
# Write to FASTA
with open(file_names.output_path, "w") as f:
    SeqIO.write(records, f, "fasta")

In [None]:
# Reading output FASTA file example
fasta = pd.read_csv("/home/azureuser/cloudfiles/code/Users/jc62/projects/esm2/data/sequences/translations_complete.fasta", header=None)
print(fasta.head())