In [4]:
from Bio import SeqIO
from pathlib import Path
import os

In [None]:
# Sort out unknown proteins
keywords = ["hypothetical", "putative", "postulated"]

# Load data
input_file = Path("../data/proteins/MZ501046.fasta")
output_known = Path("../data/proteins_known.fasta")
output_unknown = Path("../data/proteins_unknown.fasta")
output_known.mkdir(parents=True, exist_ok=True)
output_unknown.mkdir(parents=True, exist_ok=True)

In [6]:
# Create csv with id, sequence from folder containing seperate fastas
import pandas as pd


def fasta_to_df(path):
    """Parses either a folder of FASTA files, or a single FASTA file into a pandas dataframe."""
    data = []

    # Determine if path is a file or folder
    if os.path.isfile(path):
        files = [path]
    elif os.path.isdir(path):
        files = [os.path.join(path, f) for f in os.listdir(path)]

    for file in files:
        for record in SeqIO.parse(file, "fasta"):
            desc = record.description
            prot_id = next(
                (
                    x.split("=")[1][:-1]
                    for x in desc.split()
                    if x.startswith("[protein_id=")
                ),
                None,
            )
            if prot_id:
                data.append((prot_id, str(record.seq)))

    return pd.DataFrame(data, columns=["id", "sequence"])


# Change input based on avaliable fastas (folder or single file)
# folder_path = "../data/proteins"
folder_path = "../data/proteins_clustered/result_rep_seq.fasta"
output_csv = "../data/output.csv"
df = fasta_to_df(folder_path)
df.to_csv(output_csv, index=False)
print(f"Saved {len(df)} protein entries to {output_csv}")

Saved 6369 protein entries to ../data/output.csv


In [7]:
# Split sequences into known and unknown, and save to seperate files
with (
    open("proteins_known.fasta", "w") as known_file,
    open("proteins_unknown.fasta", "w") as unknown_file,
):
    k, uk = 0, 0
    record = SeqIO.parse(input_file, "fasta")
    for entry in record:
        if any(key in entry.description for key in keywords):
            SeqIO.write(entry, unknown_file, "fasta")
            uk += 1
        else:
            SeqIO.write(entry, known_file, "fasta")
            k += 1

FileNotFoundError: [Errno 2] No such file or directory: '../data/proteins_deduplicated/result_rep_seq.fasta'

In [None]:
# Format