In [1]:
import os
from collections import defaultdict


def combine_fasta(path):
    """
    Reads a FASTA file, combines multi-line sequences into a single line,
    and overwrites the original file with the reformatted FASTA.
    """
    seq = defaultdict(str)
    # Read and combine the sequences
    with open(path, "r") as f:
        curr_header = None
        for line in f:
            line = line.strip()
            if not line:
                continue  # Skip blank lines
            if line.startswith(">"):
                curr_header = line  # use the entire header line
                seq[curr_header] = ""
            else:
                # Append the sequence line without any spaces
                seq[curr_header] += line
    # Overwrite the original file with the combined sequences
    with open(path, "w") as f:
        for header, sequence in seq.items():
            f.write(f"{header}\n{sequence}\n\n")


# Define the parent folder containing your 21 subfolders
parent_folder = "/mnt/d/ebola/data/RNAup"  # Change this to your actual path

# Loop through each subfolder
for subdir in os.listdir(parent_folder):
    subdir_path = os.path.join(parent_folder, subdir)
    if os.path.isdir(subdir_path):
        # Look for a FASTA file in this subfolder.
        # (Modify the extension check if your files use a different naming convention.)
        for filename in os.listdir(subdir_path):
            if filename.endswith(".fa") or filename.endswith(".fasta"):
                fasta_path = os.path.join(subdir_path, filename)
                print(f"Processing {fasta_path}...")
                combine_fasta(fasta_path)
                print(f"Overwritten {fasta_path} with combined sequence.")

Processing /mnt/d/ebola/data/RNAup/AKT3/AKT3_ENST00000673466_1.fa...
Overwritten /mnt/d/ebola/data/RNAup/AKT3/AKT3_ENST00000673466_1.fa with combined sequence.
Processing /mnt/d/ebola/data/RNAup/CD9/CD9_ENST00000009180_10.fa...
Overwritten /mnt/d/ebola/data/RNAup/CD9/CD9_ENST00000009180_10.fa with combined sequence.
Processing /mnt/d/ebola/data/RNAup/CLIP1/CLIP1_ENST00000361654_8.fa...
Overwritten /mnt/d/ebola/data/RNAup/CLIP1/CLIP1_ENST00000361654_8.fa with combined sequence.
Processing /mnt/d/ebola/data/RNAup/DCP1A/DCP1A_ENST00000610213_6.fa...
Overwritten /mnt/d/ebola/data/RNAup/DCP1A/DCP1A_ENST00000610213_6.fa with combined sequence.
Processing /mnt/d/ebola/data/RNAup/EHD4/EHD4_ENST00000220325_9.fa...
Overwritten /mnt/d/ebola/data/RNAup/EHD4/EHD4_ENST00000220325_9.fa with combined sequence.
Processing /mnt/d/ebola/data/RNAup/EIF2AK2/EIF2AK2_ENST00000233057_9.fa...
Overwritten /mnt/d/ebola/data/RNAup/EIF2AK2/EIF2AK2_ENST00000233057_9.fa with combined sequence.
Processing /mnt/d/ebol