In [1]:
# Install Biopython (if not installed)
!pip install biopython

from collections import defaultdict
from Bio import SeqIO

def select_top3_longest_by_species(input_faa, output_faa):
    """
    select_top3_longest_by_species(input_faa, output_faa)

    Selects the top 3 longest protein sequences per species from an input multi-FASTA (.faa) file and saves them to an output FASTA file for extreme counts in non-meiosis.
    If a species has fewer than 3 sequences, all are retained. The species information is extracted from the [organism=...] tag in the FASTA header.

    This function helps reduce redundancy by focusing on representative sequences for each species, useful for downstream comparative analyses.

    Parameters:
    - input_faa (str): Path to the input FASTA file containing protein sequences.
    - output_faa (str): Path for the output FASTA file containing the top 3 sequences per species.

    Outputs:
    - A FASTA file with up to 3 longest sequences per species saved to 'output_faa'.
    - A printout summary of the number of sequences retained.
    - Automatically triggers download of the output file (for Colab workflows).

    Example:
    >>> select_top3_longest_by_species("cleaned_MYO_animals.faa", "cleaned_MYO_animals_top3.faa")
    """

    species_seqs = defaultdict(list)

    with open(input_faa) as handle:
        for record in SeqIO.parse(handle, "fasta"):
            header = record.description
            start = header.find("[organism=")
            if start == -1:
                species = "Unknown"
            else:
                end = header.find("]", start)
                species = header[start+10:end]

            species_seqs[species].append(record)

    selected_seqs = []
    for species, seqs in species_seqs.items():
        seqs_sorted = sorted(seqs, key=lambda x: len(x.seq), reverse=True)
        if len(seqs_sorted) > 3:
            selected = seqs_sorted[:3]
        else:
            selected = seqs_sorted
        selected_seqs.extend(selected)

    with open(output_faa, "w") as out_handle:
        SeqIO.write(selected_seqs, out_handle, "fasta")

    print(f"Selected {len(selected_seqs)} sequences.")
    print(f"Filtered sequences saved to '{output_faa}'")

    # Trigger download (for Colab)
    files.download(output_faa)



In [2]:
# Only keep top three sequences for Myosin of animals
# from google.colab import files
# uploaded = files.upload()
# select_top3_longest_by_species("cleaned_MYO_animals.faa", "cleaned_MYO_animals_top3.faa")