In [194]:
# Install Biopython
!pip install biopython

# Import required libraries
from Bio import SeqIO
from google.colab import files

# Define function for removing duplicate sequnces for combined .faa
def remove_duplicate_sequences(faa_file):
    """
    remove_duplicate_sequences(faa_file)

    Removes duplicate protein sequences from a multi-FASTA (.faa) file by grouping sequences by their organism annotation.
    For each species, only unique amino acid sequences are retained, and duplicates are removed based on exact sequence matches.
    The cleaned sequences are saved as a new FASTA file ("cleaned_" + faa_file), and a report is printed detailing:
    - The original number of sequences
    - The number of duplicate sequences removed
    - The number of unique sequences remaining

    Parameters:
    - faa_file (str): Path to the input FASTA (.faa) file containing protein sequences.

    Outputs:
    - A cleaned FASTA file with unique sequences per organism.
    - A printout summary of the processing.
    - Automatically triggers file download (for Colab workflows).

    Example:
    >>> remove_duplicate_sequences("combined_proteins.faa")
    """

    # Read sequences
    records = list(SeqIO.parse(faa_file, "fasta"))

    # Group sequences by organism
    species_dict = {}
    for record in records:
        description = record.description
        if '[organism=' in description:
            organism = description.split('[organism=')[1].split(']')[0].strip()
        else:
            organism = 'Unknown'
        species_dict.setdefault(organism, []).append(record)

    # Remove duplicates within each species
    unique_sequences = []
    duplicate_sequences = []

    for species, recs in species_dict.items():
        seen = set()
        for rec in recs:
            seq_str = str(rec.seq)
            if seq_str not in seen:
                seen.add(seq_str)
                unique_sequences.append(rec)
            else:
                duplicate_sequences.append(rec)

     # Save cleaned file
    cleaned_file = "cleaned_" + faa_file
    SeqIO.write(unique_sequences, cleaned_file, "fasta")

    # Report
    print(f"Original file had {len(records)} sequences.")
    print(f"Removed {len(duplicate_sequences)} duplicate sequences.")
    print(f"Cleaned file saved as: {cleaned_file}")

    cleaned_count = len(records) - len(duplicate_sequences)
    print(cleaned_count)


    # Trigger download
    files.download(cleaned_file)





