In [None]:
! pip install biopython
from Bio import SeqIO

def filter_check_faa_sequences(input_faa, output_faa, ambiguous_threshold=0.05, min_length=50, filter_rare=True):
    """
    Filter protein sequences in a .faa FASTA file based on:
    - Ambiguous amino acids (X, B, Z)
    - Sequence length
    - Optional: Rare amino acids (U, O)

    Args:
        input_faa (str): Path to input .faa file
        output_faa (str): Path to output filtered .faa file
        ambiguous_threshold (float): Max allowed proportion of ambiguous residues (default: 0.05)
        min_length (int): Minimum sequence length (default: 50)
    """
    ambiguous_residues = set('XBZ')
    rare_residues = set('UO')

    records = list(SeqIO.parse(input_faa, "fasta"))
    kept = []
    removed = []

    for record in records:
        seq = str(record.seq).upper()
        length = len(seq)

        # Length check
        if length < min_length:
            removed.append((record.id, 'Too short'))
            continue

        # Ambiguous residue check
        num_ambiguous = sum(seq.count(res) for res in ambiguous_residues)
        frac_ambiguous = num_ambiguous / length
        if frac_ambiguous > ambiguous_threshold:
            removed.append((record.id, f"Ambiguous residues > {ambiguous_threshold*100:.1f}%"))
            continue

        kept.append(record)

    SeqIO.write(kept, output_faa, "fasta")
    print(f"Filtering complete: {len(removed)} sequences removed out of {len(records)} total.")

    return kept, removed

