In [14]:
def find_species_in_fasta(species_list_file, fasta_file):
    """
    Finds species from a list in a FASTA file and returns a dictionary with species counts.

    Args:
        species_list_file (str): Path to the file containing the list of species (one per line).
        fasta_file (str): Path to the FASTA file.

    Returns:
        dict: A dictionary where keys are species and values are their counts (0 or 1).
    """

    try:
        with open(species_list_file, 'r') as f:
            species_list = [line.strip() for line in f]
    except FileNotFoundError:
        return f"Error: Species list file '{species_list_file}' not found."

    species_counts = {species: 0 for species in species_list}

    try:
        with open(fasta_file, 'r') as f:
            for line in f:
                if line.startswith('>'):
                    header = line.strip()
                    taxonomy = header.split(';')
                    if len(taxonomy) >= 7:  # Ensure there's enough taxonomy information
                        genus = taxonomy[5]
                        species_name = taxonomy[6]
                        full_species = f"{genus} {species_name}"

                        for species in species_list:
                            if species in full_species:
                                species_counts[species] = 1

    except FileNotFoundError:
        return f"Error: FASTA file '{fasta_file}' not found."

    return species_counts

# Example usage:
species_list_file = "/Users/claudia.restrepo-ortiz/Documents/MARBEC/JC/Pathogens/PathoLens/data/input/CRUSTACEAN/Crustacean_sp_pathogens_list.txt"  # Replace with your species list file
fasta_file = "/Users/claudia.restrepo-ortiz/Documents/MARBEC/JC/Pathogens/PathoLens/data/output/CRUSTACEAN/CRUSTACEAN_Pathogen_DB_curated.fasta"  # Replace with your FASTA file

In [15]:
find_species_in_fasta(species_list_file, fasta_file)



{'Acinetobacter baumannii': 1,
 'Aeromonas enteropelogenes': 1,
 'Aeromonas hydrophila': 1,
 'Aeromonas veronii': 1,
 'Aliivibrio fischeri': 1,
 'Aliivibrio logei': 1,
 'Bacillus cereus': 1,
 'Bacillus circulans': 1,
 'Bacillus endophyticus': 1,
 'Bacillus licheniformis': 1,
 'Bacillus pumilus': 1,
 'Bacillus safensis': 1,
 'Bacillus subtilis': 1,
 'Bacillus tequilensis': 1,
 'Brevibacillus brevis': 1,
 'Brevibacterium antarcticum': 1,
 'Brevibacterium sanguinis': 1,
 'Citrobacter freundii': 1,
 'Enterococcus casseliflavus': 1,
 'Enterococcus faecalis': 1,
 'Exiguobacterium oxidotolerans': 1,
 'Francisella tularensis': 1,
 'Leucothrix mucor': 1,
 'Listonella anguillarum': 1,
 'Microbacterium kitamiense': 1,
 'Microbacterium oxydans': 1,
 'Micrococcus luteus': 1,
 'Mycoplasma monodon': 0,
 'Paenibacillus lentimorbus': 1,
 'Pantoea agglomerans': 1,
 'Pasteuria ramosa': 0,
 'Photobacterium damselae': 1,
 'Proteus mirabilis': 1,
 'Pseudoclavibacter helvolus': 1,
 'Pseudomonas marincola': 1

In [16]:
def extract_species_from_fasta(fasta_file):
    """
    Extracts all unique species (genus and species) from a FASTA file.

    Args:
        fasta_file (str): Path to the FASTA file.

    Returns:
        set: A set containing unique species names.
    """

    species_set = set()

    try:
        with open(fasta_file, 'r') as f:
            for line in f:
                if line.startswith('>'):
                    header = line.strip()
                    taxonomy = header.split(';')
                    if len(taxonomy) >= 7:  # Ensure genus and species are present
                        
                        species_name = taxonomy[6]
                        species_set.add(species_name)

    except FileNotFoundError:
        return f"Error: FASTA file '{fasta_file}' not found."

    return species_set

# Example usage (using the same fasta_file from the previous example):
#fasta_file = "sequences.fasta" # Replace with your FASTA file

species_found = extract_species_from_fasta(fasta_file)
print(len(species_found))
print(species_found)

73
{'Acinetobacter baumannii', 'Vibrio campbellii', 'Listonella anguillarum', 'Proteus mirabilis', 'Microbacterium oxydans', 'Exiguobacterium oxidotolerans', 'Spiroplasma penaei', 'Aeromonas sp. CCRC 13881', 'Bacillus subtilis', 'Pseudomonas sp. GM4FR', 'Aeromonas hydrophila', 'Psychrobacter glacincola', 'Vibrio vulnificus', 'Pseudoclavibacter helvolus', 'bacterium BM0323', 'Spiroplasma eriocheiris', 'Stenotrophomonas maltophilia', 'Pantoea agglomerans', 'Staphylococcus epidermidis', 'Pseudomonas monteilii', 'Vibrio hepatarius', 'Microbacterium kitamiense', 'Vibrio ordalii', 'Bacillus circulans', 'Francisella tularensis', 'Streptococcus phocae', 'Brevibacillus brevis', 'Pseudomonas putida', 'Microbacterium aurantiacum', 'Bacillus pumilus', 'Vibrio lentus', 'Vibrio fortis', 'Vibrio rotiferianus', 'Paenibacillus lentimorbus', 'Enterococcus faecalis', 'Vibrio chagasii', 'Bacillus cereus', 'Micrococcus luteus', 'Bacillus endophyticus', 'Bacillus safensis', 'Vibrio harveyi', 'Wolbachia pipi