# Match Texonomy to Key Names
This notebook processes fungal taxonomy data to identify and classify species according to the WHO fungal priority pathogens list.

- Loads taxonomy data for either training or test datasets.
- Defines critical, high, and medium priority fungal groups based on WHO guidelines.
- Matches species in the dataset to these priority groups.
- Saves lists of matched sample names and their classification groups for downstream analysis.
- Provides a framework for tracking the presence of priority fungal pathogens in environmental samples.

In [8]:
import os
import json
import pandas as pd

In [9]:
# get path for data folder
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../.."))
data_folder = os.path.join(project_root, "data")

# Data
- training or test data

In [None]:
data_set = "training"  # options: "training" or "test"

# WHO fungal priority pathogens list
- ref: https://iris.who.int/bitstream/handle/10665/363682/9789240060241-eng.pdf?sequence=1
- create json and save

In [None]:
high_priority_names_map = {"Eumycetoma causative agents": ["Acremonium spp.", "Falciformispora enegalensis", "Curvularia lunata"],
             "Mucorales": ["Rhizopus spp.", "Mucor spp.", "Lichtheimia spp."],
             "Fusarium spp.": ["Fusarium spp."]}
medium_priority_names_map = {"Scedosporium spp.": ["Scedosporium spp."]}

In [None]:
groups = {
    "Acremonium spp.": ['Acremonium acutatum',
 'Acremonium brachypenium',
 'Acremonium charticola',
 'Acremonium furcatum',
 'Acremonium hennebertii',
 'Acremonium hyalinulum',
 'Acremonium pinkertoniae',
 'Acremonium polychromum',
 'Acremonium sp.',
 'Acremonium verruculosum',
 'Acremonium vitellinum'],
    "Fusarium spp.": ['Fusarium beomiforme',
 'Fusarium chlamydosporum',
 'Fusarium longipes',
 'Fusarium nematophilum',
 'Fusarium neocosmosporiellum',
 'Fusarium oxysporum',
 'Fusarium redolens',
 'Fusarium solani',
 'Fusarium sp.'],
     "Rhizopus spp.": ['Rhizopus arrhizus',
 'Rhizopus microsporus'],
     "Mucor spp.": ['Mucor circinelloides'],
     "Lichtheimia spp.": ['Lichtheimia brasiliensis'],
     "Scedosporium spp.":['Scedosporium aurantiacum',
 'Scedosporium boydii',
 'Scedosporium prolificans',
 'Scedosporium sp.']
}

Groups that will be used for classification

In [None]:
critical_priority_classification_groups = {
    'Cryptococcus neoformans': ['Cryptococcus neoformans'],
    'Candida auris': ['Candida auris'],
    'Aspergillus fumigatus': ['Aspergillus fumigatus'],
    'Candida albicans': ['Candida albicans']
}

high_priority_classification_groups = {
    'Nakaseomyces glabrata': ['Nakaseomyces glabrata'],
    'Histoplasma spp.': ['Histoplasma spp.'],
    'Candida glabrata': ['Candida glabrata'],
    'Candida tropicalis': ['Candida tropicalis'],
    'Candida parapsilosis': ['Candida parapsilosis'],
    'Acremonium spp.': ['Acremonium acutatum',
                        'Acremonium brachypenium',
                        'Acremonium charticola',
                        'Acremonium furcatum',
                        'Acremonium hennebertii',
                        'Acremonium hyalinulum',
                        'Acremonium pinkertoniae',
                        'Acremonium polychromum',
                        'Acremonium sp.',
                        'Acremonium verruculosum',
                        'Acremonium vitellinum'],
    'Falciformispora senegalensis': ['Falciformispora senegalensis'],
    'Curvularia lunata': ['Curvularia lunata'],
    'Rhizopus spp.': ['Rhizopus arrhizus',
                      'Rhizopus microsporus'],
    "Mucor spp.": ['Mucor circinelloides'],
    'Fusarium spp.': ['Fusarium beomiforme',
                      'Fusarium chlamydosporum',
                      'Fusarium longipes',
                      'Fusarium nematophilum',
                      'Fusarium neocosmosporiellum',
                      'Fusarium oxysporum',
                      'Fusarium redolens',
                      'Fusarium solani',
                      'Fusarium sp.'],
    "Lichtheimia spp.": ['Lichtheimia brasiliensis']
}

medium_priority_classification_groups = {
    'Scedosporium spp.':['Scedosporium aurantiacum',
                         'Scedosporium boydii',
                         'Scedosporium prolificans',
                         'Scedosporium sp.'],
    'Cryptococcus gattii': ['Cryptococcus gattii'],
    'Lomentospora prolificans': ['Lomentospora prolificans'],
    'Talaromyces marneffei': ['Talaromyces marneffei'],
    'Coccidioides spp.': ['Coccidioides spp.'],
    'Pneumocystis jirovecii': ['Pneumocystis jirovecii'],
    'Pichia kudriavzeveii': ['Pichia kudriavzeveii'],
    'Candida krusei': ['Candida krusei'],
    'Paracoccidioides spp.': ['Paracoccidioides spp.']                     
}
    

Name by priorities

In [None]:
high_priority_group = ['Nakaseomyces glabrata',
 'Histoplasma spp.',
 'Candida glabrata',
 'Candida tropicalis',
 'Candida parapsilosis',
 'Acremonium acutatum',
 'Acremonium brachypenium',
 'Acremonium charticola',
 'Acremonium furcatum',
 'Acremonium hennebertii',
 'Acremonium hyalinulum',
 'Acremonium pinkertoniae',
 'Acremonium polychromum',
 'Acremonium sp.',
 'Acremonium verruculosum',
 'Acremonium vitellinum',
 'Falciformispora senegalensis',
 'Curvularia lunata',
 'Rhizopus arrhizus',
 'Rhizopus microsporus',
 'Mucor circinelloides',
 'Fusarium beomiforme',
 'Fusarium chlamydosporum',
 'Fusarium longipes',
 'Fusarium nematophilum',
 'Fusarium neocosmosporiellum',
 'Fusarium oxysporum',
 'Fusarium redolens',
 'Fusarium solani',
 'Fusarium sp.',
 'Lichtheimia brasiliensis']

medium_priority_group = ['Scedosporium aurantiacum',
 'Scedosporium boydii',
 'Scedosporium prolificans',
 'Scedosporium sp.',
 'Cryptococcus gattii',
 'Lomentospora prolificans',
 'Talaromyces marneffei',
 'Coccidioides spp.',
 'Pneumocystis jirovecii',
 'Pichia kudriavzeveii',
 'Candida krusei',
 'Paracoccidioides spp.']

In [None]:
priority_list = {"critical_priority": critical_priority_classification_groups,
                "high_priority": high_priority_classification_groups,
                "medium_priority": medium_priority_classification_groups}

In [None]:
with open("WHO_fungal_priority_pathogens_list.json", "w") as file:
    json.dump(priority_list, file)

# Taxonomy

In [None]:
if data_set == "training":
    taxonomy_df = pd.read_csv(os.path.join(data_folder,"DroughtITS.taxonomy.fix.txt"), sep = "\t")
elif data_set == "test":
    taxonomy_df = pd.read_csv(os.path.join(data_folder,"SakhonNakhonApril2025.taxonomy.fix.txt"), sep = "\t")
else:
    raise ValueError("data_set must be either 'training' or 'test'")

In [None]:
# rename the first column to Name
taxonomy_df = taxonomy_df.rename(columns={'Unnamed: 0': 'Name'})

## Identify Name that is in the priority list

### Matching species

Critical Group
- Note: only test data has critical group species

In [None]:
critical_priority_group = critical_priority_classification_groups.values()
critical_priority_group = [item for sublist in critical_priority_group for item in sublist]

In [None]:
# critical priority group - exact match
critical_priority_samples_df = taxonomy_df[taxonomy_df.Species.isin(critical_priority_group)].reset_index(drop=True)
critical_priority_samples_df.shape

In [None]:
# Create a mapping of species to classification groups
species_to_group = {species: group for group, species_list in critical_priority_classification_groups.items() for species in species_list}
critical_priority_samples_df['classification_group'] = critical_priority_samples_df['Species'].map(species_to_group)

In [None]:
# Save sample names to a file
critical_priority_samples_df.to_csv(f"{data_folder}/critical_priority_sample_names_{data_set}_data.csv", index=False)

In [None]:
# Save the group names to a file
critical_priority_samples_groups = list(critical_priority_samples_df.classification_group.unique())
with open(f"{data_folder}/critical_priority/critical_priority_samples_groups_{data_set}_data", 'w') as file:
    file.write('\n'.join(critical_priority_samples_groups))


High Priority Group

In [None]:
high_priority_group = high_priority_classification_groups.values()
high_priority_group = [item for sublist in high_priority_group for item in sublist]

In [None]:
# high priority group - exact match
high_priority_samples_df = taxonomy_df[taxonomy_df.Species.isin(high_priority_group)].reset_index(drop=True)
high_priority_samples_df.shape

In [None]:
# Create a mapping of species to classification groups
species_to_group = {species: group for group, species_list in high_priority_classification_groups.items() for species in species_list}
high_priority_samples_df['classification_group'] = high_priority_samples_df['Species'].map(species_to_group)

In [None]:
# Save sample names to a file
high_priority_samples_df.to_csv(f"{data_folder}/high_priority_sample_names_{data_set}_data.csv", index=False)

In [None]:
# Save the group names to a file
high_priority_samples_groups = list(high_priority_samples_df.classification_group.unique())
with open(f"{data_folder}/high_priority/high_priority_samples_groups_{data_set}_data", 'w') as file:
    file.write('\n'.join(high_priority_samples_groups))


Medium Priority Group

In [None]:
medium_priority_group = medium_priority_classification_groups.values()
medium_priority_group = [item for sublist in medium_priority_group for item in sublist]

In [None]:
# high priority group - exact match
medium_priority_samples_df = taxonomy_df[taxonomy_df.Species.isin(medium_priority_group)].reset_index(drop=True)
medium_priority_samples_df.shape

In [None]:
# Create a mapping of species to classification groups
species_to_group = {species: group for group, species_list in medium_priority_classification_groups.items() for species in species_list}
medium_priority_samples_df['classification_group'] = medium_priority_samples_df['Species'].map(species_to_group)

In [None]:
# Save sample names to a file
medium_priority_samples_df.to_csv(f"{data_folder}/medium_priority_sample_names_{data_set}_data.csv", index=False)

In [None]:
# Save the group names to a file
medium_priority_samples_groups = list(medium_priority_samples_df.classification_group.unique())
with open(f"{data_folder}/medium_priority/medium_priority_samples_groups_{data_set}_data", 'w') as file:
    file.write('\n'.join(medium_priority_samples_groups))
