# Genus/Species to Class Converter
This notebook contains code to convert the genus/species listing from the BioTIME-DB to taxonomic class using Biopython and calling upon the NCBI, COL, GBIF, and WORMS databases, in that order.

In [15]:
# Import necessary packages and set NCBI API information.
!pip install biopython tqdm

import pandas as pd
import requests
import numpy as np
from Bio import Entrez
import time
import math
from tqdm import tqdm
import os
import glob

! wget https://raw.githubusercontent.com/emd-aquila/cs3-biodiversity/main/data/unique_genus_species.csv -O myfile.csv
df_all = pd.read_csv("myfile.csv")

Entrez.email = "emduggan@mit.edu"
Entrez.api_key = "2e5155aba559345711a3af676cb6c6703608"

--2025-05-08 01:56:10--  https://raw.githubusercontent.com/emd-aquila/cs3-biodiversity/main/data/unique_genus_species.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1185895 (1.1M) [text/plain]
Saving to: ‘myfile.csv’


2025-05-08 01:56:10 (17.2 MB/s) - ‘myfile.csv’ saved [1185895/1185895]



In [11]:
# Uses an email and API key to query the NCBI database
def ncbi_query(term):
    try:
        search = Entrez.esearch(db="taxonomy", term=term, retmode="xml")
        result = Entrez.read(search)
        if result["IdList"]:
            taxid = result["IdList"][0]
            fetch = Entrez.efetch(db="taxonomy", id=taxid, retmode="xml")
            record = Entrez.read(fetch)[0]
            lineage = record.get("LineageEx", [])
            class_entry = next((r for r in lineage if r.get("Rank") == "class"), None)
            return class_entry["ScientificName"] if class_entry else None
    except Exception as e:
        print(f"❌ NCBI lookup error for {term}: {e}")
        return None
    return None

# Queries the Categories of Life database
def col_query(term):
    url = f"https://api.catalogueoflife.org/nameusage/search?q={term}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        if data['total'] > 0:
            result = data['result'][0]
            lineage = result.get('classification', [])
            class_entry = next((r for r in lineage if r.get('rank') == 'class'), None)
            return class_entry['name'] if class_entry else None
    except Exception as e:
        print(f"❌ COL lookup error for {term}: {e}")
        return None
    return None

# Queries the WoRMS database
def worms_query(term):
    url = f"http://www.marinespecies.org/rest/AphiaRecordsByName/{term}?like=false&marine_only=false"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        if data:
            return data[0].get('class')
    except Exception as e:
        print(f"❌ WoRMS lookup error for {term}: {e}")
        return None
    return None

# Queries the GBIF Database
def gbif_query(term):
    url = f"https://api.gbif.org/v1/species/match?name={term}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        if data.get("class"):
            return data["class"]
    except Exception as e:
        print(f"❌ GBIF lookup error for {term}: {e}")
        return None
    return None

In [12]:
# Searches for the species in the specified DB, first by species and then by genus
def search_term(scientific_name, db_type):
    """
    Search for the taxonomic class of a species in a specified database.

    Args:
        scientific_name (str): The full species name to search.
        db_type (str): One of 'NCBI', 'COL', 'WORMS', 'GBIF'.

    Returns:
        str or None: The class name if found, else None.
    """

    # 🔍 First try full species name
    if db_type.upper() == 'NCBI':
        result = ncbi_query(scientific_name)
    elif db_type.upper() == 'COL':
        result = col_query(scientific_name)
    elif db_type.upper() == 'GBIF':
        result = gbif_query(scientific_name)
    elif db_type.upper() == 'WORMS':
        result = worms_query(scientific_name)
    else:
        raise ValueError("db_type must be one of 'NCBI', 'COL', 'WORMS', or 'GBIF'.")

    if result:
        return result

    # 🔄 Fallback: try genus only
    genus = scientific_name.split()[0]
    if genus != scientific_name:
        print(f"🔄 Fallback to genus: {genus}")
        if db_type.upper() == 'NCBI':
            return ncbi_query(genus)
        elif db_type.upper() == 'COL':
            return col_query(genus)
        elif db_type.upper() == 'WORMS':
            return worms_query(genus)
        elif db_type.upper() == 'GBIF':
            return gbif_query(genus)

    return None

In [16]:
# Batching our data from CSV
batch_size = 10000
num_batches = math.ceil(len(df_all) / batch_size)

# Save each batch
for i in range(num_batches):
    batch_df = df_all.iloc[i*batch_size : (i+1)*batch_size]
    batch_file = f"species_batch_{i+1:03d}.csv"
    batch_df.to_csv(batch_file, index=False)
    print(f"Saved {batch_file}")

Saved species_batch_001.csv
Saved species_batch_002.csv
Saved species_batch_003.csv
Saved species_batch_004.csv
Saved species_batch_005.csv


In [19]:
# 1: Specify which batch file we are currently working with, load, and cache
batch_filename = "species_batch_001.csv"

# Cache files
taxid_cache_file = batch_filename.replace(".csv", "_taxid_cache.csv")
class_cache_file = batch_filename.replace(".csv", "_class_cache.csv")

# Load batch CSV
df_batch = pd.read_csv(batch_filename)
species_names = df_batch["GENUS_SPECIES"].dropna().unique()
print(f"✅ Loaded {len(species_names)} species from {batch_filename}")

# 🔄 Load class cache if it exists
if pd.io.common.file_exists(class_cache_file):
    cached_df = pd.read_csv(class_cache_file)
    tax_class_dict = dict(zip(cached_df["GENUS_SPECIES"], cached_df["taxonomic_class"]))
    print(f"🔄 Loaded {len(tax_class_dict)} classes from cache.")
else:
    tax_class_dict = {}

to_process = [s for s in species_names if s not in tax_class_dict]
print(f"🔎 {len(to_process)} species left to process.")

✅ Loaded 10000 species from species_batch_001.csv
🔄 Loaded 95 classes from cache.
🔎 9905 species left to process.


In [21]:
# Search Loop of all species in this CSV batch through databases

# Loop through species
for species_name in tqdm(to_process, desc="Fetching Classes (all DBs)"):
    # 🚫 Skip morphospecies
    if "morphospecies" in species_name.lower():
        tax_class_dict[species_name] = None
        print(f"🚫 {species_name}: Skipped (morphospecies)")
        continue

    # ✅ First: NCBI
    class_name = search_term(species_name, db_type="NCBI")

    # 🟡 Fallbacks
    if not class_name:
        class_name = search_term(species_name, db_type="COL")
    if not class_name:
        class_name = search_term(species_name, db_type="GBIF")
    if not class_name:
        class_name = search_term(species_name, db_type="WORMS")

    tax_class_dict[species_name] = class_name
    print(f"✅ {species_name}: {class_name}")

    # 💾 Save cache after each species (for safety)
    pd.DataFrame([
        {"GENUS_SPECIES": k, "taxonomic_class": v} for k, v in tax_class_dict.items()
    ]).to_csv(class_cache_file, index=False)

    time.sleep(0.1)

# Save results
# ✅ Load the latest class cache (in case of mid-run crashes)
cached_df = pd.read_csv(class_cache_file)

# ✅ Merge with the batch data
final_df = df_batch.merge(cached_df, on="GENUS_SPECIES", how="left")

# ✅ Save the final batch result
output_filename = batch_filename.replace(".csv", "_with_class.csv")
final_df.to_csv(output_filename, index=False)
print(f"✅ Final output saved as {output_filename}")

Fetching Classes (all DBs):   0%|          | 1/9905 [00:00<1:11:15,  2.32it/s]

✅ Arnica cordifolia: Magnoliopsida


Fetching Classes (all DBs):   0%|          | 2/9905 [00:00<1:05:11,  2.53it/s]

✅ Achillea millefolium: Magnoliopsida


Fetching Classes (all DBs):   0%|          | 3/9905 [00:01<1:13:46,  2.24it/s]

✅ Cirsium undulatum: Magnoliopsida


Fetching Classes (all DBs):   0%|          | 4/9905 [00:01<1:10:38,  2.34it/s]

✅ Elymus trachycaulus: Magnoliopsida


Fetching Classes (all DBs):   0%|          | 5/9905 [00:02<1:13:22,  2.25it/s]

✅ Symphyotrichum ascendens: Magnoliopsida


Fetching Classes (all DBs):   0%|          | 6/9905 [00:02<1:11:17,  2.31it/s]

✅ Ionactis alpina: Magnoliopsida


Fetching Classes (all DBs):   0%|          | 7/9905 [00:02<1:08:53,  2.39it/s]

✅ Erigeron filifolius: Magnoliopsida


Fetching Classes (all DBs):   0%|          | 8/9905 [00:03<1:07:07,  2.46it/s]

✅ Astragalus miser: Magnoliopsida
🔄 Fallback to genus: Erigeron


Fetching Classes (all DBs):   0%|          | 9/9905 [00:03<1:10:59,  2.32it/s]

✅ Erigeron concinnus: Magnoliopsida
🚫 Morphospecies moss: Skipped (morphospecies)
🚫 Morphospecies lichen: Skipped (morphospecies)


Fetching Classes (all DBs):   0%|          | 12/9905 [00:04<43:41,  3.77it/s] 

✅ Antennaria rosea: Magnoliopsida


Fetching Classes (all DBs):   0%|          | 13/9905 [00:04<48:34,  3.39it/s]

✅ Calochortus nuttallii: Magnoliopsida


Fetching Classes (all DBs):   0%|          | 14/9905 [00:05<51:48,  3.18it/s]

✅ Guinardia delicatula: Coscinodiscophyceae


Fetching Classes (all DBs):   0%|          | 15/9905 [00:05<54:56,  3.00it/s]

✅ Guinardia flaccida: Coscinodiscophyceae


Fetching Classes (all DBs):   0%|          | 16/9905 [00:05<1:00:15,  2.74it/s]

✅ Guinardia striata: Coscinodiscophyceae


Fetching Classes (all DBs):   0%|          | 17/9905 [00:09<3:21:19,  1.22s/it]

✅ Mesoporos perforatus: Thecostraca


Fetching Classes (all DBs):   0%|          | 18/9905 [00:09<2:41:58,  1.02it/s]

✅ Meuniera membranacea: Bacillariophyceae
🔄 Fallback to genus: Navicula


Fetching Classes (all DBs):   0%|          | 19/9905 [00:11<3:04:45,  1.12s/it]

✅ Navicula distans: Bacillariophyceae


Fetching Classes (all DBs):   0%|          | 20/9905 [00:11<2:28:37,  1.11it/s]

✅ Nitzschia closterium: Bacillariophyceae


Fetching Classes (all DBs):   0%|          | 21/9905 [00:11<1:33:49,  1.76it/s]

✅ Nitzschia sigmoidea: Bacillariophyceae





KeyboardInterrupt: 

In [None]:
# COMBINE ALL BATCHED CSV RESULTS 1-9 - only run at end
# ✅ Match all batch output files
batch_files = sorted(glob.glob("species_batch_*_with_class.csv"))

# ✅ Combine them into one DataFrame
combined_df = pd.concat([pd.read_csv(f) for f in batch_files], ignore_index=True)

# ✅ Save the final combined result
combined_df.to_csv("all_species_with_class.csv", index=False)
print("✅ Combined file saved as all_species_with_class.csv")