# Genus/Species to Class Converter
This notebook contains code to convert the genus/species listing from the BioTIME-DB to taxonomic class using Biopython and calling upon the NCBI, COL, GBIF, and WORMS databases, in that order.

# Import necessary packages and get CSV file from GitHub

In [61]:
# Import necessary packages and set NCBI API information.
!pip install biopython tqdm

import pandas as pd
import requests
import numpy as np
from Bio import Entrez
import time
import math
from tqdm import tqdm
import os
import glob
import shutil
from getpass import getpass

! wget https://raw.githubusercontent.com/emd-aquila/cs3-biodiversity/main/data/unique_genus_species.csv -O myfile.csv
df_all = pd.read_csv("myfile.csv")

Entrez.email = "emduggan@mit.edu"
Entrez.api_key = "2e5155aba559345711a3af676cb6c6703608"

--2025-05-08 02:40:20--  https://raw.githubusercontent.com/emd-aquila/cs3-biodiversity/main/data/unique_genus_species.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1185895 (1.1M) [text/plain]
Saving to: ‘myfile.csv’


2025-05-08 02:40:21 (15.8 MB/s) - ‘myfile.csv’ saved [1185895/1185895]



# Creating local clone of GitHub repository for file storage

In [133]:
# if needed to clear existing clone
!rm -rf cs3-biodiversity

# get access code and clone repo
os.environ["GITHUB_TOKEN"] = getpass("🔐 Enter your GitHub token: ")
token = os.environ["GITHUB_TOKEN"]
repo_url = f"https://emd-aquila:{token}@github.com/emd-aquila/cs3-biodiversity.git"

!git clone {repo_url}

🔐 Enter your GitHub token: ··········
Cloning into 'cs3-biodiversity'...
remote: Enumerating objects: 41, done.[K
remote: Counting objects: 100% (41/41), done.[K
remote: Compressing objects: 100% (40/40), done.[K
remote: Total 41 (delta 19), reused 6 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (41/41), 449.20 KiB | 3.77 MiB/s, done.
Resolving deltas: 100% (19/19), done.


# Defining functions to query database APIs and search for species

In [92]:
# Uses an email and API key to query the NCBI database
def ncbi_query(term):
    try:
        search = Entrez.esearch(db="taxonomy", term=term, retmode="xml")
        result = Entrez.read(search)
        if result["IdList"]:
            taxid = result["IdList"][0]
            fetch = Entrez.efetch(db="taxonomy", id=taxid, retmode="xml")
            record = Entrez.read(fetch)[0]
            lineage = record.get("LineageEx", [])
            class_entry = next((r for r in lineage if r.get("Rank") == "class"), None)
            return class_entry["ScientificName"] if class_entry else None
    except Exception as e:
        print(f"❌ NCBI lookup error for {term}: {e}")
        return None
    return None

# Queries the Categories of Life database
def col_query(term):
    url = f"https://api.catalogueoflife.org/nameusage/search?q={term}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        if data['total'] > 0:
            result = data['result'][0]
            lineage = result.get('classification', [])
            class_entry = next((r for r in lineage if r.get('rank') == 'class'), None)
            return class_entry['name'] if class_entry else None
    except Exception as e:
        print(f"❌ COL lookup error for {term}: {e}")
        return None
    return None

# Queries the WoRMS database
def worms_query(term):
    url = f"http://www.marinespecies.org/rest/AphiaRecordsByName/{term}?like=false&marine_only=false"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        if data:
            return data[0].get('class')
    except Exception as e:
        print(f"❌ WoRMS lookup error for {term}: {e}")
        return None
    return None

# Queries the GBIF Database
def gbif_query(term):
    url = f"https://api.gbif.org/v1/species/match?name={term}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        if data.get("class"):
            return data["class"]
    except Exception as e:
        print(f"❌ GBIF lookup error for {term}: {e}")
        return None
    return None

In [84]:
# Searches for the species in the specified DB, first by species and then by genus
def search_term(scientific_name, db_type):
    """
    Search for the taxonomic class of a species in a specified database.

    Args:
        scientific_name (str): The full species name to search.
        db_type (str): One of 'NCBI', 'COL', 'WORMS', 'GBIF'.

    Returns:
        str or None: The class name if found, else None.
    """

    # 🔍 First try full species name
    if db_type.upper() == 'NCBI':
        result = ncbi_query(scientific_name)
    elif db_type.upper() == 'COL':
        result = col_query(scientific_name)
    elif db_type.upper() == 'GBIF':
        result = gbif_query(scientific_name)
    elif db_type.upper() == 'WORMS':
        result = worms_query(scientific_name)
    else:
        raise ValueError("db_type must be one of 'NCBI', 'COL', 'WORMS', or 'GBIF'.")

    if result:
        return result

    # 🔄 Fallback: try genus only
    genus = scientific_name.split()[0]
    if genus != scientific_name:
        print(f"🔄 Fallback to genus: {genus}")
        if db_type.upper() == 'NCBI':
            return ncbi_query(genus)
        elif db_type.upper() == 'COL':
            return col_query(genus)
        elif db_type.upper() == 'WORMS':
            return worms_query(genus)
        elif db_type.upper() == 'GBIF':
            return gbif_query(genus)

    return None

In [145]:
def setup_ssh_and_push(output_filename,
                                 repo_name="cs3-biodiversity",
                                 subfolder="biotimes-with-class-label",
                                 github_username="emd-aquila",
                                 github_email="emduggan@mit.edu",
                                 github_name="Eli Duggan"):
    """
    Set up SSH (if needed) and push a batch output file to a subfolder in your GitHub repo using SSH.

    Args:
        output_filename (str): The CSV filename to push (already saved locally).
        repo_name (str): Local folder name of the GitHub repo.
        subfolder (str): Subfolder inside the repo to store the file.
        github_username (str): Your GitHub username.
        github_email (str): Your GitHub email.
        github_name (str): Your full name for git config.
    """

    # ✅ Add GitHub to known_hosts to prevent verification errors
    print("🔑 Adding GitHub.com to known_hosts (if not already added)...")
    !mkdir -p ~/.ssh
    !ssh-keyscan github.com >> ~/.ssh/known_hosts

    # ✅ Check if repo folder exists, clone if needed
    if not os.path.exists(repo_name):
        print(f"📥 Repo folder '{repo_name}' not found. Cloning with SSH...")
        !git clone git@github.com:{github_username}/{repo_name}.git
    else:
        print(f"✅ Repo folder '{repo_name}' already exists.")

    # ✅ Create the subfolder inside the repo if it doesn't exist
    subfolder_path = f"{repo_name}/{subfolder}"
    os.makedirs(subfolder_path, exist_ok=True)

    # ✅ Copy the batch result into the subfolder
    shutil.copy(output_filename, f"{subfolder_path}/{output_filename}")
    # print(f"✅ Copied {output_filename} into {subfolder_path}/")

    # ✅ Push to GitHub
    %cd {repo_name}

    # Set Git config (only needed once)
    !git config user.email "{github_email}"
    !git config user.name "{github_name}"

    # Set remote to SSH (just in case)
    !git remote set-url origin git@github.com:{github_username}/{repo_name}.git

    # Add, commit, push
    !git add {subfolder}/{output_filename}
    !git commit -m "Add batch output {output_filename} to {subfolder}/" || echo "No changes to commit."
    !git push origin main

    # ✅ Return to root directory
    %cd ..


In [152]:
def push_combined_csv_to_github(combined_filename="all_species_with_class.csv",
                                repo_name="cs3-biodiversity",
                                subfolder="biotimes-with-class-label",
                                github_username="emd-aquila",
                                github_email="emduggan@mit.edu",
                                github_name="Eli Duggan"):
    """
    Push the combined CSV file to a subfolder in your GitHub repo using SSH.
    """
    # ✅ Build the full subfolder path
    subfolder_path = f"{repo_name}/{subfolder}"

    # ✅ Make sure the subfolder exists inside the repo
    os.makedirs(subfolder_path, exist_ok=True)

    # ✅ Move the combined CSV into the subfolder
    dest_path = f"{subfolder_path}/{combined_filename}"
    if not os.path.exists(dest_path):
        shutil.move(combined_filename, dest_path)
        print(f"✅ Moved {combined_filename} into {subfolder_path}/")
    else:
        print(f"✅ {combined_filename} already exists in {subfolder_path}/")

    # ✅ Push to GitHub
    %cd {repo_name}

    !git config user.email "{github_email}"
    !git config user.name "{github_name}"
    !git remote set-url origin git@github.com:{github_username}/{repo_name}.git

    !git add {subfolder}/{combined_filename}
    !git commit -m "Add combined species class CSV to {subfolder}/" || echo "No changes to commit."
    !git push origin main

    %cd ..


# Batching Data and Loading the Cache

## Batching data and saving


In [148]:
batch_size = 15
num_batches = math.ceil(len(df_all) / batch_size)

# Save each batch
for i in range(num_batches):
    batch_df = df_all.iloc[i*batch_size : (i+1)*batch_size]
    batch_file = f"species_batch_{i+1:03d}.csv"
    batch_df.to_csv(batch_file, index=False)
    # print(f"Saved {batch_file}")

## Loading batch files and caching files

In [149]:
batch_filename = "species_batch_001.csv"

# Cache files
taxid_cache_file = batch_filename.replace(".csv", "_taxid_cache.csv")
class_cache_file = batch_filename.replace(".csv", "_class_cache.csv")

# Load batch CSV
df_batch = pd.read_csv(batch_filename)
species_names = df_batch["GENUS_SPECIES"].dropna().unique()
print(f"✅ Loaded {len(species_names)} species from {batch_filename}")

# 🔄 Load class cache if it exists
if pd.io.common.file_exists(class_cache_file):
    cached_df = pd.read_csv(class_cache_file)
    tax_class_dict = dict(zip(cached_df["GENUS_SPECIES"], cached_df["taxonomic_class"]))
    print(f"🔄 Loaded {len(tax_class_dict)} classes from cache.")
else:
    tax_class_dict = {}

to_process = [s for s in species_names if s not in tax_class_dict]
print(f"🔎 {len(to_process)} species left to process.")

✅ Loaded 15 species from species_batch_001.csv
🔎 15 species left to process.


## Clear cache (if needed)

In [147]:
class_cache_file = batch_filename.replace(".csv", "_class_cache.csv")

# Delete class cache
if os.path.exists(class_cache_file):
    os.remove(class_cache_file)
    print(f"🗑️ Deleted {class_cache_file}")
else:
    print(f"⚠️ No cache file found for {class_cache_file}")

🗑️ Deleted species_batch_001_class_cache.csv


# Batch Species Search Loop

In [150]:
"""Search Loop of all species in this CSV batch through databases"""

skipped_morphospecies = 0
skipped_species_list = []

for i, species_name in enumerate(tqdm(to_process, desc="Fetching Classes (all DBs)")):
    # 🚫 Skip morphospecies
    if "morphospecies" in species_name.lower():
        tax_class_dict[species_name] = None
        skipped_morphospecies += 1
        skipped_species_list.append(species_name)
        continue

    # ✅ First: NCBI
    class_name = search_term(species_name, db_type="NCBI")

    # 🟡 Fallbacks
    if not class_name:
        class_name = search_term(species_name, db_type="COL")
    if not class_name:
        class_name = search_term(species_name, db_type="WORMS")
    if not class_name:
        class_name = search_term(species_name, db_type="GBIF")

    tax_class_dict[species_name] = class_name

    # 💾 Save cache after each species
    pd.DataFrame([
        {"GENUS_SPECIES": k, "taxonomic_class": v} for k, v in tax_class_dict.items()
    ]).to_csv(class_cache_file, index=False)

    # ✅ Periodic log every 100 species
    # if (i + 1) % 100 == 0:
    #     print(f"🔄 Processed {i + 1}/{len(to_process)} species...")

    time.sleep(0.1)


"""End summary of number of skipped species"""
if skipped_species_list:
    skipped_filename = batch_filename.replace(".csv", "_skipped_morphospecies.csv")
    pd.DataFrame({"GENUS_SPECIES": skipped_species_list}).to_csv(skipped_filename, index=False)
    print(f"🗂️ Skipped species saved to {skipped_filename}")
else:
    print("✅ No morphospecies were skipped in this batch.")


"""Saving our results"""
cached_df = pd.read_csv(class_cache_file)
final_df = df_batch.merge(cached_df, on="GENUS_SPECIES", how="left")

# ✅ Save the final batch result
output_filename = batch_filename.replace(".csv", "_with_class.csv")
final_df = df_batch.merge(pd.read_csv(class_cache_file), on="GENUS_SPECIES", how="left")
final_df.to_csv(output_filename, index=False)
print(f"✅ Final output saved as {output_filename}")

# ✅ Push to GitHub with one simple call
setup_ssh_and_push(output_filename)

Fetching Classes (all DBs):  27%|██▋       | 4/15 [00:01<00:04,  2.45it/s]

🔄 Fallback to genus: Populus


Fetching Classes (all DBs): 100%|██████████| 15/15 [00:06<00:00,  2.15it/s]

✅ No morphospecies were skipped in this batch.
✅ Final output saved as species_batch_001_with_class.csv
🔑 Adding GitHub.com to known_hosts (if not already added)...





# github.com:22 SSH-2.0-50e45de6
# github.com:22 SSH-2.0-50e45de6
# github.com:22 SSH-2.0-50e45de6
# github.com:22 SSH-2.0-50e45de6
# github.com:22 SSH-2.0-50e45de6
✅ Repo folder 'cs3-biodiversity' already exists.
/content/cs3-biodiversity/cs3-biodiversity
[main abeecae] Add batch output species_batch_001_with_class.csv to biotimes-with-class-label/
 1 file changed, 5 insertions(+)
Enumerating objects: 7, done.
Counting objects: 100% (7/7), done.
Delta compression using up to 2 threads
Compressing objects: 100% (4/4), done.
Writing objects: 100% (4/4), 516 bytes | 516.00 KiB/s, done.
Total 4 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To github.com:emd-aquila/cs3-biodiversity.git
   dfaca0f..abeecae  main -> main
/content/cs3-biodiversity


# Final Collation of Results

In [153]:
%cd cs3-biodiversity
!git pull origin main
%cd ..

batch_files = sorted(glob.glob("cs3-biodiversity/biotimes-with-class-label/species_batch_*_with_class.csv"))

combined_df = pd.concat([pd.read_csv(f) for f in batch_files], ignore_index=True)
combined_df.to_csv("all_species_with_class.csv", index=False)
print("✅ Combined file saved as all_species_with_class.csv")

push_combined_csv_to_github("all_species_with_class.csv")

/content/cs3-biodiversity/cs3-biodiversity
From github.com:emd-aquila/cs3-biodiversity
 * branch            main       -> FETCH_HEAD
Already up to date.
/content/cs3-biodiversity
✅ Combined file saved as all_species_with_class.csv
✅ Moved all_species_with_class.csv into cs3-biodiversity/biotimes-with-class-label/
/content/cs3-biodiversity/cs3-biodiversity
[main 72bef5f] Add combined species class CSV to biotimes-with-class-label/
 1 file changed, 16 insertions(+)
 create mode 100644 biotimes-with-class-label/all_species_with_class.csv
Enumerating objects: 5, done.
Counting objects: 100% (5/5), done.
Delta compression using up to 2 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 362 bytes | 362.00 KiB/s, done.
Total 3 (delta 1), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To github.com:emd-aquila/cs3-biodiversity.git
   a08521f..72bef5f  main -> main
/content/cs3-biodiversity
