# Genus/Species to Class Converter
This notebook contains code to convert the genus/species listing from the BioTIME-DB to taxonomic class using Biopython and calling upon the NCBI, COL, GBIF, and WORMS databases, in that order.

# Pre-Loop Package and Key Setup

In [2]:
# Import necessary packages and set NCBI API information.
!pip install biopython tqdm

import pandas as pd
import requests
import numpy as np
from Bio import Entrez
import time
import math
from tqdm import tqdm
import os
import glob
import shutil
from getpass import getpass

! wget https://raw.githubusercontent.com/emd-aquila/cs3-biodiversity/main/data/unique_genus_species.csv -O myfile.csv
df_all = pd.read_csv("myfile.csv")

Entrez.email = "emduggan@mit.edu"
Entrez.api_key = "2e5155aba559345711a3af676cb6c6703608"

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85
--2025-05-09 11:07:07--  https://raw.githubusercontent.com/emd-aquila/cs3-biodiversity/main/data/unique_genus_species.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1185895 (1.1M) [text/plain]
Saving to: ‘myfile.csv’


2025-05-09 11:07:07 (17.8 MB/s) - ‘myfile.csv’ saved [1185895/1185895]



## Access the private SSH key to upload to GitHub

In [3]:
from google.colab import files
uploaded = files.upload()  # Upload id_ed25519 --> under Users/user/.ssh
!mkdir -p ~/.ssh
!mv id_ed25519 ~/.ssh/
!ssh-keyscan github.com >> ~/.ssh/known_hosts
!chmod 600 ~/.ssh/id_ed25519
!ssh-agent bash -c 'ssh-add ~/.ssh/id_ed25519; ssh -T git@github.com'

Saving id_ed25519 to id_ed25519
# github.com:22 SSH-2.0-0264bb16
# github.com:22 SSH-2.0-0264bb16
# github.com:22 SSH-2.0-0264bb16
# github.com:22 SSH-2.0-0264bb16
# github.com:22 SSH-2.0-0264bb16
Identity added: /root/.ssh/id_ed25519 (emduggan@mit.edu)
Hi emd-aquila! You've successfully authenticated, but GitHub does not provide shell access.


## Create local clone of GitHub repository for file storage

In [4]:
# if needed to clear existing clone
!rm -rf cs3-biodiversity

# get access code and clone repo
os.environ["GITHUB_TOKEN"] = getpass("🔐 Enter your GitHub token: ")
token = os.environ["GITHUB_TOKEN"]
repo_url = f"https://emd-aquila:{token}@github.com/emd-aquila/cs3-biodiversity.git"

!git clone {repo_url}

🔐 Enter your GitHub token: ··········
Cloning into 'cs3-biodiversity'...
remote: Enumerating objects: 148, done.[K
remote: Counting objects: 100% (148/148), done.[K
remote: Compressing objects: 100% (134/134), done.[K
remote: Total 148 (delta 77), reused 56 (delta 13), pack-reused 0 (from 0)[K
Receiving objects: 100% (148/148), 929.58 KiB | 10.33 MiB/s, done.
Resolving deltas: 100% (77/77), done.


# Defining functions to query database APIs and search for species

In [5]:
# Uses an email and API key to query the NCBI database
def ncbi_query(term):
    try:
        search = Entrez.esearch(db="taxonomy", term=term, retmode="xml")
        result = Entrez.read(search)
        if result["IdList"]:
            taxid = result["IdList"][0]
            fetch = Entrez.efetch(db="taxonomy", id=taxid, retmode="xml")
            record = Entrez.read(fetch)[0]
            lineage = record.get("LineageEx", [])
            class_entry = next((r for r in lineage if r.get("Rank") == "class"), None)
            return class_entry["ScientificName"] if class_entry else None
    except Exception as e:
        # print(f"❌ NCBI lookup error for {term}: {e}")
        return None
    return None

# Queries the Categories of Life database
def col_query(term):
    url = f"https://api.catalogueoflife.org/nameusage/search?q={term}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        if data['total'] > 0:
            result = data['result'][0]
            lineage = result.get('classification', [])
            class_entry = next((r for r in lineage if r.get('rank') == 'class'), None)
            return class_entry['name'] if class_entry else None
    except Exception as e:
        # print(f"❌ COL lookup error for {term}: {e}")
        return None
    return None

# Queries the WoRMS database
def worms_query(term):
    url = f"http://www.marinespecies.org/rest/AphiaRecordsByName/{term}?like=false&marine_only=false"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        if data:
            return data[0].get('class')
    except Exception as e:
        # print(f"❌ WoRMS lookup error for {term}: {e}")
        return None
    return None

# Queries the GBIF Database
def gbif_query(term):
    url = f"https://api.gbif.org/v1/species/match?name={term}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        if data.get("class"):
            return data["class"]
    except Exception as e:
        # print(f"❌ GBIF lookup error for {term}: {e}")
        return None
    return None

In [6]:
# Searches for the species in the specified DB, first by species and then by genus
def search_term(scientific_name, db_type):
    """
    Search for the taxonomic class of a species in a specified database.

    Args:
        scientific_name (str): The full species name to search.
        db_type (str): One of 'NCBI', 'COL', 'WORMS', 'GBIF'.

    Returns:
        str or None: The class name if found, else None.
    """

    # 🔍 First try full species name
    if db_type.upper() == 'NCBI':
        result = ncbi_query(scientific_name)
    elif db_type.upper() == 'COL':
        result = col_query(scientific_name)
    elif db_type.upper() == 'GBIF':
        result = gbif_query(scientific_name)
    elif db_type.upper() == 'WORMS':
        result = worms_query(scientific_name)
    else:
        raise ValueError("db_type must be one of 'NCBI', 'COL', 'WORMS', or 'GBIF'.")

    if result:
        return result

    # 🔄 Fallback: try genus only
    genus = scientific_name.split()[0]
    if genus != scientific_name:
        # print(f"🔄 Fallback to genus: {genus}")
        if db_type.upper() == 'NCBI':
            return ncbi_query(genus)
        elif db_type.upper() == 'COL':
            return col_query(genus)
        elif db_type.upper() == 'WORMS':
            return worms_query(genus)
        elif db_type.upper() == 'GBIF':
            return gbif_query(genus)
    return None

In [7]:
def setup_ssh_and_push(output_filename,
                                 repo_name="cs3-biodiversity",
                                 subfolder="biotimes-with-class-label",
                                 github_username="emd-aquila",
                                 github_email="emduggan@mit.edu",
                                 github_name="Eli Duggan"):
    """
    Set up SSH (if needed) and push a batch output file to a subfolder in your GitHub repo using SSH.

    Args:
        output_filename (str): The CSV filename to push (already saved locally).
        repo_name (str): Local folder name of the GitHub repo.
        subfolder (str): Subfolder inside the repo to store the file.
        github_username (str): Your GitHub username.
        github_email (str): Your GitHub email.
        github_name (str): Your full name for git config.
    """

    # ✅ Add GitHub to known_hosts to prevent verification errors
    print("🔑 Adding GitHub.com to known_hosts (if not already added)...")
    !mkdir -p ~/.ssh
    !ssh-keyscan github.com >> ~/.ssh/known_hosts

    # ✅ Check if repo folder exists, clone if needed
    if not os.path.exists(repo_name):
        print(f"📥 Repo folder '{repo_name}' not found. Cloning with SSH...")
        !git clone git@github.com:{github_username}/{repo_name}.git
    else:
        print(f"✅ Repo folder '{repo_name}' already exists.")

    # ✅ Create the subfolder inside the repo if it doesn't exist
    subfolder_path = f"{repo_name}/{subfolder}"
    os.makedirs(subfolder_path, exist_ok=True)

    # ✅ Copy the batch result into the subfolder
    shutil.copy(output_filename, f"{subfolder_path}/{output_filename}")
    # print(f"✅ Copied {output_filename} into {subfolder_path}/")

    # ✅ Push to GitHub
    %cd {repo_name}

    # Set Git config (only needed once)
    !git config user.email "{github_email}"
    !git config user.name "{github_name}"

    # Set remote to SSH (just in case)
    !git remote set-url origin git@github.com:{github_username}/{repo_name}.git
    !git pull origin main --no-edit --rebase=false

    # Add, commit, push
    !git add {subfolder}/{output_filename}
    !git commit -m "Add batch output {output_filename} to {subfolder}/" || echo "No changes to commit."
    !git push origin main

    # ✅ Return to root directory
    %cd ..


In [8]:
def push_combined_csv_to_github(combined_filename="all_species_with_class.csv",
                                repo_name="cs3-biodiversity",
                                subfolder="biotimes-with-class-label",
                                github_username="emd-aquila",
                                github_email="emduggan@mit.edu",
                                github_name="Eli Duggan"):
    """
    Push the combined CSV file to a subfolder in your GitHub repo using SSH.
    """
    # ✅ Build the full subfolder path
    subfolder_path = f"{repo_name}/{subfolder}"

    # ✅ Make sure the subfolder exists inside the repo
    os.makedirs(subfolder_path, exist_ok=True)

    # ✅ Move the combined CSV into the subfolder
    dest_path = f"{subfolder_path}/{combined_filename}"
    if not os.path.exists(dest_path):
        shutil.move(combined_filename, dest_path)
        print(f"✅ Moved {combined_filename} into {subfolder_path}/")
    else:
        print(f"✅ {combined_filename} already exists in {subfolder_path}/")

    # ✅ Push to GitHub
    %cd {repo_name}

    !git config user.email "{github_email}"
    !git config user.name "{github_name}"
    !git remote set-url origin git@github.com:{github_username}/{repo_name}.git
    !git pull origin main --no-edit --rebase=false

    !git add {subfolder}/{combined_filename}
    !git commit -m "Add combined species class CSV to {subfolder}/" || echo "No changes to commit."
    !git push origin main

    %cd ..


# Batching Data and Loading the Cache

## Batching data and saving


In [9]:
batch_size = 8000
num_batches = math.ceil(len(df_all) / batch_size)

# Save each batch
for i in range(num_batches):
    batch_df = df_all.iloc[i*batch_size : (i+1)*batch_size]
    batch_file = f"species_batch_{i+1:03d}.csv"
    batch_df.to_csv(batch_file, index=False)
    # print(f"Saved {batch_file}")

## Set Batch of Interest

In [14]:
# batch_filename = "species_batch_001.csv" #DONE
# batch_filename = "species_batch_002.csv" #DONE
# batch_filename = "species_batch_003.csv" #DONE
# batch_filename = "species_batch_004.csv" #DONE
# batch_filename = "species_batch_005.csv" #DONE
batch_filename = "species_batch_006.csv" #TODO

## Clear cache (if needed)

In [15]:
class_cache_file = batch_filename.replace(".csv", "_class_cache.csv")

# Delete class cache
if os.path.exists(class_cache_file):
    os.remove(class_cache_file)
    print(f"🗑️ Deleted {class_cache_file}")
else:
    print(f"⚠️ No cache file found for {class_cache_file}")

⚠️ No cache file found for species_batch_006_class_cache.csv


## Loading batch files and caching files (run before loop)

In [16]:
# Cache files
taxid_cache_file = batch_filename.replace(".csv", "_taxid_cache.csv")
class_cache_file = batch_filename.replace(".csv", "_class_cache.csv")

# Load batch CSV
df_batch = pd.read_csv(batch_filename)
species_names = df_batch["GENUS_SPECIES"].dropna().unique()
print(f"✅ Loaded {len(species_names)} species from {batch_filename}")

# 🔄 Load class cache if it exists
if pd.io.common.file_exists(class_cache_file):
    cached_df = pd.read_csv(class_cache_file)
    tax_class_dict = dict(zip(cached_df["GENUS_SPECIES"], cached_df["taxonomic_class"]))
    print(f"🔄 Loaded {len(tax_class_dict)} classes from cache.")
else:
    tax_class_dict = {}

to_process = [s for s in species_names if s not in tax_class_dict]
print(f"🔎 {len(to_process)} species left to process.")

✅ Loaded 1852 species from species_batch_006.csv
🔎 1852 species left to process.


# Batch Species Search Loop

In [17]:
"""Search Loop of all species in this CSV batch through databases"""

skipped_species_list = []
unclassified_species_list = []
classified_species_list = []

for i, species_name in enumerate(tqdm(to_process, desc="Fetching Classes (all DBs)")):
    if "morphospecies" in species_name.lower():
        tax_class_dict[species_name] = None
        unclassified_species_list.append({"GENUS_SPECIES": species_name, "Reason": "Morphospecies"})
        continue

    # ✅ First: NCBI
    class_name = search_term(species_name, db_type="NCBI")

    # 🟡 Fallbacks
    if not class_name:
        class_name = search_term(species_name, db_type="COL")
    if not class_name:
        class_name = search_term(species_name, db_type="WORMS")
    if not class_name:
        class_name = search_term(species_name, db_type="GBIF")

    tax_class_dict[species_name] = class_name

    # Save the unclassified species to a different list!
    if class_name:
        classified_species_list.append({"GENUS_SPECIES": species_name, "taxonomic_class": class_name})
    else:
        unclassified_species_list.append({"GENUS_SPECIES": species_name, "Reason": "Not found in any DB"})



    # 💾 Save cache after each species
    pd.DataFrame([
        {"GENUS_SPECIES": k, "taxonomic_class": v} for k, v in tax_class_dict.items()
    ]).to_csv(class_cache_file, index=False)

    time.sleep(0.1)


"""Adding previously cached species"""
# Load the full cache
full_cache_df = pd.read_csv(class_cache_file)

# Build a set of species that have already been included (from the current batch)
already_included_species = set([entry["GENUS_SPECIES"] for entry in classified_species_list] +
                               [entry["GENUS_SPECIES"] for entry in unclassified_species_list])

# Loop over all cached species
for _, row in full_cache_df.iterrows():
    species_name = row["GENUS_SPECIES"]
    class_name = row["taxonomic_class"]

    if species_name in already_included_species:
        continue  # Already added during this batch

    if pd.isna(class_name) or class_name in [None, '', 'nan']:
        unclassified_species_list.append({"GENUS_SPECIES": species_name, "Reason": "Not found in any DB"})
    else:
        classified_species_list.append({"GENUS_SPECIES": species_name, "taxonomic_class": class_name})



"""Create CSVs of the unclassified and classified species and save to GitHub"""
# ✅ Save classified species CSV
if classified_species_list:
    classified_filename = batch_filename.replace(".csv", "_classified_species.csv")
    pd.DataFrame(classified_species_list).to_csv(classified_filename, index=False)
    print(f"✅ Classified species saved to {classified_filename}")
    setup_ssh_and_push(classified_filename)

# ✅ Save unclassified species CSV
if unclassified_species_list:
    unclassified_filename = batch_filename.replace(".csv", "_unclassified_species.csv")
    pd.DataFrame(unclassified_species_list).to_csv(unclassified_filename, index=False)
    print(f"⚠️ Unclassified species (including morphospecies) saved to {unclassified_filename}")
    setup_ssh_and_push(unclassified_filename)

Fetching Classes (all DBs): 100%|██████████| 1852/1852 [12:27<00:00,  2.48it/s]


✅ Classified species saved to species_batch_006_classified_species.csv
🔑 Adding GitHub.com to known_hosts (if not already added)...
# github.com:22 SSH-2.0-0264bb16
# github.com:22 SSH-2.0-0264bb16
# github.com:22 SSH-2.0-0264bb16
# github.com:22 SSH-2.0-0264bb16
# github.com:22 SSH-2.0-0264bb16
✅ Repo folder 'cs3-biodiversity' already exists.
/content/cs3-biodiversity
From github.com:emd-aquila/cs3-biodiversity
 * branch            main       -> FETCH_HEAD
Already up to date.
[main f3e47ea] Add batch output species_batch_006_classified_species.csv to biotimes-with-class-label/
 1 file changed, 1852 insertions(+)
 create mode 100644 biotimes-with-class-label/species_batch_006_classified_species.csv
Enumerating objects: 6, done.
Counting objects: 100% (6/6), done.
Delta compression using up to 2 threads
Compressing objects: 100% (4/4), done.
Writing objects: 100% (4/4), 16.32 KiB | 5.44 MiB/s, done.
Total 4 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2

# Combining batch CSV results and pushing to GitHub


In [25]:
# ALTERNATIVE WAY OF COMBINING ALL FILES

repo_name = "cs3-biodiversity"
subfolder = "biotimes-with-class-label"

# ✅ 1️⃣ Make sure repo is cloned and updated
if not os.path.exists(repo_name):
    print(f"📥 Cloning repo {repo_name}...")
    !git clone git@github.com:emd-aquila/{repo_name}.git
else:
    print(f"✅ Repo {repo_name} already exists. Pulling latest changes...")
    os.chdir(repo_name)
    !git pull origin main
    os.chdir("..")


"""Classified Species Files"""
classified_files = [
    f for f in glob.glob(f"{repo_name}/{subfolder}/*_classified_species.csv")
    if "all_classified_species" not in os.path.basename(f)
]
if classified_files:
    print(f"Combining {len(classified_files)} classified files:")
    for f in classified_files:
        print(f)

    combined_classified = pd.concat([pd.read_csv(f) for f in classified_files], ignore_index=True)

    # ✅ Sort alphabetically by species name
    combined_classified = combined_classified.sort_values(by="GENUS_SPECIES", ascending=True)
    combined_classified.to_csv("all_classified_species.csv", index=False)
    setup_ssh_and_push("all_classified_species.csv")
else:
    print("⚠️ No classified species files found in the repo.")



"""Unclassified Species Files"""
unclassified_files = [
    f for f in glob.glob(f"{repo_name}/{subfolder}/*_unclassified_species.csv")
    if "all_unclassified_species" not in os.path.basename(f)
]
if unclassified_files:
    print(f"Combining {len(unclassified_files)} unclassified files:")
    for f in unclassified_files:
        print(f)

    combined_unclassified = pd.concat([pd.read_csv(f) for f in unclassified_files], ignore_index=True)

    # ✅ Sort alphabetically by species name
    combined_unclassified = combined_unclassified.sort_values(by="GENUS_SPECIES", ascending=True)
    combined_unclassified.to_csv("all_unclassified_species.csv", index=False)
    setup_ssh_and_push("all_unclassified_species.csv")
else:
    print("⚠️ No unclassified species files found in the repo.")


✅ Repo cs3-biodiversity already exists. Pulling latest changes...
From github.com:emd-aquila/cs3-biodiversity
 * branch            main       -> FETCH_HEAD
Already up to date.
Combining 6 classified files:
cs3-biodiversity/biotimes-with-class-label/species_batch_006_classified_species.csv
cs3-biodiversity/biotimes-with-class-label/species_batch_004_classified_species.csv
cs3-biodiversity/biotimes-with-class-label/species_batch_005_classified_species.csv
cs3-biodiversity/biotimes-with-class-label/species_batch_001_classified_species.csv
cs3-biodiversity/biotimes-with-class-label/species_batch_003_classified_species.csv
cs3-biodiversity/biotimes-with-class-label/species_batch_002_classified_species.csv
🔑 Adding GitHub.com to known_hosts (if not already added)...
# github.com:22 SSH-2.0-0264bb16
# github.com:22 SSH-2.0-0264bb16
# github.com:22 SSH-2.0-0264bb16
# github.com:22 SSH-2.0-0264bb16
# github.com:22 SSH-2.0-0264bb16
✅ Repo folder 'cs3-biodiversity' already exists.
/content/cs3-bi

In [20]:
print(f"Found {len(classified_files)} classified CSV files:")
for f in classified_files:
    print(f)

Found 7 classified CSV files:
cs3-biodiversity/biotimes-with-class-label/all_classified_species.csv
cs3-biodiversity/biotimes-with-class-label/species_batch_006_classified_species.csv
cs3-biodiversity/biotimes-with-class-label/species_batch_004_classified_species.csv
cs3-biodiversity/biotimes-with-class-label/species_batch_005_classified_species.csv
cs3-biodiversity/biotimes-with-class-label/species_batch_001_classified_species.csv
cs3-biodiversity/biotimes-with-class-label/species_batch_003_classified_species.csv
cs3-biodiversity/biotimes-with-class-label/species_batch_002_classified_species.csv
