# Genus/Species to Class Converter
This notebook contains code to convert the genus/species listing from the BioTIME-DB to taxonomic class using Biopython.

In [21]:
!pip install biopython tqdm

import pandas as pd
from Bio import Entrez
from tqdm import tqdm
import time
import os



In [24]:
Entrez.email = "emduggan@mit.edu"
Entrez.api_key = "2e5155aba559345711a3af676cb6c6703608"


# Accessing CSV from Github
! wget https://raw.githubusercontent.com/emd-aquila/cs3-biodiversity/main/data/unique_genus_species.csv -O myfile.csv
df = pd.read_csv("myfile.csv")
df.head()

species_names = df["GENUS_SPECIES"].dropna().unique()

--2025-05-06 21:33:19--  https://raw.githubusercontent.com/emd-aquila/cs3-biodiversity/main/data/unique_genus_species.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1185895 (1.1M) [text/plain]
Saving to: ‘myfile.csv’


2025-05-06 21:33:19 (29.4 MB/s) - ‘myfile.csv’ saved [1185895/1185895]



In [25]:
# Load or initialize cache
cache_filename = "classification_cache.csv"
if os.path.exists(cache_filename):
    cached_df = pd.read_csv(cache_filename)
    tax_class_dict = dict(zip(cached_df["GENUS_SPECIES"], cached_df["taxonomic_class"]))
else:
    tax_class_dict = {}

# Filter names not yet cached
to_query = [name for name in species_names if name not in tax_class_dict]

In [27]:
# Step 3: NCBI Entrez taxonomy lookup with caching
batch_size = 500

def get_tax_class(name):
    try:
        search = Entrez.esearch(db="taxonomy", term=name, retmode="xml")
        result = Entrez.read(search)
        if not result["IdList"]:
            return None
        taxid = result["IdList"][0]
        fetch = Entrez.efetch(db="taxonomy", id=taxid, retmode="xml")
        record = Entrez.read(fetch)
        lineage = record[0].get("LineageEx", [])
        class_name = next((r["ScientificName"] for r in lineage if r["Rank"] == "class"), None)
        return class_name
    except Exception as e:
        print(f"Error for {name}: {e}")
        return None

for i, name in enumerate(tqdm(to_query)):
    tax_class_dict[name] = get_tax_class(name)
    time.sleep(0.1)  # 10 requests/sec with API key

    if (i + 1) % batch_size == 0 or (i + 1) == len(to_query):
        pd.DataFrame.from_dict(tax_class_dict, orient="index", columns=["taxonomic_class"])\
          .reset_index().rename(columns={"index": "GENUS_SPECIES"})\
          .to_csv(cache_filename, index=False)

  2%|▏         | 636/41852 [06:20<6:51:04,  1.67it/s]


KeyboardInterrupt: 

In [None]:
# Step 4: Merge and save final result
cached_df = pd.read_csv(cache_filename)
final_df = df.merge(cached_df, on="GENUS_SPECIES", how="left")
final_df.to_csv("species_with_class.csv", index=False)

# Optional: download the file
from google.colab import files
files.download("species_with_class.csv")