# Genus/Species to Class Converter
This notebook contains code to convert the genus/species listing from the BioTIME-DB to taxonomic class using Biopython.

In [None]:
!pip install biopython tqdm

import pandas as pd
from Bio import Entrez
from tqdm import tqdm
import time
import os
import math
from google.colab import files



In [None]:
Entrez.email = "emduggan@mit.edu"
Entrez.api_key = "2e5155aba559345711a3af676cb6c6703608"

# Accessing CSV from Github
! wget https://raw.githubusercontent.com/emd-aquila/cs3-biodiversity/main/data/unique_genus_species.csv -O myfile.csv
df = pd.read_csv("myfile.csv")

--2025-05-07 16:20:20--  https://raw.githubusercontent.com/emd-aquila/cs3-biodiversity/main/data/unique_genus_species.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1185895 (1.1M) [text/plain]
Saving to: ‘myfile.csv’


2025-05-07 16:20:20 (17.2 MB/s) - ‘myfile.csv’ saved [1185895/1185895]



In [27]:
# Batching genus_species file
batch_size = 10000
num_batches = math.ceil(len(df) / batch_size)

# Save each batch
for i in range(num_batches):
    batch_df = df.iloc[i*batch_size : (i+1)*batch_size]
    batch_file = f"species_batch_{i+1:03d}.csv"
    batch_df.to_csv(batch_file, index=False)
    print(f"Saved {batch_file}")

Saved species_batch_001.csv
Saved species_batch_002.csv
Saved species_batch_003.csv
Saved species_batch_004.csv
Saved species_batch_005.csv


In [29]:
#Specify which batch file we are currently working with, load, and cache
batch_filename = "species_batch_001.csv"

df = pd.read_csv(batch_filename)
species_names = df["GENUS_SPECIES"].dropna().unique()

taxid_cache_file = f"{batch_filename.replace('.csv', '')}_taxid_cache.csv"
class_cache_file = f"{batch_filename.replace('.csv', '')}_class_cache.csv"

In [None]:
# Load existing TaxID cache if available
if os.path.exists(taxid_cache_file):
    taxid_df = pd.read_csv(taxid_cache_file)
    species_to_taxid = dict(zip(taxid_df["GENUS_SPECIES"], taxid_df["taxid"].astype(str)))
    print(f"Loaded {len(species_to_taxid)} TaxIDs from cache.")
else:
    species_to_taxid = {}
    print("No existing TaxID cache found.")

# Load existing class cache if available
if os.path.exists(class_cache_file):
    class_df = pd.read_csv(class_cache_file)
    tax_class_dict = dict(zip(class_df["GENUS_SPECIES"], class_df["taxonomic_class"]))
    print(f"Loaded {len(tax_class_dict)} classes from cache.")
else:
    tax_class_dict = {}
    print("No existing class cache found.")

# Find species not yet processed (TaxIDs)
to_query = [name for name in species_names if name not in species_to_taxid]
print(f"🔎 {len(to_query)} species left to fetch TaxIDs.")



# STEP 1: Fetch TaxIDs (in batches, with cache)
batch_size = 100
for i in tqdm(range(0, len(to_query), batch_size), desc="Fetching TaxIDs"):
    batch = to_query[i:i + batch_size]
    for name in batch:
        if name in species_to_taxid:
            continue  # already done
        try:
            search = Entrez.esearch(db="taxonomy", term=name, retmode="xml")
            result = Entrez.read(search)
            if result["IdList"]:
                species_to_taxid[name] = result["IdList"][0]
            else:
                species_to_taxid[name] = None
        except Exception as e:
            print(f"Error fetching TaxID for {name}: {e}")
            species_to_taxid[name] = None
        time.sleep(0.1)  # respect NCBI rate limit

    # Save TaxID cache after each batch
    pd.DataFrame([
        {"GENUS_SPECIES": k, "taxid": v} for k, v in species_to_taxid.items()
    ]).to_csv(taxid_cache_file, index=False)
    print(f"💾 Saved TaxID cache ({len(species_to_taxid)} total so far)")

# Prepare TaxID -> species mapping
taxid_to_species = {v: k for k, v in species_to_taxid.items() if v is not None}
taxids = list(taxid_to_species.keys())

# Find TaxIDs that still need classification
already_classified = set(class_df["GENUS_SPECIES"]) if os.path.exists(class_cache_file) else set()
taxids_to_fetch = [tid for tid in taxids if taxid_to_species[tid] not in already_classified]
print(f"🔎 {len(taxids_to_fetch)} species left to fetch classes.")




# STEP 2: Fetch Classes (in batches, with cache)
for i in tqdm(range(0, len(taxids_to_fetch), batch_size), desc="Fetching Classes"):
    batch = taxids_to_fetch[i:i + batch_size]
    try:
        fetch = Entrez.efetch(db="taxonomy", id=",".join(batch), retmode="xml")
        records = Entrez.read(fetch)
        for record in records:
            lineage = record.get("LineageEx", [])
            class_name = next((r["ScientificName"] for r in lineage if r["Rank"] == "class"), None)
            species_name = taxid_to_species.get(record["TaxId"])
            if species_name:
                tax_class_dict[species_name] = class_name
    except Exception as e:
        print(f"Error fetching class for batch {i}: {e}")
        for tid in batch:
            species_name = taxid_to_species.get(tid)
            tax_class_dict[species_name] = None
    time.sleep(0.1)  # respect NCBI rate limit

    # Save class cache after each batch
    pd.DataFrame([
        {"GENUS_SPECIES": k, "taxonomic_class": v} for k, v in tax_class_dict.items()
    ]).to_csv(class_cache_file, index=False)
    print(f"💾 Saved class cache ({len(tax_class_dict)} total so far)")

# Final merge + save result for this batch
final_df = df.copy()
final_df["taxonomic_class"] = final_df["GENUS_SPECIES"].map(tax_class_dict)
output_filename = batch_filename.replace(".csv", "_with_class.csv")
final_df.to_csv(output_filename, index=False)
print(f"Final output saved as {output_filename}")

# Download result in Colab
files.download(output_filename)

Fetching TaxIDs:   4%|▍         | 16/419 [04:49<2:01:22, 18.07s/it]


KeyboardInterrupt: 

In [None]:
# Save to cache
pd.DataFrame.from_dict(tax_class_dict, orient="index", columns=["taxonomic_class"])\
    .reset_index().rename(columns={"index": "GENUS_SPECIES"})\
    .to_csv(cache_filename, index=False)

# Merge final results
df["taxonomic_class"] = df["GENUS_SPECIES"].map(tax_class_dict)
output_path = "species_with_class_batched.csv"
df.to_csv(output_path, index=False)
output_path

In [None]:
# Combine all results
# Match all output files
batch_files = sorted(glob.glob("species_batch_*_with_class.csv"))

# Combine into one DataFrame
combined_df = pd.concat([pd.read_csv(f) for f in batch_files], ignore_index=True)

# Save the full result
combined_df.to_csv("all_species_with_class.csv", index=False)