# Genus/Species to Class Converter
This notebook contains code to convert the genus/species listing from the BioTIME-DB to taxonomic class using Biopython.

In [73]:
!pip install biopython tqdm

import pandas as pd
from Bio import Entrez
import time
import math
from tqdm import tqdm

# API details
Entrez.email = "emduggan@mit.edu"
Entrez.api_key = "2e5155aba559345711a3af676cb6c6703608"
! wget https://raw.githubusercontent.com/emd-aquila/cs3-biodiversity/main/data/unique_genus_species.csv -O myfile.csv
df_all = pd.read_csv("myfile.csv")

--2025-05-07 19:18:32--  https://raw.githubusercontent.com/emd-aquila/cs3-biodiversity/main/data/unique_genus_species.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1185895 (1.1M) [text/plain]
Saving to: ‘myfile.csv’


2025-05-07 19:18:32 (18.1 MB/s) - ‘myfile.csv’ saved [1185895/1185895]



In [74]:
# Batching our data from CSV
batch_size = 5000
num_batches = math.ceil(len(df_all) / batch_size)

# Save each batch
for i in range(num_batches):
    batch_df = df_all.iloc[i*batch_size : (i+1)*batch_size]
    batch_file = f"species_batch_{i+1:03d}.csv"
    batch_df.to_csv(batch_file, index=False)
    print(f"Saved {batch_file}")

Saved species_batch_001.csv
Saved species_batch_002.csv
Saved species_batch_003.csv
Saved species_batch_004.csv
Saved species_batch_005.csv
Saved species_batch_006.csv
Saved species_batch_007.csv
Saved species_batch_008.csv
Saved species_batch_009.csv


In [78]:
# Specify which batch file we are currently working with, load, and cache
batch_filename = "species_batch_001.csv"

# Cache files
taxid_cache_file = batch_filename.replace(".csv", "_taxid_cache.csv")
class_cache_file = batch_filename.replace(".csv", "_class_cache.csv")

# Load batch
df = pd.read_csv(batch_filename)
species_names = df["GENUS_SPECIES"].dropna().unique()
print(f"Loaded {len(species_names)} species from {batch_filename}")

Loaded 5000 species from species_batch_001.csv


In [77]:
# Load Existing TaxID cache if available
if pd.io.common.file_exists(taxid_cache_file):
    taxid_df = pd.read_csv(taxid_cache_file)
    species_to_taxid = dict(zip(taxid_df["GENUS_SPECIES"], taxid_df["taxid"].astype(str)))
    print(f"🔄 Loaded {len(species_to_taxid)} TaxIDs from cache.")
else:
    species_to_taxid = {}

# Load Class cache if exists
if pd.io.common.file_exists(class_cache_file):
    class_df = pd.read_csv(class_cache_file)
    tax_class_dict = dict(zip(class_df["GENUS_SPECIES"], class_df["taxonomic_class"]))
    print(f"🔄 Loaded {len(tax_class_dict)} classes from cache.")
else:
    tax_class_dict = {}

print(tax_class_dict)

{}
