# Genus/Species to Class Converter
This notebook contains code to convert the genus/species listing from the BioTIME-DB to taxonomic class using Biopython.

In [42]:
!pip install biopython tqdm

import pandas as pd
from Bio import Entrez
from tqdm import tqdm
import time
import os
import math
from google.colab import files
import glob



In [45]:
Entrez.email = "emduggan@mit.edu"
Entrez.api_key = "2e5155aba559345711a3af676cb6c6703608"

# Accessing CSV from Github
! wget https://raw.githubusercontent.com/emd-aquila/cs3-biodiversity/main/data/unique_genus_species.csv -O myfile.csv
df_all = pd.read_csv("myfile.csv")

--2025-05-07 17:24:47--  https://raw.githubusercontent.com/emd-aquila/cs3-biodiversity/main/data/unique_genus_species.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1185895 (1.1M) [text/plain]
Saving to: ‘myfile.csv’


2025-05-07 17:24:47 (18.1 MB/s) - ‘myfile.csv’ saved [1185895/1185895]



In [47]:
# Batching genus_species file
batch_size = 10000
num_batches = math.ceil(len(df_all) / batch_size)

# Save each batch
for i in range(num_batches):
    batch_df = df_all.iloc[i*batch_size : (i+1)*batch_size]
    batch_file = f"species_batch_{i+1:03d}.csv"
    batch_df.to_csv(batch_file, index=False)
    print(f"Saved {batch_file}")

Saved species_batch_001.csv
Saved species_batch_002.csv
Saved species_batch_003.csv
Saved species_batch_004.csv
Saved species_batch_005.csv


In [48]:
#Specify which batch file we are currently working with, load, and cache
batch_filename = "species_batch_001.csv"

df_batch = pd.read_csv(batch_filename)
species_names = df_batch["GENUS_SPECIES"].dropna().unique()

taxid_cache_file = f"{batch_filename.replace('.csv', '')}_taxid_cache.csv"
class_cache_file = f"{batch_filename.replace('.csv', '')}_class_cache.csv"

In [31]:
# Load existing TaxID cache if available
if os.path.exists(taxid_cache_file):
    taxid_df = pd.read_csv(taxid_cache_file)
    species_to_taxid = dict(zip(taxid_df["GENUS_SPECIES"], taxid_df["taxid"].astype(str)))
    print(f"Loaded {len(species_to_taxid)} TaxIDs from cache.")
else:
    species_to_taxid = {}
    print("No existing TaxID cache found.")

# Load existing class cache if available
if os.path.exists(class_cache_file):
    class_df = pd.read_csv(class_cache_file)
    tax_class_dict = dict(zip(class_df["GENUS_SPECIES"], class_df["taxonomic_class"]))
    print(f"Loaded {len(tax_class_dict)} classes from cache.")
else:
    tax_class_dict = {}
    print("No existing class cache found.")

# Find species not yet processed (TaxIDs)
to_query = [name for name in species_names if name not in species_to_taxid]
print(f"{len(to_query)} species left to fetch TaxIDs.")



# STEP 1: Fetch TaxIDs (in batches, with cache)
batch_size = 100
for i in tqdm(range(0, len(to_query), batch_size), desc="Fetching TaxIDs"):
    batch = to_query[i:i + batch_size]
    for name in batch:
        if name in species_to_taxid:
            continue  # already done
        try:
            search = Entrez.esearch(db="taxonomy", term=name, retmode="xml")
            result = Entrez.read(search)
            if result["IdList"]:
                species_to_taxid[name] = result["IdList"][0]
            else:
                species_to_taxid[name] = None
        except Exception as e:
            print(f"Error fetching TaxID for {name}: {e}")
            species_to_taxid[name] = None
        time.sleep(0.1)  # respect NCBI rate limit

    # Save TaxID cache after each batch
    pd.DataFrame([
        {"GENUS_SPECIES": k, "taxid": v} for k, v in species_to_taxid.items()
    ]).to_csv(taxid_cache_file, index=False)
    print(f"Saved TaxID cache ({len(species_to_taxid)} total so far)")

# Prepare TaxID -> species mapping
taxid_to_species = {v: k for k, v in species_to_taxid.items() if v is not None}
taxids = list(taxid_to_species.keys())

# Find TaxIDs that still need classification
already_classified = set(class_df["GENUS_SPECIES"]) if os.path.exists(class_cache_file) else set()
taxids_to_fetch = [tid for tid in taxids if taxid_to_species[tid] not in already_classified]
print(f"🔎 {len(taxids_to_fetch)} species left to fetch classes.")




# STEP 2: Fetch Classes (in batches, with cache)
for i in tqdm(range(0, len(taxids_to_fetch), batch_size), desc="Fetching Classes"):
    batch = taxids_to_fetch[i:i + batch_size]
    try:
        fetch = Entrez.efetch(db="taxonomy", id=",".join(batch), retmode="xml")
        records = Entrez.read(fetch)
        for record in records:
            lineage = record.get("LineageEx", [])
            class_name = next((r["ScientificName"] for r in lineage if r["Rank"] == "class"), None)
            species_name = taxid_to_species.get(record["TaxId"])
            if species_name:
                tax_class_dict[species_name] = class_name
    except Exception as e:
        print(f"Error fetching class for batch {i}: {e}")
        for tid in batch:
            species_name = taxid_to_species.get(tid)
            tax_class_dict[species_name] = None
    time.sleep(0.1)  # respect NCBI rate limit

    # Save class cache after each batch
    pd.DataFrame([
        {"GENUS_SPECIES": k, "taxonomic_class": v} for k, v in tax_class_dict.items()
    ]).to_csv(class_cache_file, index=False)
    print(f"💾 Saved class cache ({len(tax_class_dict)} total so far)")

# Final merge + save result for this batch
class_df = pd.read_csv(class_cache_file)
tax_class_dict = dict(zip(class_df["GENUS_SPECIES"], class_df["taxonomic_class"]))
final_df = df_batch.copy()
final_df["taxonomic_class"] = final_df["GENUS_SPECIES"].map(tax_class_dict)
output_filename = batch_filename.replace(".csv", "_with_class.csv")
final_df.to_csv(output_filename, index=False)
print(f"Final output saved as {output_filename}")

# Download result in Colab
files.download(output_filename)

Loaded 1800 TaxIDs from cache.
No existing class cache found.
8200 species left to fetch TaxIDs.


Fetching TaxIDs:   1%|          | 1/82 [00:17<23:08, 17.15s/it]

Saved TaxID cache (1900 total so far)


Fetching TaxIDs:   2%|▏         | 2/82 [00:34<23:22, 17.54s/it]

Saved TaxID cache (2000 total so far)


Fetching TaxIDs:   4%|▎         | 3/82 [00:51<22:42, 17.24s/it]

Saved TaxID cache (2100 total so far)


Fetching TaxIDs:   5%|▍         | 4/82 [01:11<23:43, 18.25s/it]

Saved TaxID cache (2200 total so far)


Fetching TaxIDs:   6%|▌         | 5/82 [01:28<22:47, 17.77s/it]

Saved TaxID cache (2300 total so far)


Fetching TaxIDs:   7%|▋         | 6/82 [01:46<22:29, 17.76s/it]

Saved TaxID cache (2400 total so far)


Fetching TaxIDs:   9%|▊         | 7/82 [02:03<21:50, 17.47s/it]

Saved TaxID cache (2500 total so far)


Fetching TaxIDs:  10%|▉         | 8/82 [02:19<21:06, 17.11s/it]

Saved TaxID cache (2600 total so far)


Fetching TaxIDs:  11%|█         | 9/82 [02:37<21:01, 17.28s/it]

Saved TaxID cache (2700 total so far)


Fetching TaxIDs:  12%|█▏        | 10/82 [02:53<20:29, 17.07s/it]

Saved TaxID cache (2800 total so far)


Fetching TaxIDs:  13%|█▎        | 11/82 [03:10<20:11, 17.06s/it]

Saved TaxID cache (2900 total so far)


Fetching TaxIDs:  15%|█▍        | 12/82 [03:28<20:17, 17.39s/it]

Saved TaxID cache (3000 total so far)


Fetching TaxIDs:  16%|█▌        | 13/82 [03:47<20:15, 17.61s/it]

Saved TaxID cache (3100 total so far)


Fetching TaxIDs:  17%|█▋        | 14/82 [04:04<19:52, 17.54s/it]

Saved TaxID cache (3200 total so far)


Fetching TaxIDs:  18%|█▊        | 15/82 [04:21<19:18, 17.29s/it]

Saved TaxID cache (3300 total so far)


Fetching TaxIDs:  20%|█▉        | 16/82 [04:38<18:57, 17.23s/it]

Saved TaxID cache (3400 total so far)


Fetching TaxIDs:  21%|██        | 17/82 [04:56<19:08, 17.66s/it]

Saved TaxID cache (3500 total so far)


Fetching TaxIDs:  22%|██▏       | 18/82 [05:14<18:53, 17.72s/it]

Saved TaxID cache (3600 total so far)


Fetching TaxIDs:  23%|██▎       | 19/82 [05:31<18:16, 17.40s/it]

Saved TaxID cache (3700 total so far)


Fetching TaxIDs:  24%|██▍       | 20/82 [05:49<18:14, 17.66s/it]

Saved TaxID cache (3800 total so far)


Fetching TaxIDs:  26%|██▌       | 21/82 [06:07<18:03, 17.77s/it]

Saved TaxID cache (3900 total so far)


Fetching TaxIDs:  27%|██▋       | 22/82 [06:24<17:36, 17.62s/it]

Saved TaxID cache (4000 total so far)


Fetching TaxIDs:  28%|██▊       | 23/82 [06:43<17:36, 17.91s/it]

Saved TaxID cache (4100 total so far)


Fetching TaxIDs:  29%|██▉       | 24/82 [07:00<17:03, 17.64s/it]

Saved TaxID cache (4200 total so far)


Fetching TaxIDs:  30%|███       | 25/82 [07:17<16:33, 17.42s/it]

Saved TaxID cache (4300 total so far)


Fetching TaxIDs:  32%|███▏      | 26/82 [07:36<16:35, 17.77s/it]

Saved TaxID cache (4400 total so far)


Fetching TaxIDs:  33%|███▎      | 27/82 [07:53<16:03, 17.52s/it]

Saved TaxID cache (4500 total so far)


Fetching TaxIDs:  34%|███▍      | 28/82 [08:11<15:59, 17.77s/it]

Saved TaxID cache (4600 total so far)


Fetching TaxIDs:  35%|███▌      | 29/82 [08:28<15:38, 17.71s/it]

Saved TaxID cache (4700 total so far)


Fetching TaxIDs:  37%|███▋      | 30/82 [08:45<15:03, 17.38s/it]

Saved TaxID cache (4800 total so far)


Fetching TaxIDs:  38%|███▊      | 31/82 [09:03<14:55, 17.57s/it]

Saved TaxID cache (4900 total so far)


Fetching TaxIDs:  39%|███▉      | 32/82 [09:22<14:55, 17.90s/it]

Saved TaxID cache (5000 total so far)


Fetching TaxIDs:  40%|████      | 33/82 [09:38<14:19, 17.53s/it]

Saved TaxID cache (5100 total so far)


Fetching TaxIDs:  41%|████▏     | 34/82 [09:55<13:49, 17.28s/it]

Saved TaxID cache (5200 total so far)


Fetching TaxIDs:  43%|████▎     | 35/82 [10:12<13:22, 17.07s/it]

Saved TaxID cache (5300 total so far)


Fetching TaxIDs:  44%|████▍     | 36/82 [10:28<12:59, 16.96s/it]

Saved TaxID cache (5400 total so far)


Fetching TaxIDs:  45%|████▌     | 37/82 [10:46<12:49, 17.10s/it]

Saved TaxID cache (5500 total so far)


Fetching TaxIDs:  46%|████▋     | 38/82 [11:03<12:27, 17.00s/it]

Saved TaxID cache (5600 total so far)


Fetching TaxIDs:  48%|████▊     | 39/82 [11:20<12:12, 17.03s/it]

Saved TaxID cache (5700 total so far)


Fetching TaxIDs:  49%|████▉     | 40/82 [11:36<11:51, 16.93s/it]

Saved TaxID cache (5800 total so far)


Fetching TaxIDs:  50%|█████     | 41/82 [11:53<11:30, 16.84s/it]

Saved TaxID cache (5900 total so far)


Fetching TaxIDs:  51%|█████     | 42/82 [12:11<11:22, 17.05s/it]

Saved TaxID cache (6000 total so far)


Fetching TaxIDs:  52%|█████▏    | 43/82 [12:28<11:08, 17.14s/it]

Saved TaxID cache (6100 total so far)


Fetching TaxIDs:  54%|█████▎    | 44/82 [12:45<10:50, 17.12s/it]

Saved TaxID cache (6200 total so far)


Fetching TaxIDs:  55%|█████▍    | 45/82 [13:02<10:27, 16.97s/it]

Saved TaxID cache (6300 total so far)


Fetching TaxIDs:  56%|█████▌    | 46/82 [13:18<10:09, 16.93s/it]

Saved TaxID cache (6400 total so far)


Fetching TaxIDs:  57%|█████▋    | 47/82 [13:35<09:53, 16.97s/it]

Saved TaxID cache (6500 total so far)


Fetching TaxIDs:  59%|█████▊    | 48/82 [13:53<09:44, 17.19s/it]

Saved TaxID cache (6600 total so far)


Fetching TaxIDs:  60%|█████▉    | 49/82 [14:10<09:26, 17.18s/it]

Saved TaxID cache (6700 total so far)


Fetching TaxIDs:  61%|██████    | 50/82 [14:29<09:26, 17.71s/it]

Saved TaxID cache (6800 total so far)


Fetching TaxIDs:  62%|██████▏   | 51/82 [14:47<09:09, 17.74s/it]

Saved TaxID cache (6900 total so far)


Fetching TaxIDs:  63%|██████▎   | 52/82 [15:04<08:46, 17.55s/it]

Saved TaxID cache (7000 total so far)


Fetching TaxIDs:  65%|██████▍   | 53/82 [15:25<08:57, 18.53s/it]

Saved TaxID cache (7100 total so far)


Fetching TaxIDs:  66%|██████▌   | 54/82 [15:43<08:31, 18.27s/it]

Saved TaxID cache (7200 total so far)


Fetching TaxIDs:  67%|██████▋   | 55/82 [16:00<08:04, 17.96s/it]

Saved TaxID cache (7300 total so far)


Fetching TaxIDs:  68%|██████▊   | 56/82 [16:17<07:37, 17.59s/it]

Saved TaxID cache (7400 total so far)


Fetching TaxIDs:  70%|██████▉   | 57/82 [16:33<07:12, 17.31s/it]

Saved TaxID cache (7500 total so far)


Fetching TaxIDs:  71%|███████   | 58/82 [16:52<07:03, 17.66s/it]

Saved TaxID cache (7600 total so far)


Fetching TaxIDs:  72%|███████▏  | 59/82 [17:12<07:01, 18.31s/it]

Saved TaxID cache (7700 total so far)


Fetching TaxIDs:  73%|███████▎  | 60/82 [17:31<06:50, 18.67s/it]

Saved TaxID cache (7800 total so far)


Fetching TaxIDs:  74%|███████▍  | 61/82 [17:50<06:30, 18.60s/it]

Saved TaxID cache (7900 total so far)


Fetching TaxIDs:  76%|███████▌  | 62/82 [18:07<06:05, 18.28s/it]

Saved TaxID cache (8000 total so far)


Fetching TaxIDs:  77%|███████▋  | 63/82 [18:26<05:50, 18.44s/it]

Saved TaxID cache (8100 total so far)


Fetching TaxIDs:  78%|███████▊  | 64/82 [18:44<05:32, 18.45s/it]

Saved TaxID cache (8200 total so far)


Fetching TaxIDs:  79%|███████▉  | 65/82 [19:02<05:07, 18.10s/it]

Saved TaxID cache (8300 total so far)


Fetching TaxIDs:  80%|████████  | 66/82 [19:20<04:52, 18.27s/it]

Saved TaxID cache (8400 total so far)


Fetching TaxIDs:  82%|████████▏ | 67/82 [19:39<04:34, 18.29s/it]

Saved TaxID cache (8500 total so far)


Fetching TaxIDs:  83%|████████▎ | 68/82 [19:56<04:11, 17.99s/it]

Saved TaxID cache (8600 total so far)


Fetching TaxIDs:  84%|████████▍ | 69/82 [20:14<03:52, 17.88s/it]

Saved TaxID cache (8700 total so far)


Fetching TaxIDs:  85%|████████▌ | 70/82 [20:30<03:30, 17.55s/it]

Saved TaxID cache (8800 total so far)


Fetching TaxIDs:  87%|████████▋ | 71/82 [20:48<03:12, 17.50s/it]

Saved TaxID cache (8900 total so far)


Fetching TaxIDs:  88%|████████▊ | 72/82 [21:05<02:53, 17.33s/it]

Saved TaxID cache (9000 total so far)


Fetching TaxIDs:  89%|████████▉ | 73/82 [21:24<02:40, 17.83s/it]

Saved TaxID cache (9100 total so far)


Fetching TaxIDs:  90%|█████████ | 74/82 [21:42<02:24, 18.05s/it]

Saved TaxID cache (9200 total so far)


Fetching TaxIDs:  91%|█████████▏| 75/82 [22:02<02:09, 18.53s/it]

Saved TaxID cache (9300 total so far)


Fetching TaxIDs:  93%|█████████▎| 76/82 [22:20<01:49, 18.31s/it]

Saved TaxID cache (9400 total so far)


Fetching TaxIDs:  94%|█████████▍| 77/82 [22:37<01:29, 17.98s/it]

Saved TaxID cache (9500 total so far)


Fetching TaxIDs:  95%|█████████▌| 78/82 [22:54<01:10, 17.73s/it]

Saved TaxID cache (9600 total so far)


Fetching TaxIDs:  96%|█████████▋| 79/82 [23:12<00:53, 17.77s/it]

Saved TaxID cache (9700 total so far)


Fetching TaxIDs:  98%|█████████▊| 80/82 [23:29<00:35, 17.72s/it]

Saved TaxID cache (9800 total so far)


Fetching TaxIDs:  99%|█████████▉| 81/82 [23:46<00:17, 17.34s/it]

Saved TaxID cache (9900 total so far)


Fetching TaxIDs: 100%|██████████| 82/82 [24:03<00:00, 17.60s/it]


Saved TaxID cache (10000 total so far)
🔎 7292 species left to fetch classes.


Fetching Classes:   1%|▏         | 1/73 [00:00<00:51,  1.39it/s]

💾 Saved class cache (2 total so far)


Fetching Classes:   3%|▎         | 2/73 [00:01<00:41,  1.69it/s]

💾 Saved class cache (7 total so far)


Fetching Classes:   4%|▍         | 3/73 [00:02<00:48,  1.46it/s]

💾 Saved class cache (9 total so far)


Fetching Classes:   5%|▌         | 4/73 [00:02<00:46,  1.49it/s]

💾 Saved class cache (10 total so far)


Fetching Classes:   7%|▋         | 5/73 [00:03<00:50,  1.34it/s]

💾 Saved class cache (10 total so far)


Fetching Classes:   8%|▊         | 6/73 [00:04<00:53,  1.25it/s]

💾 Saved class cache (10 total so far)


Fetching Classes:  10%|▉         | 7/73 [00:05<00:52,  1.26it/s]

💾 Saved class cache (10 total so far)


Fetching Classes:  11%|█         | 8/73 [00:06<00:54,  1.20it/s]

💾 Saved class cache (10 total so far)


Fetching Classes:  12%|█▏        | 9/73 [00:06<00:49,  1.29it/s]

💾 Saved class cache (15 total so far)


Fetching Classes:  14%|█▎        | 10/73 [00:07<00:46,  1.36it/s]

💾 Saved class cache (22 total so far)


Fetching Classes:  15%|█▌        | 11/73 [00:07<00:40,  1.54it/s]

💾 Saved class cache (28 total so far)


Fetching Classes:  16%|█▋        | 12/73 [00:08<00:41,  1.48it/s]

💾 Saved class cache (41 total so far)


Fetching Classes:  18%|█▊        | 13/73 [00:09<00:41,  1.46it/s]

💾 Saved class cache (56 total so far)


Fetching Classes:  19%|█▉        | 14/73 [00:10<00:42,  1.40it/s]

💾 Saved class cache (77 total so far)


Fetching Classes:  21%|██        | 15/73 [00:10<00:43,  1.34it/s]

💾 Saved class cache (175 total so far)


Fetching Classes:  22%|██▏       | 16/73 [00:12<00:55,  1.02it/s]

💾 Saved class cache (275 total so far)


Fetching Classes:  23%|██▎       | 17/73 [00:13<00:50,  1.10it/s]

💾 Saved class cache (374 total so far)


Fetching Classes:  25%|██▍       | 18/73 [00:13<00:44,  1.23it/s]

💾 Saved class cache (473 total so far)


Fetching Classes:  26%|██▌       | 19/73 [00:14<00:42,  1.27it/s]

💾 Saved class cache (570 total so far)


Fetching Classes:  27%|██▋       | 20/73 [00:15<00:37,  1.40it/s]

💾 Saved class cache (664 total so far)


Fetching Classes:  29%|██▉       | 21/73 [00:15<00:37,  1.38it/s]

💾 Saved class cache (754 total so far)


Fetching Classes:  30%|███       | 22/73 [00:16<00:37,  1.36it/s]

💾 Saved class cache (851 total so far)


Fetching Classes:  32%|███▏      | 23/73 [00:17<00:34,  1.44it/s]

💾 Saved class cache (949 total so far)


Fetching Classes:  33%|███▎      | 24/73 [00:17<00:35,  1.39it/s]

💾 Saved class cache (1045 total so far)


Fetching Classes:  34%|███▍      | 25/73 [00:18<00:32,  1.50it/s]

💾 Saved class cache (1142 total so far)


Fetching Classes:  36%|███▌      | 26/73 [00:19<00:32,  1.45it/s]

💾 Saved class cache (1239 total so far)


Fetching Classes:  37%|███▋      | 27/73 [00:19<00:31,  1.46it/s]

💾 Saved class cache (1339 total so far)


Fetching Classes:  38%|███▊      | 28/73 [00:20<00:31,  1.41it/s]

💾 Saved class cache (1439 total so far)


Fetching Classes:  40%|███▉      | 29/73 [00:21<00:35,  1.24it/s]

💾 Saved class cache (1539 total so far)


Fetching Classes:  41%|████      | 30/73 [00:22<00:34,  1.26it/s]

💾 Saved class cache (1639 total so far)


Fetching Classes:  42%|████▏     | 31/73 [00:23<00:31,  1.35it/s]

💾 Saved class cache (1739 total so far)


Fetching Classes:  44%|████▍     | 32/73 [00:23<00:29,  1.39it/s]

💾 Saved class cache (1839 total so far)


Fetching Classes:  45%|████▌     | 33/73 [00:24<00:30,  1.32it/s]

💾 Saved class cache (1939 total so far)


Fetching Classes:  47%|████▋     | 34/73 [00:25<00:32,  1.20it/s]

💾 Saved class cache (2037 total so far)


Fetching Classes:  48%|████▊     | 35/73 [00:26<00:28,  1.36it/s]

💾 Saved class cache (2136 total so far)


Fetching Classes:  49%|████▉     | 36/73 [00:26<00:26,  1.42it/s]

💾 Saved class cache (2236 total so far)


Fetching Classes:  51%|█████     | 37/73 [00:27<00:24,  1.47it/s]

💾 Saved class cache (2335 total so far)


Fetching Classes:  52%|█████▏    | 38/73 [00:28<00:23,  1.50it/s]

💾 Saved class cache (2435 total so far)


Fetching Classes:  53%|█████▎    | 39/73 [00:28<00:22,  1.53it/s]

💾 Saved class cache (2535 total so far)


Fetching Classes:  55%|█████▍    | 40/73 [00:29<00:22,  1.50it/s]

💾 Saved class cache (2634 total so far)


Fetching Classes:  56%|█████▌    | 41/73 [00:30<00:22,  1.43it/s]

💾 Saved class cache (2734 total so far)


Fetching Classes:  58%|█████▊    | 42/73 [00:30<00:22,  1.37it/s]

💾 Saved class cache (2833 total so far)


Fetching Classes:  59%|█████▉    | 43/73 [00:31<00:23,  1.29it/s]

💾 Saved class cache (2933 total so far)


Fetching Classes:  60%|██████    | 44/73 [00:32<00:21,  1.38it/s]

💾 Saved class cache (3031 total so far)


Fetching Classes:  62%|██████▏   | 45/73 [00:33<00:19,  1.42it/s]

💾 Saved class cache (3131 total so far)


Fetching Classes:  63%|██████▎   | 46/73 [00:33<00:18,  1.44it/s]

💾 Saved class cache (3231 total so far)


Fetching Classes:  64%|██████▍   | 47/73 [00:34<00:16,  1.56it/s]

💾 Saved class cache (3330 total so far)


Fetching Classes:  66%|██████▌   | 48/73 [00:34<00:16,  1.54it/s]

💾 Saved class cache (3430 total so far)


Fetching Classes:  67%|██████▋   | 49/73 [00:35<00:17,  1.38it/s]

💾 Saved class cache (3530 total so far)


Fetching Classes:  68%|██████▊   | 50/73 [00:36<00:17,  1.29it/s]

💾 Saved class cache (3630 total so far)


Fetching Classes:  70%|██████▉   | 51/73 [00:37<00:16,  1.33it/s]

💾 Saved class cache (3729 total so far)


Fetching Classes:  71%|███████   | 52/73 [00:38<00:14,  1.42it/s]

💾 Saved class cache (3828 total so far)


Fetching Classes:  73%|███████▎  | 53/73 [00:38<00:13,  1.54it/s]

💾 Saved class cache (3927 total so far)


Fetching Classes:  74%|███████▍  | 54/73 [00:39<00:11,  1.60it/s]

💾 Saved class cache (4027 total so far)


Fetching Classes:  75%|███████▌  | 55/73 [00:39<00:11,  1.55it/s]

💾 Saved class cache (4126 total so far)


Fetching Classes:  77%|███████▋  | 56/73 [00:40<00:11,  1.48it/s]

💾 Saved class cache (4226 total so far)


Fetching Classes:  78%|███████▊  | 57/73 [00:41<00:10,  1.55it/s]

💾 Saved class cache (4326 total so far)


Fetching Classes:  79%|███████▉  | 58/73 [00:41<00:09,  1.57it/s]

💾 Saved class cache (4426 total so far)


Fetching Classes:  81%|████████  | 59/73 [00:42<00:09,  1.51it/s]

💾 Saved class cache (4525 total so far)


Fetching Classes:  82%|████████▏ | 60/73 [00:43<00:09,  1.43it/s]

💾 Saved class cache (4625 total so far)


Fetching Classes:  84%|████████▎ | 61/73 [00:43<00:08,  1.48it/s]

💾 Saved class cache (4725 total so far)


Fetching Classes:  85%|████████▍ | 62/73 [00:44<00:08,  1.27it/s]

💾 Saved class cache (4825 total so far)


Fetching Classes:  86%|████████▋ | 63/73 [00:45<00:07,  1.33it/s]

💾 Saved class cache (4925 total so far)


Fetching Classes:  88%|████████▊ | 64/73 [00:46<00:06,  1.44it/s]

💾 Saved class cache (5024 total so far)


Fetching Classes:  89%|████████▉ | 65/73 [00:47<00:06,  1.24it/s]

💾 Saved class cache (5124 total so far)


Fetching Classes:  90%|█████████ | 66/73 [00:47<00:05,  1.34it/s]

💾 Saved class cache (5224 total so far)


Fetching Classes:  92%|█████████▏| 67/73 [00:48<00:04,  1.43it/s]

💾 Saved class cache (5324 total so far)


Fetching Classes:  93%|█████████▎| 68/73 [00:49<00:03,  1.47it/s]

💾 Saved class cache (5424 total so far)


Fetching Classes:  95%|█████████▍| 69/73 [00:49<00:02,  1.40it/s]

💾 Saved class cache (5522 total so far)


Fetching Classes:  96%|█████████▌| 70/73 [00:50<00:02,  1.48it/s]

💾 Saved class cache (5622 total so far)


Fetching Classes:  97%|█████████▋| 71/73 [00:51<00:01,  1.42it/s]

💾 Saved class cache (5720 total so far)


Fetching Classes:  99%|█████████▊| 72/73 [00:52<00:00,  1.35it/s]

💾 Saved class cache (5820 total so far)


Fetching Classes: 100%|██████████| 73/73 [00:53<00:00,  1.37it/s]

💾 Saved class cache (5911 total so far)
Final output saved as species_batch_001_with_class.csv





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [56]:
# Rebuild tax_class_dict DataFrame
class_df = pd.read_csv(class_cache_file)

# Merge using a full DataFrame join (safer than map)
final_df = df.merge(
    class_df,
    how="left",
    on="GENUS_SPECIES"
)

output_filename = batch_filename.replace(".csv", "_with_class.csv")
final_df.to_csv(output_filename, index=False)
print(f"Final output saved as {output_filename}")

✅ Final output saved as species_batch_001_with_class.csv


In [50]:
pd.DataFrame.from_dict(tax_class_dict, orient="index", columns=["taxonomic_class"])\
    .reset_index().rename(columns={"index": "GENUS_SPECIES"})\
    .to_csv(class_cache_file, index=False)

# Merge final results
df["taxonomic_class"] = df["GENUS_SPECIES"].map(tax_class_dict)
output_path = "species_with_class_batched.csv"
df.to_csv(output_path, index=False)
output_path

'species_with_class_batched.csv'

In [44]:
df_check = pd.read_csv("species_batch_001_with_class.csv")
print(df_check.head())

   Unnamed: 0    GENUS_SPECIES taxonomic_class
0           1      Acer rubrum             NaN
1           2   Acer saccharum             NaN
2           3    Acer spicatum             NaN
3           4  Corylus cornuta             NaN
4           5  Populus pinnata             NaN


In [43]:
# Combine all results
# Match all output files
batch_files = sorted(glob.glob("species_batch_*_with_class.csv"))

# Combine into one DataFrame
combined_df = pd.concat([pd.read_csv(f) for f in batch_files], ignore_index=True)

# Save the full result
combined_df.to_csv("all_species_with_class.csv", index=False)

In [None]:
#TESTING CODE