In [1]:
# imports
import requests
import time
import pandas as pd
import json

In [2]:
# read csv 
df_observations = pd.read_csv("../files/tidy/inat_observations.csv")

# no need for repeated names, use unique() to get distinct values.
request_list = df_observations["taxon.name"].unique().tolist()
request_list.sort()
total = len(request_list)
print(f"There are {total} unique taxon names")

taxa_df = pd.DataFrame(request_list)
taxa_df.to_csv('../files/tidy/inat_request_list.csv', index = False, header = False)

There are 257 unique taxon names


In [3]:
# WoRMS API URLs
base_url = "https://www.marinespecies.org/rest"
aphia_record_endpoint = "/AphiaRecordsByName/"
classification_endpoint = "/AphiaClassificationByAphiaID/"

# results list to convert into pd df later
worms = []

In [4]:
def get_aphia_records(scientific_name):
    try:
        response = requests.get(base_url + aphia_record_endpoint + scientific_name)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        print(f"Error for {scientific_name}: {e}")
        return None

In [5]:
# start_time = time.time()
# for i in request_list:
#     print(f"Going through {i}")
#     result = get_aphia_records(i)
#     if result:
#         accepted = [record for record in result if record.get('status') == 'accepted' and record.get('kingdom') == 'Animalia']
#         worms.extend(accepted)
# end_time = time.time()

In [6]:
def get_target_rank(records, target_name):
    target_name = str(target_name).lower().replace("_", " ")

    for record in records:
        scientificname = str(record.get("scientificname") or "").lower()
        status = record.get("status")
        if scientificname == target_name and status in {"accepted", "alternative representation"}:
            return record.get("rank")
    return None

In [7]:
start_time = time.time()
for i in request_list:
    query_name = i.replace("_", " ")
    print(f"Going through {i}")
    result = get_aphia_records(query_name)
    if result:
        # print(f"API Response for {i}:")
        # print(json.dumps(result, indent=2))
        target_rank = get_target_rank(result, i)
        if target_rank:
            filtered_records = [
                record for record in result
                if record.get("status") in {"accepted", "alternative representation"}
                and record.get("rank") == target_rank
                and str(record.get("scientificname") or "").lower().replace(" ", "_") == i.lower() 
            ]
            worms.extend(filtered_records)  
end_time = time.time()

Going through Abudefduf_troschelii
Going through Acanthurus_coeruleus
Going through Acanthurus_nigricans
Going through Acanthurus_xanthopterus
Going through Acropora_palmata
Going through Actiniaria
Going through Aetobatus_laticeps
Going through Agaricia_tenuifolia
Going through Ammotheidae
Going through Amphiodia_occidentalis
Going through Anemonia_viridis
Going through Aniculus_elegans
Going through Anisotremus_davidsonii
Going through Anisotremus_taeniatus
Going through Anisotremus_virginicus
Going through Antennariidae
Going through Anthopleura_elegantissima
Going through Anthopleura_sola
Going through Anthopleura_xanthogrammica
Going through Anthozoa
Going through Antillogorgia_americana
Going through Aplysia
Going through Aplysia_californica
Going through Aplysia_dactylomela
Going through Aplysia_punctata
Going through Aplysia_vaccaria
Going through Aplysina_gerardogreeni
Going through Apostichopus_californicus
Going through Arachnactidae
Going through Armina_californica
Going th

In [8]:
# benchmark
duration = end_time - start_time 
min = int(duration // 60)
s = int(duration % 60)
print(f"This lasted {min}:{s:02d} minutes")
# Lasted 4:09 minutes for 256 taxa without filtering
# Duration depends on internet connection

# After adding the status = accepted, it lasted 3:33 minutes
# Still have to check if there were any changes

# After adding status = accepted and kingdom = Animalia, it lasted 3:15 minutes

This lasted 3:26 minutes


In [9]:
worms = pd.DataFrame(worms)

In [10]:
worms.to_csv('../files/tidy/worms_output_test.csv', index=False)