In [2]:
# imports
import requests
import time
import pandas as pd

In [3]:
# read csv 
df_observations = pd.read_csv("../files/tidy/observations.csv")

# no need for repeated names, use unique() to get distinct values.
request_list = df_observations["taxon.name"].unique().tolist()

In [4]:
# WoRMS API URLs
base_url = "https://www.marinespecies.org/rest"
# aphia_id_endpoint = "/AphiaIDByName/"
aphia_record_endpoint = "/AphiaRecordsByName/"
classification_endpoint = "/AphiaClassificationByAphiaID/"

# results list to convert into pd df
worms_output = []

In [5]:
# Iterate through the taxon list
for taxon in request_list:
    try:
        print(f"Processing taxon: {taxon}")
        
        # Get the full record for the taxon
        response = requests.get(base_url + aphia_record_endpoint + taxon)
        response.raise_for_status()
        records = response.json()

        if records:
            # Store all records in the output
            print(f"  {len(records)} record(s) found for {taxon}.")
            for record in records:
                valid_aphiaID = record.get("valid_AphiaID", None)
                
                # Retrieve classification if valid AphiaID exists
                if valid_aphiaID:
                    print(f"    Valid AphiaID found: {valid_aphiaID}. Retrieving classification...")
                    classification_response = requests.get(base_url + classification_endpoint + str(valid_aphiaID))
                    classification_response.raise_for_status()
                    taxa_breakdown = classification_response.json()
                    
                    # Store the record and classification
                    taxon_dict = {"observation_taxon": taxon, "record": record}
                    taxon_dict.update(taxa_breakdown)
                    worms_output.append(taxon_dict)
                    print(f"    Classification retrieved successfully for {taxon}.")
                else:
                    # Store the record without classification
                    print(f"    No valid AphiaID for this record.")
                    worms_output.append({"observation_taxon": taxon, "record": record, "Error": "No valid AphiaID"})
        else:
            print(f"  No records found for {taxon}.")
            worms_output.append({"observation_taxon": taxon, "Error": "No records found"})

    except Exception as e:
        print(f"  Error processing {taxon}: {e}")
        worms_output.append({"observation_taxon": taxon, "Error": str(e)})

Processing taxon: Haliotis_tuberculata
  7 record(s) found for Haliotis_tuberculata.
    Valid AphiaID found: 140059. Retrieving classification...
    Classification retrieved successfully for Haliotis_tuberculata.
    Valid AphiaID found: 146456. Retrieving classification...
    Classification retrieved successfully for Haliotis_tuberculata.
    Valid AphiaID found: 146456. Retrieving classification...
    Classification retrieved successfully for Haliotis_tuberculata.
    Valid AphiaID found: 596236. Retrieving classification...
    Classification retrieved successfully for Haliotis_tuberculata.
    Valid AphiaID found: 140059. Retrieving classification...
    Classification retrieved successfully for Haliotis_tuberculata.
    Valid AphiaID found: 180881. Retrieving classification...
    Classification retrieved successfully for Haliotis_tuberculata.
    Valid AphiaID found: 180881. Retrieving classification...
    Classification retrieved successfully for Haliotis_tuberculata.
Proce

In [None]:
# bring me the AphiaID and taxonomic breakdown for each taxon
# first tries are taking around three minutes so let's store that for further comparisons and improvements
start_time = time.time()
for taxon in request_list:
    try:
        # params = {"is_extant": "true", "status": "accepted", "kingdom": "Animalia"}
        aphia_record_response = requests.get(base_url + aphia_record_endpoint + taxon)
        aphia_record_response.raise_for_status()
        aphia_record = aphia_record_response.json()
        
        if aphia_record:
            print(f"  Record found: {aphia_record}...")
            classification_response = requests.get(base_url + classification_endpoint + str(aphia_id))
            classification_response.raise_for_status()
            taxa_breakdown = classification_response.json()
            taxon_dict = {"observation_taxon": taxon}
            taxon_dict.update(taxa_breakdown) 
            worms_output.append(taxon_dict)
            print(f"  Classification retrieved for {taxon}.")
        else:
            worms_output.append({"observation_taxon": taxon, "Error": "No AphiaID found"})
    except Exception as e:
        worms_output.append({"observation_taxon": taxon, "Error": str(e)})
end_time = time.time()


In [None]:
# benchmark
duration = end_time - start_time 
min = int(duration // 60)
s = int(duration % 60)
print(f"This lasted {min}:{s} minutes")

In [74]:
# stoure output into a pandas df 
worms_df2 = pd.DataFrame(worms_output)

#  there are missing values that need fixing
# result_df.to_csv("../files/tidy/worms_output_csv", index=False)


In [75]:
# troubleshooting missing values
fix_df = worms_df2[worms_df2["record"].isnull()]
fix_df.columns.tolist() #list columns to drop ones I don't need


['observation_taxon',
 'record',
 'AphiaID',
 'rank',
 'scientificname',
 'child',
 'Error']

In [76]:
# I just need the taxon and the error to check what"s going on
fix_df1 = fix_df.drop(['record', 'AphiaID', 'rank', 'scientificname', 'child'], axis = 1)
# fix_df.to_csv('../files/tidy/log_issues.csv', index=False)




# Issues and Ideas

After manually looking through the entries, I decided to get the observations.csv through the Match Taxa tool from Worms
Main findings are that, as the API instructions expressed, ambiguous values would return -999
- perhaps using status = accepted as a filter may reduce errors
    - added also kingdom = animalia and is_extant = true
    - tried it with the parameters but didn't work
- There are cases like exact matches but one is accepted and the other is not, like Terebellidae, a family of annelids (accepted) and another one for gastropods (unaccepted)
- How to handle multiple exact values? For example, Echinacea has https://marinespecies.org/aphia.php?p=taxdetails&id=1076372 for plants and https://marinespecies.org/aphia.php?p=taxdetails&id=149855 for echinoids

Species that returned no issues with Match Taxa:
- Pinctada mazatlanica
- Echinaster (Othilia) spinulosus
    - Echinaster as a genus is not accepted in WoRMS.
- Mithrodia bradleyi


 Ambiguous:  
- Nuttallina
- Aplysia
- Holothuria
- Pennaria 
- Ctenophora. There are four values while the other ones have just two and one is accepted. Of these four values, two are unaccepted but the other one accepted is a genus from the Chromista kingdom. 
- Tubastraea coccinea. There is an uncertain entry.

Use new endpoint AphiaRecordsByName and filter values. 

# Notes
Fetching genus-level records will also look for direct children, including species and subspecies. This took 34 min and 41.5 s to run for 189 taxa.