In [None]:
# Searches an ERDDAP Server based on a time range and then filters those 
# datasets by a list of standard names to find the appropriate variable names.

import requests
import httpx
import pandas as pd
from erddapy import ERDDAP

server = "https://cioosatlantic.ca/erddap"
e = ERDDAP(server=server)

# Extracts data from the erddap metadata Pandas dataframe, NC_GLOBAL and
# row type attribute are assumed as defaults for variable specific values
# you'll need to specify those features
def erddap_meta(metadata, attribute_name, row_type="attribute", var_name="NC_GLOBAL"):
    # Example: uuid = metadata[(metadata['Variable Name']=='NC_GLOBAL') & (metadata['Attribute Name']=='uuid')]['Value'].values[0]
    return_value = {"value": None, "type": None}

    try:
        return_value["value"] = metadata[(metadata["Variable Name"] == var_name) & (metadata["Attribute Name"] == attribute_name)]["Value"].values[0]
        return_value["type"] = metadata[(metadata["Variable Name"] == var_name) & (metadata["Attribute Name"] == attribute_name)]["Data Type"].values[0]

    except IndexError:
        message = (
            f"IndexError (Not found?) extracting ERDDAP Metadata: attribute: {attribute_name}, row_type: {row_type}, var_name: {var_name}"
        )
        print(message)

    return return_value


In [None]:
# Define list of standard names to query ERDDAP with because actual variable 
# names will vary from dataset to dataset.
# 
# Querying by standard names gives us a universal method to interrogate a 
# dataset, resulting variable names can then be used to lookup long names and other attributes

standard_names = [
    "wind_speed",
    "wind_speed_of_gust",
    "wind_from_direction",
    "air_temperature",
    "air_pressure",
    "relative_humidity",
    "sea_surface_temperature",
    "sea_surface_wave_significant_height",
    "sea_surface_wave_maximum_height",
    "sea_surface_wave_maximum_period",
    "sea_surface_wave_from_direction",
    "sea_surface_wave_mean_period",
    "sea_surface_wave_zero_upcrossing_period"
]

# Date range for Hurricane FIONA (2022)
min_time = "2022-09-20"
max_time = "2022-09-30"

# Get dataset list based on date range and other attributes if desired 
# (like a bounding box)
search_url = e.get_search_url(response="csv", min_time=min_time, max_time=max_time)
search = pd.read_csv(search_url)

dataset_list = search["Dataset ID"].values

In [None]:
dataset_list

In [None]:
# Interrogate each dataset for the list of variable names using the list 
# of standard names above

final_dataset_list = {}

for dataset_id in dataset_list:
    dataset_vars = e.get_var_by_attr(dataset_id, standard_name=lambda std_name: std_name in standard_names)
    
    if dataset_vars:
        # Fetch dataset metadata from ERDDAP based on dataset ID, assign to 
        # dictionary with variables of interest.

        metadata_url = e.get_download_url(
            dataset_id=f"{dataset_id}/index", response="csv", protocol="info"
        )

        metadata = pd.read_csv(filepath_or_buffer=metadata_url)
        
        final_dataset_list[dataset_id] = {
            "vars" : ["time", "latitude", "longitude"] + dataset_vars,
            "meta" : metadata
        }
        
    else:
        print(dataset_id, "Doesn't have any matching variables.")



In [None]:
# Iterate through datasets and create a mapping between variable names and standard names
for dataset_id in final_dataset_list.keys():
    print(dataset_id)
    
    # A dictionary to hold the variable name mappings
    replace_cols = {}

    for var in final_dataset_list[dataset_id]["vars"]:
        metadata = final_dataset_list[dataset_id]["meta"]

        standard_name = erddap_meta(metadata=metadata, attribute_name="standard_name", var_name=var)["value"]
        units = erddap_meta(metadata=metadata, attribute_name="units", var_name=var)["value"]
        long_name = erddap_meta(metadata=metadata, attribute_name="long_name", var_name=var)["value"]

        # Time columns usually have the unit of time in unix timestamp
        if units.find("seconds since") > -1:
            units = "UTC"

        # standard_name = metadata[(metadata["Variable Name"] == var) & (metadata["Attribute Name"] == "standard_name")]["Value"].values[0]
        replace_cols[var] = f"{standard_name}|{units}|{long_name}"
        print(var, " => ", standard_name)
    
    print(replace_cols)

    # Once variable names have been 
    e.protocol = "tabledap"
    e.dataset_id = dataset_id
    e.variables = final_dataset_list[dataset_id]["vars"]
    e.constraints = {
        "time>=": min_time,
        "time<=": max_time
    }

    try:
        df = e.to_pandas()
        # print(df.info())
        
        # !!! Uncomment this block to move time to the dataframe index and remove the original column !!!
        #
        # df["time (UTC)"] = pd.to_datetime(df["time (UTC)"])
        # df.set_index(df['time (UTC)'], inplace=True)
        # df.drop("time (UTC)", axis="columns", inplace=True)
        # del replace_cols['time']

        # Remap columns to incorporate standard name, long name and units
        df.columns = map(lambda col: col + " (" + replace_cols[col] + ")", replace_cols.keys())
        print(df.head(3))

    except (requests.HTTPError, httpx.HTTPError) as ex:
        print("HTTPError", ex)
        print(f" - No data found for time range: {min_time} - {max_time}")

    print("\n")
