Goal: break down 6k list of data sources by type (buoy, glider, ship) and data (chl|turb|sedi|par|phot|secchi|ssc)

Order of operations:
- load all_relevant_stations.pkl
- sort by chl|turb|sedi|par|phot|secchi|ssc
- sort those by buoy/glider/ship

In [15]:
from erddapy import ERDDAP
import pandas as pd
import numpy as np
import urllib.request
import json 
import re

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


In [16]:
searchterms = 
# load a list of all the relevant ERDDAPs and their urls.
FF = pd.read_pickle("all_relevant_stations.pkl")
FF.loc[1000]

idx = df["Variable Name"].str.contains(searchterms, flags = re.IGNORECASE)


sites                             exploratorium-pco2-buoy
url                     http://erddap.cencoos.org/erddap/
time_coverage_start                  2013-05-08T22:47:00Z
time_coverage_end                    2024-07-01T16:17:00Z
geospatial_lat_min                                37.8019
geospatial_lat_max                                37.8019
geospatial_lon_min                              -122.3978
geospatial_lon_max                              -122.3978
geospatial_lat_units                        degrees_north
geospatial_lon_units                         degrees_east
check_sum                                             1.0
Name: 1000, dtype: object

In [18]:
# make the info URL for this site for this time range
jj = 1000
e = ERDDAP(server=FF['url'][jj],
           protocol="tabledap",
           response="csv"
          )
e.dataset_id = FF['sites'][jj]
# e.constraints = {"time>=": min_time, "time<=": max_time}
info_url = e.get_info_url()
ds_info = pd.read_csv(info_url)# make a dataframe for all the metadata for this station
ds_info

Unnamed: 0,Row Type,Variable Name,Attribute Name,Data Type,Value
0,attribute,NC_GLOBAL,cdm_data_type,String,TimeSeries
1,attribute,NC_GLOBAL,cdm_timeseries_variables,String,"station,longitude,latitude"
2,attribute,NC_GLOBAL,contributor_email,String,"dana.greeley@noaa.gov,cencoos_communications@m..."
3,attribute,NC_GLOBAL,contributor_name,String,"NOAA Pacific Marine Environmental Lab (PMEL),C..."
4,attribute,NC_GLOBAL,contributor_role,String,"collaborator,funder,contributor,processor"
5,attribute,NC_GLOBAL,contributor_role_vocabulary,String,NERC
6,attribute,NC_GLOBAL,contributor_url,String,"https://www.pmel.noaa.gov/co2/,http://cencoos...."
7,attribute,NC_GLOBAL,Conventions,String,"IOOS-1.2, CF-1.6, ACDD-1.3, NCCSV-1.2"
8,attribute,NC_GLOBAL,creator_country,String,USA
9,attribute,NC_GLOBAL,creator_email,String,stung@exploratorium.edu


In [None]:
# # try searching for all possibly matching stations for this time (there will be a ton)
# min_time = "2018-10-01T00:00:00Z"
# max_time = "2024-01-01T00:00:00Z"

# kw = {
#     "min_time": min_time,
#     "max_time": max_time,
# }

# FF["time_coverage_start"] = np.nan
# FF["time_coverage_end"] = np.nan
# FF["geospatial_lat_min"] = np.nan
# FF["geospatial_lat_max"] = np.nan
# FF["geospatial_lon_min"] = np.nan
# FF["geospatial_lon_max"] = np.nan
# FF["geospatial_lat_units"] = ''
# FF["geospatial_lon_units"] = ''
# FF["check_sum"] = 1





In [None]:
# keep 'latitude' 'longitude' 'time' variables (min/max of each?)
searchterms = r"chl|turb|sedi|par|phot|secchi|ssc"

for jj in range(len(FF)):
    if jj % 100 ==0:
        print(jj)# give a printout every 100 for my sanity
    
    # make the info URL for this site for this time range
    e = ERDDAP(server=FF['url'][jj], protocol="tabledap", response="csv")
    e.dataset_id = FF['sites'][jj]
    e.constraints = {"time>=": min_time, "time<=": max_time}
    info_url = e.get_info_url()
#     print(info_url)

    # make a dataframe for all the metadata for this station
    # some of these urls are bogus - if fail then fill with NaNs
    try:
        df = pd.read_csv(info_url)# make a dataframe for all the metadata for this station
    except:
        FF.loc[jj,"check_sum"] = np.nan
#         print(str(jj),' does not exist')    
    else:
        # search the Variable Names for relevant measurements and minimum amount of metadata (lat/lon)
        idx = df["Variable Name"].str.contains(searchterms, flags = re.IGNORECASE)
        if any(idx==True)==True:# if there IS a relevant variable        
            e.constraints = {}# find the entire time range for the station and its location
            info_url = e.get_info_url() # just get metadata instead of downloading data(e.get_download_url)
            try:
                df = pd.read_csv(info_url)# make a dataframe for all the metadata for this station
            except:
                FF.loc[jj,"check_sum"] = np.nan
#                 print(str(jj),' does not exist')
            else:

                # some metadata is missing - leave blank if any is empty
                try:
                    FF.loc[jj,"time_coverage_start"] = df.loc[df['Attribute Name']=='time_coverage_start', 'Value'].item()
                except:
                    pass

                try:
                    FF.loc[jj,"time_coverage_end"] = df.loc[df['Attribute Name']=='time_coverage_end', 'Value'].item()
                except:
                    pass

                try:
                    FF.loc[jj,"geospatial_lat_min"] = df.loc[df['Attribute Name']=='geospatial_lat_min', 'Value'].item()
                except:
                    pass

                try:
                    FF.loc[jj,"geospatial_lat_max"] = df.loc[df['Attribute Name']=='geospatial_lat_max', 'Value'].item()
                except:
                    pass

                try:
                    FF.loc[jj,"geospatial_lon_min"] = df.loc[df['Attribute Name']=='geospatial_lon_min', 'Value'].item()
                except:
                    pass

                try:
                    FF.loc[jj,"geospatial_lon_max"] = df.loc[df['Attribute Name']=='geospatial_lon_max', 'Value'].item()
                except:
                    pass

                try:
                    FF.loc[jj,"geospatial_lat_units"] = df.loc[df['Attribute Name']=='geospatial_lat_units', 'Value'].item()
                except:
                    pass

                try:
                    FF.loc[jj,"geospatial_lon_units"] = df.loc[df['Attribute Name']=='geospatial_lon_units', 'Value'].item()
                except:
                    pass

        else: # otherwise ignore this site
#             print(str(jj),' has no relevant data')
            FF.loc[jj,"check_sum"] = np.nan

FF = FF.dropna(subset="check_sum")  
FF.to_pickle("all_relevant_stations.pkl")


In [None]:
FF_save = FF.drop({'check_sum','geospatial_lat_units','geospatial_lon_units'},axis=1)
FF_save.to_csv('erddap_station_search_results.csv',index=False)
print(len(FF_save))
FF_save

In [None]:
# # keep 'latitude' 'longitude' 'time' variables (min/max of each?)
# searchterms = r"chl|turb|sedi|par|phot|secchi|ssc"

# for jj in range(len(FF)):

#     # make the info URL for this site for this time range
#     e = ERDDAP(server=FF['url'][jj], protocol="tabledap", response="csv")
#     e.dataset_id = FF['sites'][jj]
#     e.constraints = {"time>=": min_time, "time<=": max_time}
#     info_url = e.get_info_url()
# #     print(info_url)

#     # make a dataframe for all the metadata for this station
#     df = pd.read_csv(info_url)

#     # search the Variable Names for relevant measurements and minimum amount of metadata (lat/lon)
#     idx = df["Variable Name"].str.contains(searchterms, flags = re.IGNORECASE)
#     idx2 = df["Attribute Name"].str.contains(r"geospatial_lat_min")
#     if (any(idx==True)==True) & (any(idx2==True)==True): # if there IS a relevant variable
#         # save the entire time range for the station and its location
#         e.constraints = {}
#         info_url = e.get_info_url() # just get metadata instead of downloading data(e.get_download_url)
#         # make a dataframe for all the metadata for this station
#         df = pd.read_csv(info_url)
        
#         FF.loc[jj,"min_time"] = df.loc[df['Attribute Name']=='time_coverage_start', 'Value'].item()
#         FF.loc[jj,"max_time"] = df.loc[df['Attribute Name']=='time_coverage_end', 'Value'].item()
#         FF.loc[jj,"min_lat"] = df.loc[df['Attribute Name']=='geospatial_lat_min', 'Value'].item()
#         FF.loc[jj,"max_lat"] = df.loc[df['Attribute Name']=='geospatial_lat_max', 'Value'].item()
#         FF.loc[jj,"min_lon"] = df.loc[df['Attribute Name']=='geospatial_lon_min', 'Value'].item()
#         FF.loc[jj,"max_lon"] = df.loc[df['Attribute Name']=='geospatial_lon_max', 'Value'].item()
#         FF.loc[jj,"lat_unit"] = df.loc[df['Attribute Name']=='geospatial_lat_units', 'Value'].item()
#         FF.loc[jj,"lon_unit"] = df.loc[df['Attribute Name']=='geospatial_lon_units', 'Value'].item()
#     else: # otherwise ignore this site
#         print(str(jj),' has no relevant data')
    
# FF