In [13]:
import pandas as pd
import folium
import numpy as np

import requests
from requests.auth import HTTPBasicAuth
from io import StringIO
from dotenv import dotenv_values

# load config from .env file
config = dotenv_values()

In [8]:
def erddap_password_protected(query):
    """Download from a password protected erddap"""
    response = requests.get(
        query,                     
        auth = HTTPBasicAuth(config['user'], config['password']),
        stream=True
        )
    if response.status_code == 200:
        return StringIO(response.text) 
    else:
        print(response.text)



# Review BIO Stations

In [6]:
# Get list of all the stations and lat/long from the erddap dataset:
url = f"{config['server']}/tabledap/{config['dataset']}.csv?station%2Cid%2Clatitude%2Clongitude&time%3E=1995-07-06T14%3A59%3A10Z&time%3C=2020-12-22T13%3A11%3A30Z&distinct()"
df = pd.read_csv(erddap_password_protected(url),skiprows=[1])

# Get average lat/lon per station and how many files are associated
df_mean_station = df.groupby(['station'])[['latitude','longitude']].mean()
df_station_count = df.groupby(['station'])[['id']].count()

df_stations = df_mean_station.join(df_station_count).rename(columns={'id':'profile_count'})

ValueError: Invalid file path or buffer object type: <class 'NoneType'>

In [12]:
# Retrieve reference stations from 
df_ref = pd.read_csv('../reference_stations.csv').set_index('station').drop(columns=['group','comments','depth_m']).add_prefix('ref_')

In [13]:
# Combine reference stations to the mean location
df = df_stations.join(df_ref)
# df.to_csv('DFO-BIO_station_mean_location.csv')
df

Unnamed: 0_level_0,latitude,longitude,profile_count,ref_longitude,ref_latitude
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AR7W01,53.676095,-55.550619,21,-55.55000,53.68000
AR7W02,53.795911,-55.437044,18,-55.43667,53.79667
AR7W03,53.987005,-55.250330,20,-55.25000,53.98833
AR7W04,54.220464,-55.025759,22,-55.02333,54.22000
AR7W05,54.490000,-54.754295,22,-54.75667,54.49167
...,...,...,...,...,...
YL06,43.398540,-68.663020,10,-68.66400,43.39900
YL07,43.328800,-69.106780,10,-69.10600,43.32800
YL08,43.258300,-69.557660,10,-69.55700,43.25800
YL09,43.186500,-70.009260,10,-70.01000,43.18600


In [157]:
# Show on a map
m = folium.Map(location=[df['latitude'].mean(),df['longitude'].mean()],zoom_start=4)
for station, row in df.iterrows():
    if -90< row['ref_latitude']<90 :
        folium.Marker(location=[row['ref_latitude'],row['ref_longitude']],popup=station).add_to(m)
    else:
        folium.Marker(location=[row['latitude'],row['longitude']],popup=f"{station}: {int(row['profile_count'])} files related",icon=folium.Icon(color="red", icon="info-sign"),).add_to(m)

m

In [200]:
# get list of stations for which we don't have a reference lat/lon
df.query(' not -90<ref_latitude<90')

Unnamed: 0_level_0,latitude,longitude,profile_count,ref_longitude,ref_latitude
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LC09,46.8713,-58.6354,2,,
LC14,45.7321,-56.8519,2,,
LC15,45.3467,-56.9531,2,,
LC20,45.3605,-56.6737,2,,
LC23,45.9754,-57.647,2,,
LC24,46.6359,-57.9221,2,,
LC25,46.6302,-57.6328,2,,
Prince 5,44.932011,-66.849721,440,,


# Review ERDDAP
    

In [209]:
variables = [
    ("PSLTZZ01",-10),
    ("PSLTZZ02",-10),
    ("TEMPP682",-10),
    ("CNDCST01",-1),
    ("CNDCST02",-1),
    ("DOXYZZ01",-10),
    ("DOXYZZ02",-10),
    ("CDOMZZ01",-1),
    ("CDOMZZ02",-1),
    ("CPHLPR01",-10),
    ("CPHLPR02",-10),
    ("IRRDUV01",-10),
]

df_bad_values = pd.DataFrame()
for var, lim in variables:
    condition = f"{var}<{lim}"
    try:
        df_temp = pd.read_csv(erddap_password_protected(f"{config['server']}/tabledap/{config['dataset']}.csv?id&{condition}&distinct()"),index_col='id')
    except:
        print(f'failed condition {condition}')
        continue
    df_temp[condition] = True
    df_bad_values = df_bad_values.join(df_temp, how='outer')

# Output result to a csv
df_bad_values.replace({pd.NA:False}).to_csv('bad_values.csv')
print(f"{len(df_bad_values)} files seems to accept the conditions.")