In [1]:
# Import libraries
import os, shutil, sys, time, re, requests, csv, datetime, pytz
import yaml
import pandas as pd
import numpy as np
import netCDF4 as nc
import xarray as xr
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import the OOI M2M tool
sys.path.append("/home/andrew/Documents/OOI-CGSN/ooinet/ooinet/")
from m2m import M2M

In [3]:
# Import user info for connecting to OOINet via M2M
userinfo = yaml.load(open("../../../../QAQC_Sandbox/user_info.yaml"), Loader=yaml.FullLoader)
username = userinfo["apiname"]
token = userinfo["apikey"]

In [4]:
OOINet = M2M(username, token)

In [5]:
OOINet.URLS

{'data': 'https://ooinet.oceanobservatories.org/api/m2m/12576/sensor/inv',
 'anno': 'https://ooinet.oceanobservatories.org/api/m2m/12580/anno/find',
 'vocab': 'https://ooinet.oceanobservatories.org/api/m2m/12586/vocab/inv',
 'asset': 'https://ooinet.oceanobservatories.org/api/m2m/12587',
 'deploy': 'https://ooinet.oceanobservatories.org/api/m2m/12587/events/deployment/inv',
 'preload': 'https://ooinet.oceanobservatories.org/api/m2m/12575/parameter',
 'cal': 'https://ooinet.oceanobservatories.org/api/m2m/12587/asset/cal',
 'fileServer': 'https://opendap.oceanobservatories.org/thredds/fileServer/',
 'dodsC': 'https://opendap.oceanobservatories.org/thredds/dodsC/'}

In [6]:
# Reset the M2M location to ooinet-dev1-west.intra.oceanobservatories.org
for key in OOINet.URLS:
    url = OOINet.URLS.get(key)
    if "opendap" in url:
        dev1_url = re.sub("opendap", "opendap-dev1-west.intra", url)
    else:
        dev1_url = re.sub("ooinet","ooinet-dev1-west.intra", url)
    OOINet.URLS[key] = dev1_url

In [7]:
OOINet.URLS

{'data': 'https://ooinet-dev1-west.intra.oceanobservatories.org/api/m2m/12576/sensor/inv',
 'anno': 'https://ooinet-dev1-west.intra.oceanobservatories.org/api/m2m/12580/anno/find',
 'vocab': 'https://ooinet-dev1-west.intra.oceanobservatories.org/api/m2m/12586/vocab/inv',
 'asset': 'https://ooinet-dev1-west.intra.oceanobservatories.org/api/m2m/12587',
 'deploy': 'https://ooinet-dev1-west.intra.oceanobservatories.org/api/m2m/12587/events/deployment/inv',
 'preload': 'https://ooinet-dev1-west.intra.oceanobservatories.org/api/m2m/12575/parameter',
 'cal': 'https://ooinet-dev1-west.intra.oceanobservatories.org/api/m2m/12587/asset/cal',
 'fileServer': 'https://opendap-dev1-west.intra.oceanobservatories.org/thredds/fileServer/',
 'dodsC': 'https://opendap-dev1-west.intra.oceanobservatories.org/thredds/dodsC/'}

In [8]:
OOINet.search_datasets()

https://ooinet-dev1-west.intra.oceanobservatories.org/api/m2m/12576/sensor/inv


Unnamed: 0,array,node,instrument,refdes,url,deployments
0,RS03INT2,MJ03D,06-BOTPTA303,RS03INT2-MJ03D-06-BOTPTA303,https://ooinet-dev1-west.intra.oceanobservator...,[1]
1,RS03ECAL,MJ03E,06-BOTPTA302,RS03ECAL-MJ03E-06-BOTPTA302,https://ooinet-dev1-west.intra.oceanobservator...,[1]
2,RS03CCAL,MJ03F,05-BOTPTA301,RS03CCAL-MJ03F-05-BOTPTA301,https://ooinet-dev1-west.intra.oceanobservator...,[1]
3,RS03AXPD,DP03A,06-DOSTAD304,RS03AXPD-DP03A-06-DOSTAD304,https://ooinet-dev1-west.intra.oceanobservator...,"[1, 2, 3, 4, 5, 6, 7]"
4,RS03AXPD,DP03A,03-FLNTUA302,RS03AXPD-DP03A-03-FLNTUA302,https://ooinet-dev1-west.intra.oceanobservator...,"[1, 2, 3, 4, 5, 6, 7]"
5,RS03AXPD,DP03A,03-FLCDRA302,RS03AXPD-DP03A-03-FLCDRA302,https://ooinet-dev1-west.intra.oceanobservator...,"[1, 2, 3, 4, 5, 6, 7]"
6,RS03AXPD,DP03A,02-VEL3DA303,RS03AXPD-DP03A-02-VEL3DA303,https://ooinet-dev1-west.intra.oceanobservator...,"[1, 2, 3, 4, 5, 6, 7]"
7,RS03AXPD,DP03A,01-CTDPFL304,RS03AXPD-DP03A-01-CTDPFL304,https://ooinet-dev1-west.intra.oceanobservator...,"[1, 2, 3, 4, 5, 6, 7]"
8,RS01SBPS,PC01A,4A-DOSTAD103,RS01SBPS-PC01A-4A-DOSTAD103,https://ooinet-dev1-west.intra.oceanobservator...,"[1, 2, 3, 4, 5, 6, 7, 8]"
9,RS01SBPS,PC01A,4A-CTDPFA103,RS01SBPS-PC01A-4A-CTDPFA103,https://ooinet-dev1-west.intra.oceanobservator...,"[1, 2, 3, 4, 5, 6, 7, 8]"


In [64]:
refdes = "GI03FLMA-RIS01-04-PHSENF000"

In [65]:
OOINet.get_datastreams(refdes)

Unnamed: 0,refdes,method,stream
0,GI03FLMA-RIS01-04-PHSENF000,recovered_inst,phsen_abcdef_instrument
1,GI03FLMA-RIS01-04-PHSENF000,recovered_inst,phsen_abcdef_metadata


In [66]:
method = "recovered_inst"
stream = "phsen_abcdef_instrument"

In [12]:
thredds_url = "https://opendap-dev1-west.intra.oceanobservatories.org/thredds/catalog/ooi/areed@whoi.edu/20211207T183204327Z-CP03ISSM-RID27-03-CTDBPC000-recovered_inst-ctdbp_cdef_instrument_recovered/catalog.html"

In [67]:
thredds_url = OOINet.get_thredds_url(refdes, method, stream)
thredds_url

Waiting for GI03FLMA-RIS01-04-PHSENF000-recovered_inst-phsen_abcdef_instrument to process.
Waiting: 100%|████████████████████████████████| 400/400 [01:03<00:00,  6.34it/s]


'https://opendap-dev1-west.intra.oceanobservatories.org/thredds/catalog/ooi/areed@whoi.edu/20211207T224853521Z-GI03FLMA-RIS01-04-PHSENF000-recovered_inst-phsen_abcdef_instrument/catalog.html'

In [68]:
catalog = OOINet.get_thredds_catalog(thredds_url)
catalog

['catalog.html?dataset=ooi/areed@whoi.edu/20211207T224853521Z-GI03FLMA-RIS01-04-PHSENF000-recovered_inst-phsen_abcdef_instrument/deployment0001_GI03FLMA-RIS01-04-PHSENF000-recovered_inst-phsen_abcdef_instrument_20140913T000000-20150818T075959.nc',
 'catalog.html?dataset=ooi/areed@whoi.edu/20211207T224853521Z-GI03FLMA-RIS01-04-PHSENF000-recovered_inst-phsen_abcdef_instrument/deployment0002_GI03FLMA-RIS01-04-PHSENF000-recovered_inst-phsen_abcdef_instrument_20150819T020000-20160717T055959.nc',
 'catalog.html?dataset=ooi/areed@whoi.edu/20211207T224853521Z-GI03FLMA-RIS01-04-PHSENF000-recovered_inst-phsen_abcdef_instrument/deployment0003_GI03FLMA-RIS01-04-PHSENF000-recovered_inst-phsen_abcdef_instrument_20160713T000000-20170812T095959.nc',
 'catalog.html?dataset=ooi/areed@whoi.edu/20211207T224853521Z-GI03FLMA-RIS01-04-PHSENF000-recovered_inst-phsen_abcdef_instrument/deployment0004_GI03FLMA-RIS01-04-PHSENF000-recovered_inst-phsen_abcdef_instrument_20170809T000000-20180615T055959.nc',
 'catalo

In [69]:
catalog = sorted(OOINet.parse_catalog(catalog, exclude=["ENG", "blank"]))
catalog

['catalog.html?dataset=ooi/areed@whoi.edu/20211207T224853521Z-GI03FLMA-RIS01-04-PHSENF000-recovered_inst-phsen_abcdef_instrument/deployment0001_GI03FLMA-RIS01-04-PHSENF000-recovered_inst-phsen_abcdef_instrument_20140913T000000-20150818T075959.nc',
 'catalog.html?dataset=ooi/areed@whoi.edu/20211207T224853521Z-GI03FLMA-RIS01-04-PHSENF000-recovered_inst-phsen_abcdef_instrument/deployment0002_GI03FLMA-RIS01-04-PHSENF000-recovered_inst-phsen_abcdef_instrument_20150819T020000-20160717T055959.nc',
 'catalog.html?dataset=ooi/areed@whoi.edu/20211207T224853521Z-GI03FLMA-RIS01-04-PHSENF000-recovered_inst-phsen_abcdef_instrument/deployment0003_GI03FLMA-RIS01-04-PHSENF000-recovered_inst-phsen_abcdef_instrument_20160713T000000-20170812T095959.nc',
 'catalog.html?dataset=ooi/areed@whoi.edu/20211207T224853521Z-GI03FLMA-RIS01-04-PHSENF000-recovered_inst-phsen_abcdef_instrument/deployment0004_GI03FLMA-RIS01-04-PHSENF000-recovered_inst-phsen_abcdef_instrument_20170809T000000-20180615T055959.nc',
 'catalo

In [70]:
OOINet.REFDES = refdes

In [71]:
ds = OOINet.load_netCDF_datasets(catalog)
ds

Checking and removing bad files: 
[########################################] | 100% Completed | 17.6s

Loading netCDF_files for GI03FLMA-RIS01-04-PHSENF000:
[########################################] | 100% Completed | 17.6s


Unnamed: 0,Array,Chunk
Bytes,1.84 MiB,307.19 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,|S64,numpy.ndarray
"Array Chunk Bytes 1.84 MiB 307.19 kiB Shape (30133,) (4915,) Count 34 Tasks 7 Chunks Type |S64 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,1.84 MiB,307.19 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,|S64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 235.41 kiB 38.40 kiB Shape (30133,) (4915,) Count 34 Tasks 7 Chunks Type float64 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,117.71 kiB,19.20 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 117.71 kiB 19.20 kiB Shape (30133,) (4915,) Count 34 Tasks 7 Chunks Type float32 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,117.71 kiB,19.20 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,117.71 kiB,19.20 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 117.71 kiB 19.20 kiB Shape (30133,) (4915,) Count 34 Tasks 7 Chunks Type float32 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,117.71 kiB,19.20 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,117.71 kiB,19.20 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 117.71 kiB 19.20 kiB Shape (30133,) (4915,) Count 34 Tasks 7 Chunks Type float32 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,117.71 kiB,19.20 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,117.71 kiB,19.20 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 117.71 kiB 19.20 kiB Shape (30133,) (4915,) Count 34 Tasks 7 Chunks Type float32 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,117.71 kiB,19.20 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,117.71 kiB,19.20 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 117.71 kiB 19.20 kiB Shape (30133,) (4915,) Count 34 Tasks 7 Chunks Type float32 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,117.71 kiB,19.20 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,datetime64[ns],numpy.ndarray
"Array Chunk Bytes 235.41 kiB 38.40 kiB Shape (30133,) (4915,) Count 34 Tasks 7 Chunks Type datetime64[ns] numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,datetime64[ns],numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 235.41 kiB 38.40 kiB Shape (30133,) (4915,) Count 34 Tasks 7 Chunks Type float64 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,117.71 kiB,19.20 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 117.71 kiB 19.20 kiB Shape (30133,) (4915,) Count 34 Tasks 7 Chunks Type int32 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,117.71 kiB,19.20 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,datetime64[ns],numpy.ndarray
"Array Chunk Bytes 235.41 kiB 38.40 kiB Shape (30133,) (4915,) Count 34 Tasks 7 Chunks Type datetime64[ns] numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,datetime64[ns],numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,117.71 kiB,19.20 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 117.71 kiB 19.20 kiB Shape (30133,) (4915,) Count 34 Tasks 7 Chunks Type float32 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,117.71 kiB,19.20 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,117.71 kiB,19.20 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 117.71 kiB 19.20 kiB Shape (30133,) (4915,) Count 34 Tasks 7 Chunks Type float32 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,117.71 kiB,19.20 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10.58 MiB,1.72 MiB
Shape,"(30133, 92)","(4915, 92)"
Count,34 Tasks,7 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 10.58 MiB 1.72 MiB Shape (30133, 92) (4915, 92) Count 34 Tasks 7 Chunks Type float32 numpy.ndarray",92  30133,

Unnamed: 0,Array,Chunk
Bytes,10.58 MiB,1.72 MiB
Shape,"(30133, 92)","(4915, 92)"
Count,34 Tasks,7 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.84 MiB,307.19 kiB
Shape,"(30133, 16)","(4915, 16)"
Count,34 Tasks,7 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.84 MiB 307.19 kiB Shape (30133, 16) (4915, 16) Count 34 Tasks 7 Chunks Type float32 numpy.ndarray",16  30133,

Unnamed: 0,Array,Chunk
Bytes,1.84 MiB,307.19 kiB
Shape,"(30133, 16)","(4915, 16)"
Count,34 Tasks,7 Chunks
Type,float32,numpy.ndarray


### Load netCDF files from local directory

In [None]:
save_dir = f"/home/andrew/Documents/OOI-CGSN/QAQC_Sandbox/QARTOD/Testing/data/testing/{refdes}/"

In [None]:
netCDF_files = [save_dir+f for f in os.listdir(save_dir)]
netCDF_files

In [None]:
from dask.diagnostics import ProgressBar

In [None]:
OOINet.REFDES = refdes

In [None]:
# -------------------------------
# Third, check and remove any files which are malformed
# and remove the bad ones
netCDF_files = OOINet._check_files(netCDF_files)

# Load the datasets into a concatenated xarray DataSet
with ProgressBar():
    print("\n"+f"Loading netCDF_files for {OOINet.REFDES}:")
    ds = xr.open_mfdataset(netCDF_files, preprocess=OOINet._preprocess, parallel=True)

# Add in the English name of the dataset
refdes = "-".join(ds.attrs["id"].split("-")[:4])
vocab = OOINet.get_vocab(refdes)
ds.attrs["Location_name"] = " ".join((vocab["tocL1"].iloc[0],
                                      vocab["tocL2"].iloc[0],
                                      vocab["tocL3"].iloc[0]))



## QARTOD Comparison

In [None]:
# First, cut down the dataset size to be more managable

In [72]:
dataVars = ["phsen_abcdef_ph_seawater", "phsen_abcdef_ph_seawater_qartod_executed"]
data = ds[dataVars]
phsen

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 235.41 kiB 38.40 kiB Shape (30133,) (4915,) Count 34 Tasks 7 Chunks Type float64 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.84 MiB,307.19 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,|S64,numpy.ndarray
"Array Chunk Bytes 1.84 MiB 307.19 kiB Shape (30133,) (4915,) Count 34 Tasks 7 Chunks Type |S64 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,1.84 MiB,307.19 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,|S64,numpy.ndarray


### QARTOD values
Next, load the QARTOD tables from github and parse them into dictionaries.

Changes: None

In [73]:
inst = "phsen"
param = "phsen_abcdef_ph_seawater"

In [74]:
import io
import json
def loadQARTOD(refDes,param,sensorType):
    
    (site,node,sensor1,sensor2) = refDes.split('-')
    sensor = sensor1 + '-' + sensor2
    
    ### Load climatology and gross range values
    githubBaseURL = 'https://raw.githubusercontent.com/oceanobservatories/qc-lookup/master/qartod/'
    if 'ph_seawater' in param:
        ClimParam = 'seawater_ph'
    else:
        ClimParam = param
    clim_URL = githubBaseURL + sensorType + '/climatology_tables/' + refDes + '-' + ClimParam + '.csv'
    grossRange_URL = githubBaseURL + sensorType + '/' + sensorType + '_qartod_gross_range_test_values.csv'
    download = requests.get(grossRange_URL)
    if download.status_code == 200:
        df_grossRange = pd.read_csv(io.StringIO(download.content.decode('utf-8')))
        paramString = "{'inp': '" + param + "'}"
        qcConfig = df_grossRange.qcConfig[(df_grossRange.subsite == site) 
                                          & (df_grossRange.node == node) 
                                          & (df_grossRange.sensor == sensor) 
                                          & (df_grossRange.parameters == paramString)]
        qcConfig_json = qcConfig.values[0].replace("'", "\"")
        grossRange_dict = json.loads(qcConfig_json)
    else:
        print('error retriving gross range data')
        grossRange_dict = {}

    download = requests.get(clim_URL)
    if download.status_code == 200:
        df_clim = pd.read_csv(io.StringIO(download.content.decode('utf-8')))
        climRename = {
                'Unnamed: 0':'depth',
                '[1, 1]':'1',
                '[2, 2]':'2',
                '[3, 3]':'3',
                '[4, 4]':'4',
                '[5, 5]':'5',
                '[6, 6]':'6',
                '[7, 7]':'7',
                '[8, 8]':'8',
                '[9, 9]':'9',
                '[10, 10]':'10',
                '[11, 11]':'11',
                '[12, 12]':'12'           
            } 
        
        df_clim.rename(columns=climRename, inplace=True)
        clim_dict = df_clim.set_index('depth').to_dict()
    else:
        print('error retriving climatology data')
        clim_dict = {}
    
    return(grossRange_dict,clim_dict)

In [75]:
grossRange_dict, clim_dict = loadQARTOD(refdes, param, inst)
grossRange_dict, clim_dict

({'qartod': {'gross_range_test': {'suspect_span': [7.95, 8.23],
    'fail_span': [6.9, 9]}}},
 {'1': {'[0, 0]': '[7.9425, 8.1947]'},
  '2': {'[0, 0]': '[7.9111, 8.2299]'},
  '3': {'[0, 0]': '[8.0444, 8.113]'},
  '4': {'[0, 0]': '[8.0479, 8.1345]'},
  '5': {'[0, 0]': '[7.96, 8.2481]'},
  '6': {'[0, 0]': '[7.8299, 8.3966]'},
  '7': {'[0, 0]': '[7.9555, 8.2769]'},
  '8': {'[0, 0]': '[7.8985, 8.3269]'},
  '9': {'[0, 0]': '[8.0198, 8.1885]'},
  '10': {'[0, 0]': '[8.0272, 8.159]'},
  '11': {'[0, 0]': '[8.0177, 8.146]'},
  '12': {'[0, 0]': '[8.0367, 8.1092]'}})

### Add Climatology Values
Next, add the climatology min and max values to the dataset as new data variables, based on the month of the data.

Changes:
* Renamed "climatologyMin/climatologyMax" to "{parameter name}\_climatologyMin/climatologyMax" in order to allow multiple parameter climatologies to be stored in an given dataset
* Preallocated the climatology arrays with nans instead of zeros to skip the later step of backfilling nans.
* Utilize dask to get the months (as integers) in the time variable of the dataset. This avoids loading the data into memory.
* Utilize direct assignment of the climatologyMin/Max values for each month on the dataset variable arrays. This again keeps the dataset out-of-memory.

In [76]:
import dask.array as da
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import ast

def add_climatology_values(ds, param, clim_dict):
    """Adds climatology mins and maxes to the dataset timeseries
    
    Parameters
    ----------
    ds: xarray.Dataset
        Dataset to add climatology values to, with primary dimension "time"
    param: str
        Name of parameter in the passed xarray.Dataset which to add
        climatology values to
    clim_dict: dict
        A dictionary of the climatology values for the given dataset
        loaded from the qartod gitHub repo
        
    Returns
    -------
    ds: xarray.Dataset
        An xarray dataset with climatology mins and maxes added for the given
        parameter (param) to the dataset
        
    Note: Will need to add a pressure function to make this match the original functionality
    """
    
    # First, create a variable name to store the data
    varNameMin = f"{param}_climatologyMin"
    varNameMax = f"{param}_climatologyMax"
    
    # Next, pre-allocate an array with the data
    ds[varNameMin] = ds[param].astype(float) * np.nan
    ds[varNameMax] = ds[param].astype(float) * np.nan
    
    # Get the months
    time = da.from_array(ds.time.dt.month)
    months = np.unique(time).compute()
    
    # Add the climatology min and max based on the month of the measurement
    for month in months:
        climatology = ast.literal_eval(clim_dict[str(month)][str([0, 0])])
        ds[varNameMin][(ds.time.dt.month == month)] = climatology[0]
        ds[varNameMax][(ds.time.dt.month == month)] = climatology[1]
        
    return ds

In [78]:
data = add_climatology_values(data, param, clim_dict)

In [87]:
grossRange_dict

{'qartod': {'gross_range_test': {'suspect_span': [7.95, 8.23],
   'fail_span': [6.9, 9]}}}

### Add QARTOD flags
Next, want to calculate the QARTOD flags for the gross range and climatology values and add them to the dataset. 
Changes:
* Renamed the "gr_flag/clim_flag" to "{parameter name}\_gr_flag/\_clim_flag" in order to allow multiple parameters to be tested in a single dataset.
* Utilize direct assignment of the QARTOD flags to avoid loading data into memory.

In [79]:
def create_QARTOD_flags(ds, param, grossRange):
    """Function to add the gross range and climatology flags"""
    
    # Add the gross range flags for a param
    gr_flag = f"{param}_gr_flag"
    ds[gr_flag] = ds[param].astype("int64") * 0 + 1
    gr_suspect = grossRange["qartod"]["gross_range_test"]["suspect_span"]
    gr_fail = grossRange["qartod"]["gross_range_test"]["fail_span"]
    ds[gr_flag][(ds[param] < gr_suspect[0]) | (ds[param] > gr_suspect[1])] = 3
    ds[gr_flag][(ds[param] < gr_fail[0]) | (ds[param] > gr_fail[1])] = 4
     
    # Climatology flags
    clim_flag = f"{param}_clim_flag"
    ds[clim_flag] = ds[param].astype("int64") * 0 + 1
    ds[clim_flag][(ds[f"{param}_climatologyMin"].isnull()) | (ds[f"{param}_climatologyMax"].isnull())] = 2
    ds[clim_flag][(ds[param] < ds[f"{param}_climatologyMin"]) | (ds[param] > ds[f"{param}_climatologyMax"])] = 3
    
    # Check for not evaluated locations
    not_eval = ds[param].isnull()
    ds[gr_flag][not_eval] = 9
    ds[clim_flag][not_eval] = 9
    
    return ds

In [80]:
data = create_QARTOD_flags(data, param, grossRange_dict)
data

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 235.41 kiB 38.40 kiB Shape (30133,) (4915,) Count 34 Tasks 7 Chunks Type float64 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.84 MiB,307.19 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,|S64,numpy.ndarray
"Array Chunk Bytes 1.84 MiB 307.19 kiB Shape (30133,) (4915,) Count 34 Tasks 7 Chunks Type |S64 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,1.84 MiB,307.19 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,|S64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,137 Tasks,7 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 235.41 kiB 38.40 kiB Shape (30133,) (4915,) Count 137 Tasks 7 Chunks Type float64 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,137 Tasks,7 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,137 Tasks,7 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 235.41 kiB 38.40 kiB Shape (30133,) (4915,) Count 137 Tasks 7 Chunks Type float64 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,137 Tasks,7 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,77 Tasks,7 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 235.41 kiB 38.40 kiB Shape (30133,) (4915,) Count 77 Tasks 7 Chunks Type int64 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,77 Tasks,7 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,77 Tasks,7 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 235.41 kiB 38.40 kiB Shape (30133,) (4915,) Count 77 Tasks 7 Chunks Type int64 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,77 Tasks,7 Chunks
Type,int64,numpy.ndarray


### Compare test values
Now, want to compare the values calculated locally with the values returned by OOINet in the "qartod_executed" variables.

Changes:
* Don't iterate through each data point
* Change the data type of the {parameter name}\_qartod_executed data array to string to be interperable
* With the type changed to string, can use the xarray built-in string methods (.str) to parse each value in the "qartod_executed" array
* Changed the name of "qartod_gr/qartod_clim" to "{parameter name}\_qartod_gr/\_qartod_clim" to allow multiple parameters to be stored in the same dataset
* Run the test comparison and store as "{parameter name}\_gr_comparison/\_clim_comparison" as a boolean array. This will allow us to quickly count the comparison (using sum) and mask the parameter being tested.

In [81]:
def run_comparison(ds, param):
    
    # First, identify the test order of the qartod tests run
    qartod_name = f"{param}_qartod_executed"
    test_order = ds[qartod_name].attrs["tests_executed"].strip("'").replace(" ", "").split(",")
    
    # Second, identify the index of each test
    clim_index = test_order.index("climatology_test")
    gr_index = test_order.index("gross_range_test")
    
    # Next, convert the OOINet-run QARTOD flags to interperable strings
    ds[qartod_name] = ds[qartod_name].astype(str)
    
    # Parse the qartod flags into the separate test flags
    ds[f"{param}_qartod_gr"] = ds[qartod_name].str.get(gr_index).astype("int")
    ds[f"{param}_qartod_clim"] = ds[qartod_name].str.get(clim_index).astype("int")
    
    # Compare the OOI Qartod with local Qartod
    ds[f"{param}_gr_comparison"] = ds[f"{param}_qartod_gr"] != ds[f"{param}_gr_flag"]
    ds[f"{param}_clim_comparison"] = ds[f"{param}_qartod_clim"] != ds[f"{param}_clim_flag"]
    
    return ds

In [82]:
data = run_comparison(data, param)
data

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 235.41 kiB 38.40 kiB Shape (30133,) (4915,) Count 34 Tasks 7 Chunks Type float64 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,34 Tasks,7 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.36 MiB,1.20 MiB
Shape,"(30133,)","(4915,)"
Count,41 Tasks,7 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 7.36 MiB 1.20 MiB Shape (30133,) (4915,) Count 41 Tasks 7 Chunks Type numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,7.36 MiB,1.20 MiB
Shape,"(30133,)","(4915,)"
Count,41 Tasks,7 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,137 Tasks,7 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 235.41 kiB 38.40 kiB Shape (30133,) (4915,) Count 137 Tasks 7 Chunks Type float64 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,137 Tasks,7 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,137 Tasks,7 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 235.41 kiB 38.40 kiB Shape (30133,) (4915,) Count 137 Tasks 7 Chunks Type float64 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,137 Tasks,7 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,77 Tasks,7 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 235.41 kiB 38.40 kiB Shape (30133,) (4915,) Count 77 Tasks 7 Chunks Type int64 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,77 Tasks,7 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,77 Tasks,7 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 235.41 kiB 38.40 kiB Shape (30133,) (4915,) Count 77 Tasks 7 Chunks Type int64 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,77 Tasks,7 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,70 Tasks,7 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 235.41 kiB 38.40 kiB Shape (30133,) (4915,) Count 70 Tasks 7 Chunks Type int64 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,70 Tasks,7 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,70 Tasks,7 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 235.41 kiB 38.40 kiB Shape (30133,) (4915,) Count 70 Tasks 7 Chunks Type int64 numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,235.41 kiB,38.40 kiB
Shape,"(30133,)","(4915,)"
Count,70 Tasks,7 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,29.43 kiB,4.80 kiB
Shape,"(30133,)","(4915,)"
Count,154 Tasks,7 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 29.43 kiB 4.80 kiB Shape (30133,) (4915,) Count 154 Tasks 7 Chunks Type bool numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,29.43 kiB,4.80 kiB
Shape,"(30133,)","(4915,)"
Count,154 Tasks,7 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,29.43 kiB,4.80 kiB
Shape,"(30133,)","(4915,)"
Count,154 Tasks,7 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 29.43 kiB 4.80 kiB Shape (30133,) (4915,) Count 154 Tasks 7 Chunks Type bool numpy.ndarray",30133  1,

Unnamed: 0,Array,Chunk
Bytes,29.43 kiB,4.80 kiB
Shape,"(30133,)","(4915,)"
Count,154 Tasks,7 Chunks
Type,bool,numpy.ndarray


### Execute the comparison
So far, all the work we've done hasn't actually run any processing. Everything has been done as a set of dask instructions to execute when we call compute().

Below, I first just count the number of missed flags by summing the comparison results, since each "missed" flag is stored as a boolean ```True```, which ```.sum()``` counts as a 1. 

In [83]:
from dask.diagnostics import ProgressBar

In [84]:
with ProgressBar():
    for var in data.variables:
        if "comparison" in var:
            result = data[var].sum().compute()
            print(f"Missed flags for {var}: {result.values}")

[########################################] | 100% Completed |  1.7s
Missed flags for phsen_abcdef_ph_seawater_gr_comparison: 30133
[########################################] | 100% Completed |  0.2s
Missed flags for phsen_abcdef_ph_seawater_clim_comparison: 30133


### To Do
Need to add in pressure bracket handling so that I can do profilers (although I don't have any profilers for CGSN up on Dev1). 

Also need to add in function to print out the time-stamp of when qartod flags are mis-flaged.

In [None]:
def pressureBracket(pressure,clim_dict):
    bracketList = []
    pressBracket = 'notFound'

    for bracket in clim_dict['1'].keys():
        x = re.search(r'\[(.+),(.+)\]', bracket)
        if x:
            bracketList.append([int(x.group(1)),int(x.group(2))])
        else:
            print('bracket parsing error for ' + bracket)
    for bracket in bracketList:
        if (pressure >= bracket[0]) & (pressure < bracket[1]):
            pressBracket = bracket
            break
    
    return pressBracket