### Import modules used in this notebook

In [2]:
# Import libraries
import os
import re
import gc
import io
import ast
import pandas as pd
import numpy as np
import xarray as xr
import warnings
warnings.filterwarnings("ignore")
import sys

In [3]:
# Import OOINet library
sys.path.append("c:\\Users\\cooleyky\\Documents\\GitHub\\OOINet") # this is what was missing from the steps I followed to install ooinet and ooi-data-explorations as local dev repo
from ooinet import M2M
from ooinet.Instrument.common import process_file

In [4]:
# Import functions from ooi-data-explorations library
sys.path.append("c:\\Users\\cooleyky\\Documents\\GitHub\\ooi-data-explorations\\python") # why did the initial install not include this?
from ooi_data_explorations.uncabled.process_dosta import dosta_datalogger
from ooi_data_explorations.combine_data import combine_datasets
from ooi_data_explorations import common as ooi_common

In [5]:
# Import dask tools and ProgressBar
import dask
from dask.diagnostics import ProgressBar

### Define data parameters and routines

In [6]:
# Setup parameters needed to request data
refdes = "CP01CNSM-MFD37-03-CTDBPD000"              # Coastal Pioneer Array (NES) - Central Surface Mooring CTD Bottom-pumped, is this the same as site, node, sensor?
method = "recovered_inst"                           # non-decimated data from recovered instrument
stream = "ctdbp_cdef_instrument_recovered"          # name of data stream
# Site, node, and sensor info from deconstructed reference designator
site = "CP01CNSM"
node = "MFD37"
sensor = "03-CTDBPD000"

In [7]:
# Generic preprocessing routine to do some generic dataset cleaning/processing
@dask.delayed
def preprocess(ds):
    ds = xr.open_dataset(ds)
    ds = process_file(ds)
    return ds

### QARTOD in Production: Request data from the THREDDS catalog

##### Using mostly OOINet module

In [7]:
# Use the gold copy THREDDs datasets
thredds_url = M2M.get_thredds_url(refdes, method, stream, goldCopy=True)

# Get the THREDDs catalog
thredds_catalog = M2M.get_thredds_catalog(thredds_url)

In [8]:
# Clean the THREDDs catalog
sensor_files, ancillary_files = M2M.clean_catalog(thredds_catalog, stream) 
# removes entries from thredds_catalog if they do not match the stream, or are used in processing data from the selected stream
sensor_files

['catalog.html?dataset=ooigoldcopy/public/CP01CNSM-MFD37-03-CTDBPD000-recovered_inst-ctdbp_cdef_instrument_recovered/deployment0001_CP01CNSM-MFD37-03-CTDBPD000-recovered_inst-ctdbp_cdef_instrument_recovered_20131121T181601-20140217T130601.nc',
 'catalog.html?dataset=ooigoldcopy/public/CP01CNSM-MFD37-03-CTDBPD000-recovered_inst-ctdbp_cdef_instrument_recovered/deployment0003_CP01CNSM-MFD37-03-CTDBPD000-recovered_inst-ctdbp_cdef_instrument_recovered_20150507T173501-20151023T193501.nc',
 'catalog.html?dataset=ooigoldcopy/public/CP01CNSM-MFD37-03-CTDBPD000-recovered_inst-ctdbp_cdef_instrument_recovered/deployment0005_CP01CNSM-MFD37-03-CTDBPD000-recovered_inst-ctdbp_cdef_instrument_recovered_20160513T140200-20160808T180200.nc',
 'catalog.html?dataset=ooigoldcopy/public/CP01CNSM-MFD37-03-CTDBPD000-recovered_inst-ctdbp_cdef_instrument_recovered/deployment0006_CP01CNSM-MFD37-03-CTDBPD000-recovered_inst-ctdbp_cdef_instrument_recovered_20161013T184501-20170122T060001.nc',
 'catalog.html?dataset=o

In [13]:
# Now build the url to access the data
sensor_files = [re.sub("catalog.html\?dataset=", M2M.URLS["goldCopy_dodsC"], file) for file in sensor_files]
sensor_files

['https://thredds.dataexplorer.oceanobservatories.org/thredds/dodsC/ooigoldcopy/public/CP01CNSM-MFD37-03-CTDBPD000-recovered_inst-ctdbp_cdef_instrument_recovered/deployment0001_CP01CNSM-MFD37-03-CTDBPD000-recovered_inst-ctdbp_cdef_instrument_recovered_20131121T181601-20140217T130601.nc',
 'https://thredds.dataexplorer.oceanobservatories.org/thredds/dodsC/ooigoldcopy/public/CP01CNSM-MFD37-03-CTDBPD000-recovered_inst-ctdbp_cdef_instrument_recovered/deployment0003_CP01CNSM-MFD37-03-CTDBPD000-recovered_inst-ctdbp_cdef_instrument_recovered_20150507T173501-20151023T193501.nc',
 'https://thredds.dataexplorer.oceanobservatories.org/thredds/dodsC/ooigoldcopy/public/CP01CNSM-MFD37-03-CTDBPD000-recovered_inst-ctdbp_cdef_instrument_recovered/deployment0005_CP01CNSM-MFD37-03-CTDBPD000-recovered_inst-ctdbp_cdef_instrument_recovered_20160513T140200-20160808T180200.nc',
 'https://thredds.dataexplorer.oceanobservatories.org/thredds/dodsC/ooigoldcopy/public/CP01CNSM-MFD37-03-CTDBPD000-recovered_inst-ctd

In [None]:
# preprocess the data
zs = [preprocess(file) for file in sensor_files]

In [None]:
# Load all the datasets
with ProgressBar():
    data = xr.concat([ds.chunk() for ds in dask.compute(*zs)], dim="time")

##### Using ooi_data_explorations modules

In [14]:
# Load data with ooi_common module
data = ooi_common.load_gc_thredds(site,node,sensor,method,stream,use_dask=True)    # Request the gold copy data through THREDDs catalog
# The potential error I see here that maybe Andrew was trying to work around was not getting ancillary files through this method
# load_gc_thredds() also calls process_file() within gc_collect() so we achieve the same preprocessing as in the preprocess() defined above.

Downloading 15 data file(s) from the OOI Gold Copy THREDSS catalog
Downloading and Processing Data Files: 100%|██████████| 15/15 [00:18<00:00,  1.21s/it]


In [15]:
# Make a copy of the data with a unique name
ds_prod = data.copy()

### QARTOD in Development: Request data from dev1 server

In [20]:
# Sub in ooinet-dev1-west.intra.oceanobservatories.org into the avaialbe API urls
Dev01_urls = {}
for key in M2M.URLS:
    url = M2M.URLS.get(key)
    if "opendap" in url:
        dev1_url = re.sub("opendap", "opendap-dev1-west.intra", url)
    else:
        dev1_url = re.sub("ooinet","ooinet-dev1-west.intra", url)
    Dev01_urls[key] = dev1_url
   

In [None]:
# The following example is just the same as the ooinet gold copy! I'm not sure how the dev1 urls above get used.
# We can't use any of the following because the development environment doesn't have a gold copy
# Use the gold copy THREDDs datasets
thredds_url = M2M.get_thredds_url(refdes, method, stream, goldCopy=True) # in this example we recycle refdes, method, stream

# Get the THREDDs catalog
thredds_catalog = M2M.get_thredds_catalog(thredds_url)

# Clean the THREDDs catalog
sensor_files, ancillary_files = M2M.clean_catalog(thredds_catalog, stream)

# Now build the url to access the data
sensor_files = [re.sub("catalog.html\?dataset=", M2M.URLS["goldCopy_dodsC"], file) for file in sensor_files]
zs = [preprocess(file) for file in sensor_files]

# Load all the datasets
with ProgressBar():
    data = xr.concat([ds.chunk() for ds in dask.compute(*zs)], dim="time")

In [None]:
# Adapting ooi_data_explorations.common.process_file() for Dev01 datasets

url = re.sub('catalog.html\?dataset=', Dev01_urls, catalog_file)
r = SESSION.get(url, timeout=(3.05, 120))
if r.ok:
    if use_dask:
        ds = xr.open_dataset(io.BytesIO(r.content), decode_cf=False, chunks=10000)
    else:
        ds = xr.load_dataset(io.BytesIO(r.content), decode_cf=False)
else:
    failed_file = catalog_file.rpartition('/')
    warnings.warn('Failed to download %s' % failed_file[-1])
    return None

# addresses error in how the *_qartod_executed variables are set
# qartod_pattern = re.compile(r'^.+_qartod_executed$')
# for v in ds.variables:
#     if qartod_pattern.match(v):
#         # the shape of the QARTOD executed variables should compare to the provenance variable
#         if ds[v].shape != ds['provenance'].shape:
#             ds = ds.drop_vars(v)

# convert the dimensions from obs to time and get rid of obs and other variables we don't need
ds = ds.swap_dims({'obs': 'time'})
ds = ds.reset_coords()
keys = ['obs', 'id', 'provenance', 'driver_timestamp', 'ingestion_timestamp',
        'port_timestamp', 'preferred_timestamp']
for key in keys:
    if key in ds.variables:
        ds = ds.drop_vars(key)

# since the CF decoding of the time is failing, explicitly reset all instances where the units are
# seconds since 1900-01-01 to the correct CF units and convert the values to datetime64[ns] types
time_pattern = re.compile(r'^seconds since 1900-01-01.*$')
ntp_date = np.datetime64('1900-01-01')
for v in ds.variables:
    if 'units' in ds[v].attrs.keys():
        if isinstance(ds[v].attrs['units'], str):  # because some units use non-standard characters...
            if time_pattern.match(ds[v].attrs['units']):
                del(ds[v].attrs['_FillValue'])  # no fill values for time!
                ds[v].attrs['units'] = 'seconds since 1900-01-01T00:00:00.000Z'
                np_time = ntp_date + (ds[v] * 1e9).astype('timedelta64[ns]')
                ds[v] = np_time

# sort by time
ds = ds.sortby('time')

# clear-up some global attributes we will no longer be using
keys = ['DODS.strlen', 'DODS.dimName', 'DODS_EXTRA.Unlimited_Dimension', '_NCProperties', 'feature_Type']
for key in keys:
    if key in ds.attrs:
        del(ds.attrs[key])

if ds.encoding['unlimited_dims']:
    del ds.encoding['unlimited_dims']

# resetting cdm_data_type from Point to Station and the featureType from point to timeSeries
ds.attrs['cdm_data_type'] = 'Station'
ds.attrs['featureType'] = 'timeSeries'

# update some global attributes
ds.attrs['acknowledgement'] = 'National Science Foundation'
ds.attrs['comment'] = 'Data collected from the OOI Dev01 M2M API and reworked for use in locally stored NetCDF files.'



In [None]:
# Make a copy of the data with a unique name
ds_dev = data.copy()


### Save datasets to interim data folder for further processing

In [None]:
# something along the lines of write_nc(), xr.save_dataset?
# call the data file datasets_for_testing.nc 
# I think we will test datasets in production and development separately, so maybe making separate files for those types