# Run QARTOD Test on Locally Saved Data

In this notebook we will load locally saved data from the interim data folder, extract QARTOD test parameters from spreadsheets on the OOI GitHub, run the QARTOD climatology and gross range tests on the imported data, and save the test results to the processed data folder.

More info about QARTOD tests and the ioos_qc module can be found from the [Integrated Ocean Observing System website](https://ioos.noaa.gov/project/qartod/) and [Python module documentation](https://ioos.github.io/ioos_qc/), respectively.

### Import modules for data manipulation

In [1]:
# Import libraries
import os
import requests
import re
import gc
import io
import ast
import pandas as pd
import numpy as np
import xarray as xr
import warnings
warnings.filterwarnings("ignore")
import sys

# Import dask tools and ProgressBar
import dask
from dask.diagnostics import ProgressBar

### Load locally saved data

In [2]:
# Set reference designator, data stream, and method 

site = "CP01CNSM"                                   # Coastal Pioneer Array (NES) - Central Surface Mooring
node = "MFD37"                                      # 
sensor = "03-CTDBPD000"                             # CTD Bottom-pumped
method = "recovered_inst"                           # non-decimated data from recovered instrument
stream = "ctdbp_cdef_instrument_recovered"          # name of data stream

refdes = '-'.join((site,node,sensor))               # build reference designator

type = 'prod'                                       # dataset saved from OOINet/"production" or from dev1

In [3]:
# Build filename and path to interim data

def build_data_path(refdes,method,stream,type,folder='interim'):
    # Input: 
    #   refdes: string built from OOI site, node, and sensor for chosen dataset
    #   method: 'recovered_inst', 'recovered_host', or 'telemetered'(?) 
    #   stream: name of data stream 
    #   type: 'prod' or 'dev'
    #   folder: 'interim' (default), 'processed', 'raw', or 'external'
    #
    # Returns:
    #   ds_path: relative path to dataset from notebook folder
    
    filename = '-'.join((type,refdes,method,stream))+'.nc'              # build filename from dataset type and source

    data_folder = os.path.relpath('../data')                            # path to data folder from notebook folder

    ds_path=os.path.join(data_folder,folder,filename)                   # build full relative path 
    
    return ds_path

In [4]:
ds_path = build_data_path(refdes,method,stream,'prod')
ds_path

'..\\data\\interim\\prod-CP01CNSM-MFD37-03-CTDBPD000-recovered_inst-ctdbp_cdef_instrument_recovered.nc'

In [5]:
# Load data from .nc files

ds = xr.open_dataset(ds_path)
ds

### Identify Test Parameters

Next, identify which parameters in the dataset have QARTOD applied to them. Sometimes the variable name in the dataset is different that the key that is used by OOINet to build the datasets. For that we can check the attributes of the variable for the "alternate_parameter_name"!

In [7]:
# Create a dictionary of key-value pairs of dataset variable name:alternate parameter name
test_parameters={}
for var in ds.variables:
    if "qartod_results" in var:
        # Get the parameter name
        param = var.split("_qartod")[0]
        
        # Check if the parameter has an alternative ooinet_name
        if "alternate_parameter_name" in ds[param].attrs:
            ooinet_name = ds[param].attrs["alternate_parameter_name"]
        else:
            ooinet_name = param
        
        # Save the results in a dictionary
        test_parameters.update({
            param: ooinet_name
        })
# Print out the results
test_parameters

{'sea_water_electrical_conductivity': 'ctdbp_seawater_conductivity',
 'sea_water_temperature': 'ctdbp_seawater_temperature',
 'sea_water_practical_salinity': 'practical_salinity',
 'sea_water_pressure': 'ctdbp_seawater_pressure'}

### Collect test QARTOD lookup value tables from GitHub
We can grab the QARTOD tables with the test values straight from GitHub, which ensures we are using the same input and threshold values as OOINet. However, the QARTOD tables utilize the ```ooinet_parameter_name``` instead of the dataset variable name. Thus, when loading the tables we need to make sure we are requesting the correct parameter name.

In [8]:
GITHUB_BASE_URL = "https://raw.githubusercontent.com/oceanobservatories/qc-lookup/master/qartod"

def load_gross_range_qartod_test_values(refdes, stream, ooinet_param):
    """
    Load the gross range QARTOD test from gitHub
    """
    subsite, node, sensor = refdes.split("-", 2)
    sensor_type = sensor[3:8].lower()
    
    # gitHub url to the gross range table
    GROSS_RANGE_URL = f"{GITHUB_BASE_URL}/{sensor_type}/{sensor_type}_qartod_gross_range_test_values.csv"
    
    # Download the results
    download = requests.get(GROSS_RANGE_URL)
    if download.status_code == 200:
        df = pd.read_csv(io.StringIO(download.content.decode('utf-8')))
        df["parameters"] = df["parameters"].apply(ast.literal_eval)
        df["qcConfig"] = df["qcConfig"].apply(ast.literal_eval)
        
    # Next, filter for the desired parameter
    mask = df["parameters"].apply(lambda x: True if x.get("inp") == ooinet_param else False)
    df = df[mask]
    
    # Now filter for the desired stream
    df = df[(df["subsite"] == subsite) & 
            (df["node"] == node) & 
            (df["sensor"] == sensor) &
            (df["stream"] == stream)]
    
    return df


def load_climatology_qartod_test_values(refdes, param):
    """
    Load the OOI climatology qartod test values table from gitHub
    
    Parameters
    ----------
    refdes: str
        The reference designator for the given sensor
    param: str
        The name of the 
    """
    
    site, node, sensor = refdes.split("-", 2)
    sensor_type = sensor[3:8].lower()
    
    # gitHub url to the climatology tables
    CLIMATOLOGY_URL = f"{GITHUB_BASE_URL}/{sensor_type}/climatology_tables/{refdes}-{param}.csv"
    
    # Download the results
    download = requests.get(CLIMATOLOGY_URL)
    if download.status_code == 200:
        df = pd.read_csv(io.StringIO(download.content.decode('utf-8')), index_col=0)
        df = df.applymap(ast.literal_eval)
    else:
        return None
    return df

In [9]:
# Example: load the gross range QARTOD table for a specific parameter
gross_range_qartod_test_values = load_gross_range_qartod_test_values(refdes, stream, test_parameters["sea_water_temperature"])
gross_range_qartod_test_values

Unnamed: 0,subsite,node,sensor,stream,parameters,qcConfig,source,notes
224,CP01CNSM,MFD37,03-CTDBPD000,ctdbp_cdef_instrument_recovered,{'inp': 'ctdbp_seawater_temperature'},{'qartod': {'gross_range_test': {'suspect_span...,Sensor min/max derived from vendor documentati...,


In [10]:
# Example: load the climatology QARTOD table for a specific parameter
climatology_qartod_test_values = load_climatology_qartod_test_values(refdes, test_parameters["sea_water_temperature"])
climatology_qartod_test_values

Unnamed: 0,"[1, 1]","[2, 2]","[3, 3]","[4, 4]","[5, 5]","[6, 6]","[7, 7]","[8, 8]","[9, 9]","[10, 10]","[11, 11]","[12, 12]"
"[0, 0]","[12.5076, 14.8184]","[12.0924, 13.7305]","[11.5657, 13.3547]","[11.4895, 13.4294]","[11.4483, 13.9555]","[11.58, 14.1914]","[11.1502, 14.7142]","[11.7184, 14.3547]","[11.3198, 15.4957]","[12.7114, 15.2397]","[13.2261, 15.5345]","[12.3546, 16.1987]"


### Run QARTOD tests locally
Next, we run the gross range test locally to get local results that can be compared with the output from the tests. This is done using the ```ioos_qc``` QARTOD package in conjunction with the ```qartod_test_values``` tables.

#### Gross Range Test

In [11]:
# Import the ioos_qc QARTOD package tests
from ioos_qc.qartod import gross_range_test, climatology_test, ClimatologyConfig

In [12]:
# Run through all of the parameters which had the QARTOD tests applied by OOINet and
# run the tests locally, saving the results in a dictionary
gross_range_results = {}
for param in test_parameters:
    # Get the ooinet name
    ooinet_name = test_parameters.get(param)
    
    # Load the gross_range_qartod_test_values from gitHub
    gross_range_qartod_test_values = load_gross_range_qartod_test_values(refdes, stream, ooinet_name)
    
    # Get the qcConfig object, the fail_span, and the suspect_span
    qcConfig = gross_range_qartod_test_values["qcConfig"].values[0]
    fail_span = qcConfig.get("qartod").get("gross_range_test").get("fail_span")
    suspect_span = qcConfig.get("qartod").get("gross_range_test").get("suspect_span")
    
    # Run the gross_range_tenst
    param_results = gross_range_test(
        inp = ds[param].values,
        fail_span = fail_span,
        suspect_span = suspect_span)
    
    # Save the results
    gross_range_results.update(
        {param: param_results}
    )
    

In [13]:
# Check that gross_range_results contains the tests results
gross_range_results

{'sea_water_electrical_conductivity': masked_array(data=[1, 1, 1, ..., 1, 1, 1],
              mask=False,
        fill_value=999999,
             dtype=uint8),
 'sea_water_temperature': masked_array(data=[1, 1, 1, ..., 1, 1, 1],
              mask=False,
        fill_value=999999,
             dtype=uint8),
 'sea_water_practical_salinity': masked_array(data=[1, 1, 1, ..., 1, 1, 1],
              mask=False,
        fill_value=999999,
             dtype=uint8),
 'sea_water_pressure': masked_array(data=[1, 1, 1, ..., 1, 1, 1],
              mask=False,
        fill_value=999999,
             dtype=uint8)}

#### Climatology Test

In [14]:
# Run through all of the parameters which had the QARTOD tests applied by OOINet and
# run the tests locally, saving the results in a dictionary
climatology_results = {}

for param in test_parameters:
    # Get the ooinet name
    ooinet_name = test_parameters.get(param)
    
    # Load the gross_range_qartod_test_values from gitHub
    climatology_qartod_test_values = load_climatology_qartod_test_values(refdes, ooinet_name)
    
    if climatology_qartod_test_values is None:
        climatology_results.update({
            param: "Not implemented."
        })
        continue
    
    # Initialize a climatology config object
    c = ClimatologyConfig()
    
    # Iterate through the pressure ranges
    for p_range in climatology_qartod_test_values.index:
        # Get the pressure range
        pmin, pmax = ast.literal_eval(p_range)

        # Convert the pressure range values into a dictionary
        p_values = climatology_qartod_test_values.loc[p_range].to_dict()

        # Check the pressure values. If [0, 0], then set the range [0, 5000]
        if pmax == 0:
            pmax = 5000

        for tspan in p_values.keys():
            # Get the time span
            tstart, tend = ast.literal_eval(tspan)

            # Get the values associated with the time span
            vmin, vmax = p_values.get(tspan)

            # Add the test to the climatology config object
            c.add(tspan=[tstart, tend],
                  vspan=[vmin, vmax],
                  fspan=[fail_span[0], fail_span[1]],
                  zspan=[pmin, pmax],
                  period="month")

    # Run the climatology test
    param_results = climatology_test(c,
                                     inp=ds[param],
                                     tinp=ds["time"],
                                     zinp=ds["sea_water_pressure"])
    
    # Append the results
    climatology_results.update({
        param: param_results
    })

In [15]:
climatology_results

{'sea_water_electrical_conductivity': 'Not implemented.',
 'sea_water_temperature': masked_array(data=[1, 1, 1, ..., 1, 1, 1],
              mask=False,
        fill_value=999999,
             dtype=uint8),
 'sea_water_practical_salinity': masked_array(data=[1, 1, 1, ..., 3, 3, 3],
              mask=False,
        fill_value=999999,
             dtype=uint8),
 'sea_water_pressure': 'Not implemented.'}

### Save test results to processed data folder

In [17]:
# convert dict to data frame
gr_df = pd.DataFrame.from_dict(gross_range_results)

# Add time vector to df and set time as index
gr_df = gr_df.assign(time=ds.time.values)
gr_df = gr_df.set_index("time")
gr_df

Unnamed: 0_level_0,sea_water_electrical_conductivity,sea_water_temperature,sea_water_practical_salinity,sea_water_pressure
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-11-21 18:16:01,1,1,1,1
2013-11-21 18:16:11,1,1,1,1
2013-11-21 18:16:21,1,1,1,1
2013-11-21 18:16:31,1,1,1,1
2013-11-21 18:16:41,1,1,1,1
...,...,...,...,...
2022-11-11 12:15:01,1,1,1,1
2022-11-11 12:30:01,1,1,1,1
2022-11-11 12:45:01,1,1,1,1
2022-11-11 13:00:01,1,1,1,1


In [18]:
# convert df to xarray
gr_ds = gr_df.to_xarray()
gr_ds 

In [19]:
gr_results_path = build_data_path(refdes,method,stream,(type+'-gr-result'),folder='processed') # Build path with filename to folder for saved results

gr_ds.to_netcdf(gr_results_path)                                             # write netCDF file with results to processed data folder

In [20]:
# convert dict to data frame
climatology_df = pd.DataFrame.from_dict(climatology_results)

# Add time vector to df and set time as index
climatology_df = climatology_df.assign(time=ds.time.values)
climatology_df = climatology_df.set_index("time")

# convert df to xarray
climatology_ds = climatology_df.to_xarray()
climatology_ds 

In [21]:
clim_results_path = build_data_path(refdes,method,stream,(type+'-clim-result'),folder='processed') # Build path with filename to folder for saved results

climatology_ds.to_netcdf(clim_results_path)                                                   # write netCDF file with results to processed data folder