# Compare local QARTOD Gross Range Test Results to Expected Flags
Next, we want to calculate the statistics of the different QARTOD flags for the different tests that are applied to the different parameters in the dataset. The example ```qartod_results_summary``` below simply counts the total number of different flags (e.g 1, 3, 4) and their relative percentages for each test (gross range, climatology, etc) for each parameter that the tests area applied to. 

### Import modules used in this notebook

In [1]:
# Import libraries
import os
import re
import gc
import io
import ast
import glob
import requests
import pandas as pd
import numpy as np
import xarray as xr
import warnings
warnings.filterwarnings("ignore")
import sys

In [2]:
# Import function to build relative path to data files
import qartod_testing.data_processing as dp
import ooi_data_explorations.common as common

### Define reference designator for chosen variable

In [3]:
# Set reference designator, data stream, and method 
refdes = "CP01CNSM-MFD37-03-CTDBPD000"        
method = "recovered_inst"
stream = "ctdbp_cdef_instrument_recovered"

# Site, node, and sensor info from deconstructed reference designator
[site, node, sensor] = refdes.split('-', 2)


### Load local QARTOD test flags from processed dataset

In [4]:
# build path to folder where local QARTOD test data was saved
folder_path = os.path.join(os.path.abspath('../data/interim'), method, stream, refdes)

# retrieve list of netCDF files in this directory
local_files = glob.glob(folder_path+'/gross_range*.nc')
# files = [file for file in # I started trying to remove files with blank in the name with more generalized way and ran out of time
local_files.sort() # sorts local test files in order of deployment (also bc this is alphanumeric order)
# files

In [5]:
file = local_files[0]
local_test_result = xr.open_dataset(file)

### Extract and parse expected QC results

In [6]:
# Load expected results data from interim data folder
folder_path = os.path.join(os.path.abspath('../data/external'), method, stream, refdes)
expected_files = glob.glob(folder_path+'/*.nc')
expected_files.sort() # sorts local test files in alphanumeric order

In [7]:
# Open first data set with expected test results
ds_expected = xr.open_dataset(expected_files[0])

In [8]:
# Create a dictionary of key-value pairs of dataset variable name:alternate parameter name
test_parameters={}
for var in ds_expected.variables:
    if "qartod_results" in var:
        # Get the parameter name
        param = var.split("_qartod")[0]
        
        # Check if the parameter has an alternative ooinet_name
        if "alternate_parameter_name" in ds_expected[param].attrs:
            ooinet_name = ds_expected[param].attrs["alternate_parameter_name"]
        else:
            ooinet_name = param
        
        # Save the results in a dictionary
        test_parameters.update({
            param: ooinet_name
        })
# Print out the results
test_parameters

{'sea_water_electrical_conductivity': 'ctdbp_seawater_conductivity',
 'sea_water_temperature': 'ctdbp_seawater_temperature',
 'sea_water_practical_salinity': 'practical_salinity',
 'sea_water_pressure': 'ctdbp_seawater_pressure'}

In [9]:
# Parse the variables with expected QARTOD flags into more easily useable dataset
def parse_qartod_executed(ds, parameters):
    """
    Parses the qartod tests for the given parameter into separate variables.
    
    Parameters
    ----------
    ds: xarray.DataSet
        The dataset downloaded from OOI with the QARTOD flags applied.
    parameters: list[str]
        The name of the parameters in the dataset to parse the QARTOD flags
        
    Returns
    -------
    ds: xarray.DataSet
        The dataset with the QARTOD test for the given parameters split out
        into new seperate data variables using the naming convention:
        {parameter}_qartod_{test_name}
    """
    # For the params into a list if only a string
    if type(parameters) is not list:
        parameters = list(parameters)
    
    # Iterate through each parameter
    for param in parameters:
        # Generate the qartod executed name
        qartod_name = f"{param}_qartod_executed"
        
        if qartod_name not in ds.variables:
            continue
    
        # Fix the test types
        ds[qartod_name] = ds[qartod_name].astype(str)
    
        # Get the test order
        test_order = ds[qartod_name].attrs["tests_executed"].split(",")
    
        # Iterate through the available tests and create separate variables with the results
        for test in test_order:
            test_index = test_order.index(test)
            test_name = f"{param}_qartod_{test.strip()}"
            ds[test_name] = ds[qartod_name].str.get(test_index)

    return ds

In [10]:
# Put the test parameter names in the dataset into a list
parameters = list(test_parameters.keys())

In [11]:
ds_expected

In [12]:
# Parse all of the variables with QARTOD tests applied into separate tests
results_expected = parse_qartod_executed(ds_expected, parameters)
results_expected

### Comparing local results of QARTOD tests to expected results 

In [13]:
# using Andrew's example:

def run_comparison(ds, param, test_results):
    """
    Runs a comparison between the qartod gross range results returned as part of the dataset
    and results calculated locally.
    """
    # Get the local test results and convert to string type for comparison
    local_results = test_results[param].astype(str)
    
    # Run comparison
    not_equal = np.where(ds[f"{param}_qartod_gross_range_test"] != local_results)[0]
    
    if len(not_equal) == 0:
        return None
    else:
        return not_equal

In [15]:
# Identify differences in the results
test_comparison = dict()

for index, param in enumerate(parameters):
    print("Checking for mismatched QARTOD flags in "f"{param}")
    flag_mismatch = run_comparison(results_expected, param, local_test_result)

    if flag_mismatch is None:
        print("No mismatched values found")
        test_comparison.update({f"{param}": {
                "local test mismatch time": "None",
                "expected flags": "None",
                "local test flags": "None"
            }
        })
    else:  
        flag_mismatch = flag_mismatch[np.char.isnumeric(results_expected[f"{param}_qartod_climatology_test"][flag_mismatch])] 

        if len(flag_mismatch) == 0:
            print("No mismatched values found")
            test_comparison.update({f"{param}": {
                    "local test mismatch time": np.nan,
                    "expected flags": np.nan,
                    "local test flags": np.nan
                }
            })
        else:
           test_comparison.update({f"{param}":{
                    "local test mismatch time": results_expected['time'][flag_mismatch].values,
                    "expected flags": results_expected[f"{param}_qartod_climatology_test"][flag_mismatch].values,
                    "local test flags": local_test_result[param][flag_mismatch].values
                }
            })

Checking for mismatched QARTOD flags in sea_water_electrical_conductivity
No mismatched values found
Checking for mismatched QARTOD flags in sea_water_temperature
No mismatched values found
Checking for mismatched QARTOD flags in sea_water_practical_salinity
No mismatched values found
Checking for mismatched QARTOD flags in sea_water_pressure
No mismatched values found


In [16]:
test_comparison

{'sea_water_electrical_conductivity': {'local test mismatch time': 'None',
  'expected flags': 'None',
  'local test flags': 'None'},
 'sea_water_temperature': {'local test mismatch time': 'None',
  'expected flags': 'None',
  'local test flags': 'None'},
 'sea_water_practical_salinity': {'local test mismatch time': 'None',
  'expected flags': 'None',
  'local test flags': 'None'},
 'sea_water_pressure': {'local test mismatch time': 'None',
  'expected flags': 'None',
  'local test flags': 'None'}}

### Prepare CSV with statistics about QARTOD results

In [17]:
def qartod_results_summary(ds, params, test, test_comparison):
    """
    Calculate the statistics for parameter qartod flags.
    
    This function takes in a list of the parameters and
    the associated QARTOD tests to calculate the number
    of each flag and the percent of the flag.
    
    Parameters
    ----------
    ds: xarray.DataSet
        An xarray dataset which contains the data
    params: list[strings]
        A list of the variables/parameters in the given
        dataset that have been tested with QARTOD
    tests: list[strings]
        A list of the QARTOD test names which to parse
        for the given parameters.
        
    Returns
    -------
    results: dict
        A dictionary which contains the number of each
        QARTOD flag and the percent of the total flags
        for each test applied to each parameter in the
        given dataset.
        
        results = {'parameter':
                        {'test_name':
                            {'total data points': int,
                            'good data points': (int, %),
                            'suspect data points': (int, %),
                            'bad data points': (int, %)}
                            },
                        }
    """
    # Check that the inputs are a list
    if type(params) is not list:
        params = [params]
            
    # Initialize the result dictionary and iterate 
    # through the parameters for each test
    results = {}
    for param in params:
        
        # Now iterate through each test
        test_results = {}
        
            
        # First, check that the test was applied
        test_name = f"{param}_qartod_{test}_test"
        if test_name not in ds.variables:
            continue
            
        # Count the total number of values
        n = ds[test_name].count().compute().values
        
        # First calculate the gross range results
        good = np.where(ds[test_name] == "1")[0]

        # Count the number of suspect/interesting
        suspect = np.where(ds[test_name] == "3")[0]

        # Count the number of fails
        bad = np.where(ds[test_name] == "4'")[0]

        test_results.update({"total": int(n),
                "good": (len(good), np.round(len(good)/n*100, 2)),
                "suspect": (len(suspect), np.round(len(suspect)/n*100, 2)),
                "fail": (len(bad), np.round(len(bad)/n*100, 2))
            
            }
        )
        
        if param in test_comparison:
            for label in test_comparison[param]: test_results.update({label: test_comparison[param][label]})
        
        # Save the test results for each parameter
        results.update({
            param: test_results
        })
    
    return results

In [18]:
# Summary of results from individual file
qartod_results = qartod_results_summary(results_expected, parameters, "gross_range", test_comparison)
qartod_results

{'sea_water_electrical_conductivity': {'total': 73272,
  'good': (73272, 100.0),
  'suspect': (0, 0.0),
  'fail': (0, 0.0),
  'local test mismatch time': 'None',
  'expected flags': 'None',
  'local test flags': 'None'},
 'sea_water_temperature': {'total': 73272,
  'good': (73272, 100.0),
  'suspect': (0, 0.0),
  'fail': (0, 0.0),
  'local test mismatch time': 'None',
  'expected flags': 'None',
  'local test flags': 'None'},
 'sea_water_practical_salinity': {'total': 73272,
  'good': (71754, 97.93),
  'suspect': (1518, 2.07),
  'fail': (0, 0.0),
  'local test mismatch time': 'None',
  'expected flags': 'None',
  'local test flags': 'None'},
 'sea_water_pressure': {'total': 73272,
  'good': (73272, 100.0),
  'suspect': (0, 0.0),
  'fail': (0, 0.0),
  'local test mismatch time': 'None',
  'expected flags': 'None',
  'local test flags': 'None'}}

In [19]:
# Open all expected data files and create merged full dataset
full_ds_expected = [xr.open_dataset(file) for file in expected_files]
full_ds_expected = common.merge_frames(full_ds_expected)
full_ds_expected

In [22]:
# Summary of flags from merged dataset
full_results_expected = parse_qartod_executed(full_ds_expected, parameters)
full_qartod_results = qartod_results_summary(full_results_expected, parameters, "gross_range", {})
full_qartod_results

{'sea_water_electrical_conductivity': {'total': 263101,
  'good': (258278, 98.17),
  'suspect': (4823, 1.83),
  'fail': (0, 0.0)},
 'sea_water_temperature': {'total': 263101,
  'good': (257817, 97.99),
  'suspect': (5284, 2.01),
  'fail': (0, 0.0)},
 'sea_water_practical_salinity': {'total': 263101,
  'good': (257601, 97.91),
  'suspect': (5500, 2.09),
  'fail': (0, 0.0)},
 'sea_water_pressure': {'total': 263101,
  'good': (262446, 99.75),
  'suspect': (655, 0.25),
  'fail': (0, 0.0)}}

In [15]:
qartod_flag_stats = pd.DataFrame.from_dict(qartod_results, orient='index')
qartod_flag_stats

Unnamed: 0,total,good,suspect,fail,local test mismatch time,expected flags,local test flags
sea_water_electrical_conductivity,3380605,"(3380600, 100.0)","(0, 0.0)","(0, 0.0)","[2014-02-17T07:41:51.000000000, 2016-04-02T00:...","[B, B, B, B, B]","[1, 1, 1, 1, 1]"
sea_water_temperature,3380605,"(3380602, 100.0)","(0, 0.0)","(0, 0.0)","[2014-02-17T08:30:51.000000000, 2018-03-29T19:...","[B, B, B]","[1, 1, 1]"
sea_water_practical_salinity,3380605,"(3368511, 99.64)","(12091, 0.36)","(0, 0.0)","[2014-02-17T07:26:00.000000000, 2018-03-29T01:...","[B, B, B]","[1, 1, 1]"
sea_water_pressure,3380605,"(3330378, 98.51)","(50222, 1.49)","(0, 0.0)","[2014-02-17T12:50:29.000000000, 2018-03-29T17:...","[B, B, B, B, B]","[1, 1, 1, 1, 1]"


In [16]:
csv_path = dp.build_data_path(refdes, method, stream, 'qartod-gr-stats', folder='processed', suffix='.csv')
qartod_flag_stats.to_csv(csv_path)