# Compare local QARTOD Climatology Test Results to Expected Flags
Next, we want to calculate the statistics of the different QARTOD flags for the different tests that are applied to the different parameters in the dataset. The example ```qartod_results_summary``` below simply counts the total number of different flags (e.g 1, 3, 4) and their relative percentages for each test (gross range, climatology, etc) for each parameter that the tests area applied to. 

### Import modules used in this notebook

In [1]:
# Import libraries
import os
import re
import requests
import gc
import io
import ast
import pandas as pd
import numpy as np
import xarray as xr
import warnings
warnings.filterwarnings("ignore")
import sys

In [2]:
# Import function to build relative path to data files
from qartod_testing.data_processing import build_data_path

### Define reference designator for chosen variable

In [3]:
# Set reference designator, data stream, and method 

method = "recovered_inst"                           # non-decimated data from recovered instrument
stream = "ctdbp_cdef_instrument_recovered"          # name of data stream
refdes = "CP01CNSM-RID27-03-CTDBPC000"               # build reference designator

# Site, node, and sensor info from deconstructed reference designator
[site, node, sensor] = refdes.split('-', 2)

### Load local QARTOD test flags from processed dataset

In [4]:
# Build path to data files with local test results and load to workspace - Might not use this
local_test_path = build_data_path(refdes, method, stream, 'prod-clim-result', folder="processed")

# Load local test data from .nc files
local_test_results = xr.open_dataset(local_test_path)

In [5]:
local_test_results

### Extract and parse expected QC results

In [6]:
# Load expected results data from interim data folder
expected_ds_path = build_data_path(refdes,method,stream,'prod',folder='interim')
ds_expected = xr.open_dataset(expected_ds_path)

# Create a dictionary of key-value pairs of dataset variable name:alternate parameter name
test_parameters={}
for var in ds_expected.variables:
    if "qartod_results" in var:
        # Get the parameter name
        param = var.split("_qartod")[0]
        
        # Check if the parameter has an alternative ooinet_name
        if "alternate_parameter_name" in ds_expected[param].attrs:
            ooinet_name = ds_expected[param].attrs["alternate_parameter_name"]
        else:
            ooinet_name = param
        
        # Save the results in a dictionary
        test_parameters.update({
            param: ooinet_name
        })
# Print out the results
test_parameters

{'sea_water_electrical_conductivity': 'ctdbp_seawater_conductivity',
 'sea_water_temperature': 'ctdbp_seawater_temperature',
 'sea_water_practical_salinity': 'practical_salinity',
 'sea_water_pressure': 'ctdbp_seawater_pressure'}

In [7]:
def parse_qartod_executed(ds, parameters):
    """
    Parses the qartod tests for the given parameter into separate variables.
    
    Parameters
    ----------
    ds: xarray.DataSet
        The dataset downloaded from OOI with the QARTOD flags applied.
    parameters: list[str]
        The name of the parameters in the dataset to parse the QARTOD flags
        
    Returns
    -------
    ds: xarray.DataSet
        The dataset with the QARTOD test for the given parameters split out
        into new seperate data variables using the naming convention:
        {parameter}_qartod_{test_name}
    """
    # For the params into a list if only a string
    if type(parameters) is not list:
        parameters = list(parameters)
    
    # Iterate through each parameter
    for param in parameters:
        # Generate the qartod executed name
        qartod_name = f"{param}_qartod_executed"
        
        if qartod_name not in ds.variables:
            continue
    
        # Fix the test types
        ds[qartod_name] = ds[qartod_name].astype(str)
    
        # Get the test order
        test_order = ds[qartod_name].attrs["tests_executed"].split(",")
    
        # Iterate through the available tests and create separate variables with the results
        for test in test_order:
            test_index = test_order.index(test)
            test_name = f"{param}_qartod_{test.strip()}"
            ds[test_name] = ds[qartod_name].str.get(test_index)

    return ds

In [8]:
# Put the test parameter names in the dataset into a list
parameters = [x for x in test_parameters.keys()]

In [9]:
# Parse all of the variables with QARTOD tests applied into separate tests
results_expected = parse_qartod_executed(ds_expected, parameters)
results_expected

### Comparing local results of QARTOD tests to expected results 

In [10]:
# using Andrew's example:

def run_comparison(ds, param, test_results):
    """
    Runs a comparison between the qartod climatology results returned as part of the dataset
    and results calculated locally.
    """
    test_name = f"{param}_qartod_climatology_test"
    if test_name not in ds.variables:
        return None
    
    # Get the local test results and convert to string type for comparison
    local_results = test_results[param].astype(str)
    
    # Run comparison
    not_equal = np.where(ds[f"{param}_qartod_climatology_test"] != local_results)[0]
    
    if len(not_equal) == 0:
        return None
    else:
        return not_equal

In [109]:
# Identify differences in the results
test_comparison = dict()

for index, param in enumerate(parameters):
    print("Checking for mismatched QARTOD flags in "f"{param}")
    flag_mismatch = run_comparison(results_expected, param, local_test_results)

    if flag_mismatch is None:
        print("No mismatched values found")
        pass
    else:  
        flag_mismatch = flag_mismatch[np.char.isnumeric(results_expected[f"{param}_qartod_climatology_test"][flag_mismatch])] 

        if len(flag_mismatch) == 0:
            print("No mismatched values found")
            pass
        else:
           test_comparison.update({f"{param}_mismatched_flags":{
                    "time": results_expected['time'][flag_mismatch].values,
                    "expected flags": results_expected[f"{param}_qartod_climatology_test"][flag_mismatch].values,
                    "local test flags": local_test_results[param][flag_mismatch].values
                }
            })

Checking for mismatched QARTOD flags in sea_water_electrical_conductivity
No mismatched values found
Checking for mismatched QARTOD flags in sea_water_temperature
Checking for mismatched QARTOD flags in sea_water_practical_salinity
No mismatched values found
Checking for mismatched QARTOD flags in sea_water_pressure
No mismatched values found


In [110]:
test_comparison

{'sea_water_temperature_mismatched_flags': {'time': array(['2015-12-04T10:31:50.000000000', '2015-12-04T10:33:10.000000000',
         '2015-12-04T10:33:20.000000000', ...,
         '2022-08-13T14:46:04.000000000', '2022-08-13T15:47:33.000000000',
         '2022-08-13T15:47:43.000000000'], dtype='datetime64[ns]'),
  'expected flags': array(['3', '3', '3', ..., '1', '1', '1'], dtype='<U1'),
  'local test flags': array([1, 1, 1, ..., 3, 3, 3], dtype=uint8)}}

Next, we'll manually create datasets to hold the results of the comparison for each parameter

In [114]:
temperature_mismatch = xr.Dataset(data_vars=dict(expected_flags=(["time"], test_comparison['sea_water_temperature_mismatched_flags']['expected flags']),
                                                 local_test_flags=(["time"], test_comparison['sea_water_temperature_mismatched_flags']['local test flags'])
                                                 ),
                                    coords=dict(time=(["time"], test_comparison['sea_water_temperature_mismatched_flags']['time']))
                                )

In [115]:
temperature_mismatch

In [122]:
import matplotlib.pyplot as plt
temperature_mismatch.plot
plt.show

ModuleNotFoundError: No module named 'matplotlib'

### Prepare CSV with statistics about QARTOD results

In [59]:
def qartod_results_summary(ds, params, test, test_comparison):
    """
    Calculate the statistics for parameter qartod flags.
    
    This function takes in a list of the parameters and
    the associated QARTOD tests to calculate the number
    of each flag and the percent of the flag.
    
    Parameters
    ----------
    ds: xarray.DataSet
        An xarray dataset which contains the data
    params: list[strings]
        A list of the variables/parameters in the given
        dataset that have been tested with QARTOD
    tests: list[strings]
        A list of the QARTOD test names which to parse
        for the given parameters.
        
    Returns
    -------
    results: dict
        A dictionary which contains the number of each
        QARTOD flag and the percent of the total flags
        for each test applied to each parameter in the
        given dataset.
        
        results = {'parameter':
                        {'test_name':
                            {'total data points': int,
                            'good data points': (int, %),
                            'suspect data points': (int, %),
                            'bad data points': (int, %)}
                            },
                        }
    """
    # Check that the inputs are a list
    if type(params) is not list:
        params = [params]
        
    # Initialize the result dictionary and iterate 
    # through the parameters for each test
    results = {}
    for param in params:
        
        # Now create dictionary of results for test
        test_results = {}
        
        # First, check that the test was applied
        test_name = f"{param}_qartod_{test}_test"
        if test_name not in ds.variables:
            continue
            
        # Count the total number of values
        n = ds[test_name].count().compute().values
        
        # First calculate the gross range results
        good = np.where(ds[test_name] == "1")[0]

        # Count the number of suspect/interesting
        suspect = np.where(ds[test_name] == "3")[0]

        # Count the number of fails
        bad = np.where(ds[test_name] == "4'")[0]

        test_results.update({"total": int(n),
            "good": (len(good), np.round(len(good)/n*100, 2)),
            "suspect": (len(suspect), np.round(len(suspect)/n*100, 2)),
            "fail": (len(bad), np.round(len(bad)/n*100, 2))
            }
        )
    
        if f"{param}_mismatched_flags" in test_comparison:
            for label in test_comparison[f"{param}_mismatched_flags"]: test_results.update({label: (len(test_comparison[f"{param}_mismatched_flags"][label]), np.round(len(test_comparison[f"{param}_mismatched_flags"][label])/n*100, 2))})
    
        # Save the test results for each parameter
        results.update({
            param: test_results
        })
    
    return results

In [60]:
qartod_results = qartod_results_summary(results_expected, parameters, "climatology", test_comparison)
qartod_results

{'sea_water_temperature': {'total': 3380605,
  'good': (3239079, 95.81),
  'suspect': (141523, 4.19),
  'fail': (0, 0.0),
  'time': (31051, 0.92),
  'expected flags': (31051, 0.92),
  'local test flags': (31051, 0.92)},
 'sea_water_practical_salinity': {'total': 3380605,
  'good': (3221767, 95.3),
  'suspect': (158835, 4.7),
  'fail': (0, 0.0)}}

In [116]:
qartod_flag_stats = pd.DataFrame.from_dict(qartod_results, orient='index')
qartod_flag_stats

Unnamed: 0,total,good,suspect,fail,time,expected flags,local test flags
sea_water_temperature,3380605,"(3239079, 95.81)","(141523, 4.19)","(0, 0.0)","(31051, 0.92)","(31051, 0.92)","(31051, 0.92)"
sea_water_practical_salinity,3380605,"(3221767, 95.3)","(158835, 4.7)","(0, 0.0)",,,


In [117]:
# save data frame with statistics to csv
csv_path = build_data_path(refdes, method, stream, 'qartod-clim-stats', folder='processed', suffix='.csv')
qartod_flag_stats.to_csv(csv_path)

In [119]:
# save datasets of comparison results to .nc file
nc_path = build_data_path(refdes, method, stream, 'qartod-clim-comparison', folder='processed', suffix='.nc')
temperature_mismatch.to_netcdf(nc_path, mode='a')