# Descriptive Statistics
Next, we want to calculate the statistics of the different QARTOD flags for the different tests that are applied to the different parameters in the dataset. The example ```qartod_results_summary``` below simply counts the total number of different flags (e.g 1, 3, 4) and their relative percentages for each test (gross range, climatology, etc) for each parameter that the tests area applied to. 

### Import modules used in this notebook

In [1]:
# Import libraries
import os
import re
import requests
import gc
import io
import ast
import pandas as pd
import numpy as np
import xarray as xr
import warnings
warnings.filterwarnings("ignore")
import sys

In [2]:
# Import function to build relative path to data files
from qartod_testing.data_processing import build_data_path

In [3]:
# Set reference designator, data stream, and method 

refdes = "CP01CNSM-MFD37-03-CTDBPD000"              
method = "recovered_inst"                           # non-decimated data from recovered instrument
stream = "ctdbp_cdef_instrument_recovered"          # name of data stream


In [None]:
# # Build path to data files with local test results and load to workspace - Might not use this
# gr_local_test_path = build_data_path(refdes,method,stream,'prod-gr-result',folder='processed')
# clim_local_test_path = build_data_path(refdes, method, stream, 'prod-clim-result', folder="processed")

# # Load local test data from .nc files
# gr_results_local = xr.open_dataset(gr_local_test_path)
# clim_results_local = xr.open_dataset(clim_local_test_path)

In [4]:
# Load expected results data from interim data folder
expected_ds_path = build_data_path(refdes,method,stream,'prod',folder='interim')
ds_expected = xr.open_dataset(expected_ds_path)

# Create a dictionary of key-value pairs of dataset variable name:alternate parameter name
test_parameters={}
for var in ds_expected.variables:
    if "qartod_results" in var:
        # Get the parameter name
        param = var.split("_qartod")[0]
        
        # Check if the parameter has an alternative ooinet_name
        if "alternate_parameter_name" in ds_expected[param].attrs:
            ooinet_name = ds_expected[param].attrs["alternate_parameter_name"]
        else:
            ooinet_name = param
        
        # Save the results in a dictionary
        test_parameters.update({
            param: ooinet_name
        })
# Print out the results
test_parameters

{'sea_water_electrical_conductivity': 'ctdbp_seawater_conductivity',
 'sea_water_temperature': 'ctdbp_seawater_temperature',
 'sea_water_practical_salinity': 'practical_salinity',
 'sea_water_pressure': 'ctdbp_seawater_pressure'}

In [5]:
def parse_qartod_executed(ds, parameters):
    """
    Parses the qartod tests for the given parameter into separate variables.
    
    Parameters
    ----------
    ds: xarray.DataSet
        The dataset downloaded from OOI with the QARTOD flags applied.
    parameters: list[str]
        The name of the parameters in the dataset to parse the QARTOD flags
        
    Returns
    -------
    ds: xarray.DataSet
        The dataset with the QARTOD test for the given parameters split out
        into new seperate data variables using the naming convention:
        {parameter}_qartod_{test_name}
    """
    # For the params into a list if only a string
    if type(parameters) is not list:
        parameters = list(parameters)
    
    # Iterate through each parameter
    for param in parameters:
        # Generate the qartod executed name
        qartod_name = f"{param}_qartod_executed"
        
        if qartod_name not in ds.variables:
            continue
    
        # Fix the test types
        ds[qartod_name] = ds[qartod_name].astype(str)
    
        # Get the test order
        test_order = ds[qartod_name].attrs["tests_executed"].split(",")
    
        # Iterate through the available tests and create separate variables with the results
        for test in test_order:
            test_index = test_order.index(test)
            test_name = f"{param}_qartod_{test.strip()}"
            ds[test_name] = ds[qartod_name].str.get(test_index)

    return ds

In [6]:
# Put the test parameter names in the dataset into a list
parameters = [x for x in test_parameters.keys()]

In [7]:
# Parse all of the variables with QARTOD tests applied into separate tests
results_expected = parse_qartod_executed(ds_expected, parameters)

In [8]:
def qartod_results_summary(ds, params, tests):
    """
    Calculate the statistics for parameter qartod flags.
    
    This function takes in a list of the parameters and
    the associated QARTOD tests to calculate the number
    of each flag and the percent of the flag.
    
    Parameters
    ----------
    ds: xarray.DataSet
        An xarray dataset which contains the data
    params: list[strings]
        A list of the variables/parameters in the given
        dataset that have been tested with QARTOD
    tests: list[strings]
        A list of the QARTOD test names which to parse
        for the given parameters.
        
    Returns
    -------
    results: dict
        A dictionary which contains the number of each
        QARTOD flag and the percent of the total flags
        for each test applied to each parameter in the
        given dataset.
        
        results = {'parameter':
                        {'test_name':
                            {'total data points': int,
                            'good data points': (int, %),
                            'suspect data points': (int, %),
                            'bad data points': (int, %)}
                            },
                        }
    """
    # Check that the inputs are a list
    if type(params) is not list:
        params = [params]
        
    if type(tests) is not list:
        tests = [tests]
    
    # Initialize the result dictionary and iterate 
    # through the parameters for each test
    results = {}
    for param in params:
        
        # Now iterate through each test
        test_results = {}
        for test in tests:
            
            # First, check that the test was applied
            test_name = f"{param}_qartod_{test}_test"
            if test_name not in ds.variables:
                continue
                
            # Count the total number of values
            n = ds[test_name].count().compute().values
            
            # First calculate the gross range results
            good = np.where(ds[test_name] == "1")[0]

            # Count the number of suspect/interesting
            suspect = np.where(ds[test_name] == "3")[0]
    
            # Count the number of fails
            bad = np.where(ds[test_name] == "4'")[0]
    
            test_results.update({test :{
                     "total": int(n),
                     "good": (len(good), np.round(len(good)/n*100, 2)),
                     "suspect": (len(suspect), np.round(len(suspect)/n*100, 2)),
                     "fail": (len(bad), np.round(len(bad)/n*100, 2))
                    }
                }
            )
        
        # Save the test results for each parameter
        results.update({
            param: test_results
        })
    
    return results

In [9]:
qartod_results = qartod_results_summary(results_expected, parameters, ["gross_range", "climatology"])
qartod_results

{'sea_water_electrical_conductivity': {'gross_range': {'total': 263600,
   'good': (258753, 98.16),
   'suspect': (4846, 1.84),
   'fail': (0, 0.0)}},
 'sea_water_temperature': {'gross_range': {'total': 263600,
   'good': (258314, 97.99),
   'suspect': (5286, 2.01),
   'fail': (0, 0.0)},
  'climatology': {'total': 263600,
   'good': (231212, 87.71),
   'suspect': (32388, 12.29),
   'fail': (0, 0.0)}},
 'sea_water_practical_salinity': {'gross_range': {'total': 263600,
   'good': (258046, 97.89),
   'suspect': (5554, 2.11),
   'fail': (0, 0.0)},
  'climatology': {'total': 263600,
   'good': (242335, 91.93),
   'suspect': (21265, 8.07),
   'fail': (0, 0.0)}},
 'sea_water_pressure': {'gross_range': {'total': 263600,
   'good': (262941, 99.75),
   'suspect': (656, 0.25),
   'fail': (0, 0.0)}}}

In [10]:
qartod_flag_stats = pd.DataFrame.from_dict(qartod_results, orient='index')

In [11]:
csv_path = build_data_path(refdes, method, stream, 'qartod-stats', folder='processed', suffix='.csv')
qartod_flag_stats.to_csv(csv_path)