# Compare local QARTOD Gross Range Test Results to Expected Flags
Next, we want to calculate the statistics of the different QARTOD flags for the different tests that are applied to the different parameters in the dataset. The example ```qartod_results_summary``` below simply counts the total number of different flags (e.g 1, 3, 4) and their relative percentages for each test (gross range, climatology, etc) for each parameter that the tests area applied to. 

### Import modules used in this notebook

In [1]:
# Import libraries
import os
import re
import gc
import io
import ast
import glob
import requests
import pandas as pd
import numpy as np
import xarray as xr
import warnings
warnings.filterwarnings("ignore")
import sys

In [2]:
# Import function to build relative path to data files
import qartod_testing.data_processing as dp
import ooi_data_explorations.common as common

### Define reference designator for chosen variable

In [3]:
# Set reference designator, data stream, and method 
refdes = "CP01CNSM-MFD37-03-CTDBPD000"        
method = "recovered_inst"
stream = "ctdbp_cdef_instrument_recovered"

### Grab file names for locally evaluated QARTOD tests flags

In [4]:
# build path to folder where local QARTOD test data was saved
folder_path = os.path.join(os.path.abspath('../data/interim'), method, stream, refdes)

# retrieve list of netCDF files in this directory
local_files = glob.glob(folder_path+'/gross_range*.nc')
# files = [file for file in # I started trying to remove files with blank in the name with more generalized way and ran out of time
local_files.sort() # sorts local test files in order of deployment (also bc this is alphanumeric order)
# files

### Extract and parse expected QC results

In [6]:
# Load expected results data from interim data folder
folder_path = os.path.join(os.path.abspath('../data/external'), method, stream, refdes)
expected_files = glob.glob(folder_path+'/*.nc')
expected_files.sort() # sorts local test files in alphanumeric order

In [None]:
ds_expected = xr.open_dataset(expected_files[0])

In [8]:
# Create a dictionary of key-value pairs of dataset variable name:alternate parameter name
test_parameters = dp.get_test_parameters(ds_expected)
# Print out the results
test_parameters

{'sea_water_electrical_conductivity': 'ctdbp_seawater_conductivity',
 'sea_water_temperature': 'ctdbp_seawater_temperature',
 'sea_water_practical_salinity': 'practical_salinity',
 'sea_water_pressure': 'ctdbp_seawater_pressure'}

In [9]:
# Put the test parameter names in the dataset into a list
parameters = list(test_parameters.keys())

In [10]:
# Parse all of the variables with QARTOD tests applied into separate tests
results_expected = dp.parse_qartod_executed(ds_expected, parameters)
results_expected

### Comparing local results of QARTOD tests to expected results 

In [14]:
# Identify differences in the results
test_comparison = dict()

for index, param in enumerate(parameters):
    print("Checking for mismatched QARTOD flags in "f"{param}")
    flag_mismatch = dp.run_comparison(results_expected, param, local_test_result)

    if flag_mismatch is None:
        print("No mismatched values found")
        test_comparison.update({f"{param}": {
                "local test mismatch time": "None",
                "expected flags": "None",
                "local test flags": "None"
            }
        })
    else:  
        flag_mismatch = flag_mismatch[np.char.isnumeric(results_expected[f"{param}_qartod_climatology_test"][flag_mismatch])] 

        if len(flag_mismatch) == 0:
            print("No mismatched values found")
            test_comparison.update({f"{param}": {
                    "local test mismatch time": np.nan,
                    "expected flags": np.nan,
                    "local test flags": np.nan
                }
            })
        else:
           test_comparison.update({f"{param}":{
                    "local test mismatch time": results_expected['time'][flag_mismatch].values,
                    "expected flags": results_expected[f"{param}_qartod_climatology_test"][flag_mismatch].values,
                    "local test flags": local_test_result[param][flag_mismatch].values
                }
            })

Checking for mismatched QARTOD flags in sea_water_electrical_conductivity
No mismatched values found
Checking for mismatched QARTOD flags in sea_water_temperature
No mismatched values found
Checking for mismatched QARTOD flags in sea_water_practical_salinity
No mismatched values found
Checking for mismatched QARTOD flags in sea_water_pressure
No mismatched values found


In [15]:
test_comparison

{'sea_water_electrical_conductivity': {'local test mismatch time': 'None',
  'expected flags': 'None',
  'local test flags': 'None'},
 'sea_water_temperature': {'local test mismatch time': 'None',
  'expected flags': 'None',
  'local test flags': 'None'},
 'sea_water_practical_salinity': {'local test mismatch time': 'None',
  'expected flags': 'None',
  'local test flags': 'None'},
 'sea_water_pressure': {'local test mismatch time': 'None',
  'expected flags': 'None',
  'local test flags': 'None'}}

### Prepare CSV with statistics about QARTOD results

In [17]:
# Summary of results from individual file
qartod_results = dp.qartod_results_summary(results_expected, parameters, "gross_range", test_comparison)
qartod_results

{'sea_water_electrical_conductivity': {'total': 73272,
  'good': (73272, 100.0),
  'suspect': (0, 0.0),
  'fail': (0, 0.0),
  'local test mismatch time': 'None',
  'expected flags': 'None',
  'local test flags': 'None'},
 'sea_water_temperature': {'total': 73272,
  'good': (73272, 100.0),
  'suspect': (0, 0.0),
  'fail': (0, 0.0),
  'local test mismatch time': 'None',
  'expected flags': 'None',
  'local test flags': 'None'},
 'sea_water_practical_salinity': {'total': 73272,
  'good': (71754, 97.93),
  'suspect': (1518, 2.07),
  'fail': (0, 0.0),
  'local test mismatch time': 'None',
  'expected flags': 'None',
  'local test flags': 'None'},
 'sea_water_pressure': {'total': 73272,
  'good': (73272, 100.0),
  'suspect': (0, 0.0),
  'fail': (0, 0.0),
  'local test mismatch time': 'None',
  'expected flags': 'None',
  'local test flags': 'None'}}

In [19]:
# Open all expected data files and create merged full dataset
full_ds_expected = [xr.open_dataset(file) for file in expected_files]
full_ds_expected = common.merge_frames(full_ds_expected)
full_ds_expected

In [22]:
# Summary of flags from merged dataset
full_results_expected = parse_qartod_executed(full_ds_expected, parameters)
full_qartod_results = dp.qartod_results_summary(full_results_expected, parameters, "gross_range", {})
full_qartod_results

{'sea_water_electrical_conductivity': {'total': 263101,
  'good': (258278, 98.17),
  'suspect': (4823, 1.83),
  'fail': (0, 0.0)},
 'sea_water_temperature': {'total': 263101,
  'good': (257817, 97.99),
  'suspect': (5284, 2.01),
  'fail': (0, 0.0)},
 'sea_water_practical_salinity': {'total': 263101,
  'good': (257601, 97.91),
  'suspect': (5500, 2.09),
  'fail': (0, 0.0)},
 'sea_water_pressure': {'total': 263101,
  'good': (262446, 99.75),
  'suspect': (655, 0.25),
  'fail': (0, 0.0)}}

In [15]:
qartod_flag_stats = pd.DataFrame.from_dict(qartod_results, orient='index')
qartod_flag_stats

Unnamed: 0,total,good,suspect,fail,local test mismatch time,expected flags,local test flags
sea_water_electrical_conductivity,3380605,"(3380600, 100.0)","(0, 0.0)","(0, 0.0)","[2014-02-17T07:41:51.000000000, 2016-04-02T00:...","[B, B, B, B, B]","[1, 1, 1, 1, 1]"
sea_water_temperature,3380605,"(3380602, 100.0)","(0, 0.0)","(0, 0.0)","[2014-02-17T08:30:51.000000000, 2018-03-29T19:...","[B, B, B]","[1, 1, 1]"
sea_water_practical_salinity,3380605,"(3368511, 99.64)","(12091, 0.36)","(0, 0.0)","[2014-02-17T07:26:00.000000000, 2018-03-29T01:...","[B, B, B]","[1, 1, 1]"
sea_water_pressure,3380605,"(3330378, 98.51)","(50222, 1.49)","(0, 0.0)","[2014-02-17T12:50:29.000000000, 2018-03-29T17:...","[B, B, B, B, B]","[1, 1, 1, 1, 1]"


In [16]:
csv_path = dp.build_data_path(refdes, method, stream, 'qartod-gr-stats', folder='processed', suffix='.csv')
qartod_flag_stats.to_csv(csv_path)

### Create data frames for statistics and mismatched flags by deployment

In [7]:
# Initialize empty dictionaries for comparison and statistics
mismatch = {}
statistics = {}

In [60]:
# for m in enumerate(expected_files):
m = 5
local_file = local_files[m]
expected_file = expected_files[m]

# get deployment from current file, then open local test and expected test datasets
deployment = re.findall('00[0-2][0-9]', local_file)[0][-2:]
local_ds = xr.open_dataset(local_file)
expected_ds = xr.open_dataset(expected_file)

In [61]:
# Get parameters that have QARTOD executed from expected test dataset
test_parameters = dp.get_test_parameters(expected_ds)
parameters = list(test_parameters.keys())

# Separate QARTOD test flags in expected test dataset by QARTOD test name
expected_ds = dp.parse_qartod_executed(expected_ds, parameters)

In [36]:
# Loop through parameters while updating dictionaries for mismatched flags
mismatch.update({ f"{deployment}" : {} })
for param in parameters:
    mismatch[f"{deployment}"].update({f"{param}" : {} })
    
    # Evaluate comparison of local test and expected test flags to update dictionary of differences in the results
    print("Checking for mismatched QARTOD flags in "f"{param}")
    flag_mismatch = dp.run_comparison(expected_ds, param, local_ds)

    if flag_mismatch is None:
        print("No mismatched values found")
        mismatch[f"{deployment}"].update({
                f"{param}" : np.nan
        })
     
    else:
        mismatch[f"{deployment}"][f"{param}"].update({
                'datetimes' : expected_ds['time'][flag_mismatch].values,
                'expected_flags' : expected_ds[f"{param}_qartod_gross_range_test"][flag_mismatch].values,
                'local_flags' : local_ds[param][flag_mismatch].values,
                'file_name' : f"{expected_file}"
        })
       

Checking for mismatched QARTOD flags in sea_water_electrical_conductivity
No mismatched values found
Checking for mismatched QARTOD flags in sea_water_temperature
No mismatched values found
Checking for mismatched QARTOD flags in sea_water_practical_salinity
No mismatched values found
Checking for mismatched QARTOD flags in sea_water_pressure
No mismatched values found


In [37]:
mismatch = pd.DataFrame.from_dict(mismatch, orient='index')
# mismatch2 = mismatch.set_index([deployment], [parameters])
mismatch

Unnamed: 0,sea_water_electrical_conductivity,sea_water_temperature,sea_water_practical_salinity,sea_water_pressure
1,,,,
3,,,,
5,,,,
6,,,,
7,,,,
8,,,,


In [38]:
# Evaluate statistics on expected QARTOD test flags
print("Evaluating statistics on QARTOD flags for deployment "f"{deployment}")
summary_results = dp.qartod_results_summary(expected_ds, parameters, "gross_range")

# Update statistics dictionary for current deployment
statistics.update({ f"{deployment}" : summary_results })

Evaluating statistics on QARTOD flags for deployment 08


In [39]:
# Add entry to summary statistics for full data record after last file
if expected_file == expected_files[-1]:
    # Open all expected data files and create merged full dataset
    expected_all_time = [xr.open_dataset(file) for file in expected_files]
    expected_all_time = common.merge_frames(expected_all_time)

    # Summary of flags from merged dataset
    expected_all_time = dp.parse_qartod_executed(expected_all_time, parameters)
    summary_results = dp.qartod_results_summary(expected_all_time, parameters, "gross_range")
    statistics.update({ "all time" : summary_results })

In [40]:
# Create data frames from dictionaries and check contents
statistics = pd.DataFrame.from_dict(statistics, orient='index')
# mismatch = pd.DataFrame(data=mismatch)
statistics
# mismatch

Unnamed: 0,sea_water_electrical_conductivity,sea_water_temperature,sea_water_practical_salinity,sea_water_pressure
1,"{'total': 73272, 'good': (73272, 100.0), 'susp...","{'total': 73272, 'good': (73272, 100.0), 'susp...","{'total': 73272, 'good': (71754, 97.93), 'susp...","{'total': 73272, 'good': (73272, 100.0), 'susp..."
3,"{'total': 16233, 'good': (16233, 100.0), 'susp...","{'total': 16233, 'good': (16233, 100.0), 'susp...","{'total': 16233, 'good': (16232, 99.99), 'susp...","{'total': 16233, 'good': (16232, 99.99), 'susp..."
5,"{'total': 7753, 'good': (7753, 100.0), 'suspec...","{'total': 7753, 'good': (7753, 100.0), 'suspec...","{'total': 7753, 'good': (7753, 100.0), 'suspec...","{'total': 7753, 'good': (7753, 100.0), 'suspec..."
6,"{'total': 7983, 'good': (7298, 91.42), 'suspec...","{'total': 7983, 'good': (7038, 88.16), 'suspec...","{'total': 7983, 'good': (7983, 100.0), 'suspec...","{'total': 7983, 'good': (7983, 100.0), 'suspec..."
7,"{'total': 7743, 'good': (7696, 99.39), 'suspec...","{'total': 7743, 'good': (7743, 100.0), 'suspec...","{'total': 7743, 'good': (7739, 99.95), 'suspec...","{'total': 7743, 'good': (7743, 100.0), 'suspec..."
8,"{'total': 14518, 'good': (14416, 99.3), 'suspe...","{'total': 14518, 'good': (14379, 99.04), 'susp...","{'total': 14518, 'good': (14384, 99.08), 'susp...","{'total': 14518, 'good': (14517, 99.99), 'susp..."


In [46]:
# Second option for creating the statistics data frame - hopefully a little easier to read it all at a glance, or at least edit in excel
statistics2 = {}

In [62]:
# Add entry to summary statistics for full data record after last file
summary_results = dp.qartod_summary_expanded(expected_ds, parameters, "gross_range")
statistics2.update({f"{deployment}" : summary_results })
if expected_file == expected_files[-1]:
    # Open all expected data files and create merged full dataset
    expected_all_time = [xr.open_dataset(file) for file in expected_files]
    expected_all_time = common.merge_frames(expected_all_time)

    # Summary of flags from merged dataset
    expected_all_time = dp.parse_qartod_executed(expected_all_time, parameters)
    summary_results = dp.qartod_summary_expanded(expected_all_time, parameters, "gross_range", {})
    statistics2.update({ "all time" : summary_results })

In [63]:
# Create data frame from dictionary and check contents
statistics2 = pd.DataFrame.from_dict(statistics2, orient='index')
statistics2

Unnamed: 0,sea_water_electrical_conductivity total,sea_water_electrical_conductivity good,sea_water_electrical_conductivity suspect,sea_water_electrical_conductivity fail,sea_water_temperature total,sea_water_temperature good,sea_water_temperature suspect,sea_water_temperature fail,sea_water_practical_salinity total,sea_water_practical_salinity good,sea_water_practical_salinity suspect,sea_water_practical_salinity fail,sea_water_pressure total,sea_water_pressure good,sea_water_pressure suspect,sea_water_pressure fail
1,73272,"(73272, 100.0)","(0, 0.0)","(0, 0.0)",73272,"(73272, 100.0)","(0, 0.0)","(0, 0.0)",73272,"(71754, 97.93)","(1518, 2.07)","(0, 0.0)",73272,"(73272, 100.0)","(0, 0.0)","(0, 0.0)"
3,16233,"(16233, 100.0)","(0, 0.0)","(0, 0.0)",16233,"(16233, 100.0)","(0, 0.0)","(0, 0.0)",16233,"(16232, 99.99)","(1, 0.01)","(0, 0.0)",16233,"(16232, 99.99)","(1, 0.01)","(0, 0.0)"
5,7753,"(7753, 100.0)","(0, 0.0)","(0, 0.0)",7753,"(7753, 100.0)","(0, 0.0)","(0, 0.0)",7753,"(7753, 100.0)","(0, 0.0)","(0, 0.0)",7753,"(7753, 100.0)","(0, 0.0)","(0, 0.0)"
6,7983,"(7298, 91.42)","(685, 8.58)","(0, 0.0)",7983,"(7038, 88.16)","(945, 11.84)","(0, 0.0)",7983,"(7983, 100.0)","(0, 0.0)","(0, 0.0)",7983,"(7983, 100.0)","(0, 0.0)","(0, 0.0)"
7,7743,"(7696, 99.39)","(47, 0.61)","(0, 0.0)",7743,"(7743, 100.0)","(0, 0.0)","(0, 0.0)",7743,"(7739, 99.95)","(4, 0.05)","(0, 0.0)",7743,"(7743, 100.0)","(0, 0.0)","(0, 0.0)"
8,14518,"(14416, 99.3)","(102, 0.7)","(0, 0.0)",14518,"(14379, 99.04)","(139, 0.96)","(0, 0.0)",14518,"(14384, 99.08)","(134, 0.92)","(0, 0.0)",14518,"(14517, 99.99)","(1, 0.01)","(0, 0.0)"


In [64]:
# Save data frames as CSVs
folder_path = os.path.join(os.path.abspath('../data/processed'), method, stream, refdes)
os.makedirs(folder_path, exist_ok=True)
mismatch.to_csv(folder_path+f"/gross_range-{refdes}-mismatched_flags.csv", na_rep='NaN')
statistics.to_csv(folder_path+f"/gross_range-{refdes}-flag_statistics1.csv", na_rep='NaN')
statistics2.to_csv(folder_path+f"/gross_range-{refdes}-flag_statistics2.csv", na_rep='NaN')