# Compare local QARTOD Climatology Test Results to Expected Flags
Next, we want to calculate the statistics of the different QARTOD flags for the different tests that are applied to the different parameters in the dataset. The example ```qartod_results_summary``` below simply counts the total number of different flags (e.g 1, 3, 4) and their relative percentages for each test (gross range, climatology, etc) for each parameter that the tests area applied to. 

### Import modules used in this notebook

In [1]:
# Import libraries
import os
import re
# import gc
import pandas as pd
import numpy as np
import xarray as xr
import warnings
warnings.filterwarnings("ignore")
import glob

In [2]:
# Import OOI-related modules
from qartod_testing.qc_flag_statistics import get_test_parameters, qartod_summary_expanded, parse_qartod_executed
from qartod_testing.local_qc_test import get_mismatched_flags
from ooi_data_explorations.common import merge_frames

### Define reference designator for chosen variable

In [3]:
# Set reference designator, data stream, and method 
method = "recovered_inst"                       
stream = 'ctdmo_ghqr_instrument_recovered'
refdes = "GI03FLMB-RIM01-02-CTDMOG064"             

### Load local QARTOD test flags from processed dataset

In [4]:
# build path to folder where data was saved
folder_path = os.path.join(os.path.abspath('../data/interim'), method, stream, refdes)

# retrieve list of netCDF files in this directory
local_files = glob.glob(folder_path+'/climatology*.nc')
# files = [file for file in # I started trying to remove files with blank in the name with more generalized way and ran out of time
local_files.sort()

### Extract and parse expected QC results

In [4]:
# Load expected results data from external data folder
# build path to folder where data was saved
folder_path = os.path.join(os.path.abspath('../data/external'), method, stream, refdes)

# retrieve list of netCDF files in this directory
expected_files = glob.glob(folder_path+'/*.nc')
expected_files.sort()

### Comparing local results of QARTOD tests to expected results 

In [21]:
# Identify differences in the results
test_comparison = dict()

for index, param in enumerate(parameters):
    print("Checking for mismatched QARTOD flags in "f"{param}")
    flag_mismatch = dp.run_comparison(results_expected, param, local_test_results)

    if flag_mismatch is None:
        print("No mismatched values found")
        pass
    else:  
        flag_mismatch = flag_mismatch[np.char.isnumeric(results_expected[f"{param}_qartod_climatology_test"][flag_mismatch])] 

        if len(flag_mismatch) == 0:
            print("No mismatched values found")
            pass
        else:
           test_comparison.update({f"{param}_mismatched_flags":{
                    "time": results_expected['time'][flag_mismatch].values,
                    "expected flags": results_expected[f"{param}_qartod_climatology_test"][flag_mismatch].values,
                    "local test flags": local_test_results[param][flag_mismatch].values
                }
            })

Checking for mismatched QARTOD flags in sea_water_electrical_conductivity
No mismatched values found
Checking for mismatched QARTOD flags in sea_water_temperature
No mismatched values found
Checking for mismatched QARTOD flags in sea_water_practical_salinity
No mismatched values found
Checking for mismatched QARTOD flags in sea_water_pressure
No mismatched values found


In [22]:
test_comparison

{}

Next, we'll manually create datasets to hold the results of the comparison for each parameter

In [15]:
temperature_mismatch = xr.Dataset(data_vars=dict(expected_flags=(["time"], test_comparison['sea_water_temperature_mismatched_flags']['expected flags']),
                                                 local_test_flags=(["time"], test_comparison['sea_water_temperature_mismatched_flags']['local test flags'])
                                                 ),
                                    coords=dict(time=(["time"], test_comparison['sea_water_temperature_mismatched_flags']['time']))
                                )

KeyError: 'sea_water_temperature_mismatched_flags'

In [115]:
temperature_mismatch

In [122]:
import matplotlib.pyplot as plt
temperature_mismatch.plot
plt.show

ModuleNotFoundError: No module named 'matplotlib'

### Prepare CSV with statistics about QARTOD results

In [6]:
# Initialize empty dictionaries for comparison and statistics
mismatch = {}
statistics = {}

In [46]:
# for m in enumerate(expected_files):
m = 13
local_file = local_files[m]
expected_file = expected_files[m]

# get deployment from current file, then open local test and expected test datasets
deployment = re.findall('00[0-2][0-9]', local_file)[0][-2:]
local_ds = xr.open_dataset(local_file)
expected_ds = xr.open_dataset(expected_file)

In [47]:
# Get parameters that have QARTOD executed from expected test dataset
test_parameters = get_test_parameters(expected_ds)
parameters = list(test_parameters.keys())

# Separate QARTOD test flags in expected test dataset by QARTOD test name
expected_ds = parse_qartod_executed(expected_ds, parameters)

In [48]:
# update dictionary with mismatched flags for current deployment
mismatch_results = get_mismatched_flags(expected_ds, local_ds, parameters, deployment, 'gross_range')
mismatch.update({ f"{m}" : mismatch_results })

Checking for mismatched QARTOD flags in sea_water_electrical_conductivity
No mismatched values found
Checking for mismatched QARTOD flags in sea_water_temperature
No mismatched values found
Checking for mismatched QARTOD flags in sea_water_practical_salinity
No mismatched values found
Checking for mismatched QARTOD flags in sea_water_pressure
No mismatched values found


In [49]:
mismatch = pd.DataFrame.from_dict(mismatch, orient='index')
mismatch = mismatch.set_index("deployment")
mismatch

Unnamed: 0_level_0,sea_water_electrical_conductivity,sea_water_temperature,sea_water_practical_salinity,sea_water_pressure
deployment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,,
3,,,,
5,,,,
6,,,,
7,,,,
8,,,,
9,,,,
10,,,,
11,,,,
12,,,,


In [93]:
# Update summary statistics dictionary for each deployment, then for all deployments
print("Evaluating statistics on QARTOD flags for deployment "f"{deployment}")
summary_results = qartod_summary_expanded(expected_ds, parameters, deployment, "gross_range")
statistics.update({f"{m}" : summary_results })

# Add entry to summary statistics for full data record after last file
if expected_file == expected_files[-1]:
    # Open all expected data files and create merged full dataset
    expected_all_time = [xr.open_dataset(file) for file in expected_files]
    expected_all_time = merge_frames(expected_all_time)
    deployment = "all"

    # Summary of flags from merged dataset
    print("Evaluating statistics on QARTOD flags for all deployments")
    expected_all_time = parse_qartod_executed(expected_all_time, parameters)
    summary_results = qartod_summary_expanded(expected_all_time, parameters, deployment, "gross_range")
    statistics.update({ "all" : summary_results })

Evaluating statistics on QARTOD flags for deployment 16
Evaluating statistics on QARTOD flags for all deployments


In [95]:
# Create data frame from dictionary and check contents
statistics = pd.DataFrame.from_dict(statistics, orient='index')
statistics = statistics.set_index('deployment')
statistics

Unnamed: 0_level_0,sea_water_electrical_conductivity total,conductivity good,conductivity suspect,conductivity fail,sea_water_temperature total,temperature good,temperature suspect,temperature fail,sea_water_practical_salinity total,salinity good,salinity suspect,salinity fail,sea_water_pressure total,pressure good,pressure suspect,pressure fail
deployment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
01,73272,"(73272, 100.0)","(0, 0.0)","(0, 0.0)",73272,"(73272, 100.0)","(0, 0.0)","(0, 0.0)",73272,"(71754, 97.93)","(1518, 2.07)","(0, 0.0)",73272,"(73272, 100.0)","(0, 0.0)","(0, 0.0)"
03,16233,"(16233, 100.0)","(0, 0.0)","(0, 0.0)",16233,"(16233, 100.0)","(0, 0.0)","(0, 0.0)",16233,"(16232, 99.99)","(1, 0.01)","(0, 0.0)",16233,"(16232, 99.99)","(1, 0.01)","(0, 0.0)"
05,7753,"(7753, 100.0)","(0, 0.0)","(0, 0.0)",7753,"(7753, 100.0)","(0, 0.0)","(0, 0.0)",7753,"(7753, 100.0)","(0, 0.0)","(0, 0.0)",7753,"(7753, 100.0)","(0, 0.0)","(0, 0.0)"
06,7983,"(7298, 91.42)","(685, 8.58)","(0, 0.0)",7983,"(7038, 88.16)","(945, 11.84)","(0, 0.0)",7983,"(7983, 100.0)","(0, 0.0)","(0, 0.0)",7983,"(7983, 100.0)","(0, 0.0)","(0, 0.0)"
07,7743,"(7696, 99.39)","(47, 0.61)","(0, 0.0)",7743,"(7743, 100.0)","(0, 0.0)","(0, 0.0)",7743,"(7739, 99.95)","(4, 0.05)","(0, 0.0)",7743,"(7743, 100.0)","(0, 0.0)","(0, 0.0)"
08,14518,"(14416, 99.3)","(102, 0.7)","(0, 0.0)",14518,"(14379, 99.04)","(139, 0.96)","(0, 0.0)",14518,"(14384, 99.08)","(134, 0.92)","(0, 0.0)",14518,"(14517, 99.99)","(1, 0.01)","(0, 0.0)"
09,20988,"(20099, 95.76)","(889, 4.24)","(0, 0.0)",20988,"(20219, 96.34)","(769, 3.66)","(0, 0.0)",20988,"(20069, 95.62)","(919, 4.38)","(0, 0.0)",20988,"(20988, 100.0)","(0, 0.0)","(0, 0.0)"
10,1466,"(1466, 100.0)","(0, 0.0)","(0, 0.0)",1466,"(1466, 100.0)","(0, 0.0)","(0, 0.0)",1466,"(1466, 100.0)","(0, 0.0)","(0, 0.0)",1466,"(1466, 100.0)","(0, 0.0)","(0, 0.0)"
11,16618,"(16615, 99.98)","(3, 0.02)","(0, 0.0)",16618,"(16618, 100.0)","(0, 0.0)","(0, 0.0)",16618,"(16542, 99.54)","(76, 0.46)","(0, 0.0)",16618,"(16225, 97.64)","(393, 2.36)","(0, 0.0)"
12,28266,"(26480, 93.68)","(1786, 6.32)","(0, 0.0)",28266,"(26208, 92.72)","(2058, 7.28)","(0, 0.0)",28266,"(26784, 94.76)","(1482, 5.24)","(0, 0.0)",28266,"(28265, 100.0)","(1, 0.0)","(0, 0.0)"


In [51]:
# Save data frames as CSVs
folder_path = os.path.join(os.path.abspath('../data/processed'), method, stream, refdes)
os.makedirs(folder_path, exist_ok=True)
mismatch.to_csv(folder_path+f"/gross_range-{refdes}-mismatched_flags.csv", na_rep='NaN', mode='a')
# statistics.to_csv(folder_path+f"/gross_range-{refdes}-flag_statistics.csv", na_rep='NaN', mode='a')

### Loop through files to create statistics and validation CSVs

In [5]:
# Initialize empty dictionaries for comparison and statistics
mismatch = {}
statistics = {}

In [6]:
for m, _ in enumerate(expected_files):
    # local_file = local_files[m]
    expected_file = expected_files[m]

    # get deployment from current file, then open local test and expected test datasets
    deployment = re.findall('deployment00[0-2][0-9]', expected_file)[0][-2:]
    # local_ds = xr.open_dataset(local_file)
    expected_ds = xr.open_dataset(expected_file)
    
    # Get parameters that have QARTOD executed from expected test dataset
    test_parameters = get_test_parameters(expected_ds)
    parameters = list(test_parameters.keys())

    # Separate QARTOD test flags in expected test dataset by QARTOD test name
    expected_ds = dp.parse_qartod_executed(expected_ds, parameters)
    
    # # update dictionary with mismatched flags for current deployment
    # mismatch_results = get_mismatched_flags(expected_ds, local_ds, parameters, deployment, 'climatology', expected_file)
    # mismatch.update({ f"{m}" : mismatch_results })
    
    # Update summary statistics dictionary for each deployment, then for all deployments
    print("Evaluating statistics on QARTOD flags for deployment "f"{deployment}")
    summary_results = qartod_summary_expanded(expected_ds, parameters, deployment, "climatology")
    statistics.update({f"{m}" : summary_results })

    # Add entry to summary statistics for full data record after last file
    if expected_file == expected_files[-1]:
        # Open all expected data files and create merged full dataset
        expected_all_time = [xr.open_dataset(file) for file in expected_files]
        expected_all_time = merge_frames(expected_all_time)
        deployment = "all"

        # Summary of flags from merged dataset
        print("Evaluating statistics on QARTOD flags for all deployments")
        expected_all_time = parse_qartod_executed(expected_all_time, parameters)
        summary_results = qartod_summary_expanded(expected_all_time, parameters, deployment, "climatology")
        statistics.update({ "all" : summary_results })   


Evaluating statistics on QARTOD flags for deployment 01
Evaluating statistics on QARTOD flags for deployment 02
Evaluating statistics on QARTOD flags for deployment 03
Evaluating statistics on QARTOD flags for deployment 04
Evaluating statistics on QARTOD flags for deployment 05
Evaluating statistics on QARTOD flags for deployment 06
Evaluating statistics on QARTOD flags for deployment 07
Evaluating statistics on QARTOD flags for deployment 08
Evaluating statistics on QARTOD flags for all deployments


In [8]:
# convert dictionary to dataframe for flag comparison results and check contents
mismatch = pd.DataFrame.from_dict(mismatch, orient='index')
mismatch = mismatch.set_index("deployment")
mismatch

Unnamed: 0_level_0,sea_water_electrical_conductivity,sea_water_temperature,sea_water_practical_salinity,sea_water_pressure
deployment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,,
4,,{'total': '6297 (2%)'},,
5,,,,
6,,,,
7,,{'total': '119 (1%)'},,
8,,{'total': '1021 (0%)'},,
9,,,,
10,,{'total': '4910 (2%)'},,
11,,{'total': '246 (0%)'},,
12,,{'total': '1708 (0%)'},,


In [7]:
# Create data frame from dictionary for statistics and check contents
statistics = pd.DataFrame.from_dict(statistics, orient='index')
statistics = statistics.set_index('deployment')
statistics

Unnamed: 0_level_0,sea_water_electrical_conductivity total,conductivity good,conductivity good %,conductivity suspect,conductivity suspect %,conductivity fail,conductivity fail %,sea_water_pressure total,pressure good,pressure good %,...,temperature suspect %,temperature fail,temperature fail %,sea_water_practical_salinity total,salinity good,salinity good %,salinity suspect,salinity suspect %,salinity fail,salinity fail %
deployment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01,,,,,,,,,,,...,0.27,0,0.0,32440,32439,100.0,1,0.0,0,0.0
02,,,,,,,,,,,...,1.36,0,0.0,31835,31584,99.21,251,0.79,0,0.0
03,,,,,,,,,,,...,0.38,0,0.0,37801,37800,100.0,1,0.0,0,0.0
04,,,,,,,,,,,...,0.72,0,0.0,29585,29585,100.0,0,0.0,0,0.0
05,,,,,,,,,,,...,0.49,0,0.0,40790,40748,99.9,42,0.1,0,0.0
06,,,,,,,,,,,...,0.58,0,0.0,36778,36695,99.77,83,0.23,0,0.0
07,,,,,,,,,,,...,2.2,0,0.0,34843,34843,100.0,0,0.0,0,0.0
08,,,,,,,,,,,...,6.83,0,0.0,31248,31242,99.98,6,0.02,0,0.0
all,,,,,,,,,,,...,1.52,0,0.0,273746,273363,99.86,383,0.14,0,0.0


In [8]:
# Save data frames as CSVs
folder_path = os.path.join(os.path.abspath('../data/processed'), method, stream, refdes)
os.makedirs(folder_path, exist_ok=True)
# mismatch.to_csv(folder_path+f"/climatology-{refdes}-mismatched_flags.csv", na_rep='NaN', mode='a')
statistics.to_csv(folder_path+f"/climatology-{refdes}-flag_statistics.csv", na_rep='NaN', mode='a')