# 07 - Generate statistics CSVs
Creating a separate notebook to calculate the statistics all in one pass so that I can complete a representative set of CSVs for each instrument class.

### Statistics for QARTOD tests in production

In [1]:
# Import libraries available from main conda channels or conda-forge
import xarray as xr
import pandas as pd
import numpy as np
import glob
import os
import re
import warnings
warnings.filterwarnings("ignore")

# Import dask tools and ProgressBar
import dask
from dask.diagnostics import ProgressBar

# Import qartod_testing project functions
from qartod_testing.qc_flag_statistics import ooinet_gold_copy_request, get_test_parameters, parse_qartod_executed, qartod_summary_expanded

# Import OOI library functions
from ooi_data_explorations.common import merge_frames
from ooinet.M2M import get_deployments, get_annotations

In [2]:
# Setup parameters needed to request data
refdes = "CP03ISSM-MFD35-05-PCO2WB000"
method = "recovered_inst"
stream = "pco2w_abc_instrument"

# Site, node, and sensor info from deconstructed reference designator
# [site, node, sensor] = refdes.split('-', 2)

In [3]:
# Routine in data_processing module from this project to download the gold copy THREDDs datasets
# Variable 'files' contains list of catalog URLs for downloaded datasets 
files = ooinet_gold_copy_request(refdes, method, stream)

Downloading and Processing Data Files: 100%|██████████| 13/13 [00:17<00:00,  1.34s/it]


In [4]:
# Load expected results data from external data folder
folder_path = os.path.join(os.path.abspath('../data/external'), method, stream, refdes)
expected_files = glob.glob(folder_path+'/*.nc')
expected_files.sort() # sorts local test files in alphanumeric order

In [5]:
expected_files
# get_deployments(refdes)

['/home/jovyan/code/qartod_testing/data/external/recovered_inst/pco2w_abc_instrument/CP03ISSM-MFD35-05-PCO2WB000/deployment0001_CP03ISSM-MFD35-05-PCO2WB000-recovered_inst-pco2w_abc_instrument_20141214T190509-20150508T230509.nc',
 '/home/jovyan/code/qartod_testing/data/external/recovered_inst/pco2w_abc_instrument/CP03ISSM-MFD35-05-PCO2WB000/deployment0002_CP03ISSM-MFD35-05-PCO2WB000-recovered_inst-pco2w_abc_instrument_20150509T010509-20151021T180509.nc',
 '/home/jovyan/code/qartod_testing/data/external/recovered_inst/pco2w_abc_instrument/CP03ISSM-MFD35-05-PCO2WB000/deployment0003_CP03ISSM-MFD35-05-PCO2WB000-recovered_inst-pco2w_abc_instrument_20151021T190009-20160514T130009.nc',
 '/home/jovyan/code/qartod_testing/data/external/recovered_inst/pco2w_abc_instrument/CP03ISSM-MFD35-05-PCO2WB000/deployment0005_CP03ISSM-MFD35-05-PCO2WB000-recovered_inst-pco2w_abc_instrument_20161011T150009-20170615T150009.nc',
 '/home/jovyan/code/qartod_testing/data/external/recovered_inst/pco2w_abc_instrument

In [6]:
get_annotations(refdes)

Unnamed: 0,@class,id,subsite,node,sensor,method,stream,beginDT,endDT,annotation,exclusionFlag,source,qcFlag,parameters
0,.AnnotationRecord,334,CP03ISSM,MFD35,,,,1513760404000,1522256160000,Deployment 7: Multi-Function Node failed.,False,leila@marine.rutgers.edu,9,[]
1,.AnnotationRecord,317,CP03ISSM,MFD35,05-PCO2WB000,,,1465113606000,1476533640000,No recovered instrument data available because...,False,leila@marine.rutgers.edu,9,[]
2,.AnnotationRecord,301,CP03ISSM,MFD35,05-PCO2WB000,recovered_host,pco2w_abc_dcl_instrument_recovered,1429545909000,1547078400000,"* UPDATED, 2019-01-25: After careful review, t...",False,friedrich.knuth@rutgers.edu,0,[931]
3,.AnnotationRecord,332,CP03ISSM,MFD35,,,,1486101900000,1490839209000,All seafloor Multi-Function Node (MFD35 and MF...,False,leila@marine.rutgers.edu,0,[]
4,.AnnotationRecord,2866,CP03ISSM,MFD35,05-PCO2WB000,,,1616583900000,1617217320000,Deploylment 12: The instrument stopped due to ...,False,swhite@whoi.edu,9,[]
5,.AnnotationRecord,1511,CP03ISSM,MFD35,,telemetered,,1528801530000,1540583520000,Data telemetry disabled due to chronic issues ...,False,leila@marine.rutgers.edu,0,[]
6,.AnnotationRecord,286,CP03ISSM,MFD35,05-PCO2WB000,telemetered,pco2w_abc_dcl_instrument,1418583120000,1547078400000,"* UPDATED, 2019-01-25: After careful review, t...",False,friedrich.knuth@rutgers.edu,0,[931]
7,.AnnotationRecord,1877,CP03ISSM,MFD35,,,,1577404800000,1604236560000,Deployment 11: *UPDATED 2020-11-30: The MFN po...,False,cdobson@whoi.edu,9,[]
8,.AnnotationRecord,1889,CP03ISSM,,,,,1578441600000,1604236560000,Deployment 11: UPDATED 2020-04-27: As of 2020-...,False,cdobson@whoi.edu,9,[]
9,.AnnotationRecord,315,CP03ISSM,MFD35,,,,1418583180000,1431129600000,Deployment 1: No recovered host data available...,False,cdobson@whoi.edu,9,[]


#### Gross range test statistics

In [7]:
def collect_statistics(file_paths, test_name):
    """
    Calls other functions to calculate statistics from a set of files and a name of a QARTOD test. The statistics are organized in a DataFrame.
    
    Parameters:
    -----------
        file_paths: list of paths to each file that will have statistics calculated. File names must include "deployment00##".
        test_name: string of QARTOD test name, i.e. "gross_range", "climatology".
        
    Returns:
    --------
        statistics: Pandas DataFrame containing statistics on each parameter with a QARTOD test in order of deployment number, then statistics of the full record.
        
    Version 23 Aug 2023, Kylene M Cooley    
    """
    
    # Initialize empty dictionary for statistics
    statistics = {}
    
    # Create a copy of list of file paths for individual deployment statistics
    paths_copy = file_paths.copy()
    m = 0

    while len(paths_copy)>0:
        file = paths_copy[0]

        # get deployment from current file, then open local test and expected test datasets
        deployment = re.findall('deployment00[0-2][0-9]', file)[0][-2:]
        # here figure out how to get all files of a single deployment and do a merged dataset like below
        files_with_deployment_num = [x for x in paths_copy if f'deployment00{deployment}' in x]
        
        if len(files_with_deployment_num)>1:
            file_ds = [xr.open_dataset(single_file) for single_file in files_with_deployment_num]
            file_ds = merge_frames(file_ds)
        else:  
            file_ds = xr.open_dataset(file)

        # Get parameters that have QARTOD executed from expected test dataset
        test_parameters = get_test_parameters(file_ds)
        parameters = list(test_parameters.keys())

        # Separate QARTOD test flags in expected test dataset by QARTOD test name
        file_ds = parse_qartod_executed(file_ds, parameters)

        # Update summary statistics dictionary for each deployment, then for all deployments
        print("Evaluating statistics on QARTOD flags for deployment "f"{deployment}")
        summary_results = qartod_summary_expanded(file_ds, parameters, deployment, test_name)
        statistics.update({f"{m}" : summary_results })
        
        for x in files_with_deployment_num:
            paths_copy.remove(x)
        m += 1

    # Add entry to summary statistics for full data record after last deployments
    # Open all expected data files and create merged full dataset
    merged_ds = [xr.open_dataset(single_file) for single_file in file_paths]
    merged_ds = merge_frames(merged_ds)
    deployment = "all"

    # Summary of flags from merged dataset
    print("Evaluating statistics on QARTOD flags for all deployments")
    merged_ds = parse_qartod_executed(merged_ds, parameters)
    summary_results = qartod_summary_expanded(merged_ds, parameters, deployment, test_name)
    statistics.update({ "all" : summary_results })

    # Create data frame from dictionary and check contents
    statistics = pd.DataFrame.from_dict(statistics, orient='index')
    statistics = statistics.set_index('deployment')
    return statistics

In [8]:
gross_range_stats = collect_statistics(expected_files, "gross_range")
gross_range_stats

Evaluating statistics on QARTOD flags for deployment 01
Evaluating statistics on QARTOD flags for deployment 02
Evaluating statistics on QARTOD flags for deployment 03
Evaluating statistics on QARTOD flags for deployment 05
Evaluating statistics on QARTOD flags for deployment 06
Evaluating statistics on QARTOD flags for deployment 07
Evaluating statistics on QARTOD flags for deployment 09
Evaluating statistics on QARTOD flags for deployment 10
Evaluating statistics on QARTOD flags for deployment 11
Evaluating statistics on QARTOD flags for deployment 12
Evaluating statistics on QARTOD flags for deployment 13
Evaluating statistics on QARTOD flags for deployment 14
Evaluating statistics on QARTOD flags for deployment 15
Evaluating statistics on QARTOD flags for all deployments


Unnamed: 0_level_0,pco2_seawater total,seawater good,seawater good %,seawater suspect,seawater suspect %,seawater fail,seawater fail %
deployment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01,1703,1703,100.0,0,0.0,0,0.0
02,3932,3932,100.0,0,0.0,0,0.0
03,4391,281,6.4,868,19.77,0,0.0
05,5271,513,9.73,32,0.61,0,0.0
06,2961,79,2.67,41,1.38,0,0.0
07,3134,167,5.33,156,4.98,0,0.0
09,3770,3770,100.0,0,0.0,0,0.0
10,4226,2812,66.54,378,8.94,0,0.0
11,9533,148,1.55,2932,30.76,0,0.0
12,3406,3406,100.0,0,0.0,0,0.0


In [9]:
# Save data frames as CSVs
folder_path = os.path.join(os.path.abspath('../data/processed'), method, stream, refdes)
os.makedirs(folder_path, exist_ok=True)
gross_range_stats.to_csv(folder_path+f"/gross_range-{refdes}-flag_statistics.csv", na_rep='NaN', mode='w')

#### Climatology test statistics

In [9]:
climatology_stats = collect_statistics(expected_files, "climatology")
climatology_stats

Evaluating statistics on QARTOD flags for deployment 01
Evaluating statistics on QARTOD flags for deployment 02
Evaluating statistics on QARTOD flags for deployment 03
Evaluating statistics on QARTOD flags for deployment 05
Evaluating statistics on QARTOD flags for deployment 06
Evaluating statistics on QARTOD flags for deployment 07
Evaluating statistics on QARTOD flags for deployment 09
Evaluating statistics on QARTOD flags for deployment 10
Evaluating statistics on QARTOD flags for deployment 11
Evaluating statistics on QARTOD flags for deployment 12
Evaluating statistics on QARTOD flags for deployment 13
Evaluating statistics on QARTOD flags for deployment 14
Evaluating statistics on QARTOD flags for deployment 15
Evaluating statistics on QARTOD flags for all deployments


Unnamed: 0_level_0,pco2_seawater total,seawater good,seawater good %,seawater suspect,seawater suspect %,seawater fail,seawater fail %
deployment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01,1703,1685,98.94,18,1.06,0,0.0
02,3932,3928,99.9,4,0.1,0,0.0
03,4391,175,3.99,1832,41.72,0,0.0
05,5271,424,8.04,267,5.07,0,0.0
06,2961,31,1.05,294,9.93,0,0.0
07,3134,110,3.51,870,27.76,0,0.0
09,3770,3756,99.63,14,0.37,0,0.0
10,4226,2475,58.57,1084,25.65,0,0.0
11,9533,100,1.05,8789,92.2,0,0.0
12,3406,3406,100.0,0,0.0,0,0.0


In [11]:
# Save data frames as CSVs
folder_path = os.path.join(os.path.abspath('../data/processed'), method, stream, refdes)
os.makedirs(folder_path, exist_ok=True)
climatology_stats.to_csv(folder_path+f"/climatology-{refdes}-flag_statistics.csv", na_rep='NaN', mode='w')