# 07 - Generate statistics CSVs
Creating a separate notebook to calculate the statistics all in one pass so that I can complete a representative set of CSVs for each instrument class.

### Statistics for QARTOD tests in production

In [1]:
# Import libraries available from main conda channels or conda-forge
import xarray as xr
import pandas as pd
import numpy as np
import glob
import os
import re
import warnings
warnings.filterwarnings("ignore")

# Import dask tools and ProgressBar
import dask
from dask.diagnostics import ProgressBar

# Import qartod_testing project functions
from qartod_testing.data_processing import ooinet_gold_copy_request, get_test_parameters, parse_qartod_executed, qartod_summary_expanded

# Import OOI library functions
from ooi_data_explorations.common import merge_frames
from ooinet.M2M import get_deployments, get_annotations

In [2]:
# Setup parameters needed to request data
refdes = "GA01SUMO-RID16-03-CTDBPF000"
method = "recovered_host"
stream = "ctdbp_cdef_dcl_instrument_recovered"

# Site, node, and sensor info from deconstructed reference designator
# [site, node, sensor] = refdes.split('-', 2)

In [3]:
# Routine in data_processing module from this project to download the gold copy THREDDs datasets
# Variable 'files' contains list of catalog URLs for downloaded datasets 
files = ooinet_gold_copy_request(refdes, method, stream)

Downloading and Processing Data Files: 100%|██████████| 4/4 [00:35<00:00,  8.96s/it]


In [3]:
# Load expected results data from external data folder
folder_path = os.path.join(os.path.abspath('../data/external'), method, stream, refdes)
expected_files = glob.glob(folder_path+'/*.nc')
expected_files.sort() # sorts local test files in alphanumeric order

In [5]:
expected_files
# get_deployments(refdes)

['/home/jovyan/code/qartod_testing/data/external/recovered_host/ctdbp_cdef_dcl_instrument_recovered/GA01SUMO-RID16-03-CTDBPF000/deployment0001_GA01SUMO-RID16-03-CTDBPF000-recovered_host-ctdbp_cdef_dcl_instrument_recovered_20150315T213005.410000-20151126T093009.589000.nc',
 '/home/jovyan/code/qartod_testing/data/external/recovered_host/ctdbp_cdef_dcl_instrument_recovered/GA01SUMO-RID16-03-CTDBPF000/deployment0002_GA01SUMO-RID16-03-CTDBPF000-recovered_host-ctdbp_cdef_dcl_instrument_recovered_20151114T210608.612000-20160817T235952.761000.nc',
 '/home/jovyan/code/qartod_testing/data/external/recovered_host/ctdbp_cdef_dcl_instrument_recovered/GA01SUMO-RID16-03-CTDBPF000/deployment0002_GA01SUMO-RID16-03-CTDBPF000-recovered_host-ctdbp_cdef_dcl_instrument_recovered_20160818T000004.760000-20161108T092111.791000.nc',
 '/home/jovyan/code/qartod_testing/data/external/recovered_host/ctdbp_cdef_dcl_instrument_recovered/GA01SUMO-RID16-03-CTDBPF000/deployment0003_GA01SUMO-RID16-03-CTDBPF000-recovered_

In [6]:
get_annotations(refdes)

Unnamed: 0,@class,id,subsite,node,sensor,method,stream,beginDT,endDT,annotation,exclusionFlag,source,qcFlag,parameters
0,.AnnotationRecord,653,GA01SUMO,,,,,1426454700000,1448532480000,Deployment 1: Buoy log files showed that seawa...,False,lgarzio@marine.rutgers.edu,0,[]
1,.AnnotationRecord,681,GA01SUMO,RID16,03-CTDBPF000,,,1477532820000,1483561800000,Deployment 3: Only partial recovered_instrumen...,False,lgarzio@marine.rutgers.edu,0,[]


#### Gross range test statistics

In [7]:
def collect_statistics(file_paths, test_name):
    """
    Calls other functions to calculate statistics from a set of files and a name of a QARTOD test. The statistics are organized in a DataFrame.
    
    Parameters:
    -----------
        file_paths: list of paths to each file that will have statistics calculated. File names must include "deployment00##".
        test_name: string of QARTOD test name, i.e. "gross_range", "climatology".
        
    Returns:
    --------
        statistics: Pandas DataFrame containing statistics on each parameter with a QARTOD test in order of deployment number, then statistics of the full record.
        
    Version 23 Aug 2023, Kylene M Cooley    
    """
    
    # Initialize empty dictionary for statistics
    statistics = {}
    
    # Create a copy of list of file paths for individual deployment statistics
    paths_copy = file_paths.copy()
    m = 0

    while len(paths_copy)>0:
        file = paths_copy[0]

        # get deployment from current file, then open local test and expected test datasets
        deployment = re.findall('deployment00[0-2][0-9]', file)[0][-2:]
        # here figure out how to get all files of a single deployment and do a merged dataset like below
        files_with_deployment_num = [x for x in paths_copy if f'deployment00{deployment}' in x]
        
        if len(files_with_deployment_num)>1:
            file_ds = [xr.open_dataset(single_file) for single_file in files_with_deployment_num]
            file_ds = merge_frames(file_ds)
        else:  
            file_ds = xr.open_dataset(file)

        # Get parameters that have QARTOD executed from expected test dataset
        test_parameters = get_test_parameters(file_ds)
        parameters = list(test_parameters.keys())

        # Separate QARTOD test flags in expected test dataset by QARTOD test name
        file_ds = parse_qartod_executed(file_ds, parameters)

        # Update summary statistics dictionary for each deployment, then for all deployments
        print("Evaluating statistics on QARTOD flags for deployment "f"{deployment}")
        summary_results = qartod_summary_expanded(file_ds, parameters, deployment, test_name)
        statistics.update({f"{m}" : summary_results })
        
        for x in files_with_deployment_num:
            paths_copy.remove(x)
        m += 1

    # Add entry to summary statistics for full data record after last deployments
    # Open all expected data files and create merged full dataset
    merged_ds = [xr.open_dataset(single_file) for single_file in file_paths]
    merged_ds = merge_frames(merged_ds)
    deployment = "all"

    # Summary of flags from merged dataset
    print("Evaluating statistics on QARTOD flags for all deployments")
    merged_ds = parse_qartod_executed(merged_ds, parameters)
    summary_results = qartod_summary_expanded(merged_ds, parameters, deployment, test_name)
    statistics.update({ "all" : summary_results })

    # Create data frame from dictionary and check contents
    statistics = pd.DataFrame.from_dict(statistics, orient='index')
    statistics = statistics.set_index('deployment')
    return statistics

In [8]:
gross_range_stats = collect_statistics(expected_files, "gross_range")
gross_range_stats

Evaluating statistics on QARTOD flags for deployment 01
Evaluating statistics on QARTOD flags for deployment 02
Evaluating statistics on QARTOD flags for deployment 03
Evaluating statistics on QARTOD flags for all deployments


Unnamed: 0_level_0,sea_water_electrical_conductivity total,conductivity good,conductivity good %,conductivity suspect,conductivity suspect %,conductivity fail,conductivity fail %,sea_water_temperature total,temperature good,temperature good %,...,salinity suspect %,salinity fail,salinity fail %,sea_water_pressure total,pressure good,pressure good %,pressure suspect,pressure suspect %,pressure fail,pressure fail %
deployment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01,24529,24529,100.0,0,0.0,0,0.0,24529,24529,100.0,...,0.0,0,0.0,24529,24013,97.9,516,2.1,0,0.0
02,2588451,2588362,100.0,89,0.0,0,0.0,2588451,2588451,100.0,...,0.38,0,0.0,2588451,2565415,99.11,23036,0.89,0,0.0
03,6714,6714,100.0,0,0.0,0,0.0,6714,6714,100.0,...,0.0,0,0.0,6714,6592,98.18,122,1.82,0,0.0
all,2619694,2619605,100.0,89,0.0,0,0.0,2619694,2619694,100.0,...,0.37,0,0.0,2619694,2596020,99.1,23674,0.9,0,0.0


In [9]:
# Save data frames as CSVs
folder_path = os.path.join(os.path.abspath('../data/processed'), method, stream, refdes)
os.makedirs(folder_path, exist_ok=True)
gross_range_stats.to_csv(folder_path+f"/gross_range-{refdes}-flag_statistics.csv", na_rep='NaN', mode='w')

#### Climatology test statistics

In [10]:
climatology_stats = collect_statistics(expected_files, "climatology")
climatology_stats

Evaluating statistics on QARTOD flags for deployment 01
Evaluating statistics on QARTOD flags for deployment 02
Evaluating statistics on QARTOD flags for deployment 03
Evaluating statistics on QARTOD flags for all deployments


Unnamed: 0_level_0,sea_water_electrical_conductivity total,sea_water_temperature total,temperature good,temperature good %,temperature suspect,temperature suspect %,temperature fail,temperature fail %,sea_water_practical_salinity total,salinity good,salinity good %,salinity suspect,salinity suspect %,salinity fail,salinity fail %,sea_water_pressure total
deployment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
01,,24529,24529,100.0,0,0.0,0,0.0,24529,24529,100.0,0,0.0,0,0.0,
02,,2588451,2573544,99.42,14907,0.58,0,0.0,2588451,2482734,95.92,105717,4.08,0,0.0,
03,,6714,6714,100.0,0,0.0,0,0.0,6714,6266,93.33,448,6.67,0,0.0,
all,,2619694,2604787,99.43,14907,0.57,0,0.0,2619694,2513529,95.95,106165,4.05,0,0.0,


In [11]:
# Save data frames as CSVs
folder_path = os.path.join(os.path.abspath('../data/processed'), method, stream, refdes)
os.makedirs(folder_path, exist_ok=True)
climatology_stats.to_csv(folder_path+f"/climatology-{refdes}-flag_statistics.csv", na_rep='NaN', mode='w')