# 4.2: Sort missed data streams
Notebook 4.1 created text files containing the reference designator and data stream pairs that were not added to the missed test CSVs. These data streams either didn't have a matching test in the Gross Range and Climatology Test lookup tables, or the data for the first available deployment could not be loaded into the workspace. 
This notebook completes the following tasks:
    <ul><li>Sort strings of refdes-datastream pairs into one of two lists based on the issue printed to output</li>
    </ul>

In [1]:
# Import libraries
import re

In [2]:
def find_no_dataset(path_to_file):
    # Lists all data streams where data from the first
    # available deployment did not load in notebook
    # 4.1.
    file = open(path_to_file, "r")
    all_missing = list()
    for text in file.readlines():
        if "No dataset available" in text:
            missing = re.findall(".{8}-.{5}-.{2}-.{9}-[a-z_0-9]+", text)
            all_missing.append(missing)
    return all_missing

def find_no_qc_lookup(path_to_file):
    # Lists all data streams where no qcConfig objects
    # remained after filtering in notebook 4.1.
    file = open(path_to_file, "r")
    all_missing = list()
    for text in file.readlines():
        if "No existing qc-lookup table" in text:
            missing = re.findall(".{8}-.{5}-.{2}-.{9}-[a-z_0-9]+", text)
            all_missing.append(missing)
    return all_missing

def find_skipped_refdes(path_to_file):
    # Lists all refdes that were skipped.
    # These had to have "MOPAK" or "FDCHP"
    #  in the refdes string.
    file = open(path_to_file, "r")
    all_skipped = list()
    for text in file.readlines():
        if "skipped" in text:
            skipped = re.findall(".{8}-.{5}-.{2}-.{9}", text)
            all_skipped.append(skipped)
    return all_skipped

def filter_streams(missing_qcConfig):
    # Remove streams where an automated
    # QC test is not expected according
    # to keywords in the stream name.
    drop = ["metadata", "diagnostic", "blank"]
    drop_mask = missing_qcConfig in drop
    missing_qcConfig = missing_qcConfig[drop_mask]
    return missing_qcConfig

In [3]:
array = "GS"
site = "GS01SUMO"
# path = f"../data/processed/{array}_tests_completed1/{site}-typescript.txt"
path = f"../data/processed/{array}_tests_completed2/{site}-output.txt"

In [4]:
skipped = find_skipped_refdes(path)
skipped

[['GS01SUMO-SBD12-08-FDCHPA000'], ['GS01SUMO-SBD11-01-MOPAK0000']]

In [5]:
no_deploy = find_no_dataset(path)
no_deploy

[]

In [6]:
no_qcConfig = find_no_qc_lookup(path)
no_qcConfig

[['GS01SUMO-SBD11-05-SPKIRB000-spkir_abj_dcl_instrument_recovered'],
 ['GS01SUMO-SBD11-05-SPKIRB000-spkir_abj_dcl_instrument'],
 ['GS01SUMO-RII11-02-ADCPSN010-adcps_jln_stc_instrument_recovered'],
 ['GS01SUMO-RII11-02-ADCPSN010-adcp_engineering'],
 ['GS01SUMO-RII11-02-ADCPSN010-adcp_velocity_earth'],
 ['GS01SUMO-RII11-02-ADCPSN010-adcps_jln_stc_instrument']]

In [7]:
# Filter for untested streams
from numpy import mean
def remove_streams(stream_list, skip_kw):
    filtered_list = []
    for stream in stream_list:
        skip_stream = [stream[0].find(x) for x in skip_kw]
        if mean(skip_stream)!=-1:
            continue
        filtered_list.append(stream[0])
    return filtered_list

skip_kw = ["power", "metadata", "blank", "diagnostic", "dcl_eng", "cpm_eng",
           "metbk_hourly", "hyd_o", "wavss_a_dcl_fourier",
           "wavss_a_dcl_motion", "wavss_a_dcl_non_dir", "mopak_o_dcl_rate",
           "wave_burst", "wfp_eng", "offset", "sio_eng", "glider_eng",
           "glider_gps", "adcp_config", "imodem_control"]

filtered_noQC = remove_streams(no_qcConfig, skip_kw)
filtered_noQC

['GS01SUMO-SBD11-05-SPKIRB000-spkir_abj_dcl_instrument_recovered',
 'GS01SUMO-SBD11-05-SPKIRB000-spkir_abj_dcl_instrument',
 'GS01SUMO-RII11-02-ADCPSN010-adcps_jln_stc_instrument_recovered',
 'GS01SUMO-RII11-02-ADCPSN010-adcp_engineering',
 'GS01SUMO-RII11-02-ADCPSN010-adcp_velocity_earth',
 'GS01SUMO-RII11-02-ADCPSN010-adcps_jln_stc_instrument']