In [None]:
# File to plot corrected turbidity/fDOM; stage is already corrected

# Libraries
import scipy.io as sio
import numpy as np
from scipy.signal import find_peaks
from os.path import dirname, join as pjoin
import sys
import datetime
import csv

# Tools I've developed along the way to aid the project
import sys
sys.path.insert(1,'../')
import Tools.data_movement as dm 
import Tools.data_processing as dp
from auxiliary_functions import extract_runoff

fDOM_raw_data = dm.read_in_preprocessed_timeseries('/Users/zachfogg/Desktop/DB-SRRW/Data/converted_data/julian_format/fDOM_raw_10.1.2011-9.4.2020.csv')
fDOM_cor_data = dm.read_in_preprocessed_timeseries('/Users/zachfogg/Desktop/DB-SRRW/Data/converted_data/julian_format/fDOM_corrected_10.1.2011-9.4.2020.csv')
turb_raw_data = dm.read_in_preprocessed_timeseries('/Users/zachfogg/Desktop/DB-SRRW/Data/converted_data/julian_format/turbidity_raw_10.1.2011_9.4.2020.csv')
stage_data = dm.read_in_preprocessed_timeseries('/Users/zachfogg/Desktop/DB-SRRW/Data/converted_data/julian_format/stage_10.1.11-1.1.19.csv')
turb_cor_data = dm.read_in_preprocessed_timeseries('/Users/zachfogg/Desktop/DB-SRRW/Data/converted_data/julian_format/turbidity_corrected_10.1.2011-9.4.2020.csv')

for i in range(0,300000,50000):
    write_data_to_trainset( fDOM_raw_data,
                            fDOM_cor_data,
                            stage_data, 
                            turb_raw_data,
                            turb_cor_data,
                            f"/Users/zachfogg/Desktop/DB-SRRW/Data/manual_annotating_data/non_annotated_csv/5_timeseries_{i/1000}k-{(i+50000)/1000}k.csv",
                            False,
                            True,
                            i,
                            i+50000) 

In [4]:
missing_fDOM_cor = []
time_delta = datetime.timedelta(minutes=15)
prev_time = dp.julian_to_datetime(fDOM_cor_data[0,0]) - time_delta
for entry in fDOM_cor_data: 
    time = dp.julian_to_datetime(entry[0])
    if time != prev_time + time_delta:
        missing_fDOM_cor.append([prev_time, time])
    prev_time = time 
    
def missing_in_cor(time):
    flag = False
    for entry in missing_fDOM_cor:
        if entry[0] < time and time < entry[1]:
            flag = True
            print(time, "flagged")
    return flag



def write_data_to_trainset(fDOM_raw_data : np.ndarray, 
                           fDOM_cor_data : np.ndarray,
                           stage_data : np.ndarray,
                           turb_raw_data : np.ndarray, 
                           turb_cor_data : np.ndarray,
                           out_file : str, 
                           has_flags : bool = False, 
                           is_julian_time : bool = True, 
                           data_start : int = 0, 
                           data_end : int =  sys.maxsize) -> None:
    """ 
    Function takes in 5 timeseries, adjusts and aligns timeseires,
    combines data, formats to Trainset specification, and writes 
    out combined dataset to given out directory location in csv format
    
    *_data:         timeseries
    out_file:       file path name to write data out to
    has_flags:      does the data have flags or not?
    is_julian_time: is data in julian time ?, else datetime
    data_start:     first index in data to start writing from
    data_end:       last index in data to write up to 
    """
    # Add empty flags if applicable 
    if not has_flags:
        fDOM_raw_data = dp.add_flags(fDOM_raw_data)
        fDOM_cor_data = dp.add_flags(fDOM_cor_data)
        stage_data = dp.add_flags(stage_data)
        turb_raw_data = dp.add_flags(turb_raw_data)
        turb_cor_data = dp.add_flags(turb_cor_data)
        
    
    # Convert from julian time to datetime objects
    if is_julian_time:
        fDOM_raw_data = [[dp.julian_to_datetime(entry[0]), entry[1], entry[2]] for entry in fDOM_raw_data]
        fDOM_cor_data = [[dp.julian_to_datetime(entry[0]), entry[1], entry[2]] for entry in fDOM_cor_data]
        stage_data = [[dp.julian_to_datetime(entry[0]), entry[1], entry[2]] for entry in stage_data]
        turb_raw_data = [[dp.julian_to_datetime(entry[0]), entry[1], entry[2]] for entry in turb_raw_data]
        turb_cor_data = [[dp.julian_to_datetime(entry[0]), entry[1], entry[2]] for entry in turb_cor_data]
        
    # Adjust data that is off by one second
    fDOM_raw_data = dp.correct_one_second_error(fDOM_raw_data)
    fDOM_cor_data = dp.correct_one_second_error(fDOM_cor_data)
    stage_data = dp.correct_one_second_error(stage_data)
    turb_raw_data = dp.correct_one_second_error(turb_raw_data)
    turb_cor_data = dp.correct_one_second_error(turb_cor_data)
    
    # Align data to fDOM
    stage_filtered = []
    turb_raw_filtered = []
    fDOM_cor_filtered = []
    turb_cor_filtered = []
    
    for i in range(len(fDOM_raw_data)):
        fDOM_time = fDOM_raw_data[i][0]
        j = i
            
        if missing_in_cor(fDOM_time):
            fDOM_cor_filtered.append(fDOM_raw_data[j])
        else:
            while fDOM_time != fDOM_cor_data[j][0]:
                j+=1
            fDOM_cor_filtered.append(fDOM_cor_data[j])
        j = i
        
        while fDOM_time != turb_raw_data[j][0]:
            j+=1
        turb_raw_filtered.append(turb_raw_data[j])
        j = i
        
        while fDOM_time != turb_cor_data[j][0]:
            j+=1
        turb_cor_filtered.append(turb_cor_data[j])
        j = i
        
        while fDOM_time != stage_data[j][0]:
            j+=1
        stage_filtered.append(stage_data[j])
        
        # Raise error if data still not in align
        if not (fDOM_time == fDOM_cor_filtered[i][0] == stage_filtered[i][0] == turb_raw_filtered[i][0] == turb_cor_filtered[i][0] ):
            raise ValueError("Data align failed at idx: {} ,with fDOM: {}, stage: {}, turb: {}".format(i,fDOM_time, stage_filtered[i][0],turb_filtered[i][0]))
    
    fDOM_cor_data = fDOM_cor_filtered
    stage_data = stage_filtered
    turb_raw_data = turb_raw_filtered
    turb_cor_data = turb_cor_filtered
    
    # Write out data to csv 
    with open(out_file, 'w',newline='') as outfile:
        writer = csv.writer(outfile,delimiter=',')
        writer.writerow(['series', 'timestamp', 'value', 'label'])
        
        for i in range(len(fDOM_raw_data)):
            # Skip entries not in given range
            if i >= data_start and i < data_end:
                # Convert to ISO8061 wformat 
                fDOM_raw_time = fDOM_raw_data[i][0].isoformat() + '.000Z'
                fDOM_cor_time = fDOM_cor_data[i][0].isoformat() + '.000Z'
                stage_time = stage_data[i][0].isoformat() + '.000Z'
                turb_raw_time = turb_raw_data[i][0].isoformat() + '.000Z'
                turb_cor_time = turb_cor_data[i][0].isoformat() + '.000Z'
                
                writer.writerow(['fDOM_raw', fDOM_raw_time, f'{fDOM_raw_data[i][1]:.5f}', fDOM_raw_data[i][2]])
                writer.writerow(['fDOM_cor', fDOM_cor_time, f'{fDOM_cor_data[i][1]:.5f}', fDOM_cor_data[i][2]])
                writer.writerow(['stage', stage_time, f'{stage_data[i][1]:.5f}', stage_data[i][2]])
                writer.writerow(['turb_raw',turb_raw_time, f'{turb_raw_data[i][1]:.5f}', turb_raw_data[i][2]])
                writer.writerow(['turb_cor',turb_cor_time, f'{turb_cor_data[i][1]:.5f}', turb_cor_data[i][2]])
    outfile.close()


