In [1]:
# Import libraries and data 
import scipy.io as sio
import copy
import numpy as np
from scipy.signal import find_peaks
from os.path import dirname, join as pjoin
import datetime
import csv
import math
import sys
sys.path.insert(1,'../')
import Tools.data_processing as dp
import Tools.data_movement as dm 
from auxiliary_functions import extract_runoff, get_stage_events, detect_edges

fDOM_raw_data = dm.read_in_preprocessed_timeseries('../Data/converted_data/julian_format/fDOM_raw_10.1.2011-9.4.2020.csv')
stage_data = dm.read_in_preprocessed_timeseries('../Data/converted_data/julian_format/stage_10.1.11-1.1.19.csv')
turb_data = dm.read_in_preprocessed_timeseries('../Data/converted_data/julian_format/turbidity_raw_10.1.2011_9.4.2020.csv')
stage_data = dp.align_stage_to_fDOM(fDOM_raw_data, stage_data)

In [3]:
# Process stage rises so that each index displays distance to next stage rise in positive and negative direction
s_indices = detect_stage_rises(stage_data[:,1])

y = s_indices.shape[0] -1 
s_indexed = np.zeros((s_indices.shape[0],2))
x_count = -1 
y_count = -1
for x in range(s_indices.shape[0]):
    # X Block 
    
    # When x encounters first stage rise, start x counter
    if x_count == -1 and s_indices[x] == 1:
        x_count = 0
    if x_count != -1:
        if s_indices[x] == 1:
            x_count = 0
            s_indexed[x,0] = x_count
        else:
            x_count += 1
            s_indexed[x,0] = x_count
    else:
        s_indexed[x,0] = -1
            
    # Y Block
    if y_count == -1 and s_indices[y] == 1:
        y_count = 0
    if y_count != -1:
        if s_indices[y] == 1:
            y_count = 0
            s_indexed[y,1] = y_count
        else:
            y_count += 1
            s_indexed[y,1] = y_count
    else: 
        s_indexed[y,1] = -1
        
    y-=1

    # Get turb and fDOM peaks
fDOM_cand_params = {'prom' : [8,None],
                    'width': [5, None],
                    'wlen' : 300,
                    'dist' : 20,
                    'rel_h': .6}

turb_cand_params = {'prom' : [6,None],
                    'width': [None, None],
                    'wlen' : 200,
                    'dist' : 1,
                    'rel_h': .6}

fDOM_peaks, fDOM_props = get_candidates(fDOM_raw_data, fDOM_cand_params)
turb_peaks, turb_props = get_candidates(turb_data, turb_cand_params)
turb_peaks_org = copy.deepcopy(turb_peaks)
turb_props_org = copy.deepcopy(turb_props)

# Remove peaks that occur during a flat plateau 
turb_flat_plat = detect_flat_plat(turb_data, 100, 40)
turb_flat_plat_indxs = []
for i in range(turb_flat_plat.shape[0]):
    if turb_flat_plat[i] == 1:
        turb_flat_plat_indxs.append(i)

take_indices = []
for i,peak in enumerate(turb_peaks):
    if peak not in turb_flat_plat_indxs:
        take_indices.append(i)

turb_peaks = np.take(turb_peaks, take_indices)
for key in turb_props:
    turb_props[key] = np.take(turb_props[key], take_indices)
# Convert peaks and props to useable structure and assign values from s_indexed
# Each entry = [index_of_peak, left_ips, right_ips, X, Y, flag]
fDOM_cand = [[peak, math.floor(fDOM_props['left_ips'][i]), math.ceil(fDOM_props['right_ips'][i]),s_indexed[peak,0], s_indexed[peak,1] ,None] for i,peak in enumerate(fDOM_peaks)] 
org_turb_cand = [[peak, math.floor(turb_props['left_ips'][i]), math.ceil(turb_props['right_ips'][i]),s_indexed[peak,0], s_indexed[peak,1] ,None] for i,peak in enumerate(turb_peaks)]

def return_turb_cand():
    turb_peaks, turb_props = get_candidates(turb_data, turb_cand_params)
    
    take_indices = []
    for i,peak in enumerate(turb_peaks):
        if peak not in turb_flat_plat_indxs:
            take_indices.append(i)

    turb_peaks = np.take(turb_peaks, take_indices)
    for key in turb_props:
        turb_props[key] = np.take(turb_props[key], take_indices)
        
    turb_cand = [[peak, math.floor(turb_props['left_ips'][i]), math.ceil(turb_props['right_ips'][i]),s_indexed[peak,0], s_indexed[peak,1] ,None] for i,peak in enumerate(turb_peaks)]
    
    return turb_cand

In [20]:
turb_data = dm.read_in_preprocessed_timeseries('../Data/converted_data/julian_format/turbidity_raw_10.1.2011_9.4.2020.csv')


In [4]:
x = 10
y = 3

flag = False
count = 0
while not flag: 
    count +=1
    # Detect turb candidates 
    turb_cand = return_turb_cand()

    # Label turb candidates as PP or Not PP
    PP_count = 0
    for peak in turb_cand:
        if(peak[3] != -1 and peak[3] <= x) or (peak[4] != -1 and peak[4] <= y):
            peak[5] = 'P'
        else:
            peak[5] = 'F'
            PP_count +=1
    
    # If no PP, then set flag to True, will exit loop after this iteration
    if not PP_count or count == 20:
        flag = True
    else: 
        print('Num PP: ', PP_count)
        
    turb_PP_peaks = list(filter(lambda peak: peak[5] == 'F', turb_cand))
    
    # For PP, interpolate turb_data from left_ips to right_ips
    for peak in turb_PP_peaks:
        
        left_ips = peak[1]
        right_ips = peak[2]
        left_ips_val = turb_data[left_ips,1]
        right_ips_val = turb_data[right_ips,1]
        span = right_ips - left_ips
        
        increment = (right_ips_val - left_ips_val) / (span)
        
        j = 1
        for i in range(left_ips+1, right_ips):
            turb_data[i,1] = left_ips_val + (increment * j)
            j+=1
        
# Plot data



Num PP:  489
Num PP:  103
Num PP:  42
Num PP:  14
Num PP:  4
Num PP:  1


In [7]:
# Get stage rises
fDOM_cand_params = {'prom' : [1.5,None],
                    'width': [None, None],
                    'wlen' : 200,
                    'dist' : 1,
                    'rel_h': .5}

# turb_cand_params = {'prom' : [6,None],       These are the params that were used to label turb cand 0-100k
#                     'width': [None, None],
#                     'wlen' : 200,
#                     'dist' : 1,
#                     'rel_h': .6}

turb_cand_params = {'prom' : [6,None],
                    'width': [None, None],
                    'wlen' : 200,
                    'dist' : 1,
                    'rel_h': .6}
turb_org_data = dm.read_in_preprocessed_timeseries('../Data/converted_data/julian_format/turbidity_raw_10.1.2011_9.4.2020.csv')
# Get fDOM and turb candiate peaks
fDOM_peaks, fDOM_props = get_candidates(turb_org_data, turb_cand_params)

# Remove peaks that occur during a flat plateau 
turb_flat_plat = detect_flat_plat(turb_data, 100, 40)
turb_flat_plat_indxs = []
for i in range(turb_flat_plat.shape[0]):
    if turb_flat_plat[i] == 1:
        turb_flat_plat_indxs.append(i)

take_indices = []
for i,peak in enumerate(turb_peaks_org):
    if peak not in turb_flat_plat_indxs:
        take_indices.append(i)

turb_peaks_org = np.take(turb_peaks_org, take_indices)
for key in turb_props_org:
    turb_props_org[key] = np.take(turb_props_org[key], take_indices)

# Iterate through peaks and turn into short 3 point "events" by flagging the data point to either side of a peak
# fDOM_events = []
# for peak in fDOM_peaks:
#             fDOM_events.append(np.array((fDOM_raw_data[peak-1], fDOM_raw_data[peak], fDOM_raw_data[peak+1])))
fDOM_events = []
fDOM_lb = []
fDOM_rb = []

for i,peak in enumerate(fDOM_peaks):
            fDOM_events.append(np.array((turb_org_data[peak])))
#             fDOM_lb.append(fDOM_raw_data[fDOM_props['left_bases'][i],0])
#             fDOM_rb.append(fDOM_raw_data[fDOM_props['right_bases'][i],0])
            fDOM_lb.append(turb_org_data[math.floor(fDOM_props['left_ips'][i]),0])
            fDOM_rb.append(turb_org_data[math.ceil(fDOM_props['right_ips'][i]),0])
            
fDOM_lb = list(set(fDOM_lb))
fDOM_lb.sort()
fDOM_rb = list(set(fDOM_rb))
fDOM_rb.sort()

turb_events = []
turb_lb = []
turb_rb = []
for i,peak in enumerate(turb_peaks_org):
            turb_events.append(np.array((turb_data[peak])))
            turb_lb.append(turb_data[math.floor(turb_props_org['left_ips'][i]),0])
            turb_rb.append(turb_data[math.ceil(turb_props_org['right_ips'][i]),0])
            
turb_lb = list(set(turb_lb))
turb_lb.sort()
turb_rb = list(set(turb_rb))
turb_rb.sort()            

fDOM_merged = dp.merge_data(turb_org_data, fDOM_events, 't_opp', '')
turb_merged = dp.merge_data(turb_data, turb_events, 't_opp', '')

fDOM_merged = merge_additional_data(fDOM_merged, fDOM_lb, 'left_base')
fDOM_merged = merge_additional_data(fDOM_merged, fDOM_rb, 'right_base')

turb_merged = merge_additional_data(turb_merged, turb_lb, 'left_base')
turb_merged = merge_additional_data(turb_merged, turb_rb, 'right_base')


stage_edge_data = stage_rises_to_data(s_indices, stage_data)
stage_data_merged = dp.merge_data(stage_data, stage_edge_data, 'rise','')

dm.write_data_to_trainset(fDOM_merged,
                          stage_data_merged,
                          turb_merged,
                          '../Data/anomaly_data/temp_data/turb_corr_test_0k-100k.csv',
                          True,
                          True,
                          0,
                          100000)

In [2]:
def get_candidates(data: np.ndarray, params):
    
    """
    Return all peaks that should be scanned for out or order peaks(oop)
    We don't want to return skyrocketing peaks/local fluctuations - although they are oop, 
    they will be caught by their respective algorithms
    
    data   : timeseries to scan for peaks 
    return : peaks indentified with given hyperparameters and properties of those peaks
    """
    peaks, props = find_peaks(data[:,1],
                              height = (None, None),
                              threshold = (None,None),
                              distance = params['dist'],
                              prominence = params['prom'],
                              width = params['width'],
                              wlen = params['wlen'],
                              rel_height = params['rel_h'])
    return peaks, props

def event_from_props_ips(peaks : np.ndarray, props : dict, data : np.ndarray) -> list[np.ndarray]:
    """
    Generate a multi-sample "event" given a peak and it's left/right interpolated positions
    
    peaks :  detected peaks indexs
    props :  detected peaks properites
    data :   timeseries that peaks are from 
    return : list of events generated from peaks
    """
#     events = []
#     prev_end = -1
#     for i,index in enumerate(peaks):
#         start = int(props['left_ips'][i])
#         end = int(props['right_ips'][i])
#         if i < len(peaks)-1:
#             next_start = int(props['left_ips'][i+1])
#             end_2 = end
#             bw = end - start
#             if end <= next_start:s
#                 end = next_start-1
#             if start >= end-1:
#                 print('\n\n', i, bw)
#                 print(next_start, start,end_2)
#             else:
#                 events.append(data[start: end + 1,:])
#         else: 
#             events.append(data[start:end+1,:])
#     return events
    events = []
    for i, index in enumerate(peaks):
        events.append(data[int(props['left_ips'][i]):int(props['right_ips'][i])+1,:])
    return events

def peak_of_event(event : np.ndarray) -> np.ndarray:
    """
    Detect the peak sample in an event
    
    event :  candidate event
    return : sample that is peak of event
    """
    max = event[0]
    for entry in event:
        if entry[1] > max[1]:
            max = entry
    return max

def delete_missing_data_peaks(data, peaks, props, missing_file_path):
    """ 
    Delete peaks that occur during time periods designated as "missing data"
    
    data:              timeseries that peaks occured in
    peaks:             indices of peaks detected
    props:             properties associated with each peak
    missing_file_path: file path of missing date ranges
    return:            filtered peaks and props
    """
    with open(missing_file_path,newline='') as file:
        reader = csv.reader(file, delimiter = ',')
        time_list = []
        for row in reader:
            time_list.append([dp.datetime_to_julian(datetime.datetime.strptime(row[0],'%Y-%m-%d %H:%M:%S')),
                              dp.datetime_to_julian(datetime.datetime.strptime(row[1],'%Y-%m-%d %H:%M:%S'))])
            
        # Identify and remove violating peaks 
        keep_indices = list(np.linspace(0,peaks.shape[0]-1,peaks.shape[0]))
        for i,idx in enumerate(peaks): 
            time = data[idx,0]
            for row in time_list: 
                if time >= row[0] and time <= row[1]:
                    keep_indices.remove(i)
                    break
        
        peaks = np.take(peaks,keep_indices,0)  
        
        # Remove properties for violating peaks
        for key in props:
            props[key] = np.take(props[key], keep_indices,0)
        
        return peaks, props
    
def detect_stage_rises(data : np.ndarray):
    """
    Function detects rising edges in stage based on upon manually selected hyper parameters 
    
    data:   values (no timestamps) of each data point in the timeseries
    
    return: array, that is length of data, representing whether each data point is
            part of a rising edge or not: 1 = rising, 0 = not rising
    """
    
    large_window_size = 6
    small_window_size = 3
    threshold = .02
    r = 2
    signals = np.zeros(len(data)) 
    # Detect larger, smoother rising edges 
    for i in range(len(data) - large_window_size):
        if round(round(data[i+large_window_size-1],r) - round(data[i],r),r) >= threshold:
            signals[i:i+large_window_size] = 1
    # Detect smaller, sharper rising edges 
    for i in range(len(data) - small_window_size):
        if round(round(data[i+small_window_size-1],r) - round(data[i],r),r) >= threshold:
            signals[i:i+small_window_size] = 1
    # Remove erroneously marked points
    for i in range(len(data) - 3):
        if data[i+2] < data[i+1] and data[i+1] < data[i]:
            signals[i+1] = 0
    
    # Remove edges that are less than 3 points long
    for i in range(1,len(signals)-2):
        if signals[i] == 1 and signals[i-1] == 0 and signals[i+1] == 0:
            signals[i] = 0 
        if signals[i] == 1 and signals[i-1] == 0 and signals[i+1] == 1 and signals[i+2] == 0:
            signals[i] = 0
    return signals

def stage_rises_to_data(signals : np.ndarray, data : np.ndarray) -> np.ndarray:
    """
    Givin stage rise signals, correlate these signals to the original data to 
    extract edges (timestamp, value)
    
    signals: signal for each data point in timeseries: 1 == rising edge, else 0 
    data:    stage data timeseries 
    
    return: 2d array of all rising edge data points
    
    """
    take_indices = []
    for i in range(len(signals)):
        if signals[i] == 1:
            take_indices.append(i)
    return np.take(data,take_indices, 0)

def merge_additional_data(data : list, add_data, add_flag):
    """
    Merge additional data to timeseries data that already has flags
    """
    index = 0 
    for i in range(len(data)):
        if (index < len(add_data) and data[i][0] == add_data[index]):
            data[i][2] = add_flag
            index+=1
    return data

def detect_flat_plat(data: np.ndarray, window: int, threshold: int):
    """ 
    Detect "flat plateaus" in a timeseries: consecutive datapoints as extreme amplitude
    """
    
    signals = np.zeros(data.shape[0])
    
    for i in range(data.shape[0] - window):
        flag = True
        for j in range(i, i + window):
            if data[j,1] < threshold: 
                flag = False 
                break
        if flag: 
            signals[i:i+window] = 1 
    return signals

def detect_pp(peaks : list[float],
               stage_peaks : list[float],
               hyperparams : dict):
    """
    Use stage stage_peaks and given hyperparams to detect 
    phantom peak (pp) in the given timeseries 
    
    peaks :       peaks from timeseries to detect pp in 
    stage_peaks : peaks in stage to compare against 
    hyperparams : dictionary containing threshold hyperparameters
    
    return :      list of peaks labeled either accepted (not pp) or rejected (pp)
    """
    stage_before_threshold = hyperparams['stage_before_threshold']
    stage_after_threshold = hyperparams['stage_after_threshold']
    
    labeled_peaks = []
    
    for peak in peaks:
        stage_flag = False
        for s_peak in stage_peaks:
            if (s_peak[2] >= peak[2] - stage_before_threshold) and (s_peak[2] < peak[2] + stage_after_threshold):
                stage_flag = True 
                break
        if stage_flag: 
            labeled_peaks.append([peak,'accepted'])
        else: 
            labeled_peaks.append([peak,'rejected'])
    return labeled_peaks