In [80]:
# Import libraries and data 
import scipy.io as sio
import copy
import pickle
import numpy as np
from scipy.signal import find_peaks
from os.path import dirname, join as pjoin
import datetime
import csv
import math
import sys
sys.path.insert(1,'../')
import Tools.data_processing as dp
import Tools.data_movement as dm 
from auxiliary_functions import get_candidates, detect_flat_plat, detect_stage_rises

fDOM_data = dm.read_in_preprocessed_timeseries('../Data/converted_data/julian_format/fDOM_raw_10.1.2011-9.4.2020.csv')
stage_data = dm.read_in_preprocessed_timeseries('../Data/converted_data/julian_format/stage_10.1.11-1.1.19.csv')
turb_data = dm.read_in_preprocessed_timeseries('../Data/converted_data/julian_format/turbidity_raw_10.1.2011_9.4.2020.csv')
stage_data = dp.align_stage_to_fDOM(fDOM_data, stage_data)

### Detect and process stage rises

In [82]:
# Get stage rises
s_indices = detect_stage_rises(stage_data[:,1])

# Process stage rises so that each index displays distance to next stage rise in positive and negative direction
y = s_indices.shape[0] -1 
s_indexed = np.zeros((s_indices.shape[0],2))
x_count = -1 
y_count = -1
for x in range(s_indices.shape[0]):
    # X Block 
    
    # When x encounters first stage rise, start x counter
    if x_count == -1 and s_indices[x] == 1:
        x_count = 0
    if x_count != -1:
        if s_indices[x] == 1:
            x_count = 0
            s_indexed[x,0] = x_count
        else:
            x_count += 1
            s_indexed[x,0] = x_count
    else:
        s_indexed[x,0] = -1
            
    # Y Block
    if y_count == -1 and s_indices[y] == 1:
        y_count = 0
    if y_count != -1:
        if s_indices[y] == 1:
            y_count = 0
            s_indexed[y,1] = y_count
        else:
            y_count += 1
            s_indexed[y,1] = y_count
    else: 
        s_indexed[y,1] = -1
        
    y-=1

### Get optimal fDOM candidate sets

Methods to reduce fDOM candidate: <br>

- Combat detecting shallow peaks by making two calls to find_peaks with different parameters

- Combat detecting "peaks" formed by plummeting peaks by not considering peaks where both start and end are significantly below floating mean of data within recent past


In [83]:
large_peaks_params = {'prom' : [8,None],
                    'width': [15, None],
                    'wlen' : 200,
                    'dist' : 1,
                    'rel_h': .6}

small_peaks_params = {'prom' : [3,None],
                    'width': [0, 15],
                    'wlen' : 200,
                    'dist' : 1,
                    'rel_h': .6}

large_peaks, large_props = get_candidates(fDOM_data, large_peaks_params)
small_peaks, small_props = get_candidates(fDOM_data, small_peaks_params)

# Remove peaks from missing data error range: 12-10-27 14:00:00 -> 12-10-31 18:00 

# Peaks will be deleted, so get the relevant properties right now
temp_large = []
large_indices = set()
for i in range(len(large_peaks)):
    temp_large.append([large_peaks[i], large_props['prominences'][i], large_props['widths'][i], large_props['left_ips'][i],large_props['right_ips'][i]])
    large_indices.add(large_peaks[i])
large_peaks = temp_large

temp_small = []
small_indices = set()
for i in range(len(small_peaks)):
    temp_small.append([small_peaks[i], small_props['prominences'][i], small_props['widths'][i], small_props['left_ips'][i],small_props['right_ips'][i]])
    small_indices.add(small_peaks[i])
small_peaks = temp_small

# Combine both lists of peak, if there are duplicates, prefer the information from the larger peak
del_indices = large_indices.intersection(small_indices)
if len(del_indices):
    print(len(del_indices))
    raise Exception("Sets should not overlap")
    
fDOM_cands = large_peaks + small_peaks

def sortCands(cand):
    return cand[0]
fDOM_cands.sort(key = sortCands)

print(len(fDOM_cands))

temp = []
for peak in fDOM_cands:
    if not(peak[0] > 17816 and peak[0] < 17849):
        temp.append(peak)

fDOM_cands = copy.deepcopy(temp)
print(len(fDOM_cands))

# removed_peaks = []
# non_removed_peaks = []

# For each peak, determine if its left ips and right ips are both below half of the last min(1000 points, start of data to peak)
# for peak in fDOM_cands:
#     window_back = min(peak[0], 5000)
    
#     mean = np.mean(fDOM_data[peak[0] - window_back: peak[0], 1]) * .8
#     if fDOM_data[math.floor(peak[3]),1] < mean and fDOM_data[math.ceil(peak[4]),1] < mean:
#         removed_peaks.append(peak)
#     else:
#         non_removed_peaks.append(peak)


    

821
818


In [41]:
# If peak is very near mean for past x points, then it is likely not a peak we want
removed_peaks = []
non_removed_peaks = []
win_len = 60
tolerance  = 2
for peak in fDOM_cands: 
    if peak[0] - win_len > 0 and peak[0] + win_len < fDOM_data.shape[0]:
        mean = np.mean(fDOM_data[peak[0] - win_len: peak[0] + win_len, 1])
#         print("Mean: ", mean, " Value of peak: ", fDOM_data[peak[0],1])
        if abs(mean - fDOM_data[peak[0],1]) < tolerance and np.std(fDOM_data[peak[0] - win_len: peak[0] + win_len,1]) < 4:
            removed_peaks.append(peak)
        else: 
            non_removed_peaks.append(peak)
    


In [73]:
print(len(removed_peaks))
print(len(non_removed_peaks))
print(len(fDOM_cands))

0
0
818


In [128]:
# Experiment with calculating a peaks smoothness
def calc_roughness(x):
    first_derivative = np.diff(x)
    std = np.std(first_derivative)
    if std:
        normalized_first_derivative = (first_derivative - np.mean(first_derivative)) / std
        roughness = (np.diff(normalized_first_derivative) ** 2) / 4
        return np.mean(roughness)
    
    return 0 


arr1 = np.array([0,2,-1,4,1,6,2,8,1,10])
arr2 = np.array([0,2,3,4,5])
arr3 = np.array([0,-22,3,110,-1000,6,-1,8,9,110])

print(calc_roughness(arr1))
print(calc_roughness(arr2))
print(calc_roughness(arr3))

0.9529820448307408
0.4444444444444446
0.8694362691525639


In [136]:
def calc_roughness3(x):
    fod = np.diff(x)
    return(np.mean(fod))
arr1 = np.array([0,2,3,4,5,6,7,8,9,10,11,12])
arr2 = np.array([0,0,0])
arr3 = np.array([0,-22,3,1,5,6,-1,8,9,110,110,110,110,110,110])
print(calc_roughness3(arr1))
print(calc_roughness3(arr2))
print(calc_roughness3(arr3))

1.0909090909090908
0.0
12.285714285714286


In [175]:
def calc_roughness4(x):
    fod = np.diff(x)
    num_switch = 0 
    for i in range(len(fod)-1): 
        if np.sign(fod[i]) != np.sign(fod[i+1]) and np.sign(fod[i+1]) != 0:
            num_switch +=1
    print(num_switch)
    return num_switch / len(x)

In [130]:
def calc_roughness2(x):
    fod = np.diff(x)
    mean_fod = np.mean(fod)
    if mean_fod:
        return np.std(fod)/ np.abs(mean_fod)
    return 0

arr1 = np.array([0,2,-1,4,1,6,2,8,1,10])
arr2 = np.array([0,2,3,4,5])
arr3 = np.array([0,-22,3,110,-1000,6,-1,8,9,110])

print(calc_roughness2(arr1))
print(calc_roughness2(arr2))
print(calc_roughness2(arr3))


4.675467891024384
0.34641016151377546
41.05140297176716


In [70]:
x1 = '12/10/27/14:00:00'
x2 = '12/10/31/18:00:00'

print(time_to_idx(x1))
print(time_to_idx(x2))

17816
17849


In [67]:
def time_to_idx(time):
    time = datetime.datetime.strptime(time, '%y/%m/%d/%X')
    time = dp.datetime_to_julian(time)
    for i,row in enumerate(turb_data):
        if row[0] == time:
            return i
    return -1

# end = time_to_idx('12/08/09/15:30:00') 
# start = time_to_idx('12/08/08/18:15:00') 
# test_arr = turb_data[start:end+1, 1]
# print(calc_roughness4(np.around(test_arr,10)))


# start = time_to_idx('12/08/09/16:00:00')
# end = time_to_idx('12/08/10/00:30:00')
# test_arr = turb_data[start:end+1, 1]
# print(calc_roughness4(np.around(test_arr,10)))

# start = time_to_idx('12/08/02/18:30:00')
# end = time_to_idx('12/08/03/05:00:00')
# test_arr = turb_data[start:end+1, 1]
# print(calc_roughness4(np.around(test_arr,10)))

# start = time_to_idx('12/07/04/03:30:00')
# end = time_to_idx('12/07/04/14:45:00')
# test_arr = turb_data[start:end+1, 1]
# print(calc_roughness4(np.around(test_arr,10)))

# start = time_to_idx('12/06/25/14:15:00')
# end = time_to_idx('12/06/25/23:15:00')
# test_arr = turb_data[start:end+1, 1]
# print(calc_roughness4(np.around(test_arr,10)))

In [105]:
arr3 = np.array([0,0,3,1,5,6,-100000,8,1009,110])
fd = np.diff(arr3)
m = np.mean(fd)
std = np.std(fd)
x = (fd - m)/std

r = (np.diff(x) ** 2)/4
print(fd)
print(m)
print(r)
print(np.mean(r))

[      0       3      -2       4       1 -100006  100008    1001    -899]
12.222222222222221
[1.01226672e-09 2.81185201e-09 4.04906690e-09 1.01226672e-09
 1.12489827e+00 4.49959309e+00 1.10251436e+00 4.06031430e-04]
0.8409264705134258


In [38]:
print(np.std(fDOM_data[0:10,1]))

0.14781603361113027


In [43]:
print(len(removed_peaks)) 
for peak in removed_peaks:
    print(dp.julian_to_datetime(fDOM_data[peak[0],0]))

53
2012-08-26 13:15:00
2012-08-27 09:15:00
2012-09-01 08:45:00
2012-11-06 14:45:00
2012-11-06 15:45:00
2012-11-06 16:30:00
2012-11-06 17:15:00
2013-04-13 16:15:00
2013-04-13 17:15:00
2013-04-13 19:45:00
2013-04-13 21:15:00
2013-04-14 01:00:00
2013-04-14 19:30:00
2013-04-14 21:30:00
2013-04-14 22:15:00
2013-04-14 23:30:00
2013-04-17 09:00:00
2013-04-17 11:00:00
2013-04-18 06:30:00
2013-09-10 14:00:00
2013-10-16 12:30:00
2014-04-29 14:45:00
2014-07-22 13:15:00
2015-04-14 14:45:00
2015-09-22 13:00:00
2015-12-04 05:45:00
2016-12-12 09:00:00
2017-04-24 13:15:00
2017-12-15 07:30:00
2017-12-15 08:15:00
2017-12-15 09:00:00
2017-12-15 09:45:00
2017-12-15 10:30:00
2017-12-15 11:30:00
2017-12-15 12:30:00
2017-12-15 13:30:00
2017-12-15 14:30:00
2017-12-15 15:30:00
2017-12-18 02:45:00
2017-12-18 03:15:00
2017-12-18 06:30:00
2017-12-18 07:00:00
2018-01-30 11:45:00
2018-01-30 17:30:00
2018-03-04 14:15:00
2018-04-25 14:00:00
2018-04-26 00:45:00
2018-08-13 05:15:00
2018-08-13 07:00:00
2018-08-14 10:00:

In [86]:
# disp_peaks = non_removed_peaks
disp_peaks = fDOM_cands

In [85]:
print(disp_peaks[0])
print(fDOM_cands[0])

[616, 10.488100619999997, 45.436164568494746, 606.1435361583116, 651.5797007268063]
[616, 10.488100619999997, 45.436164568494746, 606.1435361583116, 651.5797007268063]


In [87]:
# Convert peaks and props to useable structure and assign values from s_indexed
fDOM_cands = [[peak[0], peak[3],peak[4],s_indexed[peak[0],0], s_indexed[peak[0],1]] for peak in disp_peaks]


In [79]:
print(len(fDOM_cands))

768


In [91]:
turb_cand_params = {'prom' : [6,None],
                    'width': [None, None],
                    'wlen' : 200,
                    'dist' : 1,
                    'rel_h': .6}

# Get fDOM and turb candiate peaks
turb_peaks, turb_props = get_candidates(turb_data, turb_cand_params)

# Remove peaks that occur during a flat plateau 
turb_flat_plat = detect_flat_plat(turb_data, 100, 40)
turb_flat_plat_indxs = []
for i in range(turb_flat_plat.shape[0]):
    if turb_flat_plat[i] == 1:
        turb_flat_plat_indxs.append(i)

take_indices = []
for i,peak in enumerate(turb_peaks):
    if peak not in turb_flat_plat_indxs:
        take_indices.append(i)

turb_peaks = np.take(turb_peaks, take_indices)
for key in turb_props:
    turb_props[key] = np.take(turb_props[key], take_indices)

# Iterate through peaks and turn into short 3 point "events" by flagging the data point to either side of a peak
fDOM_events = []
fDOM_lb = []
fDOM_rb = []

for i,cand in enumerate(fDOM_cands):
            fDOM_events.append(np.array((fDOM_data[cand[0]])))
            fDOM_lb.append(fDOM_data[math.floor(cand[1]),0])
            fDOM_rb.append(fDOM_data[math.ceil(cand[2]),0])
            
fDOM_lb = list(set(fDOM_lb))
fDOM_lb.sort()
fDOM_rb = list(set(fDOM_rb))
fDOM_rb.sort()

turb_events = []
turb_lb = []
turb_rb = []
for i,peak in enumerate(turb_peaks):
            turb_events.append(np.array((turb_data[peak])))
            turb_lb.append(turb_data[math.floor(turb_props['left_ips'][i]),0])
            turb_rb.append(turb_data[math.ceil(turb_props['right_ips'][i]),0])
            
turb_lb = list(set(turb_lb))
turb_lb.sort()
turb_rb = list(set(turb_rb))
turb_rb.sort()            

fDOM_merged = dp.merge_data(fDOM_data, fDOM_events, 'f_opp', '')
turb_merged = dp.merge_data(turb_data, turb_events, 't_opp', '')

fDOM_merged = dp.merge_additional_data(fDOM_merged, fDOM_lb, 'left_base')
fDOM_merged = dp.merge_additional_data(fDOM_merged, fDOM_rb, 'right_base')

turb_merged = dp.merge_additional_data(turb_merged, turb_lb, 'left_base')
turb_merged = dp.merge_additional_data(turb_merged, turb_rb, 'right_base')


stage_edge_data = dp.stage_rises_to_data(s_indices, stage_data)
stage_data_merged = dp.merge_data(stage_data, stage_edge_data, 'rise','')

dm.write_data_to_trainset(fDOM_merged,
                          stage_data_merged,
                          turb_merged,
                          '/Users/zachfogg/Desktop/DB-SRRW/Data/plot/fDOM_cand_200k-300k.csv',
                          True,
                          True,
                          200000,
                          300000)

In [68]:
print(len(stage_data))
print(len(fDOM_data))

229620
229620


In [14]:
print(list(range(0,10)))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
