In [1]:
# Import libraries and data 
import scipy.io as sio
from scipy import ndimage
import copy
import pickle
import numpy as np
from scipy.signal import find_peaks
from os.path import dirname, join as pjoin
import datetime
import csv
import math
import sys
sys.path.insert(1,'../')
import Tools.data_processing as dp
import Tools.data_movement as dm 
from auxiliary_functions import get_candidates, detect_flat_plat, detect_stage_rises

fDOM_data = dm.read_in_preprocessed_timeseries('../Data/converted_data/julian_format/fDOM_raw_10.1.2011-9.4.2020.csv')
stage_data = dm.read_in_preprocessed_timeseries('../Data/converted_data/julian_format/stage_10.1.11-1.1.19.csv')
turb_data = dm.read_in_preprocessed_timeseries('../Data/converted_data/julian_format/turbidity_raw_10.1.2011_9.4.2020.csv')
stage_data = dp.align_stage_to_fDOM(fDOM_data, stage_data)

In [2]:
fDOM_data_copy = copy.deepcopy(fDOM_data)


In [23]:
fDOM_data = fDOM_data_copy

# Attempt to smooth data

In [3]:
def low_pass_filter(data, window_len):
    """
    (2 * window_len) + 1 is the size of the window that determines the values that 
    influence the current measurement (middle of window)
    """
    kernel = np.lib.pad(np.linspace(1,3,window_len), (0,window_len -1), 'reflect')
    kernel = np.divide(kernel, np.sum(kernel))
    return ndimage.convolve(data, kernel)

smoothed_signal = low_pass_filter(fDOM_data[:,1], 7)
fDOM_data = np.column_stack((fDOM_data[:,0],smoothed_signal))

### Detect and process stage rises

In [4]:
# Get stage rises
s_indices = detect_stage_rises(stage_data[:,1])

# Process stage rises so that each index displays distance to next stage rise in positive and negative direction
y = s_indices.shape[0] -1 
s_indexed = np.zeros((s_indices.shape[0],2))
x_count = -1 
y_count = -1
for x in range(s_indices.shape[0]):
    # X Block 
    
    # When x encounters first stage rise, start x counter
    if x_count == -1 and s_indices[x] == 1:
        x_count = 0
    if x_count != -1:
        if s_indices[x] == 1:
            x_count = 0
            s_indexed[x,0] = x_count
        else:
            x_count += 1
            s_indexed[x,0] = x_count
    else:
        s_indexed[x,0] = -1
            
    # Y Block
    if y_count == -1 and s_indices[y] == 1:
        y_count = 0
    if y_count != -1:
        if s_indices[y] == 1:
            y_count = 0
            s_indexed[y,1] = y_count
        else:
            y_count += 1
            s_indexed[y,1] = y_count
    else: 
        s_indexed[y,1] = -1
        
    y-=1

### Get optimal fDOM candidate sets

Methods to reduce fDOM candidate: <br>

- Combat detecting shallow peaks by making two calls to find_peaks with different parameters

- Combat detecting "peaks" formed by plummeting peaks by not considering peaks where both start and end are significantly below floating mean of data within recent past


In [11]:
# large_peaks_params = {'prom' : [8,None],
#                     'width': [15, None],
#                     'wlen' : 200,
#                     'dist' : 1,
#                     'rel_h': .6}

# small_peaks_params = {'prom' : [3,None],
#                     'width': [0, 15],
#                     'wlen' : 200,
#                     'dist' : 1,
#                     'rel_h': .6}

# large_peaks_params = {'prom' : [8,None],
#                     'width': [50, None],
#                     'wlen' : 200,
#                     'dist' : 1,
#                     'rel_h': .6}

small_peaks_params = {'prom' : [3,None],
                    'width': [None, None],
                    'wlen' : 200,
                    'dist' : 1,
                    'rel_h': .6}

remove_ranges = [[17816, 17849], [108170,108200],[111364, 111381]]
def isInRange(indx):
    for rng in remove_ranges:
        if rng[0] <= indx and indx <= rng[1]:
#             print(dp.julian_to_datetime(fDOM_data[indx,0]))
            return True
    return False 

# large_peaks, large_props = get_candidates(fDOM_data, large_peaks_params)
fDOM_peaks, fDOM_props = get_candidates(fDOM_data, small_peaks_params)

# Remove peaks from missing data error range: 12-10-27 14:00:00 -> 12-10-31 18:00 

# Peaks will be deleted, so get the relevant properties right now
# temp_large = []
# large_indices = set()
# for i in range(len(large_peaks)):
#     temp_large.append([large_peaks[i], large_props['prominences'][i], large_props['widths'][i], large_props['left_ips'][i],large_props['right_ips'][i]])
#     large_indices.add(large_peaks[i])

fDOM_cands = [[peak, math.floor(fDOM_props['left_ips'][i]), math.ceil(fDOM_props['right_ips'][i]),s_indexed[peak,0], s_indexed[peak,1] ,fDOM_props['prominences'][i]] for i,peak in enumerate(fDOM_peaks)]
# large_peaks = temp_large
# turb_cand = [[peak, math.floor(turb_props['left_ips'][i]), math.ceil(turb_props['right_ips'][i]),s_indexed[peak,0], s_indexed[peak,1] ,turb_props['prominences'][i]] for i,peak in enumerate(turb_peaks)]
# temp_small = []
# small_indices = set()
# for i in range(len(small_peaks)):
#     temp_small.append([small_peaks[i], math.floor(small_props['left_ips'][i]), math.ceil(small_props['right_ips'][i]), small_props['left_ips'][i],small_props['right_ips'][i]])
#     small_indices.add(small_peaks[i])
# small_peaks = temp_small
# temp_small = []
# small_indices = set()
# for i in range(len(small_peaks)):
#     temp_small.append([small_peaks[i], small_props['prominences'][i], small_props['widths'][i], small_props['left_bases'][i],small_props['right_bases'][i]])
#     small_indices.add(small_peaks[i])
# small_peaks = temp_small

# Combine both lists of peak, if there are duplicates, prefer the information from the larger peak
# del_indices = large_indices.intersection(small_indices)
# if len(del_indices):
#     print(len(del_indices))
#     raise Exception("Sets should not overlap")
    
# fDOM_cands = large_peaks + small_peaks
# fDOM_cands = small_peaks

# def sortCands(cand):
#     return cand[0]
# fDOM_cands.sort(key = sortCands)

# print(len(fDOM_cands))

temp = []
for peak in fDOM_cands:
    if not(isInRange(peak[0])):
        temp.append(peak)
fDOM_cands = copy.deepcopy(temp)


In [12]:
# Convert peaks and props to useable structure and assign values from s_indexed
disp_peaks = fDOM_cands
fDOM_cands = [[peak[0], peak[3],peak[4],s_indexed[peak[0],0], s_indexed[peak[0],1]] for peak in disp_peaks]


In [None]:
# Import ground truth values 
truth_fname = '../Data/labeled_data/ground_truths/turb/turb_pp/julian_time/fDOM_pp_0k-300k_labeled'

with open(truth_fname, 'r', newline = '') as f:
    
        reader = csv.reader(f, delimiter = ',')
        # truth entries in form: ['timestamp_of_peak', 'value_of_peak','label_of_peak','idx_of_peak']
        next(reader)
        truths = [[float(row[0]), float(row[1]), row[2], int(row[3])] for row in reader] 
        f.close()   

assert(len(truths) == len(fDOM_cand))

# Nested Cross Validation for fDOM PP

In [83]:
# Define Hyperparameter ranges and training parameters and helper function
iterations = 7000
num_splits = 5

x_bounds = [0, 30]
y_bounds = [0, 30]


accumulated_test_metrics = {}

accumulated_test_results = {}

accumulated_best_params = {}

train_test_split_indices = TimeSeriesSplit(num_splits).split(fDOM_cand)


def classify_fDOM_peaks(peaks, params):
    results = []
    for peak in peaks:
        if(peak[3] != -1 and peak[3] <= params['x']) or (peak[4] !=-1 and peak[4] <= params['y']):
            results.append([peak[0], 'NPP'])
        else: 
            results.append([peak[0], 'PP'])
    return results

def label_positives_negatives(predictions, truths):
    TP = TN = FP = FN = 0
    results = []
    
    for i in range(len(predictions)):
        prediction = predictions[i][1]
        truth = truths[i][2]
        
        if prediction == 'NPP':
            if truth == 'NPP':
                TN +=1
                results.append(predictions[i].append('TN'))
            else: 
                FN+=1
                results.append(predictions[i].append('FN'))
        else:
            if truth == 'NPP':
                FP+=1
                results.append(predictions[i].append('FP'))
            else: #TODO: Comeback and evaluate if this makes sense: Algo could predict PP because X/Y was not optimal, while truth was PPP because of interference
                TP+=1
                results.append(predictions[i].append('TP'))
        
    return (TP,TN,FP,FN,results)
    

562


In [None]:
overall_start = datetime.datetime.now()

split = 1

for train_val_indices, test_indices in train_test_split_indices:
    X_train, y_train = [fDOM_cand[i] for i in train_val_indices], [truths[i] for i in train_val_indices]
    X_test, y_test = [fDOM_cand[i] for i in test_indices], [truths[i] for i in test_indices]
    
    max_fold_metric = 0 
    max_result = None 
    
    print("Split: ",split)
    
    split_start = datetime.datetime.now()
    
    for iteration in range(iterations):
        
        # Random grid search for hyperparams 
        params = {}
        
        params['x'] = np.random.randint(x_bounds[0], x_bounds[1]+1)
        params['y'] = np.random.randint(y_bounds[0], y_bounds[1]+1)
                
        predictions = classify_fDOM_peaks(X_train, params)
        
        TP,TN,FP,FN,results = label_positives_negatives(predictions, y_train)
        
        TPR = TP/(TP + FN)
        TNR = TN/(TN + FP)
        
        bal_acc = (TPR + TNR)/2
        
        f1_score = (2 * TP)/((2 * TP) + FP + FN)

        acc = f1_score
        if iteration and iteration % int(iterations/10) == 0: print(" {}/{} ".format(iteration, iterations), end = "")
        if acc > max_fold_metric: 
            max_fold_metric = acc
            max_result = copy.deepcopy(results)
            best_params = copy.deepcopy(params)
            
    # Test best parameters on testing data 
    test_predictions = classify_fDOM_peaks(X_test, best_params)
    TP,TN,FP,FN,results = label_positives_negatives(test_predictions, y_test)
    
    TPR = TP/(TP + FN)
    TNR = TN/(TN + FP)
    
    bal_acc = (TPR + TNR)/2 
    f1_score = (2 * TP)/((2 * TP) + FP + FN)
    
    print('\nSplit: {}  F1: {:.4f} BA: {:.4f}  Params: {}  TP: {} TN: {} FP: {} FN: {}  Time: {}'.format(split, f1_score, bal_acc, best_params, TP, TN, FP, FN, datetime.datetime.now() - split_start))
    accumulated_test_metrics[split] = [f1_score, bal_acc] # Record test metrics of each split
    accumulated_test_results[split] = copy.deepcopy(results) # Record test results (FP,FN,TP,TN for each datapoint) for each split
    accumulated_best_params[split] = copy.deepcopy(best_params) # Record params uses in testing for each split
    
    split+=1

# Display Resuls 
mean_f1 = 0
mean_ba = 0 

for key in accumulated_test_metrics:
    metrics = accumulated_test_metrics[key]
    mean_f1+=metrics[0]
    mean_ba+=metrics[1]

print("Mean Test F1: ", mean_f1/len(accumulated_test_metrics))
print("Mean Test BA: ", mean_ba/len(accumulated_test_metrics))

print("Training time: ", datetime.datetime.now() - overall_start)

# Pickle params from last fold
with open('./Experiments/fDOM_PP/best_params.pkl', 'wb') as pck_file:
    pickle.dump(accumulated_best_params[num_splits], pck_file)
    pck_file.close()
    
# Pickle results from last fold 
with open('./Experiments/fDOM_PP/test_results.pkl', 'wb') as pck_file:
    pickle.dump(accumulated_test_results, pck_file)
    pck_file.close()

# Pickle results from last fold 
with open('./Experiments/fDOM_PP/test_metrics.pkl', 'wb') as pck_file:
    pickle.dump(accumulated_test_metrics, pck_file)
    pck_file.close()

In [10]:
turb_cand_params = {'prom' : [6,None],
                    'width': [None, None],
                    'wlen' : 200,
                    'dist' : 1,
                    'rel_h': .6}

# Get fDOM and turb candiate peaks
turb_peaks, turb_props = get_candidates(turb_data, turb_cand_params)

# Remove peaks that occur during a flat plateau 
turb_flat_plat = detect_flat_plat(turb_data, 100, 40)
turb_flat_plat_indxs = []
for i in range(turb_flat_plat.shape[0]):
    if turb_flat_plat[i] == 1:
        turb_flat_plat_indxs.append(i)

take_indices = []
for i,peak in enumerate(turb_peaks):
    if peak not in turb_flat_plat_indxs:
        take_indices.append(i)

turb_peaks = np.take(turb_peaks, take_indices)
for key in turb_props:
    turb_props[key] = np.take(turb_props[key], take_indices)

# Iterate through peaks and turn into short 3 point "events" by flagging the data point to either side of a peak
fDOM_events = []
fDOM_lb = []
fDOM_rb = []

for i,cand in enumerate(fDOM_cands):
            fDOM_events.append(np.array((fDOM_data[cand[0]])))
            fDOM_lb.append(fDOM_data[math.floor(cand[1]),0])
            fDOM_rb.append(fDOM_data[math.ceil(cand[2]),0])
            
fDOM_lb = list(set(fDOM_lb))
fDOM_lb.sort()
fDOM_rb = list(set(fDOM_rb))
fDOM_rb.sort()

turb_events = []
turb_lb = []
turb_rb = []
for i,peak in enumerate(turb_peaks):
            turb_events.append(np.array((turb_data[peak])))
            turb_lb.append(turb_data[math.floor(turb_props['left_ips'][i]),0])
            turb_rb.append(turb_data[math.ceil(turb_props['right_ips'][i]),0])
            
turb_lb = list(set(turb_lb))
turb_lb.sort()
turb_rb = list(set(turb_rb))
turb_rb.sort()            

fDOM_merged = dp.merge_data(fDOM_data, fDOM_events, 'f_opp', '')
turb_merged = dp.merge_data(turb_data, turb_events, 't_opp', '')

fDOM_merged = dp.merge_additional_data(fDOM_merged, fDOM_lb, 'left_base')
fDOM_merged = dp.merge_additional_data(fDOM_merged, fDOM_rb, 'right_base')

turb_merged = dp.merge_additional_data(turb_merged, turb_lb, 'left_base')
turb_merged = dp.merge_additional_data(turb_merged, turb_rb, 'right_base')


stage_edge_data = dp.stage_rises_to_data(s_indices, stage_data)
stage_data_merged = dp.merge_data(stage_data, stage_edge_data, 'rise','')

dm.write_data_to_trainset(fDOM_merged,
                          stage_data_merged,
                          dp.merge_data(fDOM_data_copy, [], '',''),
                          '../Data/temp_plotting/fDOM_PP_smoothed_200k-300k.csv',
                          True,
                          True,
                          200000,
                          300000)