In [1]:
# Import libraries and data 
import scipy.io as sio
from sklearn.model_selection import TimeSeriesSplit
import pandas as pd
import copy
import pickle
import numpy as np
from scipy.signal import find_peaks
from os.path import dirname, join as pjoin
import datetime
import csv
import math
import sys
sys.path.insert(1,'../')
import Tools.data_processing as dp
import Tools.data_movement as dm 
from Tools.auxiliary_functions import get_candidates, detect_flat_plat

fDOM_raw_data = dm.read_in_preprocessed_timeseries('../Data/converted_data/julian_format/fDOM_raw_10.1.2011-9.4.2020.csv')
stage_data = dm.read_in_preprocessed_timeseries('../Data/converted_data/julian_format/stage_10.1.11-1.1.19.csv')
turb_data = dm.read_in_preprocessed_timeseries('../Data/converted_data/julian_format/turbidity_raw_10.1.2011_9.4.2020.csv')
stage_data = dp.align_stage_to_fDOM(fDOM_raw_data, stage_data)

### Define the two stage rise detection algorithms being tested and helper function

In [2]:
def detect_stage_rises_one_window(data, params):
    slope_threshold = params['slope_threshold']
    duration_threshold = params['duration_threshold']
    
    signals = np.zeros(len(data))
    slopes = np.diff(data)
    continuous_rises = np.zeros(len(slopes))
    
    # For each point, how long has there been a sustained rise
    length = 0
    for i in range(len(slopes)):
        if slopes[i] >= slope_threshold:
            length +=1
        else:
            length = 0
        continuous_rises[i] = length
        
    # For each point, if the rise has been sustained for a sufficient period, label all points in that rise
    for i in range(len(continuous_rises)):
        rise_length = continuous_rises[i]
        if rise_length >= duration_threshold:
            signals[int(i - rise_length + 1):i+1] = 1

    return signals
                       
def detect_stage_rises_two_window(data, params):
    w1_slope_threshold = params['w1_slope_threshold']
    w2_slope_threshold = params['w2_slope_threshold']
    w1_duration_threshold = params['w1_duration_threshold']
    w2_duration_threshold = params['w2_duration_threshold']

    signals = np.zeros(len(data))
    slopes = np.diff(data)
    
    w1_continuous_rises = np.zeros(len(slopes))
    w2_continuous_rises = np.zeros(len(slopes))
    
    # For each point, how long has there been a sustained rise
    w1_length = 0
    w2_length = 0
    for i in range(len(slopes)):
        # Window 1
        if slopes[i] >= w1_slope_threshold:
            w1_length +=1
        else:
            w1_length = 0
        w1_continuous_rises[i] = w1_length
        
        # Window 2 
        if slopes[i] >= w2_slope_threshold:
            w2_length +=1
        else:
            w2_length = 0
            
        w2_continuous_rises[i] = w2_length
    
    # For each point, if the rise has been sustained for a sufficient period, label all points in that rise
    for i in range(len(w1_continuous_rises)):
        # Window 1
        rise_length = w1_continuous_rises[i]
        if rise_length >= w1_duration_threshold:
            signals[int(i - rise_length + 1):i+2] = 1
            
        # Window 2
        rise_length = w2_continuous_rises[i]
        if rise_length >= w2_duration_threshold:
            signals[int(i - rise_length + 1):i+2] = 1
    
    return signals

def detect_stage_rises_two_window_height(data, params):
    w1_slope_threshold = params['w1_slope_threshold']
    w2_slope_threshold = params['w2_slope_threshold']
    w1_duration_threshold = params['w1_duration_threshold']
    w2_duration_threshold = params['w2_duration_threshold']
    min_height = params['min_height']

    signals = np.zeros(len(data))
    slopes = np.diff(data)
    
    w1_continuous_rises = np.zeros(len(slopes))
    w2_continuous_rises = np.zeros(len(slopes))
    
    # For each point, how long has there been a sustained rise
    w1_length = 0
    w2_length = 0
    for i in range(len(slopes)):
        # Window 1
        if slopes[i] >= w1_slope_threshold:
            w1_length +=1
        else:
            w1_length = 0
        w1_continuous_rises[i] = w1_length
        
        # Window 2 
        if slopes[i] >= w2_slope_threshold:
            w2_length +=1
        else:
            w2_length = 0
            
        w2_continuous_rises[i] = w2_length
    
    # For each point, if the rise has been sustained for a sufficient period, label all points in that rise
    for i in range(len(w1_continuous_rises)):
        # Window 1
        rise_length = w1_continuous_rises[i]
        height_condition = abs(data[int(i - rise_length +1)] - data[i + 1]) >= min_height
        
        if (rise_length >= w1_duration_threshold) and height_condition:
            signals[int(i - rise_length + 1):i+2] = 1
            
        # Window 2
        rise_length = w2_continuous_rises[i]
        height_condition = abs(data[int(i - rise_length +1)] - data[i + 1]) >= min_height
        
        if (rise_length >= w2_duration_threshold) and height_condition:
            signals[int(i - rise_length + 1):i+2] = 1
    
    return signals

def label_positives_negatives(predictions, ground_truths):
    TP = TN = FP = FN = 0
    results = []
    for i in range(len(predictions)):
        prediction = predictions[i]
        ground_truth = ground_truths[i]

        if prediction == 1 and ground_truth == 1:
            results.append([prediction, 'TP'])
            TP +=1 
        elif prediction == 1 and ground_truth == 0:
            results.append([prediction, 'FP'])
            FP +=1 
        elif prediction == 0 and ground_truth == 1:
            results.append([prediction,'FN'])
            FN +=1 
        else:
            results.append([prediction, 'TN'])
            TN +=1
    return (TP,TN,FP,FN,results)

### Import ground truth

In [3]:
# ground_truth_path = '/Users/zachfogg/Desktop/newnew/stage_rise_labeled_0k-300k.csv'
ground_truth_path = "../Data/labeled_data/ground_truths/stage/stage_rise/stage_rise_labeled_0k-300k.csv"
ground_truth_signals = np.zeros(len(stage_data))
with open(ground_truth_path, 'r', newline = '') as gt_file:
    reader = csv.reader(gt_file, delimiter = ',')
    idx = 0 
    for row in reader:
        if row[0] == 'Stage':
            if row[3] == 'TP':
                ground_truth_signals[idx] = 1
            idx+=1

for i in range(1,len(ground_truth_signals)-1):
    if ground_truth_signals[i-1] == ground_truth_signals[i+1] == 0 and ground_truth_signals[i] == 1:
        ground_truth_signals[i] == 0

### Define Hyperparam bounds and training params

In [25]:
# Hyperparameter Bounds 
slope_threshold_bounds = (0,.1)
duration_threshold_bounds = (0,20)
min_height_bounds = (0,0.1)

best_params = {}

iterations = 688

num_splits = 5 # Number of train/val/test folds

train_test_split = TimeSeriesSplit(num_splits).split(stage_data[:,1])

accumulated_test_metrics = {}

accumulated_test_results = {}

overall_start = datetime.datetime.now()

accumulated_best_params = {}



In [26]:
overall_start = datetime.datetime.now()

split = 1 

for train_val_indices, test_indices in train_test_split: # Outer loop performs the "Forward-Chaining"
    # Get appropriate data subset
    X_train, y_train = np.take(stage_data[:,1], train_val_indices), np.take(ground_truth_signals, train_val_indices)
    X_test, y_test = np.take(stage_data[:,1], test_indices), np.take(ground_truth_signals, test_indices)

   
    max_fold_metric = 0
    max_result = None  
    max_acc = 0
    print("Split: ",split)
    # Optimize hyper parameters to the training data
    split_start = datetime.datetime.now()
    for iteration in range(iterations):
        
        # Random grid search
#         params['slope_threshold'] = np.random.uniform(slope_threshold_bounds[0], slope_threshold_bounds[1])
#         params['duration_threshold'] = np.random.randint(duration_threshold_bounds[0], duration_threshold_bounds[1]+1)        
        params = {}
        params['w1_slope_threshold'] = np.random.uniform(slope_threshold_bounds[0], slope_threshold_bounds[1])
        params['w1_duration_threshold'] = np.random.randint(duration_threshold_bounds[0], duration_threshold_bounds[1]+1)
        
        params['w2_slope_threshold'] = np.random.uniform(slope_threshold_bounds[0], slope_threshold_bounds[1])
#         params['w2_duration_threshold'] = np.random.randint(duration_threshold_bounds[0], duration_threshold_bounds[1]+1)
        params['w2_duration_threshold'] = 2
        
        # Detect stage rises 
        detected_signals = detect_stage_rises_two_window(X_train, params)
        
        # Results will hold: [signal, result in comparison to ground truth]
        TP,TN,FP,FN,results = label_positives_negatives(detected_signals, y_train)
        
        TPR = TP/(TP + FN)
        TNR = TN/(TN + FP)
        bal_acc = (TPR + TNR)/2 
        f1_score = (2 * TP)/((2 * TP) + FP + FN)

        acc = f1_score
        if iteration and iteration % int(iterations/10) == 0: print(" {}/{} ".format(iteration, iterations), end = "")
        if acc > max_acc: 
            max_acc = acc
            max_result = copy.deepcopy(results)
            best_params = copy.deepcopy(params)
#             print('I: {}  F1: {:.4f} BA: {:.4f}  Params: {}  TP: {} TN: {} FP: {} FN: {}'.format(iteration, f1_score, bal_acc, params, TP, TN, FP, FN))
        
    # Test best parameters on testing data 
    predicted_signals = detect_stage_rises_two_window(X_test, best_params)
    
    TP,TN,FP,FN,results = label_positives_negatives(predicted_signals, y_test)
    
    TPR = TP/(TP + FN)
    TNR = TN/(TN + FP)
    
    bal_acc = (TPR + TNR)/2 
    f1_score = (2 * TP)/((2 * TP) + FP + FN)
    
#     print("Split: {} test scores, f1: {} BA: {} Time: {}".format(split, f1_score, bal_acc, datetime.datetime.now() - split_start))
    print('\nSplit: {}  F1: {:.4f} BA: {:.4f}  Params: {}  TP: {} TN: {} FP: {} FN: {}  Time: {}'.format(split, f1_score, bal_acc, best_params, TP, TN, FP, FN, datetime.datetime.now() - split_start))
    accumulated_test_metrics[split] = [f1_score, bal_acc] # Record test metrics of each split
    accumulated_test_results[split] = copy.deepcopy(results) # Record test results (FP,FN,TP,TN for each datapoint) for each split
    accumulated_best_params[split] = copy.deepcopy(best_params) # Record params uses in testing for each split
    
    split+=1

mean_f1 = 0
mean_ba = 0 

for key in accumulated_test_metrics:
    metrics = accumulated_test_metrics[key]
    mean_f1+=metrics[0]
    mean_ba+=metrics[1]

print("Mean Test F1: ", mean_f1/len(accumulated_test_metrics))
print("Mean Test BA: ", mean_ba/len(accumulated_test_metrics))

print(datetime.datetime.now() - overall_start)

# Pickle params from last fold
with open('./Past_Experiments/stage_algo_exp_results/algo3_best_params.pkl', 'wb') as pck_file:
    pickle.dump(accumulated_best_params[num_splits], pck_file)
    pck_file.close()
    
# Pickle results from last fold 
with open('./Past_Experiments/stage_algo_exp_results/algo3_test_results.pkl', 'wb') as pck_file:
    pickle.dump(accumulated_test_results, pck_file)
    pck_file.close()

# Pickle results from last fold 
with open('./Past_Experiments/stage_algo_exp_results/algo3_test_metrics.pkl', 'wb') as pck_file:
    pickle.dump(accumulated_test_metrics, pck_file)
    pck_file.close()
    
# Algo 5 is when we don't allow window of size < 5

Split:  1
 68/688  136/688  204/688  272/688  340/688  408/688  476/688  544/688  612/688  680/688 
Split: 1  F1: 0.9035 BA: 0.9887  Params: {'w1_slope_threshold': 0.09973508451591305, 'w1_duration_threshold': 9, 'w2_slope_threshold': 0.0027122093054720354, 'w2_duration_threshold': 2}  TP: 1123 TN: 36907 FP: 221 FN: 19  Time: 0:01:07.738097
Split:  2
 68/688  136/688  204/688  272/688  340/688  408/688  476/688  544/688  612/688  680/688 
Split: 2  F1: 0.9034 BA: 0.9666  Params: {'w1_slope_threshold': 0.002071041670009577, 'w1_duration_threshold': 5, 'w2_slope_threshold': 0.02960505743036176, 'w2_duration_threshold': 2}  TP: 888 TN: 37192 FP: 130 FN: 60  Time: 0:02:15.969185
Split:  3
 68/688  136/688  204/688  272/688  340/688  408/688  476/688  544/688  612/688  680/688 
Split: 3  F1: 0.9033 BA: 0.9858  Params: {'w1_slope_threshold': 0.002692193427462497, 'w1_duration_threshold': 3, 'w2_slope_threshold': 0.03481963497731524, 'w2_duration_threshold': 2}  TP: 1004 TN: 37051 FP: 191 FN:

In [6]:
"""
Alg 4: {'w1_slope_threshold': 0.08070955276214427, 'w1_duration_threshold': 7, 'w2_slope_threshold': 0.0028358577231109906, 'w2_duration_threshold': 3, 'min_height': 0.029811829887152044}
"""
"""
Algo 3: 
{'w1_slope_threshold': 0.0025548972795833794, 'w1_duration_threshold': 4, 'w2_slope_threshold': 0.024178567948212826, 'w2_duration_threshold': 12, 'min_height': 0.0007260027409820013}

Algo 2: Mean Test F1:  0.874357384629945
Mean Test BA:  0.9232266661025357
"""
params = {'w1_slope_threshold': 0.0027973820056520827, 'w1_duration_threshold': 4, 'w2_slope_threshold': 0.018505985197238586, 'w2_duration_threshold': 2}
predicted_signals = detect_stage_rises_two_window(stage_data[:,1], params)
    
TP,TN,FP,FN,results = label_positives_negatives(predicted_signals, ground_truth_signals)

print((2 * TP)/((2 * TP) + FP + FN))
print("FP:,", FP, " FN: ", FN)

0.9398932112890923
FP:, 437  FN:  351


In [23]:
x = np.array([[1,2,3,4,-1],[1,2,3,4,-1]])
y = np.argmax(x)
print(y)

3


In [None]:
params = {'w1_slope_threshold': 0.0027973820056520827, 'w1_duration_threshold': 4, 'w2_slope_threshold': 0.018505985197238586, 'w2_duration_threshold': 2}
with open('./Hyperparameters/Stage/detect_stage_rises_params.pkl', 'wb') as pck_file:
    pickle.dump(params, pck_file)
    

In [23]:
# Print out signals to reconsider labels
# Get stage rises
# fDOM_cand_params = {'prom' : [4,None],
#                     'width': [None, None],   These are the params for fDOM pp cand
#                     'wlen' : 200,
#                     'dist' : 1,
#                     'rel_h': .6}

fDOM_cand_params = {'prom' : [4,None],
                    'width': [None, None],
                    'wlen' : 200,
                    'dist' : 1,
                    'rel_h': .6}

# turb_cand_params = {'prom' : [6,None],       These are the params that were used to label turb cand 0-100k
#                     'width': [None, None],
#                     'wlen' : 200,
#                     'dist' : 1,
#                     'rel_h': .6}

turb_cand_params = {'prom' : [6,None],
                    'width': [None, None],    # was 2 for skyrocketing
                    'wlen' : 200,
                    'dist' : 1,
                    'rel_h': .6}   # was .5 for skyrocketing 

# Get fDOM and turb candiate peaks
fDOM_peaks, fDOM_props = get_candidates(fDOM_raw_data, fDOM_cand_params)
turb_peaks, turb_props = get_candidates(turb_data, turb_cand_params)

# Remove peaks that occur during a flat plateau 
turb_flat_plat = detect_flat_plat(turb_data, 100, 40)
turb_flat_plat_indxs = []
for i in range(turb_flat_plat.shape[0]):
    if turb_flat_plat[i] == 1:
        turb_flat_plat_indxs.append(i)

take_indices = []
for i,peak in enumerate(turb_peaks):
    if peak not in turb_flat_plat_indxs:
        take_indices.append(i)

turb_peaks = np.take(turb_peaks, take_indices)
for key in turb_props:
    turb_props[key] = np.take(turb_props[key], take_indices)

# Iterate through peaks and turn into short 3 point "events" by flagging the data point to either side of a peak
# fDOM_events = []
# for peak in fDOM_peaks:
#             fDOM_events.append(np.array((fDOM_raw_data[peak-1], fDOM_raw_data[peak], fDOM_raw_data[peak+1])))
fDOM_events = []
fDOM_lb = []
fDOM_rb = []

for i,peak in enumerate(fDOM_peaks):
            fDOM_events.append(np.array((fDOM_raw_data[peak])))
#             fDOM_lb.append(fDOM_raw_data[fDOM_props['left_bases'][i],0])
#             fDOM_rb.append(fDOM_raw_data[fDOM_props['right_bases'][i],0])
            fDOM_lb.append(fDOM_raw_data[math.floor(fDOM_props['left_ips'][i]),0])
            fDOM_rb.append(fDOM_raw_data[math.ceil(fDOM_props['right_ips'][i]),0])
            
fDOM_lb = list(set(fDOM_lb))
fDOM_lb.sort()
fDOM_rb = list(set(fDOM_rb))
fDOM_rb.sort()

turb_events = []
turb_lb = []
turb_rb = []
for i,peak in enumerate(turb_peaks):
            turb_events.append(np.array((turb_data[peak])))
            turb_lb.append(turb_data[math.floor(turb_props['left_ips'][i]),0])
            turb_rb.append(turb_data[math.ceil(turb_props['right_ips'][i]),0])
            
turb_lb = list(set(turb_lb))
turb_lb.sort()
turb_rb = list(set(turb_rb))
turb_rb.sort()            

fDOM_merged = dp.merge_data(fDOM_raw_data, fDOM_events, 'f_opp', '')
turb_merged = dp.merge_data(turb_data, turb_events, 't_opp', '')

fDOM_merged = dp.merge_additional_data(fDOM_merged, fDOM_lb, 'left_base')
fDOM_merged = dp.merge_additional_data(fDOM_merged, fDOM_rb, 'right_base')

turb_merged = dp.merge_additional_data(turb_merged, turb_lb, 'left_base')
turb_merged = dp.merge_additional_data(turb_merged, turb_rb, 'right_base')


stage_data_merged = []
for i in range(len(results)):
    stage_data_merged.append([stage_data[i,0],stage_data[i,1],results[i][1]])

dm.write_data_to_trainset(fDOM_merged,
                          stage_data_merged,
                          turb_merged,
                          '../Data/plot/stage_rise_algo_0k-100k.csv',
                          True,
                          True,
                          0,
                          100000)

In [36]:
print(accumulated_best_params[num_splits][0])

{'w1_slope_threshold': 0.08638011677049756, 'w1_duration_threshold': 10, 'w2_slope_threshold': 0.0034103401817676352, 'w2_duration_threshold': 2}
