# Augmenting time-series data
In this file, the data is augmented in order to create more of it.

In [None]:
# Imports
import sys
sys.path.insert(1,'../')
import Tools.data_processing as dp
import Tools.data_movement as dm 
import pandas as pd
from datetime import datetime
from datetime import timedelta

## Helper Functions
The following functions provide useful tools for the augmentation process

In [None]:
# Helper functions
def make_small_change():
    """
    Makes changes to the current datapoints
    
    param: the range of datapoints by index

    returns: the new data range to be appended to the data
    """
    pass


def next_time_entry(current_entry: float) -> float:
    """
    This function returns the next time entry in julian time
    
    current_entry: a julina time float

    return: julian time + 15 minutes from past julian time
    """

    # convert julian to datetime
    date_time_init = dp.julian_to_datetime(current_entry)

    # find next date time (add 15 minutes)
    next_entry = date_time_init + timedelta(minutes=15)

    # convert date time to julian time
    final_julian_time = dp.datetime_to_julian(next_entry)

    # return julian time
    return final_julian_time

## Loading in data
The knowledge-based approach uses the data in `Data/converted_data/julian_format/`, so that is where the data augmentation will go.

In [None]:
# Read in raw data
fDOM_data = dm.read_in_preprocessed_timeseries('../Data/converted_data/julian_format/fDOM_raw_10.1.2011-9.4.2020.csv')
stage_data = dm.read_in_preprocessed_timeseries('../Data/converted_data/julian_format/stage_10.1.11-1.1.19.csv')
turb_data = dm.read_in_preprocessed_timeseries('../Data/converted_data/julian_format/turbidity_raw_10.1.2011_9.4.2020.csv')

# align stage to fDOM
stage_data = dp.align_stage_to_fDOM(fDOM_data, stage_data)

# read in labeled fDOM
fDOM_labeled = pd.read_csv("../Data/labeled_data/ground_truths/fDOM/fDOM_all_julian_0k-300k.csv")

turb_labeled = pd.read_csv("../Data/labeled_data/ground_truths/turb/turb_pp/julian_time/turb_pp_0k-300k_labeled.csv")

# New data folder:
AUGMENT_DATA_PATH = '../Data/augmented_data/julian_format/'

In [None]:
# Convert data into pandas dataframes for better indexing:
fDOM_raw = pd.DataFrame(fDOM_data)
fDOM_raw.columns = ["timestamp", "value"]

turb_raw = pd.DataFrame(turb_data)
turb_raw.columns = ["timestamp", "value"]

stage_raw = pd.DataFrame(stage_data)
stage_raw.columns = ["timestamp", "value"]

In [None]:
# Visualize data
timestamp_of_peak = fDOM_labeled.loc[88, 'timestamp_of_peak']
index_df = fDOM_raw[fDOM_raw['timestamp'] == timestamp_of_peak]

if len(index_df.index.to_list()) != 0:
    index_of_peak = index_df.index.tolist()[0]

    print(index_of_peak)
    print(timestamp_of_peak)

## Augmenting Data
We will augment data for each type of peak, and for each measurement.

Starting with fDOM:
1. PLP (plummeting peak)
2. PP (phantom peak)
3. SKP (skyrocketing peak)

TODO: augment more peak types when they are labeled

With turbidity:
1. PP

With stage:
Unsure on this section so far.


In [None]:
# create dataframe of anomaly peaks from labeled fDOM data
anom_peaks = fDOM_labeled[fDOM_labeled['label_of_peak'] != 'NAP']

# reset the index as we removed many values
anom_peaks = anom_peaks.reset_index()

# create a dataframe to read in augmented fDOM data into
augmented_fDOM_raw = pd.DataFrame()

missed_fDOM_peaks = []

# iterate over each peak
for i, row in anom_peaks.iterrows():
    # check to see if any overlap occurs between peaks
    prev_dist = 5
    next_dist = 5

    if i == 0:
        # we are at the first peak, check to see if there are 5 data points behind
        # TODO: implement this
        pass

    elif i + 1 < anom_peaks.shape[0]:
        # anywhere else in the middle, check for overlap

        # check next 5
        if row['idx_of_peak'] + 5 >= anom_peaks.loc[i + 1,'idx_of_peak'] - 5: # -5 becase we go back 5 peaks too
            # change next_dist to whatever it needs to be 
            curr_dist_to_peak = abs(row['idx_of_peak'] - anom_peaks.loc[i + 1, 'idx_of_peak'] - 5)
            next_dist = curr_dist_to_peak - 1

        # check past 5
        if row['idx_of_peak'] - 5 <= anom_peaks.loc[i - 1,'idx_of_peak'] + 5:
            curr_dist_to_peak = abs(row['idx_of_peak'] - anom_peaks.loc[i - 1, 'idx_of_peak'] - 5)
            prev_dist = curr_dist_to_peak - 1

    else:
        # if no next peak, we are at the last peak, ensure there are still 5 data points to read
        # TODO: implement this
        pass


    """Get raw fDOM data points"""
    timestamp_of_peak = fDOM_labeled.loc[i, 'timestamp_of_peak']

    index_df = fDOM_raw[fDOM_raw['timestamp'] == timestamp_of_peak]

    if len(index_df.index.to_list()) != 0:
        index_of_peak = index_df.index.tolist()[0]

        # use this timestamp to make a dataframe of raw stuff
        # iterate over fDOM raw

        # get previous "prev_dist" points
        #prev_points = pd.DataFrame(fDOM_raw)

        # get next "next_dist" points
        #next_points = pd.DataFrame()

        # for each peak, get previous and next 5 datapoints from raw and stage data
            # this works because we aligned the data

        # make changes to the data
        #changed_data = make_small_change()

        # append these to a new file, in format of the raw fDOM file
        #augmented_fDOM_raw.append(changed_data)

        # append these to a new file, in format of the stage/turb/fDOM file
        #augmented_stage_raw.append(changed_data)

        # append these to a new file, in format of labeled data (so purely the peak, mark the index of the peak when starting)
        #augmented_fDOM_labeled.append(changed_data)
    
    # TODO add this missed data into the overall data somehow
    else:
        # we missed some data points, append them to the missed data dataframe
        missed_fDOM_peaks.append(timestamp_of_peak)

print(len(missed_fDOM_peaks))