# Augmenting time-series data
In this file, the data is augmented in order to create more of it.

In [2]:
# Imports
import sys
sys.path.insert(1,'../')
import Tools.data_processing as dp
import Tools.data_movement as dm 
import pandas as pd
from datetime import datetime
from datetime import timedelta
import copy

## Helper Functions
The following functions provide useful tools for the augmentation process

In [1]:
# Helper functions
def create_data(fdom_range: pd.DataFrame, stage_range: pd.DataFrame, turb_range: pd.DataFrame, peak_index: int, datatype: str) -> tuple(pd.DataFrame, pd.DataFrame, pd.DataFrame, int):
    """
    Makes changes to the current datapoints, by modifying peaks, and adding in values if needed
    
    data: the range of datapoints by index in a dataframe

    peak_index: the index of the actual peak we are modifying

    datatype: "fdom" or "turb"

    returns: the new data range to be appended to the data
    """
    # make a copy of the modified data
    new_fdom = copy.deepcopy(fdom_range)
    new_stage = copy.deepcopy(stage_range)
    new_turb = copy.deepcopy(turb_range)

    # peak index can change when we add in x data
    new_fdom_peak_index = peak_index

    if datatype == "fdom":

        # augment fDOM

        # add entries to stage as needed

        # add entries to turb as needed
        pass

    else:

        # augment turb

        # add entries to stage as needed

        # add entries to fdom as needed
        pass

    return new_fdom, new_stage, new_turb, new_fdom_peak_index


def next_time_entry(current_entry: float) -> float:
    """
    This function returns the next time entry in julian time
    
    current_entry: a julina time float

    return: julian time + 15 minutes from past julian time
    """

    # convert julian to datetime
    date_time_init = dp.julian_to_datetime(current_entry)

    # find next date time (add 15 minutes)
    next_entry = date_time_init + timedelta(minutes=15)

    # convert date time to julian time
    final_julian_time = dp.datetime_to_julian(next_entry)

    # return julian time
    return final_julian_time


def reindex_augmented_data(data: pd.DataFrame, datatype: str) -> pd.DataFrame:
    """
    Reindex the augmented data so there are no overlaps

    data: the data to reindex
    datatype: fdom, turb, or stage

    returns: reindexed data
    """
    pass

NameError: name 'pd' is not defined

## Loading in data
The knowledge-based approach uses the data in `Data/converted_data/julian_format/`, so that is where the data augmentation will go.

In [4]:
# Read in raw data
fDOM_data = dm.read_in_preprocessed_timeseries('../Data/converted_data/julian_format/fDOM_raw_10.1.2011-9.4.2020.csv')
stage_data = dm.read_in_preprocessed_timeseries('../Data/converted_data/julian_format/stage_10.1.11-1.1.19.csv')
turb_data = dm.read_in_preprocessed_timeseries('../Data/converted_data/julian_format/turbidity_raw_10.1.2011_9.4.2020.csv')

# align stage to fDOM
stage_data = dp.align_stage_to_fDOM(fDOM_data, stage_data)

# read in labeled fDOM
fDOM_labeled = pd.read_csv("../Data/labeled_data/ground_truths/fDOM/fDOM_all_julian_0k-300k.csv")

turb_labeled = pd.read_csv("../Data/labeled_data/ground_truths/turb/turb_pp/julian_time/turb_pp_0k-300k_labeled.csv")

# New data folder:
AUGMENT_DATA_PATH = '../Data/augmented_data/julian_format/'

In [6]:
# Convert data into pandas dataframes for better indexing:
fDOM_raw = pd.DataFrame(fDOM_data)
fDOM_raw.columns = ["timestamp", "value"]

turb_raw = pd.DataFrame(turb_data)
turb_raw.columns = ["timestamp", "value"]

stage_raw = pd.DataFrame(stage_data)
stage_raw.columns = ["timestamp", "value"]

In [7]:
# Visualize data
# timestamp_of_peak = fDOM_labeled.loc[88, 'timestamp_of_peak']
# index_df = fDOM_raw[fDOM_raw['timestamp'] == timestamp_of_peak]

# if len(index_df.index.to_list()) != 0:
#     index_of_peak = index_df.index.tolist()[0]

#     print(index_of_peak)
#     print(timestamp_of_peak)

9406
2456140.479166667


## Augmenting Data
We will augment data for each type of peak, and for each measurement.

Starting with fDOM:
1. PLP (plummeting peak)
2. PP (phantom peak)
3. SKP (skyrocketing peak)

TODO: augment more peak types when they are labeled

With turbidity:
1. PP

We start by creating our data frames to augment.

In [24]:
""" Helpful constants """
TIME_RANGE_INIT = 5 # the base distance of points to add between the peaks

""" Dataframes to be used by augmenter """

# labeled fDOM peaks
fdom_anon_peaks = fDOM_labeled[fDOM_labeled['label_of_peak'] != 'NAP']
fdom_anon_peaks = fdom_anon_peaks.reset_index() # reset index as values were removed

# labeled turb peaks
turb_anon_peaks = turb_labeled[turb_labeled['label_of_peak'] != "NPP"]
turb_anon_peaks = turb_anon_peaks.reset_index()

# new dataframes for augmented labeled/raw fDOM
augmented_fDOM_raw = pd.DataFrame(columns=['timestamp', 'value'])
augmented_fDOM_labeled = pd.DataFrame(columns=['timestamp_of_peak', 'value_of_peak', 'label_of_peak', 'idx_of_peak'])

# dataframe for augmented stage 
augmented_stage_raw = pd.DataFrame(columns=['timestamp', 'value'])

# dataframes for augmented raw/labeled turbidity
augmented_turb_raw = pd.DataFrame(columns=['timestamp', 'value'])
augmented_turb_labeled = pd.DataFrame(columns=['timestamp_of_peak', 'value_of_peak', 'label_of_peak', 'idx_of_peak'])

# a list of peaks that don't align with the fDOM raw file that was aligned with stage
# i believe its just peaks that don't align with stage in general for whatever reason
missed_fDOM_peaks = []

### Augmenting fDOM
The next codeblocks augment fDOM data.

In [26]:
# HACK: there are overlaps in time ranges, might not be an issue but it could be an issue
# TODO: turn this into a callable function when it is finished


# iterate over each peak
for i, row in fdom_anon_peaks.iterrows():
    # check to see if any overlap occurs between peaks
    prev_dist = TIME_RANGE_INIT
    next_dist = TIME_RANGE_INIT

    if i == 0:
        # we are at the first peak, check to see if there are 5 data points behind
        # TODO: implement this
        # note that its actually not needed
        pass 

    elif i + 1 < fdom_anon_peaks.shape[0]:
        # anywhere else in the middle, check for overlap
        # FIXME: not currently checking for overlaps, we might not need to tho

        # check next 5
        if row['idx_of_peak'] + TIME_RANGE_INIT >= fdom_anon_peaks.loc[i + 1,'idx_of_peak'] - TIME_RANGE_INIT: # -5 becase we go back 5 peaks too
            # change next_dist to whatever it needs to be 
            curr_dist_to_peak = abs(row['idx_of_peak'] - fdom_anon_peaks.loc[i + 1, 'idx_of_peak'] - TIME_RANGE_INIT)
            next_dist = curr_dist_to_peak - 1

        # check past 5
        if row['idx_of_peak'] - TIME_RANGE_INIT <= fdom_anon_peaks.loc[i - 1,'idx_of_peak'] + TIME_RANGE_INIT:
            curr_dist_to_peak = abs(row['idx_of_peak'] - fdom_anon_peaks.loc[i - 1, 'idx_of_peak'] - TIME_RANGE_INIT)
            prev_dist = curr_dist_to_peak - 1

    else:
        # if no next peak, we are at the last peak, ensure there are still 5 data points to read
        # TODO: implement this
        pass


    """Get raw fDOM data points"""
    timestamp_of_peak = fDOM_labeled.loc[i, 'timestamp_of_peak']

    # get index dataframes of each type
    # HACK: there has got to be a better way to do this
    fdom_index_df = fDOM_raw[fDOM_raw['timestamp'] == timestamp_of_peak]
    stage_index_df = stage_raw[stage_raw['timestamp'] == timestamp_of_peak]
    turb_index_df = turb_raw[turb_raw['timestamp'] == timestamp_of_peak]

    if len(fdom_index_df.index.to_list()) != 0:
        # get indices of each data type from index df's 
        index_of_peak = fdom_index_df.index.tolist()[0]
        stage_index = stage_index_df.index.tolist()[0]
        turb_index = turb_index_df.index.tolist()[0]

        # use this timestamp to make a dataframe of raw stuff
        # get data from fDOM_raw file
        fDOM_raw_time_range = pd.DataFrame(fDOM_raw.iloc[index_of_peak - prev_dist:index_of_peak + next_dist])

        # get stage data range
        stage_time_range = pd.DataFrame(stage_raw.iloc[stage_index - prev_dist:stage_index + next_dist])

        # get turbidity data range
        turb_time_range = pd.DataFrame(turb_raw.iloc[turb_index - prev_dist:turb_index + next_dist])

        # get augmented data
        fDOM_augmented, new_stage, new_turb, new_peak_index = create_data(fDOM_raw_time_range, stage_time_range, turb_time_range, index_of_peak, "fdom")

        # append these to a new file, in format of the raw fDOM file
        augmented_fDOM_labeled.append(fDOM_augmented)

        # append these to a new file, in format of the stage/turb/fDOM file
        augmented_stage_raw.append(new_stage)

        # append these to a new file, in format of labeled data (so purely the peak, mark the index of the peak when starting)
        #augmented_fDOM_labeled.append(changed_data)
    
    # TODO add this missed data into the overall data somehow
    else:
        # we missed some data points, append them to the missed data dataframe
        missed_fDOM_peaks.append(timestamp_of_peak)

print(fDOM_raw_time_range)
print(stage_time_range)
print(turb_time_range)

         timestamp      value
9665  2.456143e+06  33.203392
9666  2.456143e+06  43.953086
9667  2.456143e+06  40.934235
9668  2.456143e+06  40.071279
9669  2.456143e+06  44.637700
9670  2.456143e+06  41.017220
9671  2.456143e+06  47.057776
9672  2.456143e+06  41.434796
9673  2.456143e+06  49.846897
9674  2.456143e+06  44.133289
         timestamp     value
9665  2.456143e+06  0.081429
9666  2.456143e+06  0.081000
9667  2.456143e+06  0.080571
9668  2.456143e+06  0.080143
9669  2.456143e+06  0.079714
9670  2.456143e+06  0.079286
9671  2.456143e+06  0.078857
9672  2.456143e+06  0.078429
9673  2.456143e+06  0.078000
9674  2.456143e+06  0.078148
         timestamp     value
9665  2.456143e+06  0.851615
9666  2.456143e+06  0.487554
9667  2.456143e+06  0.525225
9668  2.456143e+06  0.684105
9669  2.456143e+06  0.801421
9670  2.456143e+06  0.521043
9671  2.456143e+06  0.910232
9672  2.456143e+06  0.558703
9673  2.456143e+06  0.659151
9674  2.456143e+06  0.650588


### Augment turbidity data
The following code blocks augment turbidity data

In [None]:
""" Augment turbidity data by calling previously written function """

## Move augmented data into csv files
The following codeblock creates csv files for the augmented data.

In [None]:
""" Augmented Data Paths """


In [None]:
# TODO: Implement this function
def write_augmented_data_to_csv():
    # call to_csv for each dataframe
    pass