# Augmenting time-series data
In this file, the data is augmented in order to create more of it.

In [1]:
# Imports
import sys
sys.path.insert(1,'../')
import Tools.data_processing as dp
import Tools.data_movement as dm 
import pandas as pd
from datetime import datetime
from datetime import timedelta
import copy
import random

## Helper Functions
The following functions provide useful tools for the augmentation process

In [2]:
def next_time_entry(current_entry: float) -> float:
    """
    This function returns the next time entry in julian time

    current_entry: a julina time float

    return: julian time + 15 minutes from past julian time
    """

    # convert julian to datetime
    date_time_init = dp.julian_to_datetime(current_entry)

    # find next date time (add 15 minutes)
    next_entry = date_time_init + timedelta(minutes=15)

    # convert date time to julian time
    final_julian_time = dp.datetime_to_julian(next_entry)

    # return julian time
    return final_julian_time


def reindex_augmented_data(data: pd.DataFrame, datatype: str) -> pd.DataFrame:
    """
    Reindex the augmented data so there are no overlaps

    data: the data to reindex
    datatype: fdom, turb, or stage

    returns: reindexed data
    """
    pass


## Loading in data
The knowledge-based approach uses the data in `Data/converted_data/julian_format/`, so that is where the data augmentation will go.

In [3]:
# Read in raw data
fDOM_data = dm.read_in_preprocessed_timeseries(
    "../Data/converted_data/julian_format/fDOM_raw_10.1.2011-9.4.2020.csv"
)
stage_data = dm.read_in_preprocessed_timeseries(
    "../Data/converted_data/julian_format/stage_10.1.11-1.1.19.csv"
)
turb_data = dm.read_in_preprocessed_timeseries(
    "../Data/converted_data/julian_format/turbidity_raw_10.1.2011_9.4.2020.csv"
)

# align stage to fDOM
stage_data = dp.align_stage_to_fDOM(fDOM_data, stage_data)

# read in labeled fDOM
fDOM_labeled = pd.read_csv(
    "../Data/labeled_data/ground_truths/fDOM/fDOM_all_julian_0k-300k.csv"
)

turb_labeled = pd.read_csv(
    "../Data/labeled_data/ground_truths/turb/turb_pp/julian_time/turb_pp_0k-300k_labeled.csv"
)

# New data folder:
AUGMENT_DATA_PATH = "../Data/augmented_data/julian_format/"


In [4]:
# Convert data into pandas dataframes for better indexing:
fDOM_raw = pd.DataFrame(fDOM_data)
fDOM_raw.columns = ["timestamp", "value"]

turb_raw = pd.DataFrame(turb_data)
turb_raw.columns = ["timestamp", "value"]

stage_raw = pd.DataFrame(stage_data)
stage_raw.columns = ["timestamp", "value"]

In [None]:
# Visualize data
# timestamp_of_peak = fDOM_labeled.loc[88, 'timestamp_of_peak']
# index_df = fDOM_raw[fDOM_raw['timestamp'] == timestamp_of_peak]

# if len(index_df.index.to_list()) != 0:
#     index_of_peak = index_df.index.tolist()[0]

#     print(index_of_peak)
#     print(timestamp_of_peak)

## Augmenting Data
We will augment data for each type of peak, and for each measurement.

Starting with fDOM:
1. PLP (plummeting peak)
2. PP (phantom peak)
3. SKP (skyrocketing peak)

TODO: augment more peak types when they are labeled

With turbidity:
1. PP

We start by creating our data frames to augment.

In [5]:
""" Helpful constants """
TIME_RANGE_INIT = 5  # the base distance of points to add between the peaks

""" Dataframes to be used by augmenter """

# labeled fDOM peaks
fdom_anon_peaks = fDOM_labeled[fDOM_labeled["label_of_peak"] != "NAP"]
fdom_anon_peaks = fdom_anon_peaks.reset_index()  # reset index as values were removed

# labeled turb peaks
turb_anon_peaks = turb_labeled[turb_labeled["label_of_peak"] != "NPP"]
turb_anon_peaks = turb_anon_peaks.reset_index()

# new dataframes for augmented labeled/raw fDOM
augmented_fDOM_raw = pd.DataFrame(columns=["timestamp", "value"])
augmented_fDOM_labeled = pd.DataFrame(
    columns=["timestamp_of_peak", "value_of_peak", "label_of_peak", "idx_of_peak"]
)

# dataframe for augmented stage
augmented_stage_raw = pd.DataFrame(columns=["timestamp", "value"])

# dataframes for augmented raw/labeled turbidity
augmented_turb_raw = pd.DataFrame(columns=["timestamp", "value"])
augmented_turb_labeled = pd.DataFrame(
    columns=["timestamp_of_peak", "value_of_peak", "label_of_peak", "idx_of_peak"]
)

# variable to keep the last entry in the dataframe for stage
# defaults to the last entry that was in fdom/turb raw csv files, in julian format
prev_added_entry = 2459096.9583333335

# a list of peaks that don't align with the fDOM raw file that was aligned with stage
# i believe its just peaks that don't align with stage in general for whatever reason
missed_fDOM_peaks = []


In [17]:
def get_last_augment_index(dataframe) -> int:
    """
    Collects the last index of the augmented time series
    """
    return dataframe.shape[0]


### Augmenting fDOM
The next codeblocks augment fDOM data.

In [20]:
# HACK: there are overlaps in time ranges, might not be an issue but it could be an issue
# TODO: turn this into a callable function when it is finished


# iterate over each peak
for i, row in fdom_anon_peaks.iterrows():
    # check to see if any overlap occurs between peaks
    prev_dist = TIME_RANGE_INIT
    next_dist = TIME_RANGE_INIT

    if i == 0:
        # we are at the first peak, check to see if there are 5 data points behind
        # TODO: implement this
        # note that its actually not needed
        pass

    elif i + 1 < fdom_anon_peaks.shape[0]:
        # anywhere else in the middle, check for overlap
        # FIXME: not currently checking for overlaps, we might not need to tho

        # check next 5
        if (
            row["idx_of_peak"] + TIME_RANGE_INIT
            >= fdom_anon_peaks.loc[i + 1, "idx_of_peak"] - TIME_RANGE_INIT
        ):  # -5 becase we go back 5 peaks too
            # change next_dist to whatever it needs to be
            curr_dist_to_peak = abs(
                row["idx_of_peak"]
                - fdom_anon_peaks.loc[i + 1, "idx_of_peak"]
                - TIME_RANGE_INIT
            )
            next_dist = curr_dist_to_peak - 1

        # check past 5
        if (
            row["idx_of_peak"] - TIME_RANGE_INIT
            <= fdom_anon_peaks.loc[i - 1, "idx_of_peak"] + TIME_RANGE_INIT
        ):
            curr_dist_to_peak = abs(
                row["idx_of_peak"]
                - fdom_anon_peaks.loc[i - 1, "idx_of_peak"]
                - TIME_RANGE_INIT
            )
            prev_dist = curr_dist_to_peak - 1

    else:
        # if no next peak, we are at the last peak, ensure there are still 5 data points to read
        # TODO: implement this
        pass

    """Get raw fDOM data points"""
    timestamp_of_peak = fDOM_labeled.loc[i, "timestamp_of_peak"]
    label_of_peak = fDOM_labeled.loc[i, "label_of_peak"]

    # get index dataframes of each type
    # HACK: there has got to be a better way to do this
    fdom_index_df = fDOM_raw[fDOM_raw["timestamp"] == timestamp_of_peak]
    stage_index_df = stage_raw[stage_raw["timestamp"] == timestamp_of_peak]
    turb_index_df = turb_raw[turb_raw["timestamp"] == timestamp_of_peak]

    if len(fdom_index_df.index.to_list()) != 0:
        # get indices of each data type from index df's
        index_of_peak = fdom_index_df.index.tolist()[0]
        stage_index = stage_index_df.index.tolist()[0]
        turb_index = turb_index_df.index.tolist()[0]

        # use this timestamp to make a dataframe of raw stuff
        # get data from fDOM_raw file
        fDOM_raw_time_range = pd.DataFrame(
            fDOM_raw.iloc[index_of_peak - prev_dist : index_of_peak + next_dist]
        )

        # get stage data range
        stage_time_range = pd.DataFrame(
            stage_raw.iloc[stage_index - prev_dist : stage_index + next_dist]
        )

        # get turbidity data range
        turb_time_range = pd.DataFrame(
            turb_raw.iloc[turb_index - prev_dist : turb_index + next_dist]
        )

        # make a copy of the modified data
        new_fdom_raw = copy.deepcopy(fDOM_raw_time_range)
        new_stage = copy.deepcopy(stage_time_range)
        new_turb_raw = copy.deepcopy(turb_time_range)

        # peak index can change when we add in x data
        new_fdom_peak_index = -1
        new_peak_timestamp = -1

        # generate a random number to multiply the peak by, +- 0.1
        # set seed
        random.seed()
        random_val = random.uniform(-0.1, 0.1)

        new_peak_val = new_fdom_raw.loc[index_of_peak, "value"] * (1 + random_val)
        new_fdom_raw.loc[index_of_peak, "value"] = new_peak_val

        # insert necessary values into turb and stage

        # get the next possible timestamp
        new_time_entry = next_time_entry(prev_added_entry)

        # update all timestamps for augmented data
        for i, row in new_fdom_raw.iterrows():
            # if timestamps equal, we have the relative peak
            if new_fdom_raw.loc[i, "timestamp"] == timestamp_of_peak:
                # register index here
                new_fdom_peak_index = get_last_augment_index(augmented_fDOM_raw)
                new_peak_timestamp = new_time_entry

            # update timestamps
            new_fdom_raw.loc[i, "timestamp"] = new_time_entry
            new_stage.loc[i, "timestamp"] = new_time_entry
            new_turb_raw.loc[i, "timestamp"] = new_time_entry

            # get next time stamp
            new_time_entry = next_time_entry(new_time_entry)

        # add entries into raw fDOM
        augmented_fDOM_raw = pd.concat(
            [augmented_fDOM_raw, new_fdom_raw], ignore_index=True
        )

        # add entries to labeled fDOM
        augmented_fDOM_labeled = pd.concat(
            [
                augmented_fDOM_labeled,
                pd.DataFrame(
                    [
                        [
                            new_peak_timestamp,
                            new_peak_val,
                            label_of_peak,
                            new_fdom_peak_index,
                        ]
                    ]
                ),
            ],
            ignore_index=True,
        )

        # add entries to stage
        augmented_stage_raw = pd.concat(
            [augmented_stage_raw, new_stage], ignore_index=True
        )

        # add entries to turb
        augmented_turb_raw = pd.concat(
            [augmented_turb_raw, new_turb_raw], ignore_index=True
        )

        # update prev time entry
        prev_added_entry = new_time_entry

    # TODO add this missed data into the overall data somehow
    else:
        # we missed some data points, append them to the missed data dataframe
        missed_fDOM_peaks.append(timestamp_of_peak)


In [24]:
# Testing visualizing augmented data

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

print(augmented_fDOM_raw)
print(augmented_stage_raw)
print(augmented_turb_raw)

      timestamp    value 
0     2459096.969  26.345
1     2459096.979  26.606
2      2459096.99  26.867
3       2459097.0  26.935
4      2459097.01  26.914
5     2459097.021  27.507
6     2459097.031  26.851
7     2459097.042  26.711
8     2459097.052  26.634
9     2459097.062  26.644
10    2459096.969  26.345
11    2459096.979  26.606
12     2459096.99  26.867
13      2459097.0  26.935
14     2459097.01  26.914
15    2459097.021  24.525
16    2459097.031  26.851
17    2459097.042  26.711
18    2459097.052  26.634
19    2459097.062  26.644
20    2459096.969  26.345
21    2459096.979  26.606
22     2459096.99  26.867
23      2459097.0  26.935
24     2459097.01  26.914
25    2459097.021  27.153
26    2459097.031  26.851
27    2459097.042  26.711
28    2459097.052  26.634
29    2459097.062  26.644
30    2459096.969  26.345
31    2459096.979  26.606
32     2459096.99  26.867
33      2459097.0  26.935
34     2459097.01  26.914
35    2459097.021  25.315
36    2459097.031  26.851
37    245909

### Augment turbidity data
The following code blocks augment turbidity data

In [None]:
""" Augment turbidity data by calling previously written function """

## Move augmented data into csv files
The following codeblock creates csv files for the augmented data.

In [None]:
""" Augmented Data Paths """


In [None]:
# TODO: Implement this function
def write_augmented_data_to_csv():
    # call to_csv for each dataframe
    pass