# Augmenting time-series data
In this file, the data is augmented in order to create more of it.

In [None]:
# Imports
import sys

sys.path.insert(1, "../")
import Tools.data_processing as dp
import Tools.data_movement as dm
import Tools.augmentation_helpers as augment
import pandas as pd
from Tools.get_candidates import (
    get_cands_fDOM_NAP,
    get_cands_fDOM_PLP,
    get_cands_fDOM_PP,
    get_cands_fDOM_SKP,
    get_cands_fDOM_FPT,
    get_cands_fDOM_FSK,
    get_cands_turb_PP,
    get_cands_turb_SKP,
    get_cands_turb_FPT,
    get_cands_turb_NAP,
)


### Constants

We define two constants for use with augmenting the data:

1. `TIME_RANGE_INIT`: the number of points before and after the relative peak that we take data from
2. `ITERATIONS`: the number of times we loop over the list of anomaly peaks, and augment them
3. `STARTING_TIMESTAMP`: the timestamp to start all augmented data at. The default value is 15 minutes after the last data measurement from the original set up data given to the project devs. As of 2/16/22, this timestamp is correct.
4. `LOWER_BOUND_AMPLITUDE_MULTIPLIER`: the lower bound of the amplitude augment multiplier
5. `UPPER_BOUND_AMPLITUDE_MULTIPLIER`: the upper bound of the amplitude augment multiplier
6. `SMOOTH_LOWER_BOUND`: lower bound for number of points to cover when adding smoothing data
7. `SMOOTH_UPPER_BOUND`: upper bound for number of points to cover when adding smoothing data


In [None]:
""" Helpful constants """
TIME_RANGE_INIT = 30  # the base time range for peaks, in number of data points(MUST BE 1 OR HIGHER), this is used as a fallback
STARTING_TIMESTAMP = 2459096.9583333335
SAMPLES = 500 # the number of samples to generate

# for how much we change peaks by
LOWER_BOUND_AMPLITUDE_MULTIPLIER = -0.1
UPPER_BOUND_AMPLITUDE_MULTIPLIER = 0.1

SMOOTH_LOWER_BOUND = 200  # the minimum amount of data points to cover when smoothing
SMOOTH_UPPER_BOUND = 400  # the maximum amount of data points to cover when smoothing

# flat level average vals (for data smoothing)
FLAT_FDOM_VAL = 5
FLAT_TURB_VAL = 10
FLAT_STAGE_VAL = 0.1

# used for balancing classes, add any necessary peak classes for your data here
FDOM_PEAK_LABELS = ['PLP', 'SKP', 'PP', 'FPT', 'FSK', 'NAP']
TURB_PEAK_LABELS = ['PP', 'SKP', 'FPT', 'NAP']


## Loading in data
The knowledge-based approach uses the data in `Data/converted_data/julian_format/`, so that is where the data augmentation will go.

In [None]:
fDOM_data_filename = (
    "../Data/converted_data/julian_format/fDOM_raw_10.1.2011-9.4.2020.csv"
)
fDOM_truths_filename = (
    "../Data/labeled_data/ground_truths/fDOM/fDOM_all_julian_0k-300k.csv"
)
turb_data_filename = (
    "../Data/converted_data/julian_format/turbidity_raw_10.1.2011_9.4.2020.csv"
)
turb_truths_filename = (
    "../Data/labeled_data/ground_truths/turb/turb_all_julian_0k-300k.csv"
)

# Read in raw data
fDOM_data = dm.read_in_preprocessed_timeseries(fDOM_data_filename)
stage_data = dm.read_in_preprocessed_timeseries(
    "../Data/converted_data/julian_format/stage_10.1.11-1.1.19.csv"
)
turb_data = dm.read_in_preprocessed_timeseries(turb_data_filename)

# align stage to fDOM
stage_data = dp.align_stage_to_fDOM(fDOM_data, stage_data)

# read in labeled fDOM
fDOM_labeled = pd.read_csv(fDOM_truths_filename)

# read in labeled turb
turb_labeled = pd.read_csv(turb_truths_filename)

# Convert data into pandas dataframes for better indexing:
fDOM_raw = pd.DataFrame(fDOM_data)
fDOM_raw.columns = ["timestamp", "value"]

turb_raw = pd.DataFrame(turb_data)
turb_raw.columns = ["timestamp", "value"]

stage_raw = pd.DataFrame(stage_data)
stage_raw.columns = ["timestamp", "value"]


## Augmenting Data
We will augment data for each type of peak, and for each measurement.

Starting with fDOM:
1. PLP (plummeting peak)
2. PP (phantom peak)
3. SKP (skyrocketing peak)

TODO: augment more peak types when they are labeled

With turbidity:
1. PP

### Augmenting fDOM

In [None]:
#####################################################################################
#                                                                                   #
#                               DATAFRAME SETUP SECTION                             #
#                                                                                   #
#####################################################################################

# new dataframes for augmented labeled/raw fDOM
augmented_fDOM_raw = pd.DataFrame(columns=["timestamp", "value"])
augmented_fDOM_labeled = pd.DataFrame(
    columns=["timestamp_of_peak", "value_of_peak", "label_of_peak", "idx_of_peak"]
)

# dataframe for stage to align with augmented fDOM
augmented_stage_raw_fdom = pd.DataFrame(columns=["timestamp", "value"])

# dataframes for turbidity to align with augmented fDOM
augmented_turb_raw_fdom = pd.DataFrame(columns=["timestamp", "value"])


# variable to keep the last entry in the dataframe for stage
# defaults to the last entry that was in fdom/turb raw csv files, in julian format
prev_added_entry = STARTING_TIMESTAMP

# a list of peaks that don't align with the fDOM raw file that was aligned with stage
# i believe its just peaks that don't align with stage in general for whatever reason
missed_fDOM_peaks = 0

# ~~~~~ Collect starting and ending points of each fDOM peak type ~~~~~
# PP
fdom_pp_index_lookup = get_cands_fDOM_PP(fDOM_data_filename, fDOM_truths_filename)

## SKP
fdom_skp_index_lookup = get_cands_fDOM_SKP(fDOM_data_filename, fDOM_truths_filename)

## PLP
fdom_plp_index_lookup = get_cands_fDOM_PLP(fDOM_data_filename, fDOM_truths_filename)

# flat plateaus (FPT)
fdom_fpt_index_lookup = get_cands_fDOM_FPT()

# flat sinks
fdom_fsk_index_lookup = get_cands_fDOM_FSK()

# non anomaly peaks
fdom_NAP_index_lookup = get_cands_fDOM_NAP(fDOM_data_filename, fDOM_truths_filename)

# to balance out classes
class_count = {}
for label in FDOM_PEAK_LABELS:
    class_count[label] = 0

for sample in range(SAMPLES):
    """
    gen a sample
    """

    peaks = fDOM_labeled.sample(frac=1).reset_index(drop=True)

    # find out which peak label we need augment next to keep classes balanced
    next_label = augment.check_class_balance(class_count, FDOM_PEAK_LABELS)

    # select a peak from peaks randomly
    peak = peaks.sample()
    peak_label = peak['label_of_peak'].tolist()[0]

    # check its label, need to iterate until we get a correct one to keep classes balanced
    while peak_label != next_label:
        peak = peaks.sample()
        peak_label = peak['label_of_peak'].tolist()[0]
    
    # update class count
    class_count[peak_label] += 1

    # now, we need to actually augment the peak
    prev_dist, next_dist = TIME_RANGE_INIT, TIME_RANGE_INIT

    # get lookup table for specific cand
    if peak_label == "PP":
        cands_df = fdom_pp_index_lookup
    elif peak_label == "PLP":
        cands_df = fdom_plp_index_lookup
    elif peak_label == "SKP":
        cands_df = fdom_skp_index_lookup
    elif peak_label == "FSK":
        cands_df = fdom_fsk_index_lookup
    elif peak_label == "FPT":
        cands_df = fdom_fpt_index_lookup
    else:
        cands_df = fdom_NAP_index_lookup

    # get timestamp of peak
    peak_timestamp = peak["timestamp_of_peak"].tolist()[0]

    # get the indices of the peak
    peak_index_df = fDOM_raw[fDOM_raw["timestamp"] == peak_timestamp]
    stage_index_df = stage_raw[stage_raw["timestamp"] == peak_timestamp]
    turb_index_df = turb_raw[turb_raw["timestamp"] == peak_timestamp]

    if len(peak_index_df.index.tolist()) != 0:
        peak_index = peak_index_df.index.tolist()[0]
        stage_index = stage_index_df.index.tolist()[0]
        turb_index = turb_index_df.index.tolist()[0]

        # get the benginning and end of the peak
        left, right = augment.get_ends_of_peak(cands_df, peak_index)
        prev_dist = abs(peak_index - left)
        next_dist = abs(peak_index - right)

        # build the temp dataframes to concat to the main df
        new_fdom_raw, new_stage, new_turb_raw = augment.build_temp_dataframes(
            fDOM_raw,
            stage_raw,
            turb_raw,
            prev_dist,
            next_dist,
            peak_index,
            stage_index,
            turb_index,
        )

        # actual data augmentation here
        new_fdom_raw = augment.augment_data(
            new_fdom_raw,
            peak_index,
            LOWER_BOUND_AMPLITUDE_MULTIPLIER,
            UPPER_BOUND_AMPLITUDE_MULTIPLIER,
        )

        # SMOOTH DATA
        # ensure that main augmented df has more than 1 row, else no data to smooth
        if augmented_fDOM_raw.shape[0] > 1:
            (
                augmented_fDOM_raw,
                augmented_stage_raw_fdom,
                augmented_turb_raw_fdom,
                prev_added_entry,
            ) = augment.smooth_data(
                augmented_fDOM_raw,
                augmented_stage_raw_fdom,
                augmented_turb_raw_fdom,
                prev_added_entry,
                SMOOTH_LOWER_BOUND,
                SMOOTH_UPPER_BOUND,
                FLAT_FDOM_VAL,
                FLAT_TURB_VAL,
                FLAT_STAGE_VAL,
            )

        # update the dataframes to set new indices and timestamps
        (
            new_label,
            new_fdom_raw,
            new_stage,
            new_turb_raw,
            prev_added_entry,
        ) = augment.update_dataframes(
            prev_added_entry,
            new_fdom_raw,
            peak_index,
            prev_dist,
            augmented_fDOM_raw,
            new_stage,
            new_turb_raw,
            peak_label,
        )

        # concat rest of the peaks
        (
            augmented_fDOM_labeled,
            augmented_fDOM_raw,
            augmented_stage_raw_fdom,
            augmented_turb_raw_fdom,
        ) = augment.concat_dataframes(
            augmented_fDOM_labeled,
            augmented_fDOM_raw,
            augmented_stage_raw_fdom,
            augmented_turb_raw_fdom,
            new_fdom_raw,
            new_stage,
            new_turb_raw,
            new_label,
        )

    else:
        missed_fDOM_peaks += 1

# print the class count for debugging
print('FINAL COUNTS OF AUGMENTED PEAKS BY CLASS')
print('________________________________________')
print(class_count)

### Augment turbidity data

In [None]:
#####################################################################################
#                                                                                   #
#                               DATAFRAME SETUP SECTION                             #
#                                                                                   #
#####################################################################################

# new dataframes for augmented labeled and raw turb
augmented_turb_raw = pd.DataFrame(columns=["timestamp", "value"])
augmented_turb_labeled = pd.DataFrame(
    columns=["timestamp_of_peak", "value_of_peak", "label_of_peak", "idx_of_peak"]
)

# dataframe for stage to align with augmented turb
augmented_stage_raw_turb = pd.DataFrame(columns=["timestamp", "value"])

# dataframes for fDOM to align with augmented turb
augmented_fdom_raw_turb = pd.DataFrame(columns=["timestamp", "value"])

# var to keep last time entry for augmentation
prev_added_entry = STARTING_TIMESTAMP

# missed peaks
missed_turb_peaks = 0

# ~~~~~~ Collect starting and ending points of each fDOM peak type
# PP
turb_pp_index_lookup = get_cands_turb_PP(turb_data_filename, turb_truths_filename)

## SKP
turb_skp_index_lookup = get_cands_turb_SKP(turb_data_filename, turb_truths_filename)

## FPT
turb_fpt_index_lookup = get_cands_turb_FPT(turb_data_filename, turb_truths_filename)

# non anomaly peaks
turb_NAP_index_lookup = get_cands_turb_NAP(turb_data_filename, turb_truths_filename)

# for class balancing
class_count = {}
for label in TURB_PEAK_LABELS:
    class_count[label] = 0

for sample in range(SAMPLES):
    """
    augment a single sample
    """

    peaks = turb_labeled.sample(frac=1).reset_index(drop=True)

    # get next label to augment
    next_label = augment.check_class_balance(class_count, TURB_PEAK_LABELS)

    peak = peaks.sample()
    peak_label = peak["label_of_peak"].tolist()[0]

    while peak_label != next_label:
        peak = peaks.sample()
        peak_label = peak["label_of_peak"].tolist()[0]

    class_count[peak_label] += 1

    # now, we need to actually augment the peak
    prev_dist, next_dist = TIME_RANGE_INIT, TIME_RANGE_INIT

    # get lookup table for peak beginning and ends
    if peak_label == "SKP":
        cands_df = turb_skp_index_lookup
    elif peak_label == "PP":
        cands_df = turb_pp_index_lookup
    elif peak_label == "FPT":
        cands_df = turb_fpt_index_lookup
    else:
        cands_df = turb_NAP_index_lookup

    # get timestamp of peak
    peak_timestamp = peak["timestamp_of_peak"].tolist()[0]

    # get peak indices (NOTE THAT PEAK IS TURB)
    peak_index_df = turb_raw[turb_raw["timestamp"] == peak_timestamp]
    stage_index_df = stage_raw[stage_raw["timestamp"] == peak_timestamp]
    fdom_index_df = fDOM_raw[fDOM_raw["timestamp"] == peak_timestamp]

    if len(peak_index_df.index.tolist()) != 0:
        peak_index = peak_index_df.index.tolist()[0]
        stage_index = stage_index_df.index.tolist()[0]
        fdom_index = fdom_index_df.index.tolist()[0]

        # get beginning and ending of peak indices from lookup cands df
        left, right = augment.get_ends_of_peak(cands_df, peak_index)
        prev_dist = int(abs(peak_index - left))
        next_dist = int(abs(peak_index - right))

        new_fdom_raw, new_stage, new_turb_raw = augment.build_temp_dataframes(
            fDOM_raw,
            stage_raw,
            turb_raw,
            prev_dist,
            next_dist,
            fdom_index,
            stage_index,
            peak_index,
        )

        # augment data
        new_turb_raw = augment.augment_data(
            new_turb_raw,
            peak_index,
            LOWER_BOUND_AMPLITUDE_MULTIPLIER,
            UPPER_BOUND_AMPLITUDE_MULTIPLIER,
        )

        # SMOOTH DATA
        if augmented_turb_raw.shape[0] > 1:
            (
                augmented_fdom_raw_turb,
                augmented_stage_raw_turb,
                augmented_turb_raw,
                prev_added_entry,
            ) = augment.smooth_data(
                augmented_fdom_raw_turb,
                augmented_stage_raw_turb,
                augmented_turb_raw,
                prev_added_entry,
                SMOOTH_LOWER_BOUND,
                SMOOTH_UPPER_BOUND,
                FLAT_FDOM_VAL,
                FLAT_TURB_VAL,
                FLAT_STAGE_VAL,
            )

            # update dataframes
            # UPDATE FRAMES
        (
            new_label,
            new_fdom_raw,
            new_stage,
            new_turb_raw,
            prev_added_entry,
        ) = augment.update_dataframes(
            prev_added_entry,
            new_fdom_raw,
            peak_index,
            prev_dist,
            augmented_fdom_raw_turb,
            new_stage,
            new_turb_raw,
            peak_label,
        )

        # CONCAT FRAMES
        (
            augmented_turb_labeled,
            augmented_fdom_raw_turb,
            augmented_stage_raw_turb,
            augmented_turb_raw,
        ) = augment.concat_dataframes(
            augmented_turb_labeled,
            augmented_fdom_raw_turb,
            augmented_stage_raw_turb,
            augmented_turb_raw,
            new_fdom_raw,
            new_stage,
            new_turb_raw,
            new_label,
        )
    else:
        missed_turb_peaks += 1

# print the class count for debugging
print('FINAL COUNTS OF AUGMENTED PEAKS BY CLASS')
print('________________________________________')
print(class_count)


## Move augmented data into csv files
The following codeblock creates csv files for the augmented data.

In [None]:
""" Augmented Data Paths """
# NOTE: Before running this cell, be sure that the following paths exist, the folders must be there to allow the data to be written
# trainset
trainset_fdom_path = "../Data/augmented_data/trainset_plotting/fdom/"
trainset_turb_path = "../Data/augmented_data/trainset_plotting/turb/"

# unlabeled data
unlabeled_fdom_path = "../Data/augmented_data/fdom/unlabeled/"
unlabeled_turb_path = "../Data/augmented_data/turb/unlabeled/"

# labeled data
labeled_fdom_path = "../Data/augmented_data/fdom/labeled/"
labeled_turb_path = "../Data/augmented_data/turb/labeled/"

# write to normal julian csv
augment.write_augmented_data_to_csv(
    labeled_fdom_path,
    unlabeled_fdom_path,
    labeled_turb_path,
    unlabeled_turb_path,
    augmented_fDOM_labeled,
    augmented_fDOM_raw,
    augmented_turb_raw_fdom,
    augmented_stage_raw_fdom,
    augmented_turb_labeled,
    augmented_turb_raw,
    augmented_fdom_raw_turb,
    augmented_stage_raw_turb,
)

# write to trainset
augment.write_to_trainset_csv(
    augmented_fDOM_raw,
    augmented_turb_raw_fdom,
    augmented_stage_raw_fdom,
    trainset_fdom_path,
    augmented_turb_raw,
    augmented_fdom_raw_turb,
    augmented_stage_raw_turb,
    trainset_turb_path,
)
