# Data Merging
This file's main goal is to merge all labeled timeline data into a single source to allow for easier data augmentation, as well a classifier that can detect all types of anomalies in one, rather than one classifier for each one.

In [11]:
# Imports
import csv
import pandas as pd
import numpy as np
import copy

## Data Labels Reference:
### fDOM:

### Turbidity:

### Stage:


The following functions are helpers for the rest of the merging process.

In [2]:
def load_labeled_dataset(filename):
    """ Read in labeled data and return the read in data """
    with open(filename, 'r', newline='') as f:
        reader = csv.reader(f, delimiter=',')
        next(reader)
        truths = [[float(row[0]), float(row[1]), row[2], int(row[3])] for row in reader] 
        f.close()  

    return truths 

In [3]:
# Load in all of the datasets:
fDOM_PLP_path = '../Data/labeled_data/ground_truths/fDOM/fDOM_PLP/julian_time/fDOM_PLP_0k-300k.csv'
fDOM_SKP_path = '../Data/labeled_data/ground_truths/fDOM/fDOM_SKP/julian_time/fDOM_SKP_0k-300k.csv'
fDOM_PP_path = '../Data/labeled_data/ground_truths/fDOM/fDOM_PP/julian_time/fDOM_PP_0k-300k.csv'

# load in dataset from original function (MIGHT BE UNNECESSARY)
fDOM_PLP_truths = load_labeled_dataset(fDOM_PLP_path)
fDOM_SKP_truths = load_labeled_dataset(fDOM_SKP_path)
fDOM_PP_truths = load_labeled_dataset(fDOM_PP_path)

# Load in dataframes
fDOM_PLP_df = pd.read_csv(fDOM_PLP_path)
fDOM_SKP_df = pd.read_csv(fDOM_SKP_path)
fDOM_PP_df = pd.read_csv(fDOM_PP_path)

In [4]:
# Visualize these dataframes
print("PLP Head:")
print(fDOM_PLP_df.head())

print("\nSKP Head:")
print(fDOM_SKP_df.head())

print("\nPP Head:")
print(fDOM_PP_df.head())

PLP Head:
   timestamp_of_peak  value_of_peak label_of_peak  idx_of_peak
0       2.456064e+06      112.40602          NPLP         2083
1       2.456077e+06      113.10874          NPLP         3270
2       2.456077e+06       84.50452          NPLP         3276
3       2.456077e+06       90.15410          NPLP         3294
4       2.456077e+06       96.68559          NPLP         3300

SKP Head:
   timestamp_of_peak  value_of_peak label_of_peak  idx_of_peak
0       2.456049e+06       28.46222          NSKP          616
1       2.456056e+06       38.09339          NSKP         1318
2       2.456063e+06       38.94278          NSKP         1993
3       2.456064e+06       43.10656          NSKP         2091
4       2.456077e+06       20.55849          NSKP         3269

PP Head:
   timestamp_of_peak  value_of_peak label_of_peak  idx_of_peak
0       2.456045e+06       26.71488           NPP          207
1       2.456049e+06       27.74371           NPP          617
2       2.456052e+06    

In [53]:
# SET PRECENDENCE OF PEAKS
# skyrocketing <- phantom <- plummeting
TOP_LEVEL_PEAK = fDOM_SKP_df
TOP_ACRO = "SKP"
TOP_NO_ACRO = "NSKP"

SECOND = fDOM_PP_df
SECOND_ACRO = "PP"
SECOND_NO_ACRO = "NPP"

THIRD = fDOM_PLP_df
THIRD_ACRO = "PLP"
THIRD_NO_ACRO = "NPLP"

TOP_LEVEL_PEAK = TOP_LEVEL_PEAK.truncate(after=513)

THIRD = THIRD.truncate(after=513)

print(TOP_LEVEL_PEAK.shape)
print(SECOND.shape)
print(THIRD.shape)

(514, 4)
(514, 4)
(514, 4)


## Making dataframes the same size

## Begin Merging Process
We start by creating a new dataframe, which is equivalent to the top level peak dataframe (based on precendence)

In [54]:
# create new pandas dataframe
fDOM_df = copy.deepcopy(TOP_LEVEL_PEAK)


# merge second into the first
fDOM_df['label_of_peak'] = np.where(fDOM_df['label_of_peak'] == TOP_NO_ACRO, SECOND['label_of_peak'], fDOM_df['label_of_peak'])

# merge third into first and second
fDOM_df['label_of_peak'] = np.where(fDOM_df['label_of_peak'] == SECOND_NO_ACRO, THIRD['label_of_peak'], fDOM_df['label_of_peak'])

# rename thirds not peak label with NAP for "Not Anomaly Peak"
fDOM_df['label_of_peak'] = np.where(fDOM_df['label_of_peak'] == THIRD_NO_ACRO, "NAP", fDOM_df['label_of_peak'])

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(fDOM_df)


     timestamp_of_peak  value_of_peak label_of_peak  idx_of_peak
0         2.456049e+06       28.46222           NAP          616
1         2.456056e+06       38.09339           NAP         1318
2         2.456063e+06       38.94278           NAP         1993
3         2.456064e+06       43.10656           NAP         2091
4         2.456077e+06       20.55849           NAP         3269
5         2.456077e+06       50.91167           NAP         3279
6         2.456077e+06       50.87842           NAP         3298
7         2.456077e+06       61.43491           NAP         3305
8         2.456077e+06       48.39317           NAP         3345
9         2.456081e+06       45.57343           NAP         3721
10        2.456083e+06       31.40129           NAP         3901
11        2.456100e+06       23.85191           NAP         5526
12        2.456100e+06       25.05013            PP         5537
13        2.456104e+06       51.68857            PP         5918
14        2.456106e+06   