# Data Merging
This file's main goal is to merge all labeled timeline data into a single source to allow for easier data augmentation, as well a classifier that can detect all types of anomalies in one, rather than one classifier for each one.

In [1]:
# Imports
import csv
import pandas as pd
import numpy as np
import copy

## Data Labels Reference:
### fDOM:

### Turbidity:

### Stage:


The following functions are helpers for the rest of the merging process.

In [2]:
def load_labeled_dataset(filename):
    """ Read in labeled data and return the read in data """
    with open(filename, 'r', newline='') as f:
        reader = csv.reader(f, delimiter=',')
        next(reader)
        truths = [[float(row[0]), float(row[1]), row[2], int(row[3])] for row in reader] 
        f.close()  

    return truths 

In [3]:
# Load in all of the datasets:
fDOM_PLP_path = '../Data/labeled_data/ground_truths/fDOM/fDOM_PLP/julian_time/fDOM_PLP_0k-300k.csv'
fDOM_SKP_path = '../Data/labeled_data/ground_truths/fDOM/fDOM_SKP/julian_time/fDOM_SKP_0k-300k.csv'
fDOM_PP_path = '../Data/labeled_data/ground_truths/fDOM/fDOM_PP/julian_time/fDOM_PP_0k-300k.csv'

# load in dataset from original function (MIGHT BE UNNECESSARY)
fDOM_PLP_truths = load_labeled_dataset(fDOM_PLP_path)
fDOM_SKP_truths = load_labeled_dataset(fDOM_SKP_path)
fDOM_PP_truths = load_labeled_dataset(fDOM_PP_path)

# Load in dataframes
fDOM_PLP_df = pd.read_csv(fDOM_PLP_path)
fDOM_SKP_df = pd.read_csv(fDOM_SKP_path)
fDOM_PP_df = pd.read_csv(fDOM_PP_path)

# update indices to use timestamp
fDOM_PLP_df.set_index('timestamp_of_peak', inplace=True)
fDOM_SKP_df.set_index('timestamp_of_peak', inplace=True)
fDOM_PP_df.set_index('timestamp_of_peak', inplace=True)

In [4]:
# Visualize these dataframes
print("PLP Head:")
print(fDOM_PLP_df.head())

print("\nSKP Head:")
print(fDOM_SKP_df.head())

print("\nPP Head:")
print(fDOM_PP_df.head())

PLP Head:
                   value_of_peak label_of_peak  idx_of_peak
timestamp_of_peak                                          
2.456064e+06           112.40602          NPLP         2083
2.456077e+06           113.10874          NPLP         3270
2.456077e+06            84.50452          NPLP         3276
2.456077e+06            90.15410          NPLP         3294
2.456077e+06            96.68559          NPLP         3300

SKP Head:
                   value_of_peak label_of_peak  idx_of_peak
timestamp_of_peak                                          
2.456049e+06            28.46222          NSKP          616
2.456056e+06            38.09339          NSKP         1318
2.456063e+06            38.94278          NSKP         1993
2.456064e+06            43.10656          NSKP         2091
2.456077e+06            20.55849          NSKP         3269

PP Head:
                   value_of_peak label_of_peak  idx_of_peak
timestamp_of_peak                                          
2.456045e

## Peak Precendence
The following code block sets the order predence of peaks. 

In [5]:
# SET PRECENDENCE OF PEAKS
# skyrocketing <- phantom <- plummeting
TOP = fDOM_SKP_df
TOP_ACRO = "SKP"
TOP_NO_ACRO = "NSKP"

SECOND = fDOM_PP_df
SECOND_ACRO = "PP"
SECOND_NO_ACRO = "NPP"

THIRD = fDOM_PLP_df
THIRD_ACRO = "PLP"
THIRD_NO_ACRO = "NPLP"

print(TOP.shape)
print(SECOND.shape)
print(THIRD.shape)

(637, 3)
(514, 3)
(536, 3)


## Aligning timeline data correctly
All three of the dataframes have a different number of observations, so we need to align those numbers correctly in order to merge the data

In [6]:
# find duplicate timestamp entries
TOP_TIME = TOP
SECOND_TIME = SECOND
THIRD_TIME = THIRD

print("Top and Second")
print(TOP_TIME.merge(SECOND_TIME, on='timestamp_of_peak', copy=False))
print("\n")

print("Top and third")
print(TOP_TIME.merge(THIRD_TIME, on='timestamp_of_peak', copy=False))
print("\n")

print("Second and third")
print(SECOND_TIME.merge(THIRD_TIME, on='timestamp_of_peak', copy=False))


Top and Second
    timestamp_of_peak  value_of_peak_x label_of_peak_x  idx_of_peak_x  \
0        2.456083e+06         31.40129            NSKP           3901   
1        2.456167e+06         41.67178            NSKP          11924   
2        2.456179e+06         59.58092            NSKP          13137   
3        2.456245e+06         44.51830            NSKP          19071   
4        2.456283e+06         25.82408            NSKP          21975   
5        2.456333e+06         39.10495             SKP          26731   
6        2.456397e+06         16.17749            NSKP          32854   
7        2.456437e+06         39.71421            NSKP          36704   
8        2.456458e+06         71.80204            NSKP          38774   
9        2.456624e+06         42.52520            NSKP          54683   
10       2.456633e+06         30.29573            NSKP          55527   
11       2.456649e+06         41.06024            NSKP          57071   
12       2.456664e+06         27.074

## Begin Merging Process
We start by creating a new dataframe, which is equivalent to the top level peak dataframe (based on precendence)

In [6]:
# merge top and second together
df = pd.concat([TOP, SECOND, THIRD])
df = df.sort_values(by=['timestamp_of_peak'], kind='stable')
df = df[~df.index.duplicated(keep='first')]


with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df)

                   value_of_peak label_of_peak  idx_of_peak
timestamp_of_peak                                          
2.456045e+06            26.71488           NPP          207
2.456049e+06            28.46222          NSKP          616
2.456049e+06            27.74371           NPP          617
2.456052e+06            26.34231           NPP          909
2.456056e+06            38.09339          NSKP         1318
2.456056e+06            37.29078           NPP         1319
2.456063e+06            38.94278          NSKP         1993
2.456063e+06            37.94644           NPP         1996
2.456064e+06           112.40602          NPLP         2083
2.456064e+06            43.10656          NSKP         2091
2.456064e+06            41.00051           NPP         2093
2.456077e+06            20.55849          NSKP         3269
2.456077e+06           113.10874          NPLP         3270
2.456077e+06            84.50452          NPLP         3276
2.456077e+06            50.91167        