# Multiclass Peak Detection in fDOM

This file combines all of the fDOM detection scripts into a singular classifier, that detects all peak types. On top of this, it also leverages the augmented data created previously.

## Structure

The core structure of the project is to have all individual classifiers running, and then when one detects a peak, it alerts the overall classifier "manager", which then takes note of the peak that a classifier has detected as an anomaly peak.


In [None]:
# Imports
import scipy.io as sio
from sklearn.model_selection import TimeSeriesSplit
import seaborn as sn
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
import datetime
import sys

sys.path.insert(1, "../")
import Tools.data_processing as dp
import Tools.data_movement as dm
from get_cands import get_all_cands_fDOM, get_all_truths_fDOM

# import classifiers
from fdom_classifiers.fDOM_PLP import fDOM_PLP_Classifier

# TODO: uncomment these when the classes are written
# from fdom_classifiers.fDOM_FPT import fDOM_FPT_Classifier
# from fdom_classifiers.fDOM_FSK import fDOM_FSK_Classifier
# from fdom_classifiers.fDOM_PP import fDOM_PP_Classifier
from fdom_classifiers.fDOM_SKP import fDOM_SKP_Classifier


## Training parameters and helper functions


In [None]:
ITERATIONS = 7000
NUM_SPLITS = 5


In [None]:
# TODO: REMOVE THIS WHEN FINISHED MODIFYING
%load_ext autoreload
%autoreload 2

In [None]:
# Filenames
fdom_raw_data = "../Data/converted_data/julian_format/fDOM_raw_10.1.2011-9.4.2020.csv"
stage_raw_data = "../Data/converted_data/julian_format/stage_10.1.11-1.1.19.csv"
turb_raw_data = "../Data/converted_data/julian_format/turbidity_raw_10.1.2011_9.4.2020.csv"

fdom_labeled = "../Data/labeled_data/ground_truths/fDOM/fDOM_all_julian_0k-300k.csv"

fdom_raw_augmented = "../Data/augmented_data/fdom/unlabeled/unlabeled_fdom.csv"
fdom_labeled_augmented = "../Data/augmented_data/fdom/labeled/labeled_fdom_peaks.csv"

turb_augmented_raw_data = "../Data/augmented_data/fdom/unlabeled/unlabeled_turb.csv"

In [None]:
# Load Data
fDOM_data = dm.read_in_preprocessed_timeseries(fdom_raw_data)
stage_data = dm.read_in_preprocessed_timeseries(stage_raw_data)
turb_data = dm.read_in_preprocessed_timeseries(turb_raw_data)
stage_data = dp.align_stage_to_fDOM(fDOM_data, stage_data)

augmented_fdom_data = np.array(dm.read_in_timeseries(fdom_raw_augmented, True))
augmented_turb_data = np.array(dm.read_in_timeseries(turb_augmented_raw_data, True))


## Get Candidates and truths

In [None]:
# get candidates from raw data
cands = get_all_cands_fDOM(
    fdom_raw_data,
    fdom_labeled,
)

print(cands.shape)

# get truths from raw data
truths = get_all_truths_fDOM(fdom_labeled)

print(truths.shape)

# assert they are the same size
assert truths.shape == cands.shape


In [None]:
# get candidates from augmented data
cands_augmented = get_all_cands_fDOM(
    fdom_raw_augmented,
    fdom_labeled_augmented,
    True,
)
print(cands_augmented.shape)

truths_augmented = get_all_truths_fDOM(
    fdom_labeled_augmented, True
)

# align the missing augmented data (FPT, NFPT, FSK, NFSK, some others)
truths_augmented = truths_augmented[truths_augmented["idx_of_peak"].isin(cands_augmented["idx_of_peak"])]

print(truths_augmented.shape)

assert truths_augmented.shape == cands_augmented.shape


In [None]:
# concatenate two candidates and truths into single list
cands = pd.concat([cands, cands_augmented])
truths = pd.concat([truths, truths_augmented])

In [None]:
# Convert cands and truths into lists
cands = cands.values.tolist()
truths = truths.values.tolist()

print(str(len(cands)) + " candidates in provided data.")
print(len(cands))

## Create Classifiers


In [None]:
# PLP Classifier needs the raw fDOM data and turb data to function correctly
turb_data_total = np.concatenate((turb_data, augmented_turb_data))

plp_classifer = fDOM_PLP_Classifier(
    fDOM_data,
    turb_data_total,
    fdom_raw_data,
    fdom_labeled,
    fdom_raw_augmented,
    fdom_labeled_augmented
)


In [None]:
fdom_total = np.concatenate((fDOM_data, augmented_fdom_data))

skp_classifier = fDOM_SKP_Classifier(
    fdom_total,
    fdom_raw_data,
    fdom_labeled,
    fdom_raw_augmented,
    fdom_labeled_augmented
)

## Splitting data into testing and training

## Training Loop

In [None]:
# split data
train_test_split_indices = TimeSeriesSplit(NUM_SPLITS).split(cands)

overall_start = datetime.datetime.now()

split = 1
divide_by_zero_errs = 0

for train_val_indices, test_indices in train_test_split_indices:
    X_train, y_train = [cands[i] for i in train_val_indices], [truths[i] for i in train_val_indices]
    X_test, y_test = [cands[i] for i in test_indices], [truths[i] for i in test_indices]

    max_fold_metric = 0
    max_result = None
    print("\nSplit: ", split)

    split_start = datetime.datetime.now()

    # TODO: check on these two lines, unsure what they really do
    num_pos_test= len(list(filter(lambda x: x[2] != "NAP", y_test)))
    num_pos_train= len(list(filter(lambda x: x[2] != "NAP", y_train)))

    print(f'Num Pos in Test: {num_pos_test}')
    print(f'Num Pos in Train: {num_pos_train}')

    if num_pos_test >= 1 and num_pos_train >= 1:

        # main training loop
        for iteration in range(ITERATIONS):
            # start the iteration for each classifier (resets predictions, generates params)
            plp_classifer.start_iteration()
            skp_classifier.start_iteration()

            # iterate over list of peaks
            for i, peak in enumerate(X_train):
                plp_result = plp_classifer.classify_sample(i, peak)
                skp_result = skp_classifier.classify_sample(i, peak)

            # print out info for user
            if iteration and iteration % int(ITERATIONS / 10) == 0:
                print(" {}/{} ".format(iteration, ITERATIONS), end="")

            # test classifiers
            plp_classifer.test_results(truths, iteration, ITERATIONS)
            skp_classifier.test_results(truths, iteration, ITERATIONS)

        # increment split
        split += 1

# print a newline char for better display
print("\n")

print("PLP CLASSIFIER INFO:")
print("ACC: " + str(plp_classifer.best_acc))
print("f1: " + str(plp_classifer.best_f1_score))
print("\n")

print("SKP CLASSIFIER INFO:")
print("ACC: " + str(skp_classifier.best_acc))
print("f1: " + str(skp_classifier.best_f1_score))


## Display Metrics
