# Multiclass Knowledge Engineering Peak Detection in fDOM


In [None]:
# Imports
from sklearn.model_selection import TimeSeriesSplit
import seaborn as sn
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    precision_score,
)
import pandas as pd
import numpy as np
import datetime
import copy
import matplotlib.pyplot as plt
import pickle
import progressbar

import Tools.data_processing as dp
import Tools.data_movement as dm
from Tools.get_all_cands import get_all_cands_turb, get_all_truths

# import classifiers
from Anomaly_Detection.Multiclass_Detection.turbidity_classifiers.turb_PP import (
    turb_PP_Classifier,
)

from Anomaly_Detection.Multiclass_Detection.turbidity_classifiers.turb_FPT import (
    turb_FPT_Classifier,
)

from Anomaly_Detection.Multiclass_Detection.turbidity_classifiers.turb_SKP import (
    turb_SKP_Classifier,
)

# disable warnings
def warn(*args, **kwargs):
    pass


import warnings

warnings.warn = warn


## Training parameters and helper functions


In [None]:
ITERATIONS = 4000
NUM_SPLITS = 5
AUGMENT_DATA_BEGIN_TIMESTAMP = 2459096.9583333335
USE_AUGMENTED_DATA = True
CLASS_BALANCED_ONLY = True

# for confusion matrix
DATA_LABELS = ["SKP", "PP", "FPT", "NAP"]

# Useful functions
def get_prediction(skp_pred, pp_pred, fpt_pred):
    """
    take the top level prediction based on peak precendence
    """
    if skp_pred == "SKP":
        return skp_pred

    elif pp_pred == "PP":
        return pp_pred

    elif fpt_pred == "FPT":
        return fpt_pred

    else:
        return "NAP"


## Paths to data


In [None]:
# filenames
fdom_raw_data_path = "Data/converted_data/julian_format/fDOM_raw_10.1.2011-9.4.2020.csv"
stage_raw_data_path = "Data/converted_data/julian_format/stage_10.1.11-1.1.19.csv"
turb_raw_data_path = (
    "Data/converted_data/julian_format/turbidity_raw_10.1.2011_9.4.2020.csv"
)

turb_labeled_path = "Data/labeled_data/ground_truths/turb/turb_all_julian_0k-300k.csv"

turb_raw_augmented_path = "Data/augmented_data/turb/unlabeled/unlabeled_turb.csv"
turb_labeled_augmented_path = "Data/augmented_data/turb/labeled/labeled_turb_peaks.csv"

fdom_augmented_raw_data_path = "Data/augmented_data/turb/unlabeled/unlabeled_fdom.csv"
stage_augmented_data_path = "Data/augmented_data/turb/unlabeled/unlabeled_stage.csv"

# FPT lookup table
turb_fpt_lookup_path = "Data/augmented_data/turb/fpt_lookup.csv"


In [None]:
# Load data
fDOM_data = dm.read_in_preprocessed_timeseries(fdom_raw_data_path)
stage_data = dm.read_in_preprocessed_timeseries(stage_raw_data_path)
turb_data = dm.read_in_preprocessed_timeseries(turb_raw_data_path)
stage_data = dp.align_stage_to_fDOM(fDOM_data, stage_data)

augmented_fdom_data = np.array(
    dm.read_in_timeseries(fdom_augmented_raw_data_path, True)
)
augmented_turb_data = np.array(dm.read_in_timeseries(turb_raw_augmented_path, True))
augmented_stage_data = np.array(dm.read_in_timeseries(stage_augmented_data_path, True))


## Get candidates and truths

The following cell loads in raw non-augmented data that is not class-balanced.


In [None]:
cands = get_all_cands_turb(turb_raw_data_path, turb_labeled_path)

truths = get_all_truths(turb_labeled_path)

assert truths.shape == cands.shape

print(f"Total number of original data candidates: {cands.shape[0]}")


## Augmented data

The following block appends augmented data into the raw dataset, making classes mostly (but not perfectly!) class balanced.


In [None]:
if USE_AUGMENTED_DATA:
    cands_augmented = get_all_cands_turb(
        turb_raw_augmented_path, turb_labeled_augmented_path, True, turb_fpt_lookup_path
    )

    truths_augmented = get_all_truths(turb_labeled_augmented_path)

    truths_augmented = truths_augmented[
        truths_augmented["idx_of_peak"].isin(cands_augmented["idx_of_peak"])
    ]

    assert truths_augmented.shape == cands_augmented.shape

    print(f"Total number of augmented candidates: {cands_augmented.shape[0]}")

    # concatenate two candidates and truths into single list
    cands = pd.concat([cands, cands_augmented])
    truths = pd.concat([truths, truths_augmented])

    # concat augmented raw data to normal raw data, for classifier preprocessing
    turb_total = np.concatenate((turb_data, augmented_turb_data))
    fdom_total = np.concatenate((fDOM_data, augmented_fdom_data))
    stage_total = np.concatenate((stage_data, augmented_stage_data))

# if we aren't using augmented data, make sure classifiers have correct data being passed into them for preprocessing
else:
    turb_total = turb_data
    fdom_total = fDOM_data
    stage_total = stage_data


## Test class-balanced testing

Alternatively, use only augmented data for class-balanced training AND TESTING.


In [None]:
if CLASS_BALANCED_ONLY:
    cands = get_all_cands_turb(
        turb_raw_augmented_path, turb_labeled_augmented_path, True, turb_fpt_lookup_path
    )

    truths = get_all_truths(turb_labeled_augmented_path)

    # align the missing augmented data (FPT, NFPT, FSK, NFSK, some others)
    truths = truths[truths["idx_of_peak"].isin(cands["idx_of_peak"])]

    assert truths.shape == cands.shape

    print(f"Total number of candidates: {cands.shape[0]}")

    turb_total = augmented_turb_data
    fdom_total = augmented_fdom_data
    stage_total = augmented_stage_data


## Convert candidates and truths into lists for the classifiers

Classifiers use python lists, not the dataframes


In [None]:
# Convert cands and truths into lists
cands = cands.values.tolist()

truths = truths.values.tolist()

print(str(len(cands)) + " candidates in provided data.")


## Create Classifiers


In [None]:
pp_classifier = turb_PP_Classifier(
    turb_total, fdom_total, stage_total, AUGMENT_DATA_BEGIN_TIMESTAMP
)


In [None]:
skp_classifier = turb_SKP_Classifier(
    turb_total,
    turb_raw_data_path,
    turb_labeled_path,
    turb_raw_augmented_path,
    turb_labeled_augmented_path,
    augmented_only=True,
)


In [None]:
fpt_classifier = turb_FPT_Classifier(turb_total)


## Training Loop


In [None]:
accumulated_test_metrics = {}

accumulated_test_results = {}

accumulated_best_params = {}

accumulated_cfmxs = {}
accumulated_train_cfmxs = {}

# split data
tss = TimeSeriesSplit(NUM_SPLITS)

overall_start = datetime.datetime.now()

split = 1
divide_by_zero_errs = 0

for train_val_indices, test_indices in tss.split(cands):
    X_train, y_train = [cands[i] for i in train_val_indices], [
        truths[i] for i in train_val_indices
    ]

    X_test, y_test = [cands[i] for i in test_indices], [truths[i] for i in test_indices]

    max_fold_metric = 0
    max_result = None

    # print out info for user
    print("\nSplit: ", split)
    split_start = datetime.datetime.now()
    num_pos_test = len(list(filter(lambda x: x[2] != "NAP", y_test)))
    num_pos_train = len(list(filter(lambda x: x[2] != "NAP", y_train)))

    print(f"Num Pos in Test: {num_pos_test}")
    print(f"Num Pos in Train: {num_pos_train}")

    if num_pos_test >= 1 and num_pos_train >= 1:
        # instantiate the progress bar
        pbar = progressbar.ProgressBar(max_value=ITERATIONS)

        # main training loop
        for iteration in range(ITERATIONS):
            params = {}

            params["pp"] = pp_classifier.start_iteration()
            params["skp"] = skp_classifier.start_iteration()
            params["fpt"] = fpt_classifier.start_iteration()

            train_preds = []

            pp_preds = pp_classifier.classify_samples(X_train)
            skp_preds = skp_classifier.classify_samples(X_train)
            fpt_preds = fpt_classifier.classify_samples(X_train)

            for i, pred in enumerate(pp_preds):
                train_pred = get_prediction(
                    skp_preds[i][1],
                    pred[1],
                    fpt_preds[i][1],
                )

                train_preds.append(train_pred)

            bal_acc = balanced_accuracy_score(
                [row[2] for row in y_train],
                [row for row in train_preds],
            )

            acc = accuracy_score(
                [row[2] for row in y_train],
                [row for row in train_preds],
            )

            f1_train = f1_score(
                [row[2] for row in y_train], [row for row in train_preds], average=None
            )

            if acc > max_fold_metric:
                max_fold_metric = acc
                max_result = copy.deepcopy(train_preds)

                pp_classifier.got_best_results()
                skp_classifier.got_best_results()
                fpt_classifier.got_best_results()

            pp_classifier.end_of_iteration(y_train)
            skp_classifier.end_of_iteration(y_train)
            fpt_classifier.end_of_iteration(y_train)

            pbar.update(iteration)

        # peak testing
        test_preds = []
        pp_preds = pp_classifier.classify_samples(X_test, True)
        skp_preds = skp_classifier.classify_samples(X_test, True)
        fpt_preds = fpt_classifier.classify_samples(X_test, True)

        for i, pred in enumerate(pp_preds):
            test_pred = get_prediction(
                skp_preds[i][1],
                pred[1],
                fpt_preds[i][1],
            )

            test_preds.append(test_pred)

        ######## GET SCORES ########
        # get confusion matrix
        cfmx = confusion_matrix(
            [row[2] for row in y_test],
            [row for row in test_preds],
            labels=DATA_LABELS,
        )

        # get acc score
        acc_score = accuracy_score(
            [row[2] for row in y_test],
            [row for row in test_preds],
        )

        bal_acc = balanced_accuracy_score(
            [row[2] for row in y_test],
            [row for row in test_preds],
        )

        f1 = f1_score(
            [row[2] for row in y_test],
            [row for row in test_preds],
            average="macro",
        )

        precision = precision_score(
            [row[2] for row in y_test],
            [row for row in test_preds],
            average="macro",
        )

        print(f"\nSplit {split} test Scores: F1: {f1}  BA: {bal_acc}  ACC: {acc_score}")

        accumulated_cfmxs[split] = copy.deepcopy(cfmx)

        accumulated_test_metrics[split] = {
            "f1": f1,
            "acc": acc_score,
            "ba": bal_acc,
            "precision": precision,
        }

        accumulated_test_results[split] = copy.deepcopy(test_preds)

        accumulated_best_params[split] = {
            "PP": copy.deepcopy(pp_classifier.best_params),
            "SKP": copy.deepcopy(skp_classifier.best_params),
            "FPT": copy.deepcopy(fpt_classifier.best_params),
        }

        split += 1

print("\n")

overall_end_time = datetime.datetime.now()


## Display Metrics


In [None]:
mean_f1 = 0
mean_ba = 0
mean_precision = 0
mean_acc = 0

for key in accumulated_test_metrics:
    metrics = accumulated_test_metrics[key]

    mean_f1 += metrics["f1"]
    mean_ba += metrics["ba"]
    mean_precision += metrics["precision"]
    mean_acc += metrics["acc"]

print("Mean Test F1: ", mean_f1 / len(accumulated_test_metrics))
print("Mean Test BA: ", mean_ba / len(accumulated_test_metrics))
print("Mean Test Acc: ", mean_acc / len(accumulated_test_metrics))
print("Mean Test Precision: ", mean_precision / len(accumulated_test_metrics))

for split in accumulated_best_params.keys():
    print(f"\nParams from split {split}:")
    for peak in accumulated_best_params[split].keys():
        print(f"\nFor peak type {peak}:")

        for param in accumulated_best_params[split][peak].keys():
            print(f"{param}, value: {accumulated_best_params[split][peak][param]}")

# testing conf matrix
mean_cfmx = np.zeros((len(DATA_LABELS), len(DATA_LABELS)))
for key in accumulated_cfmxs.keys():
    mean_cfmx += accumulated_cfmxs[key]

mean_cfmx = mean_cfmx / len(accumulated_cfmxs)

plt.figure(figsize=(10, 7))
plt.title(label="Turbidity Peak Detection Confusion Matrix")

sn.set(font_scale=1.5)

group_counts = ["{0:0.0f}\n".format(value) for value in mean_cfmx.flatten()]
percentages = (mean_cfmx / mean_cfmx.sum(axis=1)[:, np.newaxis]).flatten()
group_percentages = ["{0:.2%}".format(value) for value in percentages]

box_labels = [f"{v2}{v3}".strip() for v2, v3 in zip(group_counts, group_percentages)]
box_labels = np.asarray(box_labels).reshape(mean_cfmx.shape[0], mean_cfmx.shape[1])

plot = sn.heatmap(
    mean_cfmx.astype("float"),
    annot=box_labels,
    xticklabels=DATA_LABELS,
    yticklabels=DATA_LABELS,
    fmt="",
    cbar=False,
    annot_kws={"size": 14},
)

plt.xlabel("Predictions")
plt.ylabel("Ground Truths")
plt.show()

plot.get_figure().savefig(
    "Anomaly_Detection/Multiclass_Detection/Experimental_Results/turb/conf-matrix.png"
)


In [None]:
with open(
    "Anomaly_Detection/Multiclass_Detection/Experimental_Results/turb/best_params.pkl",
    "wb",
) as pck_file:
    pickle.dump(accumulated_best_params, pck_file)
    pck_file.close()

# Pickle test results
with open(
    "Anomaly_Detection/Multiclass_Detection/Experimental_Results/turb/test_results.pkl",
    "wb",
) as pck_file:
    pickle.dump(accumulated_test_results, pck_file)
    pck_file.close()

# Pickle test metrics
with open(
    "Anomaly_Detection/Multiclass_Detection/Experimental_Results/turb/test_metrics.pkl",
    "wb",
) as pck_file:
    pickle.dump(accumulated_test_metrics, pck_file)
    pck_file.close()


### Statistics from individual classifiers


In [None]:
# print accuracies
print("SKP CLASSIFIER INFO:")
print("ACC: " + str(skp_classifier.best_acc))
print("f1: " + str(skp_classifier.best_f1_score))
print("\n")

print("PP CLASSIFIER INFO:")
print("ACC: " + str(pp_classifier.best_acc))
print("f1: " + str(pp_classifier.best_f1_score))
print("\n")

print("FPT CLASSIFIER INFO:")
print("ACC: " + str(fpt_classifier.best_acc))
print("f1: " + str(fpt_classifier.best_f1_score))
print("\n")
