# Deep Learning Classification of Anomaly Peaks with 1D resnet
Using normal train/test split

In [1]:
# imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import sys
from sklearn import preprocessing
from resnet import ResNet1D
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from torchsummary import summary
from sklearn.metrics import confusion_matrix, classification_report, f1_score, balanced_accuracy_score
import matplotlib.pyplot as plt
import seaborn as sn

sys.path.insert(1, "../")

from datasets import fdomDataset, fdomAugOnlyDataset


In [2]:
# Hyperparams
WINDOW_SIZE = 15 # the size of each data segment
TEST_SIZE = 0.10
SEED = 42
BATCH_SIZE = 32

In [3]:
# Paths to data files
fdom_raw_data = (
    "../Data/converted_data/julian_format/fDOM_raw_10.1.2011-9.4.2020.csv"
)
stage_raw_data = "../Data/converted_data/julian_format/stage_10.1.11-1.1.19.csv"
turb_raw_data = (
    "../Data/converted_data/julian_format/turbidity_raw_10.1.2011_9.4.2020.csv"
)

fdom_labeled = "../Data/labeled_data/ground_truths/fDOM/fDOM_all_julian_0k-300k.csv"

fdom_raw_augmented = "../Data/augmented_data/fdom/unlabeled/unlabeled_fdom.csv"
fdom_labeled_augmented = "../Data/augmented_data/fdom/labeled/labeled_fdom_peaks.csv"

turb_augmented_raw_data = "../Data/augmented_data/fdom/unlabeled/unlabeled_turb.csv"

stage_augmented_data_fn = "../Data/augmented_data/fdom/unlabeled/unlabeled_stage.csv"

fdom_fpt_lookup_path = "../Data/augmented_data/fdom/fpt_lookup.csv"
fdom_fsk_lookup_path = "../Data/augmented_data/fdom/fsk_lookup.csv"

In [4]:
# get device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


cpu


## Create dataset and dataloaders

In [5]:
# create dataset
classes = ["NAP", "FSK", "FPT", "PLP", "PP", "SKP"]
le = preprocessing.LabelEncoder()

targets = le.fit_transform(classes)

# # train on class balanced data
train_dataset = fdomAugOnlyDataset(
    le,
    fdom_raw_data,
    stage_raw_data,
    turb_raw_data,
    fdom_labeled,
    fdom_raw_augmented,
    stage_augmented_data_fn,
    turb_augmented_raw_data,
    fdom_labeled_augmented,
    window_size=WINDOW_SIZE,
    fpt_lookup_filename=fdom_fpt_lookup_path,
    fsk_lookup_filename=fdom_fsk_lookup_path,
)

# test on unbalanced data
test_dataset = fdomDataset(
    le,
    fdom_raw_data,
    stage_raw_data,
    turb_raw_data,
    fdom_labeled,
    # fdom_raw_augmented,
    # stage_augmented_data_fn,
    # turb_augmented_raw_data,
    # fdom_labeled_augmented,
    window_size=WINDOW_SIZE,
    fpt_lookup_filename=fdom_fpt_lookup_path,
    fsk_lookup_filename=fdom_fsk_lookup_path,
)


TypeError: fdomAugOnlyDataset.__init__() missing 4 required positional arguments: 'fdom_augmented_dir', 'stage_augmented_dir', 'turb_augmented_dir', and 'fdom_labeled_aug_dir'

### Split into training/testing
This should not be the final iteration, this is just to get initial results.

The following cell is a custom collate function which pads different length objects into one shape, allowing variable sized data.

In [None]:
# FIXME: this is broken, not batching correctly
# the batch is a tuple...
def collate_fn_padd(batch):
    '''
    Padds batch of variable length

    note: it converts things ToTensor manually here since the ToTensor transform
    assume it takes in images rather than arbitrary tensors.
    '''
    ## get sequence lengths
    lengths = torch.tensor([ len(t) for t in batch ]).to(device)
    print(batch[0][0].shape)
    ## padd
    batch = [ torch.Tensor(t).to(device) for t in batch ] # this is the broken line, because dim 1 is variable length
    batch = torch.nn.utils.rnn.pad_sequence(batch)
    ## compute mask
    mask = (batch != 0).to(device)
    return batch, lengths, mask

In [None]:
# split data into training / testing
# train_size = int(0.85 * len(dataset))
# test_size = len(dataset) - train_size
# train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

# create dataloaders
trainloader = torch.utils.data.DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn_padd
)
testloader = torch.utils.data.DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn_padd
)


In [None]:
# init model
model = ResNet1D(
    in_channels=WINDOW_SIZE * 2 + 1,
    base_filters=64,
    kernel_size=16,
    stride=2,
    n_block=48,
    groups=1,  # check this
    n_classes=len(classes),
    downsample_gap=6,
    increasefilter_gap=12,
    verbose=False,
).to(device)

model = model.float()

# print a model summary
print(summary(model, (WINDOW_SIZE * 2 + 1, 6)))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv1d-1                [-1, 64, 6]          31,808
   MyConv1dPadSame-2                [-1, 64, 6]               0
       BatchNorm1d-3                [-1, 64, 6]             128
              ReLU-4                [-1, 64, 6]               0
            Conv1d-5                [-1, 64, 6]          65,600
   MyConv1dPadSame-6                [-1, 64, 6]               0
       BatchNorm1d-7                [-1, 64, 6]             128
              ReLU-8                [-1, 64, 6]               0
           Dropout-9                [-1, 64, 6]               0
           Conv1d-10                [-1, 64, 6]          65,600
  MyConv1dPadSame-11                [-1, 64, 6]               0
       BasicBlock-12                [-1, 64, 6]               0
      BatchNorm1d-13                [-1, 64, 6]             128
             ReLU-14                [-1

## Init loss and optimizer

In [None]:
# Optimizer/criterion
optimizer = optim.Adam(model.parameters(), lr=1e-3)

criterion = nn.CrossEntropyLoss().to(device)
all_loss = []


## Train Model

In [None]:
prog_bar = tqdm(trainloader, desc='Training', leave=False)
for i, batch in enumerate(prog_bar):
    x = batch[0].to(device)

    # squeeze y to flatten predictions into 1d tensor
    y = batch[1].squeeze().to(device)

    pred = model(x.float())

    loss = criterion(pred, y)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    all_loss.append(loss.item())


  batch = [ torch.Tensor(t).to(device) for t in batch ] # this is the broken line, because dim 1 is variable length
                                               

(4, 2)




ValueError: expected sequence of length 4 at dim 1 (got 1)

## Test Model

In [None]:
# Test model
correct_pred = {classname: 0 for classname in classes}
total_pred = {classname: 0 for classname in classes}

y_pred = []
y_true = []

prog_bar = tqdm(testloader, desc="Testing", leave=False)
with torch.no_grad():
    for i, batch in enumerate(prog_bar):
        x = batch[0].to(device)

        y = batch[1].squeeze().to(device)

        outs = model(x.float())

        _, preds = torch.max(outs, 1)

        for label, prediction in zip(y, preds):
            # convert label and prediction to current vals
            label = le.inverse_transform([label])[0]
            prediction = le.inverse_transform([prediction])[0]

            y_pred.append(prediction)
            y_true.append(label)

            if label == prediction:
                correct_pred[label] += 1  # this may not work
            total_pred[label] += 1

for classname, correct_count in correct_pred.items():
    # because of unbalanced data, we need to not print out any classes that didn't have any labels
    if total_pred[classname] != 0:
        accuracy = 100 * float(correct_count) / total_pred[classname]
        print(f"Accuracy for class: {classname:5s} is {accuracy:.1f} %")
    else:
        accuracy = 0.0
        print(f"CLASS NOT IN TEST BATCH: {classname:5s}")

# build conf matrix
conf = confusion_matrix(y_true, y_pred, labels=classes)
print(conf)

# review the classnames here
df_cm = pd.DataFrame(
    conf / conf.sum(axis=1)[:, np.newaxis], index=[i for i in classes], columns=[i for i in classes]
)

# classification report
acc_report = classification_report(y_true, y_pred)
print(acc_report)

# display conf matrix
plt.figure(figsize=(12, 7))

plt.xlabel("Ground Truths")
plt.ylabel("Predictions")
plt.title(label="fDOM Peak Detection Ratio Confusion Matrix")

sn.heatmap(df_cm, annot=True)


## Display metrics

In [None]:
# TODO: implement displaying metrics
f1_score = f1_score(y_true, y_pred, average=None)
bal_acc = balanced_accuracy_score(y_true, y_pred)

print(f1_score)
print(bal_acc)