In [1]:
import sys
import scipy.io
import scipy.signal as sig
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import numpy as np
import pandas as pd
import math
import os
from torch.utils.data import WeightedRandomSampler
from torch.optim.lr_scheduler import StepLR, LambdaLR, SequentialLR
from DataHandlers.DiagEnum import DiagEnum
import DataHandlers.DiagEnum
import DataHandlers.SAFERDataset as SAFERDataset

sys.modules["SAFERDataset"] = SAFERDataset
sys.modules["DiagEnum"] = DataHandlers.DiagEnum

### Load the SAFER dataset

In [3]:
feas2_pt_data, feas2_ecg_data = SAFERDataset.load_feas_dataset(2, "dataframe_reload")

[Errno 2] No such file or directory: 'D:\\2022_23_DSiromani\\Feas2\\ECGs/filtered_dataframe_reload.pk'
Failed to load from pickle, regenerating files
Reading file ECGs/000000/saferF2_000002

  ecg_data = pd.read_csv(os.path.join(dataset_path, "rec_data_anon.csv"))
  for ind, file_path in ecg_data["file_path"].iteritems():
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ecg_data["data"].loc[ind] = record.p_signal[:, 0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ecg_data["adc_gain"].loc[ind] = record.adc_gain[0]


Reading file ECGs/023000/saferF2_023259

In [None]:
# Fill in the undecided data with zenicor labels - doesnt seem to help
feas2_ecg_data["class_index"] = feas2_ecg_data["class_index"].where(feas2_ecg_data["measDiag"] != DiagEnum.Undecided, feas2_ecg_data["tag_orig_Poor_Quality"])

In [4]:
feas2_pt_data["noRecs"] = feas2_ecg_data["ptID"].value_counts()
feas2_pt_data["noHQrecs"] = feas2_ecg_data[feas2_ecg_data["class_index"] == 0]["ptID"].value_counts()

In [5]:
feas2_ecg_data["class_index"].value_counts()

0    22794
1      465
Name: class_index, dtype: int64

In [6]:
(feas2_pt_data["noRecs"] - feas2_pt_data["noHQrecs"]).sum()

465.0

### Initialise the model

In [8]:
import torch.nn as nn
import torch
from torchvision.ops import sigmoid_focal_loss

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

In [10]:
# Check cuda
print(torch.cuda.is_available())

True


### Create Dataloaders

In [11]:
# Onehot encoding
from torch.utils.data import Dataset, DataLoader

class Dataset(torch.utils.data.Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, dataset):
        'Initialization'
        self.dataset = dataset

  def __len__(self):
        'Denotes the total number of samples'
        return len(self.dataset.index)

  def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        row = self.dataset.iloc[index]

        X = row["data"]
        y = row["class_index"]
        ind = row.name

        return X, y, ind

In [120]:
# For SAFER data
# Split train and test data according to each patient
def make_SAFER_dataloaders(pt_data, ecg_data, test_frac, val_frac, only_clean_training=True):
    pt_data["noLQrecs"] = pt_data["noRecs"] - pt_data["noHQrecs"]  # for Feas1 this might include stuff flagged by zenicor as noisy?
    train_patients = []
    test_patients = []
    val_patients = []

    test_val_frac = test_frac + val_frac
    val_second_frac = val_frac/test_val_frac

    for val, df in pt_data.groupby("noLQrecs"):
        print(f"processing {val}")
        print(f"number of patients {len(df.index)}")

        n = math.floor(len(df.index) * test_val_frac)
        res = ((len(df.index) * test_val_frac) - n)/test_val_frac
        n += np.random.binomial(res, test_val_frac)
        test_val = df.sample(n)

        n = math.floor(len(test_val.index) * val_second_frac)
        res = ((len(test_val.index) * val_second_frac) - n)/val_second_frac
        n += np.random.binomial(res, val_second_frac)

        val = test_val.sample(n)
        val_patients.append(val)
        test_patients.append(test_val[~test_val["ptID"].isin(val["ptID"])])
        train_patients.append(df[~df["ptID"].isin(test_val["ptID"])])


    train_pt_df = pd.concat(train_patients)
    test_pt_df = pd.concat(test_patients)
    val_pt_df = pd.concat(val_patients) if len(val_patients) > 0 else pd.DataFrame()

    print(f"Test high quality: {test_pt_df['noHQrecs'].sum()} low quality: {test_pt_df['noLQrecs'].sum()} ")
    print(f"Train high quality: {train_pt_df['noHQrecs'].sum()} low quality: {train_pt_df['noLQrecs'].sum()} ")
    print(f"Validation high quality: {val_pt_df['noHQrecs'].sum()} low quality: {val_pt_df['noLQrecs'].sum()} ")

    train_dataloader = None
    test_dataloader = None
    val_dataloader = None

    train_dataset = None
    test_dataset = None
    val_dataset = None

    if not train_pt_df.empty:
        # get ECG datasets
        train_dataset = ecg_data[(ecg_data["ptID"].isin(train_pt_df["ptID"]))] # & (ecg_data["measDiag"] != DiagEnum.Undecided)]
        # Normalise
        train_dataset["data"] = (train_dataset["data"] - train_dataset["data"].map(lambda x: x.mean()))/train_dataset["data"].map(lambda x: x.std())

        class_counts = torch.tensor(train_dataset["class_index"].value_counts().values.astype(np.float32))
        a = class_counts[0]/(class_counts[0] + class_counts[1])

        sampler = WeightedRandomSampler(torch.tensor([a, 1-a]), 2)

        if only_clean_training:
            torch_dataset_train = Dataset(train_dataset[train_dataset["class_index"] == 0])
        else:
            torch_dataset_train = Dataset(train_dataset)

        train_dataloader = DataLoader(torch_dataset_train, batch_size=32, pin_memory=True, sampler=sampler)

    if not test_pt_df.empty:
        test_dataset = ecg_data[(ecg_data["ptID"].isin(test_pt_df["ptID"])) & (ecg_data["measDiag"] != DiagEnum.Undecided)]
        test_dataset["data"] = (test_dataset["data"] - test_dataset["data"].map(lambda x: x.mean()))/test_dataset["data"].map(lambda x: x.std())
        torch_dataset_test = Dataset(test_dataset)
        test_dataloader = DataLoader(torch_dataset_test, batch_size=32, shuffle=True, pin_memory=True)


    if not val_pt_df.empty:
        val_dataset = ecg_data[(ecg_data["ptID"].isin(val_pt_df["ptID"])) & (ecg_data["measDiag"] != DiagEnum.Undecided)]
        val_dataset["data"] = (val_dataset["data"] - val_dataset["data"].map(lambda x: x.mean()))/val_dataset["data"].map(lambda x: x.std())
        torch_dataset_val = Dataset(val_dataset)
        val_dataloader = DataLoader(torch_dataset_val, batch_size=32, shuffle=True, pin_memory=True)

    return train_dataloader, test_dataloader, val_dataloader, train_dataset, test_dataset, val_dataset

train_dataloader, test_dataloader, val_dataloader, train_dataset, test_dataset, val_dataset = make_SAFER_dataloaders(feas2_pt_data, feas2_ecg_data, test_frac=0.15, val_frac=0.25)

processing 0.0
number of patients 211
processing 1.0
number of patients 23
processing 2.0
number of patients 10
processing 3.0
number of patients 6
processing 4.0
number of patients 6
processing 5.0
number of patients 7
processing 6.0
number of patients 4
processing 7.0
number of patients 3
processing 9.0
number of patients 4
processing 10.0
number of patients 1
processing 11.0
number of patients 1
processing 12.0
number of patients 1
processing 13.0
number of patients 1
processing 14.0
number of patients 1
processing 15.0
number of patients 1
processing 17.0
number of patients 1
processing 18.0
number of patients 1
processing 22.0
number of patients 1
processing 27.0
number of patients 1
processing 28.0
number of patients 1
processing 34.0
number of patients 1
processing 43.0
number of patients 1
Test high quality: 2112.0 low quality: 16.0 
Train high quality: 20610.0 low quality: 449.0 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dataset["data"] = (train_dataset["data"] - train_dataset["data"].map(lambda x: x.mean()))/train_dataset["data"].map(lambda x: x.std())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset["data"] = (test_dataset["data"] - test_dataset["data"].map(lambda x: x.mean()))/test_dataset["data"].map(lambda x: x.std())


In [73]:
train_dataset["class_index"].value_counts()

0    20551
1      443
Name: class_index, dtype: int64

In [74]:
test_dataset["class_index"].value_counts()

0    2157
1      22
Name: class_index, dtype: int64

In [None]:
### Use only the labelled portions of feas2 data for training and testing

feas2_ecg_data_no_undecided = feas2_ecg_data[feas2_ecg_data["measDiag"] != DiagEnum.Undecided]

feas2_pt_data_no_undecided = feas2_pt_data.copy()
feas2_pt_data_no_undecided["noRecs"] = feas2_ecg_data_no_undecided["ptID"].value_counts()
feas2_pt_data_no_undecided["noHQRecs"] = feas2_ecg_data_no_undecided[feas2_ecg_data_no_undecided["measDiag"] != DiagEnum.PoorQuality]["ptID"].value_counts()

train_dataloader, test_dataloader, train_dataset, test_dataset = make_SAFER_dataloaders(feas2_pt_data_no_undecided, feas2_ecg_data_no_undecided, test_frac=0.2, only_clean_training=False)

In [None]:
feas2_ecg_data_unbalanced = feas2_ecg_data_unbalanced.drop(train_ecgs.index)
feas2_ecg_data_unbalanced["class_index"] = feas2_ecg_data_unbalanced["measDiag"].map(lambda x: int(x == DiagEnum.PoorQuality))

torch_dataset_test_unbalanced = Dataset(feas2_ecg_data_unbalanced)
test_unbalanced_dataloader = DataLoader(torch_dataset_test_unbalanced, batch_size=32, shuffle=True, pin_memory=True)

### Prepare for training

In [121]:
# Now import a model
import Models.NoiseCNN
import importlib
importlib.reload(Models.NoiseCNN)
from Models.NoiseCNN import CNN

In [122]:
if torch.cuda.is_available():
    print("Using Cuda")
    device = torch.device("cuda")
else:
    print("Using CPU")
    device = torch.device("cpu")

Using Cuda


In [127]:
num_epochs = 15
model = CNN().to(device)

# Use weightings to handle class imbalance

class_counts = torch.tensor(train_dataset["class_index"].value_counts().values.astype(np.float32))
a = class_counts[0]/(class_counts[0] + class_counts[1])

class binary_focal_loss(nn.Module):

    def __init__(self, _alpha, _gamma):
        super(binary_focal_loss, self).__init__()
        self.BCE_loss = torch.nn.BCEWithLogitsLoss(reduction="none")
        self.alpha = _alpha
        self.gamma = _gamma

    def forward(self, pred, targets):
        bce = self.BCE_loss(pred, targets)
        prob_correct = torch.exp(-bce)
        loss_unweighted = (1.0 - prob_correct)**self.gamma * bce
        loss_weighted = torch.where(targets == 1,
                           loss_unweighted * self.alpha,
                           loss_unweighted * (1-self.alpha))
        return torch.mean(loss_weighted)

loss_func = binary_focal_loss(0.5, 0)

sampler = WeightedRandomSampler(torch.tensor([a, 1-a]), 2)

optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
scheduler = StepLR(optimizer, step_size=6, gamma=0.5)

In [128]:
import copy
model = model.to(device)

def train(model):
    best_test_loss = 100
    best_epoch = -1
    best_model = copy.deepcopy(model).cpu()

    losses = []

    for epoch in range(num_epochs):
        total_loss = 0
        print(f"starting epoch {epoch} ...")
        # Train
        num_batches = 0
        model.train()
        for i, (signals, labels, _) in enumerate(train_dataloader):
            signals = torch.unsqueeze(signals.to(device), 1).float()
            labels = labels.float()

            optimizer.zero_grad()
            output = model(signals).to("cpu")[:, 0]
            # print("Output and labels")
            # print(output)
            # print(labels)
            loss = loss_func(output, labels)
            # print(f"Loss: {loss}")
            loss.backward()
            optimizer.step()
            num_batches += 1
            total_loss += float(loss)

        print(f"Epoch {epoch} finished with average loss {total_loss/num_batches}")
        print("Testing ...")
        # Test
        num_test_batches = 0
        test_loss = 0
        with torch.no_grad():
            model.eval()
            for i, (signals, labels, _) in enumerate(test_dataloader):
                signals = torch.unsqueeze(signals.to(device), 1).float()
                labels = labels.float()
                output = model(signals).to("cpu")[:, 0]
                # print("Output and labels")
                # print(output)
                # print(labels)
                loss = loss_func(output, labels)
                # print(f"Loss: {loss}")
                test_loss += float(loss)
                num_test_batches += 1

        print(f"Average test loss: {test_loss/num_test_batches}")
        losses.append([total_loss/num_batches, test_loss/num_test_batches])

        if test_loss/num_test_batches < best_test_loss:
            best_model = copy.deepcopy(model).cpu()
            best_test_loss = test_loss/num_test_batches
            best_epoch = epoch
        else:
            if best_epoch + 10 <= epoch:
                return best_model, losses

        scheduler.step()


    return best_model, losses

model, losses = train(model)
model = model.to(device)

starting epoch 0 ...
Epoch 0 finished with average loss 0.31923335790634155
Testing ...
Average test loss: 0.3471134379506111
starting epoch 1 ...
Epoch 1 finished with average loss 0.3120512366294861
Testing ...
Average test loss: 0.3470748960971832
starting epoch 2 ...
Epoch 2 finished with average loss 0.31248217821121216
Testing ...
Average test loss: 0.34730226546525955
starting epoch 3 ...
Epoch 3 finished with average loss 0.3066568076610565
Testing ...
Average test loss: 0.3472636640071869
starting epoch 4 ...
Epoch 4 finished with average loss 0.30651408433914185
Testing ...
Average test loss: 0.34794066846370697
starting epoch 5 ...
Epoch 5 finished with average loss 0.3015957772731781
Testing ...
Average test loss: 0.34653452783823013
starting epoch 6 ...
Epoch 6 finished with average loss 0.3187685012817383
Testing ...
Average test loss: 0.347627155482769
starting epoch 7 ...
Epoch 7 finished with average loss 0.308758407831192
Testing ...
Average test loss: 0.3467519730329

In [None]:
# Save a model
torch.save(model.state_dict(), "TrainedModels/CNN_AlexNet_LSTM_Bidirectional_Final_States.pt")
train_dataset.to_pickle("TrainedModels/CNN_AlexNet_LSTM_Bidirectional_Final_States_feas2_train_set.pk")

In [None]:
# Load a model
model = CNN().to(device)
model.load_state_dict(torch.load("TrainedModels/CNN_AlexNet_LSTM_Bidirectional_Final_States.pt", map_location=device))

In [None]:
# Load a training set and get a test set from its exclusion
train_dataset = pd.read_pickle("TrainedModels/CNN_AlexNet_LSTM_Bidirectional_Final_States_feas2_train_set.pk")

train_dataset["data"] = (train_dataset["data"] - train_dataset["data"].map(lambda x: x.mean()))/train_dataset["data"].map(lambda x: x.std())
torch_dataset_train = Dataset(train_dataset)
train_dataloader = DataLoader(torch_dataset_train, batch_size=32, shuffle=True, pin_memory=True)

test_pt_df = feas2_pt_data[~feas2_pt_data["ptID"].isin(train_dataset["ptID"])]

if not test_pt_df.empty:
    test_dataset = test_dataset[test_dataset["measDiag"] != DiagEnum.Undecided]
    test_dataset["data"] = (test_dataset["data"] - test_dataset["data"].map(lambda x: x.mean()))/test_dataset["data"].map(lambda x: x.std())
    torch_dataset_test = Dataset(test_dataset)
    test_dataloader = DataLoader(torch_dataset_test, batch_size=32, shuffle=True, pin_memory=True)

In [None]:
# Garbage collection - in case of CUDA out of memory error
import gc
model = None
signals = None
labels = None
gc.collect() # Python thing
torch.cuda.empty_cache()

### Model testing

In [125]:
test_dataset["prediction"] = None

def get_predictions(model, dataloader, dataset):
    model.eval()

    true_labels = []
    predictions = []

    outputs = []
    inds = []

    with torch.no_grad():
        for i, (signals, labels, ind) in enumerate(dataloader):
            signals = torch.unsqueeze(signals.to(device), 1).float()
            labels = labels.detach().numpy()
            true_labels.append(labels)

            optimizer.zero_grad()
            output = model(signals).detach().to("cpu").numpy()

            prediction = output # np.argmax(output, axis=-1)
            predictions.append(prediction)

            for i, o in zip(ind, output):
                outputs.append(o[0])
                inds.append(int(i))

    dataset["prediction"] = pd.Series(data=outputs, index=inds)

    predictions = np.concatenate(predictions)
    true_labels = np.concatenate(true_labels)

    return predictions, true_labels

predictions, true_labels = get_predictions(model, test_dataloader, test_dataset)
conf_mat = confusion_matrix(true_labels, predictions > 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset["prediction"] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["prediction"] = pd.Series(data=outputs, index=inds)


In [126]:
#ConfusionMatrixDisplay.from_predictions(true_labels, predictions, display_labels=["sufficint quality", "insufficient quality"], cmap="inferno")

# Same as the below function (as described in CinC)
def F1_ind(conf_mat, ind):
    return (2 * conf_mat[ind, ind])/(np.sum(conf_mat[ind]) + np.sum(conf_mat[:, ind]))

"""
def bin_F1_score(conf_mat, ind):
    return conf_mat[ind, ind]/(conf_mat[ind, ind] + 0.5 * (conf_mat[0, 1] + conf_mat[1, 0]))

print(f"Normal F1: {bin_F1_score(conf_mat, 0)}")
print(f"Noisy F1: {bin_F1_score(conf_mat, 1)}")
"""
print("Confusion matrix:")
print(conf_mat)

print(f"Sensitivity: {conf_mat[1, 1]/np.sum(conf_mat[1])}")
print(f"Specificity: {conf_mat[0, 0]/np.sum(conf_mat[0])}")

print(f"Normal F1: {F1_ind(conf_mat, 0)}")
print(f"Noisy F1: {F1_ind(conf_mat, 1)}")

Confusion matrix:
[[34 22]
 [35 23]]
Sensitivity: 0.39655172413793105
Specificity: 0.6071428571428571
Normal F1: 0.544
Noisy F1: 0.44660194174757284


In [None]:
from sklearn.metrics import precision_recall_curve
p, r, d = precision_recall_curve(true_labels, predictions)

point = np.argmin(np.abs(d - 0.5))
p_point = p[point]
r_point = r[point]

fig = go.Figure()
fig.add_trace(go.Scatter(x=r, y=p, hovertext=[f"decision boundary: {x:.2f}" for x in d]))

fig.update_xaxes(title="Recall")
fig.update_yaxes(title="Precision")
fig.show()

In [None]:
plot_dataset = test_dataset[(test_dataset["prediction"] > 0) & (test_dataset["class_index"] == 0)]

from matplotlib.ticker import AutoMinorLocator

def plot_ecg(x, fs=500):
    sample_len = x.shape[0]
    time_axis = np.arange(sample_len)/fs

    y_step = 2

    cuts = [0, sample_len//3, (sample_len*2)//3, sample_len-1]

    fig, ax = plt.subplots(3, 1, figsize=(8, 6))
    for j in range(3):
        ax[j].plot(time_axis[cuts[j]:cuts[j+1]], x[cuts[j]:cuts[j+1]])
        ax[j].set_xlabel("Time")
        ax[j].set_xlim((time_axis[cuts[j]], time_axis[cuts[j+1]]))

        t_s = time_axis[cuts[j]]
        t_f = time_axis[cuts[j+1]]
        time_ticks = np.arange(t_s - t_s%0.2, t_f + (0.2 - t_f%0.2), 0.2)
        decimal_labels = ~np.isclose(time_ticks, np.round(time_ticks))
        time_labels = np.round(time_ticks).astype(int).astype(str)
        time_labels[decimal_labels] = ""

        ax[j].set_xticks(time_ticks, labels=time_labels)
        ax[j].set_yticks(np.arange(x.min()-y_step, x.max()+y_step, y_step))

        # ax[j].xaxis.set_major_formatter(plt.NullFormatter())
        # ax[j].yaxis.set_major_formatter(plt.NullFormatter())

        ax[j].xaxis.set_minor_locator(AutoMinorLocator(5))
        ax[j].yaxis.set_minor_locator(AutoMinorLocator(5))

        ax[j].set_ylim((x.min()-y_step, x.max()+y_step))
        ax[j].set_xlim((t_s, t_f))

        ax[j].grid(which='major', linestyle='-', linewidth='0.2', color='black')
        ax[j].grid(which='minor', linestyle='-', linewidth='0.2', color='lightgray')

    fig.tight_layout()
    # plt.savefig("test_ecg_plot.png", dpi=300)
    # plt.show()

c = DiagEnum.CannotExcludePathology

for _, ecg in plot_dataset[plot_dataset["measDiag"].map(lambda x: x.value) == 3].sample(frac=1).iterrows():
    print(ecg[["ptDiag", "tag_orig_Poor_Quality", "poss_AF_tag", "measDiag", "prediction"]])
    plot_ecg(ecg["data"], 300)
    plt.show()

In [None]:
# Used to generate figures for the report

def sigmoid(x):
    return 1/ (1 + np.exp(-x))

def plot_ecg_section_examples(xs, ranges, titles, fs=300):
    fig, ax = plt.subplots(len(xs), 1, figsize=(6, 4))

    for j, (x, r, t) in enumerate(zip(xs, ranges, titles)):
        sample_len = r[1] - r[0]
        time_axis = np.arange(sample_len)/fs

        ax[j].plot(time_axis, x[r[0]:r[1]])
        ax[j].set_xlabel("Time")
        ax[j].set_xlim((time_axis[0], time_axis[-1]))

        t_s = time_axis[0]
        t_f = time_axis[-1]
        time_ticks = np.arange(t_s - t_s%0.2, t_f + (0.2 - t_f%0.2), 0.2)
        decimal_labels = ~np.isclose(time_ticks, np.round(time_ticks))
        time_labels = np.round(time_ticks).astype(int).astype(str)
        time_labels[decimal_labels] = ""
        ax[j].set_xticks(time_ticks, time_labels)

        ax[j].set_title(t)

        ax[j].yaxis.set_major_formatter(plt.NullFormatter())

        ax[j].xaxis.set_minor_locator(AutoMinorLocator(5))
        ax[j].yaxis.set_minor_locator(AutoMinorLocator(5))

        ax[j].grid(which='major', linestyle='-', linewidth='0.5', color='black')
        ax[j].grid(which='minor', linestyle='-', linewidth='0.5', color='lightgray')

    fig.tight_layout()
    plt.show()
    # plt.savefig("TMRFigures/cnn_false_positive_examples.png")

ecg_ind_list = [1248, 12804]
ranges = [(0, 3010), (3000, 6010)]

xs = plot_dataset.loc[ecg_ind_list]["data"].tolist()

titles = plot_dataset.loc[ecg_ind_list].apply(lambda x: f"{x['measDiag'].name}, p(noisy) = {sigmoid(x['prediction']):.3f}", axis=1)   # ["measDiag"].map(lambda x: x.name).tolist()
print(len(titles))

plot_ecg_section_examples(xs, ranges, titles)

### Perform cross validation

In [86]:
# Cross Validation dataset construction for SAFER data
# Split train and test data according to each patient
feas2_pt_data["noLQrecs"] = feas2_pt_data["noRecs"] - feas2_pt_data["noHQrecs"]

num_folds = 5
test_pt_folds = [[] for _ in range(num_folds)]

sorted_pts = feas2_pt_data.sort_values("noLQrecs", axis=0)
group_num = 0

# Go around the folds and assign patients to each
for _, pt in sorted_pts.iterrows():
    test_pt_folds[group_num].append(pt)
    group_num = (group_num + 1) % num_folds

test_pt_folds = [pd.DataFrame(fold) for fold in test_pt_folds]
train_pt_folds = [feas2_pt_data[~feas2_pt_data["ptID"].isin(fold["ptID"])] for fold in test_pt_folds]

conf_mats = []

num_epochs = 10

for i, (train_pt_df, test_pt_df) in enumerate(zip(train_pt_folds, test_pt_folds)):
    print(f"Fold {i}")
    train_df = feas2_ecg_data[feas2_ecg_data["ptID"].isin(train_pt_df["ptID"])]
    test_df = feas2_ecg_data[(feas2_ecg_data["ptID"].isin(test_pt_df["ptID"])) & (feas2_ecg_data["measDiag"] != DiagEnum.Undecided)]

    torch_dataset_train = Dataset(train_df)
    torch_dataset_test = Dataset(test_df)

    print(train_df["class_index"].value_counts())
    print(test_df["class_index"].value_counts())

    train_dataloader = DataLoader(torch_dataset_train, batch_size=32, shuffle=True, pin_memory=True)
    test_dataloader = DataLoader(torch_dataset_test, batch_size=32, shuffle=True, pin_memory=True)

    model = CNN().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.00002)
    scheduler = StepLR(optimizer, step_size=8, gamma=0.5)

    num_batches = len(train_dataloader)
    num_test_batches = len(test_dataloader)

    model, losses = train(model)
    model = model.to(device)

    predictions, true_labels = get_predictions(model, test_dataloader, test_df)
    conf_mat = confusion_matrix(true_labels, predictions > 0)

    print(conf_mat)

    conf_mats.append(conf_mat)

Fold 0
0    18243
1      364
Name: class_index, dtype: int64
1    101
0     96
Name: class_index, dtype: int64
starting epoch 0 ...
Epoch 0 finished with average loss 0.0072191464426668035
Testing ...
Average test loss: 0.04729761076825006
starting epoch 1 ...
Epoch 1 finished with average loss 0.006027607661687939
Testing ...
Average test loss: 0.04210886039904186
starting epoch 2 ...
Epoch 2 finished with average loss 0.004987984370437044
Testing ...
Average test loss: 0.036189168957727294
starting epoch 3 ...
Epoch 3 finished with average loss 0.0042832474935563004
Testing ...
Average test loss: 0.035378552973270416
starting epoch 4 ...
Epoch 4 finished with average loss 0.003657153038079194
Testing ...
Average test loss: 0.03588196182889598
starting epoch 5 ...
Epoch 5 finished with average loss 0.0034248441484951994
Testing ...
Average test loss: 0.0334846382694585
starting epoch 6 ...
Epoch 6 finished with average loss 0.0029262187092603405
Testing ...
Average test loss: 0.038977

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["prediction"] = pd.Series(data=outputs, index=inds)


[[77 19]
 [ 4 97]]
Fold 1
0    18128
1      429
Name: class_index, dtype: int64
0    83
1    36
Name: class_index, dtype: int64
starting epoch 0 ...
Epoch 0 finished with average loss 0.007020425285887102
Testing ...
Average test loss: 0.03930687624961138
starting epoch 1 ...
Epoch 1 finished with average loss 0.0061765740693803745
Testing ...
Average test loss: 0.030746130738407373
starting epoch 2 ...
Epoch 2 finished with average loss 0.005350805167108774
Testing ...
Average test loss: 0.027397357393056154
starting epoch 3 ...
Epoch 3 finished with average loss 0.004753655232970827
Testing ...
Average test loss: 0.03136730333790183
starting epoch 4 ...
Epoch 4 finished with average loss 0.004179629686103877
Testing ...
Average test loss: 0.0219472695607692
starting epoch 5 ...
Epoch 5 finished with average loss 0.0036557715970236037
Testing ...
Average test loss: 0.0299018993973732
starting epoch 6 ...
Epoch 6 finished with average loss 0.003486844927979762
Testing ...
Average test 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["prediction"] = pd.Series(data=outputs, index=inds)


Epoch 0 finished with average loss 0.006556240876890578
Testing ...
Average test loss: 0.036216942593455315
starting epoch 1 ...
Epoch 1 finished with average loss 0.0053969075340988505
Testing ...
Average test loss: 0.02984806173481047
starting epoch 2 ...
Epoch 2 finished with average loss 0.0045434621463834885
Testing ...
Average test loss: 0.02626833397274216
starting epoch 3 ...
Epoch 3 finished with average loss 0.004010197097087346
Testing ...
Average test loss: 0.03198685341825088
starting epoch 4 ...
Epoch 4 finished with average loss 0.003633790676430903
Testing ...
Average test loss: 0.02926559226276974
starting epoch 5 ...
Epoch 5 finished with average loss 0.003087441179155345
Testing ...
Average test loss: 0.04695548536255956
starting epoch 6 ...
Epoch 6 finished with average loss 0.0029179069779072067
Testing ...
Average test loss: 0.03300327931841215
starting epoch 7 ...
Epoch 7 finished with average loss 0.0027218349579494847
Testing ...
Average test loss: 0.0344175873

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["prediction"] = pd.Series(data=outputs, index=inds)


[[212  85]
 [  7  77]]
Fold 3
0    18352
1      254
Name: class_index, dtype: int64
1    211
0    171
Name: class_index, dtype: int64
starting epoch 0 ...
Epoch 0 finished with average loss 0.005953045097448546
Testing ...
Average test loss: 0.06884775155534346
starting epoch 1 ...
Epoch 1 finished with average loss 0.00497480205266618
Testing ...
Average test loss: 0.06763351957003276
starting epoch 2 ...
Epoch 2 finished with average loss 0.00417460449267166
Testing ...
Average test loss: 0.07970229256898165
starting epoch 3 ...
Epoch 3 finished with average loss 0.00354773373362253
Testing ...
Average test loss: 0.08476139542957146
starting epoch 4 ...
Epoch 4 finished with average loss 0.0029527926729524773
Testing ...
Average test loss: 0.1131293053428332
starting epoch 5 ...
Epoch 5 finished with average loss 0.0027213863631601596
Testing ...
Average test loss: 0.09249040391296148
starting epoch 6 ...
Epoch 6 finished with average loss 0.0025322476861585584
Testing ...
Average te

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["prediction"] = pd.Series(data=outputs, index=inds)


[[108  63]
 [ 30 181]]
Fold 4
0    18201
1      432
Name: class_index, dtype: int64
0    129
1     33
Name: class_index, dtype: int64
starting epoch 0 ...
Epoch 0 finished with average loss 0.007128834956857547
Testing ...
Average test loss: 0.022524835697064798
starting epoch 1 ...
Epoch 1 finished with average loss 0.006142732488809355
Testing ...
Average test loss: 0.02084058220498264
starting epoch 2 ...
Epoch 2 finished with average loss 0.005095461859107401
Testing ...
Average test loss: 0.020143028581514955
starting epoch 3 ...
Epoch 3 finished with average loss 0.004315795180408562
Testing ...
Average test loss: 0.022115352874000866
starting epoch 4 ...
Epoch 4 finished with average loss 0.0038698757585301025
Testing ...
Average test loss: 0.02639660966815427
starting epoch 5 ...
Epoch 5 finished with average loss 0.0035194192068910106
Testing ...
Average test loss: 0.022788536657268803
starting epoch 6 ...
Epoch 6 finished with average loss 0.003065909781492448
Testing ...
Ave

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["prediction"] = pd.Series(data=outputs, index=inds)


In [88]:
f1_scores_normal = [F1_ind(c, 0) for c in conf_mats]
f1_scores_noisy = [F1_ind(c, 1) for c in conf_mats]

print(f"Mean F1 normal: {np.mean(f1_scores_normal)}")
print(f"Mean F1 noisy: {np.mean(f1_scores_noisy)}")
print(f"Individual F1 scores (noisy): {f1_scores_noisy}")

Mean F1 normal: 0.8312086798789824
Mean F1 noisy: 0.7518072931526905
Individual F1 scores (noisy): [0.8940092165898618, 0.85, 0.6260162601626016, 0.7956043956043956, 0.5934065934065934]


In [None]:
for c in conf_mats:
    print(c)

### Test on the noise stress test database

In [None]:
# load stress test noise data
import wfdb
import os
import scipy.signal

noise_stress_test_db = "mit-bih-noise-stress-test-database"
stress_test_files = ["118e24", "119e24", "118e06", "118e00", "118e_6", "119e06", "119e00", "119e_6"]

labels = []
noise_level = []
samples = []

# Additionally band pass filter
def filter_ecg(x, fs):
    b, a = scipy.signal.butter(3, [0.66, 50], 'band', fs=fs)
    x = scipy.signal.filtfilt(b, a, x, padlen=150)
    x = (x - min(x)) / (max(x) - min(x))
    return x

for file in stress_test_files:
    try:
        print(f"Reading file: {file}")
        data = wfdb.io.rdrecord(os.path.join(noise_stress_test_db, file))
        all_data_v1 = data.p_signal[:,1]
        # Resample to 300Hz
        all_data_v1 = scipy.signal.resample(all_data_v1, int(all_data_v1.shape[0] * 300/data.fs))
        # all_data_v1 = filter_ecg(all_data_v1, data.fs)
        # all_data_v1 = adaptive_gain_norm(all_data_v1, 501)

        sec_len = 300 * 30  # 30s segments
        i = 1
        while i * sec_len < all_data_v1.shape[0]:
            s = all_data_v1[(i-1)*sec_len:i*sec_len]
            samples.append(s)
            noise_level.append(file.split("e")[-1])

            if i * 30 < 300:
                labels.append("N")
            elif (i * 30 - 300) % 240 > 120 or (i * 30 - 300) % 240 == 0:
                labels.append("N")
            elif (i * 30 - 300) % 240 <= 120:
                labels.append("~")

            i += 1

    except ValueError:
        print("error, scipping file")
        continue


nst_df = pd.DataFrame({"data": samples, "class": labels, "noise_level": noise_level})
pk_path = "mit-bih-noise-stress-test-database/database.pk"
nst_df.to_pickle(pk_path)

In [None]:
nst_df["class_index"] = (nst_df["class"] == "~").astype(int)

class NSTDataset(torch.utils.data.Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, dataset):
        'Initialization'
        self.dataset = dataset

  def __len__(self):
        'Denotes the total number of samples'
        return len(self.dataset.index)

  def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        row = self.dataset.iloc[index]

        X = row["data"]  # The only dataset and nst dataset difference is in this line!
        y = row["class_index"]

        return X, y

# Normalise the data
nst_df["data"] = (nst_df["data"] - nst_df["data"].map(lambda x: x.mean()))/nst_df["data"].map(lambda x: x.std())

torch_dataset_nst = NSTDataset(nst_df)
nst_dataloader = DataLoader(torch_dataset_nst, batch_size=32, shuffle=True, pin_memory=True)

In [None]:
model.eval()

true_labels = []
predictions = []

false_positives = []
true_negatives = []

with torch.no_grad():
    for i, (signals, labels) in enumerate(nst_dataloader):
        signals = torch.unsqueeze(signals.to(device), 1).float()
        # fft = torch.abs(torch.fft.fft(signals))
        # signals = torch.cat([signals, fft], dim=1)
        labels = labels.detach().numpy()
        true_labels.append(labels)

        optimizer.zero_grad()
        output = model(signals).detach().to("cpu").numpy()

        prediction = np.argmax(output, axis=-1)
        false_positive = np.logical_and(labels == 0, prediction == 1)
        false_positives.append(signals[false_positive, 0, :].cpu().detach().numpy())

        true_negative = np.logical_and(labels == 1, prediction == 0)
        true_negatives.append(signals[true_negative, 0, :].cpu().detach().numpy())

        predictions.append(np.argmax(output, axis=-1))

predictions = np.concatenate(predictions)
true_labels = np.concatenate(true_labels)
false_positives = np.concatenate(false_positives, axis=0)
true_negatives = np.concatenate(true_negatives, axis=0)

In [None]:
ConfusionMatrixDisplay.from_predictions(true_labels, predictions, display_labels=["sufficint quality", "insufficient quality"], cmap="inferno")

conf_mat = confusion_matrix(true_labels, predictions)

def F1_ind(conf_mat, ind):
    return (2 * conf_mat[ind, ind])/(np.sum(conf_mat[ind]) + np.sum(conf_mat[:, ind]))

print(f"Normal F1: {F1_ind(conf_mat, 0)}")
print(f"Other F1: {F1_ind(conf_mat, 1)}")
# print(f"AF F1: {F1_ind(conf_mat, 2)}")
# print(f"Noisy F1: {F1_ind(conf_mat, 3)}")

print(f"Average F1 score: {sum([F1_ind(conf_mat, i) for i in range(2)])/2}")

In [None]:
index = 40
print(false_positives.shape)

fig = go.Figure(go.Scatter(y=false_positives[index]))
fig.show()

In [None]:
index = 15
print(true_negatives.shape)

fig = go.Figure(go.Scatter(y=true_negatives[index]))
fig.show()