In [3]:
import scipy.io
import scipy.signal as sig
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys

import DataHandlers.CinCDataset as CinCDataset
import DataHandlers.SAFERDataset as SAFERDataset

import importlib
importlib.reload(SAFERDataset)
importlib.reload(CinCDataset)

from DataHandlers.DiagEnum import DiagEnum, feas1DiagToEnum

import matplotlib
matplotlib.rcParams["text.usetex"] = False

# A fudge because I moved the files
sys.modules["SAFERDataset"] = SAFERDataset
sys.modules["CinCDataset"] = CinCDataset

### Load the data

In [5]:
feas2_pt_data, feas2_ecg_data = SAFERDataset.load_feas_dataset(2, "dataframe")
feas2_ecg_data = feas2_ecg_data[feas2_ecg_data["length"] == 9120]

In [None]:
feas1_pt_data, feas1_ecg_data = SAFERDataset.load_feas_dataset(1, ecg_meas_diag=[d for d in DiagEnum if d != DiagEnum.Undecided])

In [None]:
# Load specially cleaned data

feas2_ecg_data = pd.read_pickle(r"C:\Users\daniel\Documents\2022_23_DSiromani\Feas2\ECGs\filtered_dataframe.pk")
feas2_pt_data = pd.read_csv(r"C:\Users\daniel\Documents\2022_23_DSiromani\Feas2\pt_data_anon.csv")
feas2_pt_data["ptID"] += 10000
feas2_ecg_data["ptID"] += 10000

feas1_ecg_data_clean = pd.read_pickle(r"C:\Users\daniel\Documents\2022_23_DSiromani\Feas1\ECGs\clean_ecg_dataset.pk")
feas1_pt_data = pd.read_csv(r"C:\Users\daniel\Documents\2022_23_DSiromani\Feas1\pt_data_anon.csv")
print(len(feas1_ecg_data_clean.index))

feas2_ecg_data_clean = pd.read_pickle(r"C:\Users\daniel\Documents\2022_23_DSiromani\Feas2\ECGs\clean_ecg_dataset.pk")
feas2_ecg_data_clean["ptID"] += 10000
print(len(feas2_ecg_data_clean.index))

all_clean_data = pd.concat([feas2_ecg_data_clean, feas1_ecg_data_clean], ignore_index=True)
all_clean_pt = pd.concat([feas2_pt_data[feas2_pt_data["ptID"].isin(feas2_ecg_data_clean["ptID"])], feas1_pt_data[feas1_pt_data["ptID"].isin(feas1_ecg_data_clean["ptID"])]])

all_clean_pt.set_index("ptID", drop=False, inplace=True)
all_clean_pt["noRecs"] = all_clean_data["ptID"].value_counts()
all_clean_pt["noHQrecs"] = all_clean_pt["noRecs"]
all_clean_pt.head()

### Setup the model

In [7]:
import torch.nn as nn
import torch

In [49]:
class CVAE(nn.Module):

    def __init__(self, z_dim):
        super(CVAE, self).__init__()
        self.z_dim = z_dim

        self.conv_section1 = nn.Sequential(
            nn.Conv1d(1, 16, 19, padding='same'),
            nn.ReLU(),
            nn.BatchNorm1d(16),
            nn.Conv1d(16, 16, 19, padding='same'),
            nn.ReLU(),
            nn.BatchNorm1d(16)
        )

        self.conv_section2 = nn.Sequential(
            nn.Conv1d(16, 16, 19, padding='same'),
            nn.ReLU(),
            nn.BatchNorm1d(16),
            nn.Conv1d(16, 16, 19, padding='same'),
            nn.ReLU(),
            nn.BatchNorm1d(16)
        )

        self.conv_section3 = nn.Sequential(
            nn.Conv1d(16, 32, 9, padding='same'),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.Conv1d(32, 32, 9, padding='same'),
            nn.ReLU(),
            nn.BatchNorm1d(32)
        )

        self.conv_section4 = nn.Sequential(
            nn.Conv1d(32, 32, 9, padding='same'),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.Conv1d(32, 32, 9, padding='same'),
            nn.ReLU(),
            nn.BatchNorm1d(32)
        )

        self.conv_section5 = nn.Sequential(
            nn.Conv1d(32, 64, 7, padding='same'),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Conv1d(64, 64, 7, padding='same'),
            nn.ReLU(),
            nn.BatchNorm1d(64)
        )

        """
        self.conv_section6 = nn.Sequential(
            nn.Conv1d(64, 64, 19, padding='same'),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Conv1d(64, 64, 19, padding='same'),
            nn.ReLU(),
            nn.BatchNorm1d(64)
        )
        """

        self.conv_section7 = nn.Sequential(
            nn.Conv1d(64, 80, 7, padding='same'),
            nn.ReLU(),
            nn.BatchNorm1d(80),
            nn.Conv1d(80, 80, 7, padding='same'),
            nn.ReLU(),
            nn.BatchNorm1d(80)
        )

        self.encoder_linear = nn.Linear(5120, z_dim*2)
        self.decoder_linear = nn.Linear(z_dim, 5120)

        self.decoder_batchnorm = nn.BatchNorm1d(5120)

        self.transconv_section1 = nn.Sequential(
            nn.ConvTranspose1d(16, 1, 19, padding=9, stride=1),
        )

        self.transconv_section2 = nn.Sequential(
            nn.ConvTranspose1d(16, 16, 20, padding=9, stride=2),
            nn.ReLU(),
            nn.BatchNorm1d(16),
        )

        self.transconv_section3 = nn.Sequential(
            nn.ConvTranspose1d(32, 16, 20, padding=9, stride=2),
            nn.ReLU(),
            nn.BatchNorm1d(16),
        )

        self.transconv_section4 = nn.Sequential(
            nn.ConvTranspose1d(48, 32, 10, padding=4, stride=2),
            nn.ReLU(),
            nn.BatchNorm1d(32),
        )

        self.transconv_section5 = nn.Sequential(
            nn.ConvTranspose1d(64, 48, 10, padding=4, stride=2),
            nn.ReLU(),
            nn.BatchNorm1d(48),
        )

        self.transconv_section6 = nn.Sequential(
            nn.ConvTranspose1d(64, 64, 8, padding=3, stride=2),
            nn.ReLU(),
            nn.BatchNorm1d(64)
        )

        self.transconv_section7 = nn.Sequential(
            nn.ConvTranspose1d(80, 64, 7, padding=3, stride=1),
            nn.ReLU(),
            nn.BatchNorm1d(64)
        )

        self.dropout = nn.Dropout()

    def encode(self, x):
        # [1, 2048]
        x = self.conv_section1(x)
        x = nn.functional.max_pool1d(x, 2)

        # [16, 1024]
        x = self.conv_section2(x) + x
        x = nn.functional.max_pool1d(x, 2)

        # [32, 512]
        x = self.conv_section3(x)
        x = nn.functional.max_pool1d(x, 2)

        # [32, 256]
        x = self.conv_section4(x) + x
        x = nn.functional.max_pool1d(x, 2)

        # [64, 128]
        x = self.conv_section5(x)
        x = nn.functional.max_pool1d(x, 2)

        # [64, 64]
        # x = self.conv_section6(x)

        # [64, 64]
        x = self.conv_section7(x)

        # [80, 64]
        x = torch.flatten(x, -2)

        # [5120]
        x = self.encoder_linear(x)

        # Apply exp activation function to variances
        x[:, self.z_dim:] = torch.exp(x[:, self.z_dim:])

        return x

    def decode(self, z):
        # [z_dim]
        z = self.decoder_linear(z)
        z = self.decoder_batchnorm(z)
        z = torch.nn.functional.relu(z)

        # [5120]
        z = torch.reshape(z, (-1, 80, 64))
        # [80, 64]
        z = self.transconv_section7(z)
        # print(z.shape)
        # [64, 64]
        z = self.transconv_section6(z)
        # print(z.shape)
        # [64, 64]
        z = self.transconv_section5(z)
        # print(z.shape)
        # [64, 128]
        z = self.transconv_section4(z)
        # print(z.shape)
        # [48, 256]
        z = self.transconv_section3(z)
        # print(z.shape)
        # [32, 512]
        z = self.transconv_section2(z)
        # print(z.shape)
        # [16, 1024]
        z = self.transconv_section1(z)
        # print(z.shape)
        # [1, 2048]
        return z


    def forward(self, x):
        batch_size = x.shape[0]
        device = x.device

        z_dist = self.encode(x)

        z = torch.randn((batch_size, self.z_dim)).to(device) * torch.abs(z_dist[:, self.z_dim:]) + z_dist[:, :self.z_dim]

        # x = self.decode(z)

        return z_dist

In [22]:
# Onehot encoding
from torch.utils.data import Dataset, DataLoader

class Dataset(torch.utils.data.Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, dataset):
        'Initialization'
        self.dataset = dataset

  def __len__(self):
        'Denotes the total number of samples'
        return len(self.dataset.index)

  def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        row = self.dataset.iloc[index]

        X = row["data"]
        y = row["class_index"]
        ind = row.name

        return X, y, ind

class IDDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

        ids = pd.unique(self.dataset["ptID"])
        for id in ids:
            num_samples = pd.value_counts(self.dataset["ptID"] == id)[True]
            pos = self.dataset[self.dataset["ptID"] == id].sample(num_samples)
            neg = self.dataset[self.dataset["ptID"] != id].sample(num_samples)

            self.dataset.loc[self.dataset["ptID"] == id, "pos_sample"] = pos.index.values
            self.dataset.loc[self.dataset["ptID"] == id, "neg_sample"] = neg.index.values

    def __len__(self):
        return len(self.dataset.index)

    def __getitem__(self, index):
        # Select sample
        anchor = self.dataset.iloc[index]
        Xa = anchor["data"]
        ya = anchor["ptID"]
        ind = anchor.name

        pos = self.dataset.loc[anchor["pos_sample"]]
        Xp = pos["data"]

        neg = self.dataset.loc[anchor["neg_sample"]]
        Xn = neg["data"]
        yn = neg["ptID"]

        return Xa, Xp, Xn, ya, yn, ind

In [23]:
def split_to_segments(dataset, new_len, orig_len, overlap=0):
    sections = []

    step = int(round(new_len * (1 - overlap)))
    num_sections = (orig_len - (new_len - step)) // step
    for _, series in dataset.iterrows():
        for i in range(num_sections):
            section_series = series.copy()
            section_series["data"] = section_series["data"][i*step: i*step + new_len]
            section_series["rec_ind"] = series.name
            section_series["rec_pos"] = i
            # Keep all other data (ptid, measDiag etc the same for each section as the source ECG)
            sections.append(section_series)

    return pd.DataFrame(sections).reset_index()

In [112]:
# For SAFER data
# Split train and test data according to each patient
def make_SAFER_dataloaders(pt_data, ecg_data, test_frac, only_clean_training=True):
    pt_data["noLQrecs"] = pt_data["noRecs"] - pt_data["noHQrecs"]  # for Feas1 this might include stuff flagged by zenicor as noisy?
    train_patients = []
    test_patients = []

    for val, df in pt_data.groupby("noLQrecs"):
        # print(f"processing {val}")
        # print(f"number of patients {len(df.index)}")
        test = df.sample(frac=test_frac)
        test_patients.append(test)
        train_patients.append(df[~df["ptID"].isin(test["ptID"])])

    train_pt_df = pd.concat(train_patients)
    test_pt_df = pd.concat(test_patients)

    print(f"Test high quality: {test_pt_df['noHQrecs'].sum()} low quality: {test_pt_df['noLQrecs'].sum()} ")
    print(f"Train high quality: {train_pt_df['noHQrecs'].sum()} low quality: {train_pt_df['noLQrecs'].sum()} ")

    train_dataloader = None
    test_dataloader = None

    train_dataset = None
    test_dataset = None

    if not train_pt_df.empty:
        # get ECG datasets
        train_dataset = split_to_segments(ecg_data[ecg_data["ptID"].isin(train_pt_df["ptID"])], 2048, 9120, 0.5)
        # Normalise
        train_dataset["data"] = (train_dataset["data"] - train_dataset["data"].map(lambda x: x.mean()))/train_dataset["data"].map(lambda x: x.std())

        if only_clean_training:
            torch_dataset_train = Dataset(train_dataset[train_dataset["class_index"] == 0])
        else:
            torch_dataset_train = Dataset(train_dataset)

        train_dataloader = DataLoader(torch_dataset_train, batch_size=64, shuffle=True, pin_memory=True)

    if not test_pt_df.empty:
        test_dataset = split_to_segments(ecg_data[ecg_data["ptID"].isin(test_pt_df["ptID"])], 2048, 9120, 0.5)
        test_dataset["data"] = (test_dataset["data"] - test_dataset["data"].map(lambda x: x.mean()))/test_dataset["data"].map(lambda x: x.std())
        torch_dataset_test = Dataset(test_dataset)
        test_dataloader = DataLoader(torch_dataset_test, batch_size=64, shuffle=True, pin_memory=True)

    return train_dataloader, test_dataloader, train_dataset, test_dataset


train_dataloader, test_dataloader, train_dataset, test_dataset = make_SAFER_dataloaders(feas2_pt_data, feas2_ecg_data, test_frac=0.2, only_clean_training=False)

Test high quality: 4650 low quality: 101 
Train high quality: 18029 low quality: 479 
Empty DataFrame
Columns: [index]
Index: []


KeyError: 'data'

In [29]:
from sklearn.model_selection import train_test_split

def make_safer_id_dataloaders(pt_data, ecg_data, test_frac, only_clean_training=True):
    test_patients = pt_data.sample(frac=0.2)

    if only_clean_training:
        ecg_segments = split_to_segments(ecg_data[(ecg_data["class_index"] == 0) & (~ecg_data["ptID"].isin(test_patients["ptID"]))], 2048, 9120, 0.5)
    else:
        ecg_segments = split_to_segments(ecg_data[(~ecg_data["ptID"].isin(test_patients["ptID"]))], 2048, 9120, 0.5)

    train_dataset, test_dataset = train_test_split(ecg_segments, stratify=ecg_segments["ptID"], test_size=test_frac)

    if not train_dataset.empty:
        train_dataset["data"] = (train_dataset["data"] - train_dataset["data"].map(lambda x: x.mean()))/train_dataset["data"].map(lambda x: x.std())
        torch_dataset_train = IDDataset(train_dataset)
        train_dataloader = DataLoader(torch_dataset_train, batch_size=64, shuffle=True, pin_memory=True)

    if not test_dataset.empty:
        test_dataset["data"] = (test_dataset["data"] - test_dataset["data"].map(lambda x: x.mean()))/test_dataset["data"].map(lambda x: x.std())
        torch_dataset_test = IDDataset(test_dataset)
        test_dataloader = DataLoader(torch_dataset_test, batch_size=64, shuffle=True, pin_memory=True)

    return train_dataloader, test_dataloader, train_dataset, test_dataset

train_dataloader, test_dataloader, train_dataset, test_dataset = make_safer_id_dataloaders(feas2_pt_data, feas2_ecg_data, 0.2)

In [None]:
# If we want noisy and clean test_data for evaluation, after training and testing on only clean data in training loop
_, noisy_test_dataloader, _, noisy_test_dataset = make_SAFER_dataloaders(feas2_pt_data, feas2_ecg_data[~feas2_ecg_data["measID"].isin(train_dataset["measID"])], test_frac=1, only_clean_training=False)

In [None]:
# Setup dataloaders for only the clean data
train_dataloader, test_dataloader, train_dataset, test_dataset = make_SAFER_dataloaders(all_clean_pt, all_clean_data, test_frac=0.2, only_clean_training=False)

### Set up the losses

In [72]:
def dist(x, y):
    return torch.sqrt(torch.sum((x - y) ** 2, dim=-1))

triplet_margin = 0.1

def triplet_latent_loss(za, zp, zn):
    print(za)
    print(zp)
    print(zn)
    loss = dist(za, zp) - dist(za, zn) + triplet_margin
    print(loss)
    return torch.mean(torch.where(loss > 0, loss, 0))

def kl_latent_loss(z_mean, z_std):
    # The regularization loss based on kl divergence of the latent distribution from N(0, 1)
    vars = z_std ** 2
    means = z_mean

    return 1/50 * torch.mean( - torch.log(vars) + vars + means ** 2 - 1)

In [58]:
if torch.cuda.is_available():
    print("Using Cuda")
    device = torch.device("cuda")
else:
    print("Using CPU")
    device = torch.device("cpu")

Using Cuda


In [81]:
num_epochs = 10
z_dim = 128

model = CVAE(z_dim).to(device)

loss_func = lambda z: kl_latent_loss(z[:, :z_dim], z[:, z_dim:])

optimizer = torch.optim.SGD(model.parameters(), lr=0.003, momentum=0.8)
num_batches = len(train_dataloader)
num_test_batches = len(test_dataloader)

### Train on feas 1 and 2

In [66]:
import math

# warning: changing these chunk sizes may reload feas1 data from scratch, which will take ages
chunk_size = 20000
num_chunks = math.ceil(162515 / chunk_size )

def get_feas1_dataloader(chunk_num):
    feas1_pt_data, feas1_ecg_data = SAFERDataset.load_feas_dataset(1, f"dataframe_{chunk_num}.pk", ecg_range=[chunk_size * chunk_num, chunk_size * (chunk_num + 1)])
    train_dataset = split_to_segments(feas1_ecg_data, 2048, 9120, 0.5)
    train_dataset["data"] = (train_dataset["data"] - train_dataset["data"].map(lambda x: x.mean()))/train_dataset["data"].map(lambda x: x.std())

    torch_dataset_train = Dataset(train_dataset)
    train_dataloader = DataLoader(torch_dataset_train, batch_size=128, shuffle=True, pin_memory=True)

    return train_dataloader

In [82]:
model = model.to(device)
import copy

best_test_loss = 100
best_model = copy.deepcopy(model).cpu()

for epoch in range(num_epochs):
    total_loss = 0
    print(f"starting epoch {epoch} ...")
    # Train
    model.train()

    for ds_ind in range(num_chunks + 1):
        print(f"training on dataset: {ds_ind}")
        if ds_ind == 0:
            train_dataloader_part = train_dataloader
        else:
            train_dataloader_part = get_feas1_dataloader(ds_ind-1)

        for i, (signals, _, _) in enumerate(train_dataloader_part):
            signals = torch.unsqueeze(signals.to(device), 1).float()

            optimizer.zero_grad()
            output, latents = model(signals)
            loss = loss_func(output, signals, latents)

            loss.backward()

            nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0, norm_type=2)

            optimizer.step()
            total_loss += float(loss)

        print(f"Total loss {total_loss/num_batches}")

    print(f"Epoch {epoch} finished with average loss {total_loss/num_batches}")
    print("Testing ...")
    # Test
    test_loss = 0
    with torch.no_grad():
        model.eval()
        for i, (signals, _, _)in enumerate(test_dataloader):
            signals = torch.unsqueeze(signals.to(device), 1).float()

            output, latents = model (signals)
            loss = loss_func(output, signals, latents)
            test_loss += float(loss)

    print(f"Average test loss: {test_loss/num_test_batches}")

    if test_loss/num_test_batches < best_test_loss:
        best_model = copy.deepcopy(model).cpu()
        best_test_loss = test_loss/num_test_batches

model = best_model

starting epoch 0 ...


NameError: name 'num_chunks' is not defined

In [33]:
from torch.profiler import profile, ProfilerActivity
from tqdm import tqdm

### Train only using feas2

In [83]:
model = model.to(device)
import copy

batch_size = 64  # NB this doesn't actually modify the batch size

best_test_loss = 100
best_model = copy.deepcopy(model).cpu()

for epoch in range(num_epochs):
    total_loss = 0
    print(f"starting epoch {epoch} ...")
    # Train
    model.train()

    for (anchor_sig, pos_sig, neg_sig, anchor_label, neg_label, _) in train_dataloader:
        print(anchor_sig.shape)
        print(pos_sig.shape)
        print(neg_sig.shape)

        if anchor_sig.shape[0] != pos_sig.shape[0] or pos_sig.shape[0] != neg_sig.shape[0]:
            # print("Not equal number of anchors, positive, and negatives")
            continue

        if torch.any(torch.isnan(anchor_sig)) or torch.any(torch.isnan(pos_sig)) or torch.any(torch.isnan(neg_sig)):
            print("Input is nan!")
            continue

        batch_len = anchor_sig.shape[0]

        all_signals = torch.concat([anchor_sig, pos_sig, neg_sig], dim=0)
        all_signals = torch.unsqueeze(all_signals.to(device), 1).float()

        optimizer.zero_grad()
        latents = model(all_signals)

        print(latents)

        loss = loss_func(latents)
        triplet_loss = triplet_latent_loss(latents[:batch_len, :z_dim],
                                           latents[batch_len:2*batch_len, :z_dim],
                                           latents[2*batch_len:, :z_dim])

        print(f"loss: {loss:0.3f}, triplet_loss: {triplet_loss:0.3f}")

        combined_loss = loss + 0.1 * triplet_loss
        combined_loss.backward()

        # nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.1, norm_type=2)

        optimizer.step()
        total_loss += float(combined_loss.detach())

    print(f"Epoch {epoch} finished with average loss {total_loss/num_batches}")
    print("Testing ...")
    # Test
    test_loss = 0
    with torch.no_grad():
        model.eval()
        for i, (anchor_sig, pos_sig, neg_sig, anchor_label, neg_label, _) in enumerate(test_dataloader):
            if anchor_sig.shape[0] != pos_sig.shape[0] or pos_sig.shape[0] != neg_sig.shape[0]:
                print("Not equal number of anchors, positive, and negatives")
                continue

            batch_len = anchor_sig.shape[0]

            all_signals = torch.concat([anchor_sig, pos_sig, neg_sig], dim=0)
            all_signals = torch.unsqueeze(all_signals.to(device), 1).float()

            latents = model(all_signals)

            loss = loss_func(output, all_signals, latents)
            triplet_loss = triplet_latent_loss(latents[:batch_len, :z_dim],
                                               latents[batch_len:2*batch_len, :z_dim],
                                               latents[2*batch_len:, :z_dim])

            total_loss = loss + triplet_loss
            test_loss += float(total_loss)

            print(f"loss: {loss:0.3f}, triplet_loss: {triplet_loss:0.3f}")

    print(f"Average test loss: {test_loss/num_test_batches}")

    if test_loss/num_test_batches < best_test_loss:
        best_model = copy.deepcopy(model).cpu()
        best_test_loss = test_loss/num_test_batches

model = best_model

starting epoch 0 ...
torch.Size([64, 2048])
torch.Size([64, 2048])
torch.Size([64, 2048])
tensor([[ 2.2585e-01,  2.9277e-01, -6.2979e-02,  ...,  1.3004e+00,
          3.5853e+00,  2.3914e-01],
        [ 6.5220e-01,  9.3021e-02, -3.6025e-01,  ...,  1.3581e+00,
          1.0197e+00,  1.3384e+00],
        [ 3.4962e-01,  1.6490e-01, -6.1851e-01,  ...,  4.4539e-01,
          9.0657e-01,  7.2466e-01],
        ...,
        [-4.3428e-01,  1.5238e-01,  7.6054e-01,  ...,  1.3600e+00,
          6.4126e-01,  4.0571e-01],
        [ 4.0193e-01,  4.8054e-01,  3.0405e-01,  ...,  5.2683e-01,
          1.3961e-01,  1.9934e+00],
        [-2.0811e-03, -6.6920e-01,  1.1409e+00,  ...,  4.7770e-01,
          5.6657e-01,  1.1697e-01]], device='cuda:0', grad_fn=<CopySlices>)
tensor([[ 0.2259,  0.2928, -0.0630,  ..., -0.0912, -0.1720, -0.3667],
        [ 0.6522,  0.0930, -0.3603,  ...,  0.4173,  0.3177, -0.0026],
        [ 0.3496,  0.1649, -0.6185,  ..., -0.5375,  0.0275,  0.4648],
        ...,
        [ 0.6566

KeyboardInterrupt: 

In [115]:
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
    for i, (_, _, _) in enumerate(train_dataloader): # , total=len(train_dataset)):
        if i > 10:
            break

print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=25))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
enumerate(DataLoader)#_SingleProcessDataLoaderIter._...        83.52%      50.130ms        99.54%      59.742ms       4.979ms      44.081ms        72.84%      59.961ms       4.997ms            12  
                                       aten::pin_memory         1.16%     698.000us         7.38%       4.430ms     123.056us     810.000us         1.34%       2.570ms      71.389us            36  
         

### Train using the NST dataset

In [13]:
model = model.to(device)
import copy

best_test_loss = 100
best_model = copy.deepcopy(model).cpu()

for epoch in range(num_epochs):
    total_loss = 0
    print(f"starting epoch {epoch} ...")
    # Train
    model.train()
    for i, (signals, clean_signals, _) in enumerate(train_dataloader):
        signals = torch.unsqueeze(signals.to(device), 1).float()
        clean_signals = torch.unsqueeze(clean_signals.to(device), 1).float()

        optimizer.zero_grad()
        output, latents = model(signals)
        loss = loss_func(output, clean_signals, latents.to("cpu"))

        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0, norm_type=2)

        optimizer.step()
        total_loss += float(loss)

    print(f"Epoch {epoch} finished with average loss {total_loss/num_batches}")
    print("Testing ...")
    # Test
    test_loss = 0
    with torch.no_grad():
        model.eval()
        for i, (signals, clean_signals, _)in enumerate(test_dataloader):
            signals = torch.unsqueeze(signals.to(device), 1).float()
            clean_signals = torch.unsqueeze(clean_signals.to(device), 1).float()

            output, latents = model (signals)
            loss = loss_func(output, clean_signals, latents.to("cpu"))
            test_loss += float(loss)

    print(f"Average test loss: {test_loss/num_test_batches}")

    if test_loss/num_test_batches < best_test_loss:
        best_model = copy.deepcopy(model).cpu()
        best_test_loss = test_loss/num_test_batches

model = best_model

starting epoch 0 ...
Epoch 0 finished with average loss 0.400227868819938
Testing ...
Average test loss: 0.37143253578859214
starting epoch 1 ...
Epoch 1 finished with average loss 0.40309759667691064
Testing ...
Average test loss: 0.373511717599981
starting epoch 2 ...
Epoch 2 finished with average loss 0.4044528840219273
Testing ...
Average test loss: 0.3732363511534298
starting epoch 3 ...
Epoch 3 finished with average loss 0.4039087159668698
Testing ...
Average test loss: 0.3708088310325847
starting epoch 4 ...
Epoch 4 finished with average loss 0.40508778261787753
Testing ...
Average test loss: 0.37695128426832314
starting epoch 5 ...
Epoch 5 finished with average loss 0.40080102489275093
Testing ...
Average test loss: 0.38048596767818227
starting epoch 6 ...
Epoch 6 finished with average loss 0.40437007563955646
Testing ...
Average test loss: 0.37527622019543366
starting epoch 7 ...
Epoch 7 finished with average loss 0.40649035835967345
Testing ...
Average test loss: 0.3758788319

In [51]:
model = model.to(device)  # if train finished use this to put back on the GPU

In [60]:
model = best_model.to(device)  # if train did not finish use this to take the best intermediate result

In [48]:
# Save a model
torch.save(model.state_dict(), "TrainedModels/Autoencoder_ID_just_encode_10_epochs.pt")
# train_dataset.to_pickle("TrainedModels/Autoencoder_50_epochs_nst_train_set.pk")

In [15]:
z_dim = 128
model = CVAE(z_dim).to(device)
model.load_state_dict(torch.load("TrainedModels/Autoencoder_new_6_epochs_all_feas1_feas2.pt", map_location=device))

<All keys matched successfully>

In [20]:
# Reload the training and dataset with the model so we don't test on stuff we trained on
train_dataset = pd.read_pickle("TrainedModels/Autoencoder_new_6_epochs_all_feas1_feas2_train_set.pk")

train_dataset["data"] = (train_dataset["data"] - train_dataset["data"].map(lambda x: x.mean()))/train_dataset["data"].map(lambda x: x.std())
torch_dataset_train = Dataset(train_dataset)
train_dataloader = DataLoader(torch_dataset_train, batch_size=32, shuffle=True, pin_memory=True)

test_pt_df = feas2_pt_data[~feas2_pt_data["ptID"].isin(train_dataset["ptID"])]

if not test_pt_df.empty:
    test_dataset = test_dataset[test_dataset["measDiag"] != DiagEnum.Undecided]
    test_dataset = split_to_segments(feas2_ecg_data[feas2_ecg_data["ptID"].isin(test_pt_df["ptID"])], 2048, 9120, 0.5)
    test_dataset["data"] = (test_dataset["data"] - test_dataset["data"].map(lambda x: x.mean()))/test_dataset["data"].map(lambda x: x.std())
    torch_dataset_test = Dataset(test_dataset)
    test_dataloader = DataLoader(torch_dataset_test, batch_size=32, shuffle=True, pin_memory=True)

### Plot the TSNE for some of the training patients

In [59]:
embeddings = []

with torch.no_grad():
    for i, (anchor_sig, pos_sig, neg_sig, anchor_label, neg_label, _) in tqdm(enumerate(train_dataloader), total = len(train_dataset)//64):
        if i > 100:
            break

        if anchor_sig.shape[0] != pos_sig.shape[0] or pos_sig.shape[0] != neg_sig.shape[0]:
            # print("Not equal number of anchors, positive, and negatives")
            continue

        batch_len = anchor_sig.shape[0]

        all_signals = torch.concat([anchor_sig, pos_sig, neg_sig], dim=0)
        all_signals = torch.unsqueeze(all_signals.to(device), 1).float()

        latents = model(all_signals)

        for i, l in enumerate(anchor_label):
            embeddings.append({"ptID": l.detach().cpu(), "feature": latents[i, :z_dim].detach().cpu()})
            embeddings.append({"ptID": l.detach().cpu(), "feature": latents[batch_len + i, :z_dim].detach().cpu()})

        for i, l in enumerate(neg_label):
            embeddings.append({"ptID": l.detach().cpu(), "feature": latents[2*batch_len + i, :z_dim].detach().cpu()})

embedding_df = pd.DataFrame(embeddings)
print(embedding_df.head())

 24%|██▎       | 101/427 [00:34<01:50,  2.94it/s]

         ptID                                            feature
0  tensor(57)  [tensor(-0.3639), tensor(0.7346), tensor(0.462...
1  tensor(57)  [tensor(-0.0949), tensor(0.9462), tensor(0.262...
2  tensor(24)  [tensor(0.6876), tensor(0.9295), tensor(0.4696...
3  tensor(24)  [tensor(0.5577), tensor(0.6108), tensor(0.7485...
4  tensor(10)  [tensor(0.2424), tensor(-0.1201), tensor(-0.45...





In [67]:
from sklearn.manifold import TSNE

feature_matrix = np.array([np.array(v) for v in embedding_df["feature"].values])
pt_ids = [int(v) for v in embedding_df["ptID"].values]

patients = np.unique(pt_ids)

print("starting tsne")
tsne = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=30)

X_embedded = tsne.fit_transform(feature_matrix)

for p in patients:
    plt.scatter(X_embedded[pt_ids == p, 0], X_embedded[pt_ids == p, 1], marker="x", label=p)

plt.legend()
plt.show()

starting tsne


### Now try classifying the test data

In [69]:
from sklearn.svm import SVC

classifier = SVC()
classifier = classifier.fit(feature_matrix, pt_ids)

In [70]:
test_embeddings = []

with torch.no_grad():
    for i, (anchor_sig, pos_sig, neg_sig, anchor_label, neg_label, _) in tqdm(enumerate(test_dataloader), total = len(test_dataset)//64):

        if anchor_sig.shape[0] != pos_sig.shape[0] or pos_sig.shape[0] != neg_sig.shape[0]:
            # print("Not equal number of anchors, positive, and negatives")
            continue

        batch_len = anchor_sig.shape[0]

        all_signals = torch.concat([anchor_sig, pos_sig, neg_sig], dim=0)
        all_signals = torch.unsqueeze(all_signals.to(device), 1).float()

        latents = model(all_signals)

        for i, l in enumerate(anchor_label):
            test_embeddings.append({"ptID": l.detach().cpu(), "feature": latents[i, :z_dim].detach().cpu()})
            test_embeddings.append({"ptID": l.detach().cpu(), "feature": latents[batch_len + i, :z_dim].detach().cpu()})

        for i, l in enumerate(neg_label):
            test_embeddings.append({"ptID": l.detach().cpu(), "feature": latents[2*batch_len + i, :z_dim].detach().cpu()})

test_embedding_df = pd.DataFrame(test_embeddings)
print(test_embedding_df.head())

 95%|█████████▌| 101/106 [00:13<00:00,  7.38it/s]

         ptID                                            feature
0   tensor(1)  [tensor(0.3270), tensor(0.0010), tensor(1.5780...
1   tensor(1)  [tensor(-0.5078), tensor(0.1782), tensor(1.598...
2  tensor(41)  [tensor(-0.1397), tensor(0.4097), tensor(1.332...
3  tensor(41)  [tensor(0.5691), tensor(0.2803), tensor(1.3708...
4   tensor(6)  [tensor(-0.1465), tensor(0.8696), tensor(0.393...





In [71]:
test_matrix = np.array([np.array(v) for v in test_embedding_df["feature"].values])
targets = [int(v) for v in test_embedding_df["ptID"].values]

prediction = classifier.predict(test_matrix)

In [75]:
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(targets, prediction)

accuracy = np.trace(conf_mat) / np.sum(conf_mat)
accuracy_per_person = np.diag(conf_mat) / np.sum(conf_mat, axis=1)

print(f"Overall Accuracy: {accuracy}")

plt.plot(accuracy_per_person)
plt.show()

Overall Accuracy: 0.8509694719471947


In [78]:
best_patient = np.argmin(accuracy_per_person)
best_patient_conf_mat = [[conf_mat[best_patient, best_patient], np.sum(conf_mat[best_patient, best_patient+1:]) + np.sum(conf_mat[best_patient, :best_patient])],
                         [np.sum(conf_mat[best_patient+1:, best_patient]) + np.sum(conf_mat[:best_patient, best_patient]),
                          np.sum(conf_mat[:best_patient, :best_patient]) + np.sum(conf_mat[:best_patient, best_patient+1:]) + np.sum(conf_mat[best_patient+1:, :best_patient]) + np.sum(conf_mat[best_patient+1:, best_patient+1:])]]

print(best_patient_conf_mat)

[[185, 133], [73, 19001]]


### For comparison try with a nearest neighbour classifier

In [79]:
from sklearn.neighbors import KNeighborsClassifier

nn_classify = KNeighborsClassifier()

nn_classify.fit(feature_matrix, pt_ids)
prediction = nn_classify.predict(test_matrix)

In [80]:
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(targets, prediction)

accuracy = np.trace(conf_mat) / np.sum(conf_mat)
accuracy_per_person = np.diag(conf_mat) / np.sum(conf_mat, axis=1)

print(f"Overall Accuracy: {accuracy}")

plt.plot(accuracy_per_person)
plt.show()

Overall Accuracy: 0.7937809405940595


In [81]:
best_patient = np.argmin(accuracy_per_person)
best_patient_conf_mat = [[conf_mat[best_patient, best_patient], np.sum(conf_mat[best_patient, best_patient+1:]) + np.sum(conf_mat[best_patient, :best_patient])],
                         [np.sum(conf_mat[best_patient+1:, best_patient]) + np.sum(conf_mat[:best_patient, best_patient]),
                          np.sum(conf_mat[:best_patient, :best_patient]) + np.sum(conf_mat[:best_patient, best_patient+1:]) + np.sum(conf_mat[best_patient+1:, :best_patient]) + np.sum(conf_mat[best_patient+1:, best_patient+1:])]]

print(best_patient_conf_mat)

[[172, 172], [110, 18938]]


### Reconstruction for clean samples

In [78]:
# Plot test data reconstruction
test_dataset["reconstruction"] = None
mse_only_loss = lambda truth, pred: torch.mean((truth - pred) ** 2, dim=(1,2))

with torch.no_grad():
    model.eval()

    r_err = []
    inds = []
    reconstructions = []

    for i, (signals, _, ind) in enumerate(test_dataloader):
        signals = torch.unsqueeze(signals.to(device), 1).float()
        # signals_clean = torch.unsqueeze(signals_clean.to(device), 1).float()
        # labels = labels.type(torch.LongTensor)

        output, latents  = model(-signals)
        loss = mse_only_loss(output, signals).detach().cpu().numpy()

        output = output.detach().cpu().numpy()

        for i, o, l in zip(ind, output[:, 0, :], loss):
            r_err.append(l)
            reconstructions.append(o)
            inds.append(int(i))


test_dataset["r_err"] = pd.Series(data=r_err, index=inds)
test_dataset["reconstruction"] = pd.Series(data=reconstructions, index=inds)

In [63]:
test_df = test_dataset

In [64]:
test_df.head()

Unnamed: 0,index,ptID,age,ptDiag,ptDiagRev1,ptDiagRev2,ptDiagRev3,cardRev,measDiag,measDiagRev1,...,perhapsAF,measID,data,file_path,class_index,length,rec_ind,rec_pos,reconstruction,r_err
0,590,8,72.5,DiagEnum.NoAF,DiagEnum.NoAF,DiagEnum.Undecided,DiagEnum.Undecided,1,DiagEnum.Undecided,DiagEnum.Undecided,...,0,591,"[-0.49644204545732973, -0.6246603189506869, -0...",ECGs/000000/saferF2_000591,0,9120,590,0,"[0.27881688, 0.18085302, 0.30135077, 0.2448078...",1.414751
1,590,8,72.5,DiagEnum.NoAF,DiagEnum.NoAF,DiagEnum.Undecided,DiagEnum.Undecided,1,DiagEnum.Undecided,DiagEnum.Undecided,...,0,591,"[0.6110765438273132, 0.6934087028050593, 0.796...",ECGs/000000/saferF2_000591,0,9120,590,1,"[0.2756654, 0.16722995, 0.3430887, 0.32528114,...",1.502876
2,590,8,72.5,DiagEnum.NoAF,DiagEnum.NoAF,DiagEnum.Undecided,DiagEnum.Undecided,1,DiagEnum.Undecided,DiagEnum.Undecided,...,0,591,"[4.487455908826463, 5.457356318373902, 6.04539...",ECGs/000000/saferF2_000591,0,9120,590,2,"[0.28695422, 0.1368554, 0.22762185, 0.20471534...",1.426323
3,590,8,72.5,DiagEnum.NoAF,DiagEnum.NoAF,DiagEnum.Undecided,DiagEnum.Undecided,1,DiagEnum.Undecided,DiagEnum.Undecided,...,0,591,"[-0.19036774215761504, -0.13728679542462185, -...",ECGs/000000/saferF2_000591,0,9120,590,3,"[0.34878796, 0.11950919, 0.31936407, 0.1697063...",1.354892
4,590,8,72.5,DiagEnum.NoAF,DiagEnum.NoAF,DiagEnum.Undecided,DiagEnum.Undecided,1,DiagEnum.Undecided,DiagEnum.Undecided,...,0,591,"[-0.2725175881175217, -0.3385749643142289, -0....",ECGs/000000/saferF2_000591,0,9120,590,4,"[0.373186, 0.29026645, 0.4235611, 0.34632695, ...",1.427138


In [24]:
test_df["reconstruction"].iloc[0]

nan

In [65]:
from matplotlib.ticker import AutoMinorLocator
import matplotlib
matplotlib.use('TkAgg')

def plot_ecg_and_reconstruction(x, r, fs=300, n_split=3):
    sample_len = x.shape[0]
    time_axis = np.arange(sample_len)/fs

    cuts = np.round(np.linspace(0, sample_len-1, n_split+1)).astype(int)

    fig, ax = plt.subplots(n_split, 1, figsize=(16, 10), squeeze=False)
    for j in range(n_split):
        ax[j][0].plot(time_axis[cuts[j]:cuts[j+1]], x[cuts[j]:cuts[j+1]])
        ax[j][0].plot(time_axis[cuts[j]:cuts[j+1]], r[cuts[j]:cuts[j+1]])
        ax[j][0].set_xlabel("Time")
        ax[j][0].set_xlim((time_axis[cuts[j]], time_axis[cuts[j+1]]))

        t_s = time_axis[cuts[j]]
        t_f = time_axis[cuts[j+1]]
        time_ticks = np.arange(t_s - t_s%0.2, t_f + (0.2 - t_f%0.2), 0.2)
        decimal_labels = ~np.isclose(time_ticks, np.round(time_ticks))
        time_labels = np.round(time_ticks).astype(int).astype(str)
        time_labels[decimal_labels] = ""

        ax[j][0].set_xticks(time_ticks, time_labels)

        ax[j][0].xaxis.set_major_formatter(plt.NullFormatter())
        ax[j][0].yaxis.set_major_formatter(plt.NullFormatter())

        ax[j][0].xaxis.set_minor_locator(AutoMinorLocator(5))
        ax[j][0].yaxis.set_minor_locator(AutoMinorLocator(5))

        ax[j][0].grid(which='major', linestyle='-', linewidth='0.5', color='black')
        ax[j][0].grid(which='minor', linestyle='-', linewidth='0.5', color='lightgray')

    plt.show()

for _, ecg in test_df[test_df["measDiag"] == DiagEnum.NoAF].iterrows():
    # print(ecg)
    print(ecg[["ptDiag", "measDiag", "tag_orig_Poor_Quality", "poss_AF_tag", "r_err"]])
    plot_ecg_and_reconstruction(ecg["data"], -ecg["reconstruction"], n_split=1)

ptDiag                   DiagEnum.NoAF
measDiag                 DiagEnum.NoAF
tag_orig_Poor_Quality                0
poss_AF_tag                          1
r_err                         1.565233
Name: 637, dtype: object
ptDiag                   DiagEnum.NoAF
measDiag                 DiagEnum.NoAF
tag_orig_Poor_Quality                0
poss_AF_tag                          1
r_err                         1.475245
Name: 638, dtype: object
ptDiag                   DiagEnum.NoAF
measDiag                 DiagEnum.NoAF
tag_orig_Poor_Quality                0
poss_AF_tag                          1
r_err                         1.915962
Name: 639, dtype: object
ptDiag                   DiagEnum.NoAF
measDiag                 DiagEnum.NoAF
tag_orig_Poor_Quality                0
poss_AF_tag                          1
r_err                         1.953902
Name: 640, dtype: object
ptDiag                   DiagEnum.NoAF
measDiag                 DiagEnum.NoAF
tag_orig_Poor_Quality                0
pos

KeyboardInterrupt: 

In [62]:
def plot_ecg_and_reconstruction_for_classes(xs, rs, titles, fs=300):
    fig, ax = plt.subplots(len(xs), 1, figsize=(6, 7))

    for j, (x, r, t) in enumerate(zip(xs, rs, titles)):
        sample_len = x.shape[0]
        time_axis = np.arange(sample_len)/fs

        ax[j].plot(time_axis, x)
        ax[j].plot(time_axis, r)
        ax[j].set_xlabel("Time")
        ax[j].set_xlim((time_axis[0], time_axis[-1]))

        ax[j].set_xticks(np.arange(time_axis[0], time_axis[-1]+0.2,0.2))
        ax[j].set_title(t)

        ax[j].xaxis.set_major_formatter(plt.NullFormatter())
        ax[j].yaxis.set_major_formatter(plt.NullFormatter())

        ax[j].xaxis.set_minor_locator(AutoMinorLocator(5))
        ax[j].yaxis.set_minor_locator(AutoMinorLocator(5))

        ax[j].grid(which='major', linestyle='-', linewidth='0.5', color='black')
        ax[j].grid(which='minor', linestyle='-', linewidth='0.5', color='lightgray')

    fig.tight_layout()
    # plt.show()
    plt.savefig("TMRFigures/cvae_reconst_examples_large_dataset.png")

ecg_ind_list = [3195, 2916, 1563, 1561]  # 2192 # 441 # 315

xs = test_df.loc[ecg_ind_list]["data"].tolist()
rs = test_df.loc[ecg_ind_list]["reconstruction"].tolist()
titles = test_df.loc[ecg_ind_list].apply(lambda x: f"{x['measDiag'].name} e = {x['r_err']:.3f}", axis=1)   # ["measDiag"].map(lambda x: x.name).tolist()
print(len(titles))

plot_ecg_and_reconstruction_for_classes(xs, rs, titles)

4


### Latent space exploration

In [43]:
# Try some latent space exploration

with torch.no_grad():
    model.eval()
    for i, (signals, _, _) in enumerate(test_dataloader):
        signals = torch.unsqueeze(signals.to(device), 1).float()
        # fft = torch.abs(torch.fft.fft(signals))
        # signals = torch.cat([signals, fft], dim=1)
        # labels = labels.type(torch.LongTensor)

        latent_position = model.encode(signals)
        latent_position = latent_position.detach().cpu().numpy()
        signals_np = signals.detach().cpu().numpy()

        print(latent_position.shape)

        break

index = 3
latent_positions = np.zeros((10, *latent_position.shape), dtype=np.float32)
for i in range(10):
    latent_positions[i, :, :] += latent_position
    latent_positions[i, :, index] = i * 4 - 2

signals = []

with torch.no_grad():
    model.eval()
    for l in latent_positions:
        latent = torch.from_numpy(l[:, :60]).to(device)
        signal = model.decode(latent)
        signals.append(signal.detach().cpu().numpy())

(32, 120)


### Interpolate between a noisy and clean ECG!

In [171]:
# noisy 441 # clean 315

noisy_latent = test_dataset.loc[441]["latent_encoding"][:60]
clean_latent = test_dataset.loc[315]["latent_encoding"][:60]

latent_sequence = np.linspace(noisy_latent, clean_latent, 32)
latent_sequence = torch.from_numpy(latent_sequence).to(device)

ecgs = model.decode(latent_sequence).detach().cpu().numpy()
print("plotting")

for ecg in np.flip(ecgs[:, 0, :], axis=0):
    plt.plot(ecg)
    plt.show()

plotting


### Find the reconstruction error for noisy and clean samples

In [66]:
# For NST data
# test_whole_ecgs_rec_err = test_dataset.groupby("rec_ind").agg({"r_err": "mean", "noise_level": lambda x: x.iloc[0]})
# For safer data
test_whole_ecgs_rec_err = test_dataset.groupby("rec_ind").agg({"r_err": "mean" ,"measDiag": lambda x: x.iloc[0]})
test_whole_ecgs_rec_err

Unnamed: 0_level_0,r_err,measDiag
rec_ind,Unnamed: 1_level_1,Unnamed: 2_level_1
590,1.461073,DiagEnum.Undecided
591,1.734338,DiagEnum.Undecided
592,1.272624,DiagEnum.Undecided
593,1.588059,DiagEnum.Undecided
594,1.382735,DiagEnum.Undecided
...,...,...
23182,1.404204,DiagEnum.Undecided
23183,1.498458,DiagEnum.Undecided
23184,1.509076,DiagEnum.Undecided
23185,1.468974,DiagEnum.Undecided


In [123]:
# A little bit of faff to plot the results from small and large datasets on one axis, for the TMR, not sure it made it into the report in the end
test_whole_ecgs_rec_err.to_pickle("TrainedModels/Autoecoder_small_dataset.pk")

In [41]:
test_not_undecided = test_whole_ecgs_rec_err

fig = plt.figure(figsize=(6, 4), dpi=300)
plt.scatter(test_not_undecided["measDiag"].map(lambda x: x.value), test_not_undecided["r_err"], marker='+')
plt.xticks([e.value for e in pd.unique(test_not_undecided["measDiag"])], [e.name for e in pd.unique(test_not_undecided["measDiag"])])
plt.ylabel("Reconstruction error")
plt.xlabel("Measurement diagnosis")
plt.tight_layout()
plt.show()

# plt.savefig("TMRFigures/cvae_reconst_err_large_dataset.png")

KeyError: 'measDiag'

In [68]:
test_whole_ecgs_rec_err["measDiag"].value_counts()

DiagEnum.Undecided      4569
DiagEnum.NoAF            172
DiagEnum.PoorQuality      99
DiagEnum.AF                3
Name: measDiag, dtype: int64

In [71]:
# Safer data

test_not_undecided = test_whole_ecgs_rec_err
# test_not_undecided_2 = pd.read_pickle("TrainedModels/Autoecoder_large_dataset.pk")

fig = plt.figure(figsize=(6, 4), dpi=300)

enum_order = [DiagEnum.NoAF, DiagEnum.AF, DiagEnum.PoorQuality]
data = [test_not_undecided[test_not_undecided["measDiag"] == e]["r_err"] for e in enum_order]

# print(test_not_undecided_2["measDiag"].value_counts())
# data_2 = [test_not_undecided_2[test_not_undecided_2["measDiag"] == e]["r_err"] for e in enum_order]

plt.violinplot(data)    # quantiles=[[0.25, 0.75]]*4, showmedians=True)
# plt.violinplot(data_2)
plt.xticks([1, 2, 3], [e.name for e in enum_order])
plt.ylabel("Reconstruction error")
plt.xlabel("Measurement diagnosis")
plt.tight_layout()
plt.show()

# plt.savefig("TMRFigures/cvae_reconst_err_small_dataset.png")

In [54]:
# NST data

test_not_undecided = test_whole_ecgs_rec_err

fig = plt.figure(figsize=(6, 4), dpi=300)

noise_levels = np.sort(pd.unique(test_not_undecided["noise_level"]))
print(noise_levels)
data = [test_not_undecided[test_not_undecided["noise_level"] == e]["r_err"] for e in noise_levels]


plt.violinplot(data)    # quantiles=[[0.25, 0.75]]*4, showmedians=True)
plt.xticks([1, 2, 3, 4, 5])
plt.ylabel("Reconstruction error")
plt.xlabel("Measurement diagnosis")
plt.tight_layout()
plt.show()

# plt.savefig("TMRFigures/cvae_reconst_err_small_dataset.png")

['00' '06' '12' '18' '_6']


### Sample cross validation code for SAFER (not yet applied to anything)

## Classification from the latent space

In [72]:
from sklearn.svm import SVC

with torch.no_grad():
    model.eval()
    latents = []
    inds = []

    for i, (signals, _, ind) in enumerate(train_dataloader):
        signals = torch.unsqueeze(signals.to(device), 1).float()
        # fft = torch.abs(torch.fft.fft(signals))
        # signals = torch.cat([signals, fft], dim=1)
        # labels = labels.type(torch.LongTensor)

        latent_position = model.encode(signals)
        latent_position = latent_position.detach().cpu().numpy()
        signals_np = signals.detach().cpu().numpy()

        for i, l in zip(ind, latent_position):
            latents.append(l)
            inds.append(i)

train_dataset["latent_encoding"] = pd.Series(data=latents, index=inds)
svc_train_df = train_dataset.dropna(subset=["latent_encoding"])

In [76]:
svc_train_df = svc_train_df[svc_train_df["measDiag"] != DiagEnum.Undecided]
train_dataset

Unnamed: 0,index,ptID,age,ptDiag,ptDiagRev1,ptDiagRev2,ptDiagRev3,cardRev,measDiag,measDiagRev1,...,unlikelyAF,perhapsAF,measID,data,file_path,class_index,length,rec_ind,rec_pos,latent_encoding
0,0,1,79.0,DiagEnum.NoAF,DiagEnum.Undecided,DiagEnum.NoAF,DiagEnum.Undecided,1,DiagEnum.Undecided,DiagEnum.Undecided,...,0,0,1,"[-0.25368181611599944, -0.45841503541331713, -...",ECGs/000000/saferF2_000001,0,9120,0,0,
1,0,1,79.0,DiagEnum.NoAF,DiagEnum.Undecided,DiagEnum.NoAF,DiagEnum.Undecided,1,DiagEnum.Undecided,DiagEnum.Undecided,...,0,0,1,"[0.2026567814899965, 0.184902954707474, 0.1763...",ECGs/000000/saferF2_000001,0,9120,0,1,
2,0,1,79.0,DiagEnum.NoAF,DiagEnum.Undecided,DiagEnum.NoAF,DiagEnum.Undecided,1,DiagEnum.Undecided,DiagEnum.Undecided,...,0,0,1,"[0.0664598504863543, 0.0779387026676995, 0.087...",ECGs/000000/saferF2_000001,0,9120,0,2,
3,0,1,79.0,DiagEnum.NoAF,DiagEnum.Undecided,DiagEnum.NoAF,DiagEnum.Undecided,1,DiagEnum.Undecided,DiagEnum.Undecided,...,0,0,1,"[1.3574057452010926, 1.0019610647315154, 0.645...",ECGs/000000/saferF2_000001,0,9120,0,3,
4,0,1,79.0,DiagEnum.NoAF,DiagEnum.Undecided,DiagEnum.NoAF,DiagEnum.Undecided,1,DiagEnum.Undecided,DiagEnum.Undecided,...,0,0,1,"[-0.025626811413227674, -0.038046522783888745,...",ECGs/000000/saferF2_000001,0,9120,0,4,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128907,23258,288,70.5,DiagEnum.NoAF,DiagEnum.Undecided,DiagEnum.NoAF,DiagEnum.Undecided,1,DiagEnum.Undecided,DiagEnum.Undecided,...,0,0,23259,"[-0.801575410206255, -0.7448302284482392, -0.6...",ECGs/023000/saferF2_023259,0,9120,23258,2,
128908,23258,288,70.5,DiagEnum.NoAF,DiagEnum.Undecided,DiagEnum.NoAF,DiagEnum.Undecided,1,DiagEnum.Undecided,DiagEnum.Undecided,...,0,0,23259,"[-0.9279949051780753, -0.9494823052111254, -0....",ECGs/023000/saferF2_023259,0,9120,23258,3,
128909,23258,288,70.5,DiagEnum.NoAF,DiagEnum.Undecided,DiagEnum.NoAF,DiagEnum.Undecided,1,DiagEnum.Undecided,DiagEnum.Undecided,...,0,0,23259,"[0.4556109704687446, 0.3097769511612076, 0.185...",ECGs/023000/saferF2_023259,0,9120,23258,4,
128910,23258,288,70.5,DiagEnum.NoAF,DiagEnum.Undecided,DiagEnum.NoAF,DiagEnum.Undecided,1,DiagEnum.Undecided,DiagEnum.Undecided,...,0,0,23259,"[0.21881221826302832, 0.4723944748082791, 0.78...",ECGs/023000/saferF2_023259,0,9120,23258,5,


#### Visualise the data with scatter plots and T-SNE

In [69]:
latent_list = list(svc_train_df["latent_encoding"].map(lambda x: x[:60].tolist()).values)
latent_df = pd.DataFrame(latent_list, index=svc_train_df.index)
print(latent_df.columns)

latent_ind = 0

# scatter plot
for i in range(60):
    plt.figure(figsize=(6, 4), dpi=300)
    for d in [DiagEnum.NoAF, DiagEnum.PoorQuality, DiagEnum.AF, DiagEnum.CannotExcludePathology]:
        plt.scatter(latent_df[svc_train_df["measDiag"] == d][0], latent_df[svc_train_df["measDiag"] == d][i], marker="x", label=d.name)
    plt.legend()
    plt.ylabel(f"latent mean {i}")
    plt.xlabel(f"latent mean 0")
    plt.show()

RangeIndex(start=0, stop=60, step=1)


KeyboardInterrupt: 

In [74]:
from sklearn.manifold import TSNE

latent_matrix = np.array(list(svc_train_df["latent_encoding"].map(lambda x: x[:60].tolist()).values))
latent_classes = svc_train_df["measDiag"].values

print("starting tsne")
tsne = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=30)

X_embedded = tsne.fit_transform(latent_matrix)

for d in [DiagEnum.NoAF, DiagEnum.PoorQuality, DiagEnum.AF, DiagEnum.CannotExcludePathology]:
    plt.scatter(X_embedded[latent_classes == d, 0], X_embedded[latent_classes == d, 1], marker="x", label=d.name)

plt.legend()
plt.show()

starting tsne


ValueError: perplexity must be less than n_samples

#### Group all the segments together

In [36]:
svc_train_df = svc_train_df[svc_train_df["measDiag"] != DiagEnum.Undecided]

def concatenate_means(x):
    mean_series = x.map(lambda x: x[:60])
    return np.concatenate(mean_series.tolist())

full_ecg_train_df = svc_train_df.groupby("rec_ind").agg({"latent_encoding": concatenate_means, "measDiag": lambda x: x.iloc[0], "class_index": lambda x: x.iloc[0]})
full_ecg_train_df.iloc[0]["latent_encoding"].shape

(420,)

In [37]:
# Try a T-SNE now all the segments are together

latent_matrix = np.array(list(full_ecg_train_df["latent_encoding"].map(lambda x: x.tolist()).values))
latent_classes = full_ecg_train_df["measDiag"].values

print("starting tsne")
tsne = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=30)

X_embedded = tsne.fit_transform(latent_matrix)

for d in [DiagEnum.NoAF, DiagEnum.PoorQuality, DiagEnum.AF, DiagEnum.CannotExcludePathology]:
    plt.scatter(X_embedded[latent_classes == d, 0], X_embedded[latent_classes == d, 1], marker="x", label=d.name)

plt.legend()
plt.show()

starting tsne


In [39]:
train_matrix = np.vstack(full_ecg_train_df["latent_encoding"].values)
targets =  np.array(full_ecg_train_df["class_index"].astype(int).values)

print(train_matrix.shape)

# class weightings?
classifier = SVC()
classifier = classifier.fit(train_matrix, targets)

(655, 420)


### Testing

In [40]:
test_dataset["latent_encoding"] = None
inds = []
latents = []

with torch.no_grad():
    model.eval()
    for i, (signals, _,  ind) in enumerate(test_dataloader):
        signals = torch.unsqueeze(signals.to(device), 1).float()
        # fft = torch.abs(torch.fft.fft(signals))
        # signals = torch.cat([signals, fft], dim=1)
        # labels = labels.type(torch.LongTensor)

        latent_position = model.encode(signals)
        latent_position = latent_position.detach().cpu().numpy()

        for i, l in zip(ind, latent_position):
            inds.append(int(i))
            latents.append(l)

test_dataset["latent_encoding"] = pd.Series(data=latents, index=inds)

In [42]:
from sklearn.svm import SVC

full_ecg_test_df = test_dataset.groupby("rec_ind").agg({"latent_encoding": concatenate_means, "measDiag": lambda x: x.iloc[0], "class_index": lambda x: x.iloc[0], "measID": lambda x: x.iloc[0]})
full_ecg_no_undecided_test_df = full_ecg_test_df[full_ecg_test_df["measDiag"] != DiagEnum.Undecided]

test_matrix = np.vstack(full_ecg_no_undecided_test_df["latent_encoding"].values)
targets =  np.array(full_ecg_no_undecided_test_df["class_index"].astype(int).values)
print(test_matrix.shape)

prediction = classifier.predict(test_matrix)

full_ecg_no_undecided_test_df["prediction"] = prediction

(586, 420)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_ecg_no_undecided_test_df["prediction"] = prediction


In [43]:
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(full_ecg_no_undecided_test_df["class_index"].astype(int), full_ecg_no_undecided_test_df["prediction"].astype(int))
print("Confusion matrix:")
print(conf_mat)

def F1_ind(conf_mat, ind):
    return (2 * conf_mat[ind, ind])/(np.sum(conf_mat[ind]) + np.sum(conf_mat[:, ind]))

print(f"Sensitivity: {conf_mat[1, 1]/np.sum(conf_mat[1])}")
print(f"Specificity: {conf_mat[0, 0]/np.sum(conf_mat[0])}")

print(f"Normal F1: {F1_ind(conf_mat, 0)}")
print(f"Noisy F1: {F1_ind(conf_mat, 1)}")

[[280 142]
 [ 41 123]]
Sensitivity: 0.75
Specificity: 0.6635071090047393
Normal F1: 0.7537012113055181
Noisy F1: 0.5734265734265734


In [51]:
false_positives = full_ecg_no_undecided_test_df[(full_ecg_no_undecided_test_df["class_index"] == 0) & (full_ecg_no_undecided_test_df["prediction"] == 1)]

for _, ecg in feas2_ecg_data[feas2_ecg_data["measID"].isin(false_positives["measID"])]["data"].iteritems():
    plot_ecg_and_reconstruction(ecg, ecg)
    plt.show()

  for _, ecg in feas2_ecg_data[feas2_ecg_data["measID"].isin(false_positives["measID"])]["data"].iteritems():


KeyboardInterrupt: 