In [1]:
import matplotlib.pyplot as plt
import plotly.graph_objects as go

import numpy as np
import pandas as pd

import sys
import os

import scipy.signal
from ecgdetectors import Detectors

import math
from DataHandlers.DiagEnum import DiagEnum
import DataHandlers.DiagEnum
import DataHandlers.SAFERDataset as SAFERDataset
import DataHandlers.CinC2020Dataset as CinC2020Dataset
import DataHandlers.CinC2020Enums
import importlib
import DataHandlers.CinCDataset as CinCDataset
import DataHandlers.DataAugmentations as DataAugmentations
from multiprocesspandas import applyparallel
importlib.reload(SAFERDataset)
importlib.reload(CinC2020Dataset)

import DataHandlers.DataProcessUtilities
importlib.reload(DataHandlers.DataProcessUtilities)
from DataHandlers.DataProcessUtilities import *
import Utilities.Plotting
importlib.reload(Utilities.Plotting)
from Utilities.Plotting import *

# A fudge because I moved the files
sys.modules["SAFERDataset"] = SAFERDataset
sys.modules["CinC2020Dataset"] = CinC2020Dataset
sys.modules["DiagEnum"] = DataHandlers.DiagEnum
sys.modules["CinC2020Enums"] = DataHandlers.CinC2020Enums
sys.modules["CinCDataset"] = CinCDataset

In [3]:
from scipy.special import softmax

print(softmax(np.array([-0.63554263, 0.14963047, -0.018686041])))

[0.19818147 0.43456966 0.36724887]


In [2]:
import torch
from torch import nn

enable_cuda = True

if torch.cuda.is_available() and enable_cuda:
    print("Using Cuda")
    device = torch.device("cuda")
else:
    print("Using CPU")
    device = torch.device("cpu")

Using Cuda


### Load SAFER data

In [5]:
feas2_pt_data, feas2_ecg_data = SAFERDataset.load_feas_dataset(2, "dataframe_reload")
feas2_ecg_data["measID"] += 300000
feas2_ecg_data.index = feas2_ecg_data["measID"]

D:\2022_23_DSiromani\Feas2\ECGs/filtered_dataframe_reload.pk


In [45]:
from sklearn.metrics import confusion_matrix
confusion_matrix(feas2_ecg_data[feas2_ecg_data["measDiag"] != DiagEnum.Undecided]["tag_orig_Poor_Quality"], feas2_ecg_data[feas2_ecg_data["measDiag"] != DiagEnum.Undecided]["class_index"])

array([[768, 342],
       [  8, 123]], dtype=int64)

In [63]:
feas2_ecg_data["feas"] = 2

In [6]:
def reduce_normals_all_other_af(pt_data, ecg_data):
    accepted_meas_diags = [DiagEnum.AF, DiagEnum.NoAF, DiagEnum.HeartBlock]
    ecg_data = ecg_data[(ecg_data["measDiag"].isin(accepted_meas_diags)) | (ecg_data["measID"] < 20000) | (ecg_data["not_tagged_ign_wide_qrs"] == 0)]
    pt_data = pt_data[pt_data["ptID"].isin(ecg_data["ptID"])]

    return pt_data, ecg_data

# warning: changing these chunk sizes may reload feas1 data from scratch, which will take ages
chunk_size = 20000
num_chunks = math.ceil(162515 / chunk_size )

def load_feas1_chunk_range(chunk_range=(0, num_chunks)):
    ecg_data = []
    pt_data = []

    for chunk_num in range(chunk_range[0], chunk_range[1]):
        feas1_pt_data, feas1_ecg_data = SAFERDataset.load_feas_dataset(1, f"dataframe_{chunk_num}.pk")

        ecg_data.append(feas1_ecg_data)
        pt_data.append(feas1_pt_data)

    feas1_ecg_data = pd.concat(ecg_data)
    feas1_ecg_data["feas"] = 1
    feas1_ecg_data["rri_len"] = feas1_ecg_data["rri_feature"].map(lambda x: x[x > 0].shape[-1])
    feas1_pt_data = pd.concat(pt_data).drop_duplicates()

    return feas1_ecg_data, feas1_pt_data

In [58]:
feas1_ecg_data, feas1_pt_data = load_feas1_chunk_range((0, num_chunks))

D:\2022_23_DSiromani\Feas1\ECGs/filtered_dataframe_0.pk.pk
D:\2022_23_DSiromani\Feas1\ECGs/filtered_dataframe_1.pk.pk
D:\2022_23_DSiromani\Feas1\ECGs/filtered_dataframe_2.pk.pk
D:\2022_23_DSiromani\Feas1\ECGs/filtered_dataframe_3.pk.pk
D:\2022_23_DSiromani\Feas1\ECGs/filtered_dataframe_4.pk.pk
D:\2022_23_DSiromani\Feas1\ECGs/filtered_dataframe_5.pk.pk
D:\2022_23_DSiromani\Feas1\ECGs/filtered_dataframe_6.pk.pk
D:\2022_23_DSiromani\Feas1\ECGs/filtered_dataframe_7.pk.pk
D:\2022_23_DSiromani\Feas1\ECGs/filtered_dataframe_8.pk.pk


In [7]:
def prepare_safer_data(pt_data, ecg_data):
    if "length" in ecg_data:
        ecg_data = ecg_data[ecg_data["length"] == 9120]

    ecg_data = ecg_data[ecg_data["measDiag"] != DiagEnum.PoorQuality]
    # ecg_data = ecg_data[ecg_data["tag_orig_Poor_Quality"] == 0]

    ecg_data = ecg_data[ecg_data["rri_len"] > 5]


    pt_data.index = pt_data["ptID"]
    ecg_data = SAFERDataset.generate_af_class_labels(ecg_data)
    pt_data = SAFERDataset.add_ecg_class_counts(pt_data, ecg_data)

    return pt_data, ecg_data

In [11]:
# just use feas2
safer_ecg_data = feas2_ecg_data
safer_ecg_data["ffReview_sent"] = -1
safer_ecg_data["ffReview_remain"] = -1
safer_pt_data = feas2_pt_data

safer_pt_data, safer_ecg_data = prepare_safer_data(safer_pt_data, safer_ecg_data)

In [61]:
# Just use feas1 to prepare test and validation datasets (The train is best handled with a DatasetSequenceIterator)
feas1_pt_data, feas1_ecg_data = prepare_safer_data(feas1_pt_data, feas1_ecg_data)
feas1_ecg_data["class_index"].value_counts()

feas1_ecg_data_test = feas1_ecg_data[feas1_ecg_data["ptID"].isin(test_pts["ptID"])]
feas1_ecg_data_val = feas1_ecg_data[feas1_ecg_data["ptID"].isin(val_pts["ptID"])]

print(feas1_ecg_data_test["class_index"].value_counts())
print(feas1_ecg_data_val["class_index"].value_counts())

0    22274
2      377
1      102
Name: class_index, dtype: int64
0    22862
2      452
1      126
Name: class_index, dtype: int64


In [358]:
# Undersample normals and produce a dataloader
feas1_ecg_data_train_norm = feas1_ecg_data[(feas1_ecg_data["class_index"] == 0) & (feas1_ecg_data["ptID"].isin(train_pts["ptID"]))].sample(frac=0.3)
feas1_ecg_data_train_not_norm = feas1_ecg_data[(feas1_ecg_data["class_index"] != 0) & (feas1_ecg_data["ptID"].isin(train_pts["ptID"]))]

feas1_ecg_data_test_norm = feas1_ecg_data[(feas1_ecg_data["class_index"] == 0) & (feas1_ecg_data["ptID"].isin(test_pts["ptID"]))].sample(frac=0.3)
feas1_ecg_data_test_not_norm = feas1_ecg_data[(feas1_ecg_data["class_index"] != 0) & (feas1_ecg_data["ptID"].isin(test_pts["ptID"]))]

feas1_ecg_data_test_undersamp = pd.concat([feas1_ecg_data_test_norm, feas1_ecg_data_test_not_norm])
feas1_ecg_data_train_undersamp = pd.concat([feas1_ecg_data_train_norm, feas1_ecg_data_train_not_norm])
print(feas1_ecg_data_train_undersamp["class_index"].value_counts())

feas1_train_dataloader_undersamp = get_dataloaders(feas1_ecg_data_train_undersamp, 64)
feas1_test_dataloader_undersamp = get_dataloaders(feas1_ecg_data_test_undersamp, 64)

0    31274
2     2281
1      513
Name: class_index, dtype: int64


In [163]:
doc_path = r"C:\Users\daniel\Documents"

feas1_ecg_data_test.to_pickle(os.path.join(doc_path, "feas1_test_27_mar.pk"))
feas1_ecg_data_val.to_pickle(os.path.join(doc_path, "feas1_val_27_mar.pk"))

In [64]:
safer_ecg_data = pd.concat([feas2_ecg_data, feas1_ecg_data])
safer_pt_data = pd.concat([feas2_pt_data, feas1_pt_data])

safer_pt_data, safer_ecg_data = prepare_safer_data(safer_pt_data, safer_ecg_data)

In [9]:
safer_ecg_data.groupby("feas")["class_index"].value_counts()

feas  class_index
2     0              19513
      2                757
      1                 16
Name: class_index, dtype: int64

In [66]:
# Plot a heartrate histogram for AF and not AF
fig, ax = plt.subplots(figsize=(4, 4), dpi=300)
ax.hist(safer_ecg_data["heartrate"][(safer_ecg_data["measDiag"] != DiagEnum.AF) & (safer_ecg_data["feas"] == 1)], alpha=0.7, density=True, label="Normal or Other Rhythm")
ax.hist(safer_ecg_data["heartrate"][(safer_ecg_data["measDiag"] == DiagEnum.AF) & (safer_ecg_data["feas"] == 1)], alpha=0.7, density=True, label="AF")
ax.set_xlabel("Heartrate (bpm)")
ax.set_ylabel("Frequency proportion")
ax.legend()

fig.tight_layout()
fig.show()

In [16]:
# Cut out high and low heartrates - I dont think this makes a difference so havent been doing it mostly
safer_ecg_data = safer_ecg_data[(safer_ecg_data["heartrate"] < 120) & (safer_ecg_data["heartrate"] > 50)]

In [None]:
for _, ecg in safer_ecg_data[safer_ecg_data["feas"] == 1].sample(frac=1).iterrows():
    print(ecg[["measDiag", "class_index", "heartrate", "r_peaks"]])
    plot_ecg(ecg["data"], r_peaks=ecg["r_peaks"], fs=300, n_split=3)
    plt.show()

In [25]:
# Plot the 1 feas2 AF example with high heartrate!

plot_ecg(safer_ecg_data.loc[310209]["data"], r_peaks=safer_ecg_data.loc[310209]["r_peaks"], fs=300, n_split=3, figsize=(6, 5), export_quality=True)
plot_ecg_spectrogram(safer_ecg_data.loc[310209]["data"], fs=300, n_split=3, figsize=(6, 5), export_quality=True, cut_range=(2, 18))
plt.show()

### Load CinC 2020 data

In [5]:
import DataHandlers.CinC2020Dataset as CinC2020Dataset
import importlib
importlib.reload(CinC2020Dataset)

df = CinC2020Dataset.load_dataset(save_name="dataframe_2")

In [7]:
# At the moment we only select data with length which can be truncated to 3000 samples (10s)
def select_length(df):
    df_within_range = df[(df["length"] <= 5000) & (df["length"] >= 3000)].copy()
    df_within_range["data"] = df_within_range["data"].map(lambda x: x[:3000])
    df_within_range["length"] = df_within_range["data"].map(lambda x: x.shape[0])
    return df_within_range

df = select_length(df)

In [67]:
# Plot a heartrate histogram for AF and not AF
fig, ax = plt.subplots(figsize=(4, 4), dpi=300)
ax.hist(df["heartrate"][(df["measDiag"] != DiagEnum.AF)], alpha=0.7, density=True, label="Normal or Other Rhythm")
ax.hist(df["heartrate"][(df["measDiag"] == DiagEnum.AF)], alpha=0.7, density=True, label="AF")
ax.set_xlabel("Heartrate (bpm)")
ax.set_ylabel("Frequency proportion")
ax.legend()

fig.tight_layout()
fig.show()

In [26]:
noise = noise_df.sample()["data"].iloc[0] * np.random.normal(scale=1)

for _, ecg in df.iterrows():
    noise_scale = np.random.normal(scale=0.2)
    noise = noise_df.sample()["data"].iloc[0] * noise_scale
    print(noise_scale)
    plot_ecg(ecg["data"], figsize=(5, 2), export_quality=True)
    plot_ecg(ecg["data"] + noise, figsize=(5, 2), export_quality=True)
    plot_ecg(noise, figsize=(5, 2), export_quality=True)
    plt.show()

-0.04636585375203334
-0.12941265829123266


KeyboardInterrupt: 

In [83]:

for _, ecg in df[df["class_index"] == 0].iterrows():
    plot_ecg(ecg["data"][:1500], figsize=(5, 2.5), export_quality=True)
    plt.show()

KeyboardInterrupt: 

In [4]:
df.groupby("dataset")["class_index"].value_counts()

dataset               class_index
cpsc_2018             2               2047
                      0                984
                      1                903
cpsc_2018_extra       2                364
                      0                350
                      1                113
georgia               2               5257
                      0               3508
                      1                566
ptb-xl                0              10692
                      2               9349
                      1               1514
st_petersburg_incart  0              10955
                      1               1010
                      2                363
Name: class_index, dtype: int64

### Load noise from MIT database

In [16]:
import wfdb
import os
from scipy import signal

noises = ["em", "ma"]
noise_dfs = []
mit_dataset_path = "Datasets/mit-bih-noise-stress-test-database"

f_low = 0.67
f_high = 25

def split_signal(data, split_len):
    data_splits = []
    splits = np.arange(0, data["data"].shape[0], split_len)

    for i, (start, end) in enumerate(zip(splits, splits[1:])):
        data_split = data.copy()
        data_split["data"] = data["data"][start:end]
        data_split["data"] = (data_split["data"] - data_split["data"].mean())/ data_split["data"].std()

        data_split.name = i
        data_splits.append(data_split)

    return data_splits


for n_path in noises:
    rec = wfdb.rdrecord(os.path.join(mit_dataset_path, n_path))
    sig = np.concatenate([rec.p_signal[:, 0], rec.p_signal[:, 1]])

    bandpass = signal.butter(3, [f_low, f_high], 'bandpass', fs=rec.fs, output='sos')
    notch = signal.butter(3, [48, 52], 'bandstop', fs=rec.fs, output='sos')

    sig = filter_and_norm(sig, bandpass)
    sig = filter_and_norm(sig, notch)

    sig = resample(sig, rec.fs, 300)
    sig_series = pd.Series(data={"data": sig, "fs": 300, "noise_type": n_path})

    split_signals = split_signal(sig_series, 3000)
    split_signals = pd.DataFrame(split_signals)

    noise_dfs.append(split_signals)

noise_df = pd.concat(noise_dfs, ignore_index=True)

### Load CinC2017 Dataset

In [11]:
importlib.reload(CinCDataset)
import DataHandlers.DataProcessUtilities
importlib.reload(DataHandlers.DataProcessUtilities)
from DataHandlers.DataProcessUtilities import *

cinc2017_df = CinCDataset.load_cinc_dataset()

100%|██████████| 8528/8528 [00:04<00:00, 1971.58it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ecg_data["rri_len"] = ecg_data["rri_feature"].map(lambda x: x[x > 0].shape[-1])


In [12]:
cinc2017_df = cinc2017_df[cinc2017_df["length"] == 9000]
cinc2017_df["measDiag"].value_counts()

DiagEnum.NoAF                      3694
DiagEnum.CannotExcludePathology    1649
DiagEnum.AF                         504
DiagEnum.PoorQuality                123
Name: measDiag, dtype: int64

In [13]:
cinc2017_df = cinc2017_df[cinc2017_df["length"] == 9000]
cinc2017_df = cinc2017_df[cinc2017_df["measDiag"] != DiagEnum.PoorQuality]

In [14]:
cinc2017_df["class_index"].value_counts()

0    5847
Name: class_index, dtype: int64

In [17]:
# Plot a heartrate histogram for AF and not AF
fig, ax = plt.subplots(figsize=(4, 4), dpi=300)
ax.hist(cinc2017_df["heartrate"][(cinc2017_df["class_index"] != 1)], alpha=0.7, density=True, label="Normal or Other Rhythm")
ax.hist(cinc2017_df["heartrate"][(cinc2017_df["class_index"] == 1)], alpha=0.7, density=True, label="AF")
ax.set_xlabel("Heartrate (bpm)")
ax.set_ylabel("Frequency proportion")
ax.legend()

fig.tight_layout()
fig.show()

In [81]:
ecgs = cinc2017_df[(cinc2017_df["class_index"] == 2) & (cinc2017_df["heartrate"] > 120)]

for _, ecg in ecgs.iterrows():
    plot_ecg(ecg["data"][:3000], 300, n_split=1, r_peaks=ecg["r_peaks"], figsize=(6, 2.5), export_quality=True)
    plot_ecg_spectrogram(ecg["data"][:3000], 300, n_split=1, cut_range=[2, 18], figsize=(6, 2.5), export_quality=True)
    plot_ecg_poincare(ecg["rri_feature"][:10], 10)# ecg["rri_len"])
    plt.show()

KeyboardInterrupt: 

In [46]:
ecg = cinc2017_df.loc["A02650"]

plot_ecg(ecg["data"], 300, n_split=3, r_peaks=ecg["r_peaks"], figsize=(6, 2.5), export_quality=True)
plot_ecg_drr(ecg["rri_feature"], ecg["rri_len"], export_quality=True)# ecg["rri_len"])

### Generate dataloaders

In [15]:
mapper = CinC2020Dataset.CinC2020DiagMapper()
num_unique_classes = len(mapper.diag_desc.index)

# Note this only gets used for CinC data - the safer data labels were decided to have different meanings
def class_index_map(diag):
    if diag == DiagEnum.NoAF:
        return 0
    elif diag == DiagEnum.AF:
        return 1
    elif diag == DiagEnum.CannotExcludePathology:
        return 2
    elif diag == DiagEnum.Undecided:
        return 0

In [16]:
cinc2017_df["class_index"] = cinc2017_df["measDiag"].map(class_index_map)

In [10]:
# Onehot encoding
from torch.utils.data import Dataset, DataLoader

class Dataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, dataset):
        'Initialization'
        self.dataset = dataset
        self.noise_prob = 0
        self.temp_warp = 0

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.dataset.index)

    def set_noise_prob(self, prob, power_std, noise_df):
        self.noise_prob = prob
        self.noise_power_std = power_std
        self.noise_df = noise_df

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        row = self.dataset.iloc[index]

        data = row["data"]
        rri = row["rri_feature"]
        rri_len = row["rri_len"]

        warp = np.random.binomial(1, self.temp_warp)
        if warp:
            data, r_peaks = DataAugmentations.temporal_warp(data, row["r_peaks_hamilton"])
            rri = get_rri_feature(r_peaks, 20)

        add_noise = np.random.binomial(1, self.noise_prob)
        if add_noise:
            noise = noise_df.sample()["data"].iloc[0] * np.random.normal(scale=self.noise_power_std)
            data += noise

        X = (data, rri, rri_len)
        y = row["class_index"]
        ind = row.name

        return X, y, ind

In [11]:
# For SAFER data
# Split train and test data according to each patient
# Note this function stratifies for AF and non AF!
def generate_patient_splits(pt_data, test_frac, val_frac):
    train_patients = []
    test_patients = []
    val_patients = []

    test_val_frac = test_frac + val_frac
    val_second_frac = val_frac/test_val_frac

    for val, df in pt_data.groupby("noAFRecs"):
        print(f"processing {val}")
        print(f"number of patients {len(df.index)}")



        n = math.floor(len(df.index) * test_val_frac)
        if  test_val_frac > 0:
            res = ((len(df.index) * test_val_frac) - n)/test_val_frac
        else:
            res = 0
        n += np.random.binomial(res, test_val_frac)
        test_val = df.sample(n)

        n = math.floor(len(test_val.index) * val_second_frac)
        if  val_second_frac > 0:
            res = ((len(test_val.index) * val_second_frac) - n)/val_second_frac
        else:
            res = 0
        n += np.random.binomial(res, val_second_frac)
        val = test_val.sample(n)
        val_patients.append(val)

        test_patients.append(test_val[~test_val["ptID"].isin(val["ptID"])])
        train_patients.append(df[~df["ptID"].isin(test_val["ptID"])])

    train_pt_df = pd.concat(train_patients)
    test_pt_df = pd.concat(test_patients)
    val_pt_df = pd.concat(val_patients)

    return train_pt_df, test_pt_df, val_pt_df


def make_SAFER_dataloaders(pt_data, ecg_data, test_frac, val_frac, batch_size=128):
    train_pt_df, test_pt_df, val_pt_df = generate_patient_splits(pt_data, test_frac, val_frac)

    print(f"Test AF: {test_pt_df['noAFRecs'].sum()} Normal: {test_pt_df['noNormalRecs'].sum()} Other: {test_pt_df['noOtherRecs'].sum()}")
    print(f"Train AF: {train_pt_df['noAFRecs'].sum()} Normal: {train_pt_df['noNormalRecs'].sum()} Other: {train_pt_df['noOtherRecs'].sum()}")
    print(f"Val AF: {val_pt_df['noAFRecs'].sum()} Normal: {val_pt_df['noNormalRecs'].sum()} Other: {val_pt_df['noOtherRecs'].sum()}")

    train_dataloader = None
    test_dataloader = None
    val_dataloader = None

    train_dataset = None
    test_dataset = None
    val_dataset = None

    if not train_pt_df.empty:
        # get ECG datasets
        train_dataset = ecg_data[ecg_data["ptID"].isin(train_pt_df["ptID"])]
        # Normalise
        train_dataset["data"] = (train_dataset["data"] - train_dataset["data"].map(lambda x: x.mean()))/train_dataset["data"].map(lambda x: x.std())
        torch_dataset_train = Dataset(train_dataset)
        train_dataloader = DataLoader(torch_dataset_train, batch_size=batch_size, shuffle=True, pin_memory=True)

    if not test_pt_df.empty:
        test_dataset = ecg_data[(ecg_data["ptID"].isin(test_pt_df["ptID"]))]
        test_dataset["data"] = (test_dataset["data"] - test_dataset["data"].map(lambda x: x.mean()))/test_dataset["data"].map(lambda x: x.std())
        torch_dataset_test = Dataset(test_dataset)
        test_dataloader = DataLoader(torch_dataset_test, batch_size=batch_size, shuffle=True, pin_memory=True)

    if not val_pt_df.empty:
        val_dataset = ecg_data[(ecg_data["ptID"].isin(val_pt_df["ptID"]))]
        val_dataset["data"] = (val_dataset["data"] - val_dataset["data"].map(lambda x: x.mean()))/val_dataset["data"].map(lambda x: x.std())
        torch_dataset_val = Dataset(val_dataset)
        val_dataloader = DataLoader(torch_dataset_val, batch_size=batch_size, shuffle=True, pin_memory=True)

    return train_dataloader, test_dataloader, val_dataloader, train_dataset, test_dataset, val_dataset

In [70]:
train_dataloader_safer, test_dataloader_safer, val_dataloader_safer, train_dataset_safer, test_dataset_safer, val_dataset_safer = make_SAFER_dataloaders(safer_pt_data, safer_ecg_data, test_frac=0.15, val_frac=0.15, batch_size=32)

processing 0.0
number of patients 2366
processing 1.0
number of patients 12
processing 2.0
number of patients 11
processing 3.0
number of patients 4
processing 4.0
number of patients 5
processing 5.0
number of patients 3
processing 6.0
number of patients 1
processing 8.0
number of patients 2
processing 9.0
number of patients 2
processing 10.0
number of patients 3
processing 11.0
number of patients 3
processing 18.0
number of patients 2
processing 19.0
number of patients 2
processing 22.0
number of patients 2
processing 26.0
number of patients 1
processing 29.0
number of patients 2
processing 35.0
number of patients 2
processing 39.0
number of patients 1
processing 45.0
number of patients 1
processing 53.0
number of patients 1
processing 62.0
number of patients 1
processing 80.0
number of patients 1
processing 94.0
number of patients 1
Test AF: 155.0 Normal: 24905.0 Other: 853.0
Train AF: 498.0 Normal: 118092.0 Other: 2360.0
Val AF: 176.0 Normal: 25902.0 Other: 713.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dataset["data"] = (train_dataset["data"] - train_dataset["data"].map(lambda x: x.mean()))/train_dataset["data"].map(lambda x: x.std())


NameError: name 'Dataset' is not defined

In [8]:
def get_dataloaders(dataset, batch_size=32):
    torch_dataset = Dataset(dataset)
    dataloader = DataLoader(torch_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
    return dataloader

In [16]:
# validate on Feas2 and train/test on feas1
val_dataset_safer = safer_ecg_data[safer_ecg_data["feas"] == 2]
val_dataloader_safer = get_dataloaders(val_dataset_safer)

In [17]:
val_dataset_safer["class_index"].value_counts()

0    19513
2      757
1       16
Name: class_index, dtype: int64

In [9]:
### Make dataloaders for CinC data - separate cpsc as the validation set
from sklearn.model_selection import train_test_split

val_dataset = df[df["dataset"] == "cpsc_2018"]
train_dataset, test_dataset = train_test_split(df[df["dataset"] != "cpsc_2018"], test_size=0.15, stratify=df[df["dataset"] != "cpsc_2018"]["class_index"])
# test_dataset, val_dataset = train_test_split(test_dataset, test_size=0.5, stratify=test_dataset["class_index"])

test_dataset = test_dataset[test_dataset["measDiag"] != DiagEnum.Undecided]  # Should just remove any errors in loading the dataset
val_dataset = val_dataset[val_dataset["measDiag"] != DiagEnum.Undecided]  # Should just remove any errors in loading the dataset

torch_dataset_test = Dataset(test_dataset)
test_dataloader = DataLoader(torch_dataset_test, batch_size=128, shuffle=True, pin_memory=True)

torch_dataset_val = Dataset(val_dataset)
val_dataloader = DataLoader(torch_dataset_val, batch_size=128, shuffle=True, pin_memory=True)

torch_dataset_train = Dataset(train_dataset)
# torch_dataset_train.temp_warp = 0.2
# torch_dataset_train.set_noise_prob(0.1, 0.2, noise_df)
train_dataloader = DataLoader(torch_dataset_train, batch_size=128, shuffle=True, pin_memory=True)

In [20]:
print(train_dataset["class_index"].value_counts())

0    21679
2    13033
1     2722
Name: class_index, dtype: int64


In [90]:
# Set the proportion of AF samples in the test data to that of the train data

val_df_counts = val_dataset["class_index"].value_counts()
train_df_counts = train_dataset["class_index"].value_counts()

train_not_af = train_df_counts.loc[2] + train_df_counts.loc[0]
val_not_af = val_df_counts.loc[2] + val_df_counts.loc[0]

val_af_wanted = int(round((train_df_counts.loc[1]/train_not_af) * val_not_af))

wanted_af_samples = val_dataset[val_dataset["class_index"] == 1].sample(val_af_wanted)
val_dataset = pd.concat([val_dataset[val_dataset["class_index"] != 1], wanted_af_samples])

torch_dataset_val = Dataset(val_dataset)
val_dataloader = DataLoader(torch_dataset_val, batch_size=32, shuffle=True, pin_memory=True)

In [63]:
### CinC2017 data
from sklearn.model_selection import train_test_split

test_size = 0.15
val_size = 0.15

train_dataset_2017, test_val = train_test_split(cinc2017_df.dropna(subset="class_index"), test_size=test_size + val_size, stratify=cinc2017_df["class_index"].dropna())
test_dataset_2017, val_dataset_2017 = train_test_split(test_val, test_size=val_size/(test_size + val_size), stratify=test_val["class_index"])

# test_dataset_2017 = test_dataset_2017[test_dataset_2017["measDiag"] != DiagEnum.Undecided]  # Should just remove any errors in loading the dataset

torch_dataset_test = Dataset(test_dataset_2017)
test_dataloader_2017 = DataLoader(torch_dataset_test, batch_size=32, shuffle=True, pin_memory=True)

torch_dataset_train = Dataset(train_dataset_2017)
train_dataloader_2017 = DataLoader(torch_dataset_train, batch_size=32, shuffle=True, pin_memory=True)

torch_dataset_val = Dataset(val_dataset_2017)
val_dataloader_2017 = DataLoader(torch_dataset_val, batch_size=32, shuffle=True, pin_memory=True)

In [64]:
print(train_dataset_2017["class_index"].value_counts())
print(test_dataset_2017["class_index"].value_counts())
print(val_dataset_2017["class_index"].value_counts())

0    2585
2    1154
1     353
Name: class_index, dtype: int64
0    554
2    247
1     76
Name: class_index, dtype: int64
0    555
2    248
1     75
Name: class_index, dtype: int64


In [184]:
# Save the CinC2017 data splits for consistent results!
train_dataset_2017.to_pickle("TrainedModels/19_May_cinc_2017_train.pk")
test_dataset_2017.to_pickle("TrainedModels/19_May_cinc_2017_test.pk")
val_dataset_2017.to_pickle("TrainedModels/19_May_cinc_2017_val.pk")

In [251]:
train_dataset_2017 = pd.read_pickle("TrainedModels/19_May_cinc_2017_train.pk")
test_dataset_2017 = pd.read_pickle("TrainedModels/19_May_cinc_2017_test.pk")
val_dataset_2017 = pd.read_pickle("TrainedModels/19_May_cinc_2017_val.pk")

train_dataloader_2017 = get_dataloaders(train_dataset_2017, 32)
test_dataloader_2017 = get_dataloaders(test_dataset_2017, 32)
val_dataloader_2017 = get_dataloaders(val_dataset_2017, 32)

In [131]:
### Use whole CinC2017 as a test
dataset_2017 = cinc2017_df[cinc2017_df["measDiag"] != DiagEnum.Undecided].dropna(subset="class_index")

torch_dataset = Dataset(dataset_2017)
dataloader_2017 = DataLoader(torch_dataset, batch_size=32, shuffle=True, pin_memory=True)

In [18]:
val_dataset_2017["class_index"].value_counts()

0    555
2    248
1     75
Name: class_index, dtype: int64

### Use the noise detector to filter the datasets

In [13]:
# Filter noisy things out of SAFER
import Models.NoiseCNN
import importlib

importlib.reload(Models.NoiseCNN)
from Models.NoiseCNN import CNN, hyperparameters

noiseDetector = CNN(**hyperparameters).to(device)
noiseDetector.load_state_dict(torch.load("TrainedModels/CNN_16_may_final_no_undecided.pt", map_location=device))
noiseDetector.eval()

def add_noise_predictions(nd, dataloader, dataset):
    noise_ps = []
    inds = []

    with torch.no_grad():
        for i, (signals, labels, ind) in enumerate(dataloader):
            signal = signals[0].to(device).float()
            noise_prob = nd(torch.unsqueeze(signal, 1)).detach().to("cpu").numpy()

            for i, n in zip(ind, noise_prob):
                if type(i) == str:
                    inds.append(i)
                else:
                    inds.append(i.item())
                noise_ps.append(float(n))

    if dataset is not None:
        dataset["noise_probs"] = pd.Series(data=noise_ps, index=inds)
    else:
        return pd.Series(data=noise_ps, index=inds)

In [19]:
add_noise_predictions(noiseDetector, val_dataloader_safer, val_dataset_safer)
# add_noise_predictions(noiseDetector, test_dataloader_safer, test_dataset_safer)
# add_noise_predictions(noiseDetector, train_dataloader_safer, train_dataset_safer)

In [20]:
# Remove the noisy samples
# train_dataset_safer_clean = train_dataset_safer[train_dataset_safer["noise_probs"] < 0]
# test_dataset_safer_clean = test_dataset_safer[test_dataset_safer["noise_probs"] < 0]
val_dataset_safer_clean = val_dataset_safer[val_dataset_safer["noise_probs"] < 0]

# print(len(train_dataset_safer_clean.index))
# print(len(test_dataset_safer_clean.index))
print(len(val_dataset_safer_clean.index))

# train_dataloader_safer_clean = get_dataloaders(train_dataset_safer_clean)
# test_dataloader_safer_clean = get_dataloaders(test_dataset_safer_clean)
val_dataloader_safer_clean = get_dataloaders(val_dataset_safer_clean)

17861


In [252]:
add_noise_predictions(noiseDetector, train_dataloader_2017, train_dataset_2017)
add_noise_predictions(noiseDetector, test_dataloader_2017, test_dataset_2017)
add_noise_predictions(noiseDetector, val_dataloader_2017, val_dataset_2017)

In [186]:
print(train_dataset_2017["class_index"].value_counts())
print(test_dataset_2017["class_index"].value_counts())
print(val_dataset_2017["class_index"].value_counts())

0    2585
2    1154
1     353
Name: class_index, dtype: int64
0    554
2    247
1     76
Name: class_index, dtype: int64
0    555
2    248
1     75
Name: class_index, dtype: int64


In [253]:
# Remove the noisy samples
thresh = 0.3

train_dataset_2017_clean = train_dataset_2017[train_dataset_2017["noise_probs"] < 0]
test_dataset_2017_clean = test_dataset_2017[test_dataset_2017["noise_probs"] < 0]
val_dataset_2017_clean = val_dataset_2017[val_dataset_2017["noise_probs"] < 0]

print(train_dataset_2017_clean["class_index"].value_counts())
print(test_dataset_2017_clean["class_index"].value_counts())
print(val_dataset_2017_clean["class_index"].value_counts())


train_dataloader_2017_clean = get_dataloaders(train_dataset_2017_clean)
test_dataloader_2017_clean = get_dataloaders(test_dataset_2017_clean)
val_dataset_2017_clean = val_dataset_2017[val_dataset_2017["noise_probs"] < 0]

0    1955
2     857
1     255
Name: class_index, dtype: int64
0    405
2    172
1     50
Name: class_index, dtype: int64
0    392
2    178
1     50
Name: class_index, dtype: int64


In [9]:
import threading

class DatasetSequenceIterator:

    def __init__(self, data_loading_functions, batch_sizes, filter=lambda x:x):
        self.dl_functions = data_loading_functions

        self.dataset = None
        self.next_dataset = None

        self.dataloader_iterator = None
        self.next_dataloader_iterator = None

        self.next_dataset_loaded = False
        self.dataloader_thread = None

        self.filter = filter

        self.batch_sizes = batch_sizes
        self.dl_index = 0

    def __iter__(self):
        self.dl_index = -1
        self.dataloader_thread = threading.Thread(target=self.load_next_dataset)
        self.dataloader_thread.start()
        self.dataloader_thread.join()
        self.swap_to_next_dataset()
        self.dl_index += 1
        self.dataloader_thread = threading.Thread(target=self.load_next_dataset)
        self.dataloader_thread.start()
        print(self.dl_index)
        return self

    def __len__(self):
        # TODO make this return the right value
        return 100

    def swap_to_next_dataset(self):
        self.dataset = self.next_dataset
        self.dataloader_iterator = self.next_dataloader_iterator
        self.next_dataset_loaded = False

    def load_next_dataset(self):
        if self.dl_index + 1 < len(self.dl_functions):
            print(f"Loading dataset {self.dl_index + 1}")
            self.next_dataset = self.dl_functions[self.dl_index + 1]()
            self.next_dataset = self.filter(self.next_dataset)

            torch_dataset = Dataset(self.next_dataset)
            self.next_dataloader_iterator = iter(DataLoader(torch_dataset, batch_size=self.batch_sizes[self.dl_index], shuffle=True, pin_memory=True))
            self.next_dataset_loaded = True
        else:
            print("Finished loading all datasets")
            self.next_dataset_loaded = False
            return None



    def __next__(self):
        try:
            ret = next(self.dataloader_iterator)
        except StopIteration:
            print("stop_iteration")
            if self.dl_index >= len(self.dl_functions):
                # We have gone through all the datasets
                print("Completed all datasets")
                raise StopIteration
            else:

                if not self.next_dataset_loaded:
                    print("waiting_for_next_dataset")
                    self.dataloader_thread.join()

                self.swap_to_next_dataset()
                self.dl_index += 1
                self.dataloader_thread = threading.Thread(target=self.load_next_dataset)
                self.dataloader_thread.start()
                ret = next(self.dataloader_iterator)

        return ret

In [16]:
# Testing the DatasetSequenceIterator by dividing feas1 into two parts

def load_feas1_first_half():
    ecg_data, pt_data = load_feas1_chunk_range((0, 1))
    return prepare_safer_data(pt_data, ecg_data)[1]

def load_feas1_second_half():
    ecg_data, pt_data = load_feas1_chunk_range((4, 5))
    return prepare_safer_data(pt_data, ecg_data)[1]

def load_feas1_nth_chuck(n):
    ecg_data, pt_data = load_feas1_chunk_range((n, n+1))
    ecg_data.index = ecg_data["measID"]
    pt_data.index = pt_data["ptID"]
    return prepare_safer_data(pt_data, ecg_data)[1]

loading_functions = [lambda n=n: load_feas1_nth_chuck(n) for n in range(num_chunks)]

feas1_dataloader_entire = DatasetSequenceIterator(loading_functions, [128 for n in range(num_chunks)])

In [49]:
num_ecgs = 0
for i, (signals, labels, _) in enumerate(feas1_train_dataloader):
    signal = signals[0].to(device).float()
    rris = signals[1].to(device).float()
    rri_len = signals[2].to(device).float()

    num_ecgs += signal.shape[0]

print(num_ecgs)

Loading dataset 0
D:\2022_23_DSiromani\Feas1\ECGs/filtered_dataframe_0.pk.pk


Exception in thread Thread-10 (load_next_dataset):
Traceback (most recent call last):
  File "C:\Users\daniel\AppData\Local\Programs\Python\Python310\lib\threading.py", line 1009, in _bootstrap_inner
    self.run()
  File "C:\Users\daniel\AppData\Local\Programs\Python\Python310\lib\threading.py", line 946, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\daniel\AppData\Local\Temp\ipykernel_18972\2813032984.py", line 47, in load_next_dataset
  File "C:\Users\daniel\AppData\Local\Temp\ipykernel_18972\2109776680.py", line 6, in filter_train_pts
NameError: name 'train_pts' is not defined


filtering 2967 ECGs out
   ptID   age         ptDiag          ptDiagRev1          ptDiagRev2  \
0     1  71.0  DiagEnum.NoAF  DiagEnum.Undecided  DiagEnum.Undecided   
1     1  71.0  DiagEnum.NoAF  DiagEnum.Undecided  DiagEnum.Undecided   
2     1  71.0  DiagEnum.NoAF  DiagEnum.Undecided  DiagEnum.Undecided   
3     1  71.0  DiagEnum.NoAF  DiagEnum.Undecided  DiagEnum.Undecided   
4     1  71.0  DiagEnum.NoAF  DiagEnum.Undecided  DiagEnum.Undecided   

           ptDiagRev3  cardRev            measDiag        measDiagRev1  \
0  DiagEnum.Undecided        0  DiagEnum.Undecided  DiagEnum.Undecided   
1  DiagEnum.Undecided        0  DiagEnum.Undecided  DiagEnum.Undecided   
2  DiagEnum.Undecided        0  DiagEnum.Undecided  DiagEnum.Undecided   
3  DiagEnum.Undecided        0  DiagEnum.Undecided  DiagEnum.Undecided   
4  DiagEnum.Undecided        0  DiagEnum.Undecided  DiagEnum.Undecided   

         measDiagRev2  ...                                               data  \
0  DiagEnum.Undec

TypeError: 'NoneType' object is not an iterator

D:\2022_23_DSiromani\Feas1\ECGs/filtered_dataframe_1.pk.pk


In [10]:
pt_data = SAFERDataset.load_pt_dataset(1)
ecg_data = SAFERDataset.load_ecg_csv(1, pt_data, ecg_range=None, ecg_meas_diag=None, feas2_offset=10000, feas2_ecg_offset=200000)

ecg_data["feas"] = 1
ecg_data["length"] = 9120
ecg_data["rri_len"] = 20

pt_data, ecg_data = prepare_safer_data(pt_data, ecg_data)
# train_pts, test_pts, val_pts = generate_patient_splits(pt_data, 0.15, 0.15)

In [18]:
feas1_noise_predictions = add_noise_predictions(noiseDetector, feas1_dataloader_entire, ecg_data)

Loading dataset 0
D:\2022_23_DSiromani\Feas1\ECGs/filtered_dataframe_0.pk.pk
Loading dataset 10

D:\2022_23_DSiromani\Feas1\ECGs/filtered_dataframe_1.pk.pk
stop_iteration
waiting_for_next_dataset
Loading dataset 2
D:\2022_23_DSiromani\Feas1\ECGs/filtered_dataframe_2.pk.pk
stop_iteration
waiting_for_next_dataset
Loading dataset 3
D:\2022_23_DSiromani\Feas1\ECGs/filtered_dataframe_3.pk.pk
stop_iteration
waiting_for_next_dataset
Loading dataset 4
D:\2022_23_DSiromani\Feas1\ECGs/filtered_dataframe_4.pk.pk
stop_iteration
waiting_for_next_dataset
Loading dataset 5
D:\2022_23_DSiromani\Feas1\ECGs/filtered_dataframe_5.pk.pk
stop_iteration
waiting_for_next_dataset
Loading dataset 6
D:\2022_23_DSiromani\Feas1\ECGs/filtered_dataframe_6.pk.pk
stop_iteration
waiting_for_next_dataset
Loading dataset 7
D:\2022_23_DSiromani\Feas1\ECGs/filtered_dataframe_7.pk.pk
stop_iteration
waiting_for_next_dataset
Loading dataset 8
D:\2022_23_DSiromani\Feas1\ECGs/filtered_dataframe_8.pk.pk
stop_iteration
Finished l

In [27]:
ecg_data["noise_prediction"] = ecg_data["noise_probs"]

In [29]:
ecg_data.dropna(subset=["noise_prediction"])
(ecg_data[ecg_data['poss_AF_tag'] == 1]["noise_prediction"] < 0).value_counts(dropna=False)

True     5860
False    4345
Name: noise_prediction, dtype: int64

In [None]:
print(f"number of noisy ECGs: {(feas1_noise_predictions > 0).sum()}")
feas1_path = r"D:\2022_23_DSiromani\Feas1"
feas1_noise_predictions.to_pickle(os.path.join(feas1_path, "ECGs/feas1_noise_predictions.pk"))

In [14]:
feas1_path = r"D:\2022_23_DSiromani\Feas1"
feas1_noise_predictions = pd.read_pickle(os.path.join(feas1_path, "ECGs/feas1_noise_predictions.pk"))

In [145]:
zenicor_conf_mat = confusion_matrix(feas1_ecg_data_test["class_index"], feas1_ecg_data_test["poss_AF_tag"])
print_results(zenicor_conf_mat)

Confusion matrix:
[[18614   496     0]
 [   21    77     0]
 [  260   206     0]]
Sensitivity: 0.786
Specificity: 0.964

Normal F1: 0.980
AF F1: 0.176
Other F1: 0.000


In [152]:
ecg_data["noise_prediction"] = feas1_noise_predictions
print(ecg_data[ecg_data["noise_prediction"] < 0]["class_index"].value_counts())
print(ecg_data["class_index"].value_counts())

0    122688
2      2355
1       470
Name: class_index, dtype: int64
0    149586
2      3120
1       745
Name: class_index, dtype: int64


In [26]:
feas1_train_data = ecg_data[ecg_data["ptID"].isin(train_pts["ptID"])]
feas1_train_data_clean = feas1_train_data[feas1_train_data["noise_prediction"] < 0]

NameError: name 'train_pts' is not defined

In [34]:
for pts in [train_pts, test_pts, val_pts]:
    print(pts["noNormalRecs"].sum())
    print(pts["noAFRecs"].sum())
    print(pts["noOtherRecs"].sum())
    print("")

105156.0
567.0
2207.0

22464.0
134.0
562.0

21966.0
121.0
410.0



In [357]:
feas1_path = r"D:\2022_23_DSiromani\Feas1"
train_pts.to_pickle(os.path.join(feas1_path, "all_feas1_train_pts_27_may.pk"))
test_pts.to_pickle(os.path.join(feas1_path, "all_feas1_test_pts_27_may.pk"))
val_pts.to_pickle(os.path.join(feas1_path, "all_feas1_val_pts_27_may.pk"))

In [60]:
feas1_path = r"D:\2022_23_DSiromani\Feas1"
train_pts = pd.read_pickle(os.path.join(feas1_path, "all_feas1_train_pts_27_may.pk"))
test_pts = pd.read_pickle(os.path.join(feas1_path, "all_feas1_test_pts_27_may.pk"))
val_pts = pd.read_pickle(os.path.join(feas1_path, "all_feas1_val_pts_27_may.pk"))

In [43]:
feas1_ecg_data_test = pd.read_pickle(os.path.join(feas1_path, "ECGs/feas1_test_27_mar.pk"))
feas1_ecg_data_test = feas1_ecg_data_test[feas1_ecg_data_test["rri_len"] > 5]

feas1_ecg_data_val = pd.read_pickle(os.path.join(feas1_path, "ECGs/feas1_val_27_mar.pk"))
feas1_ecg_data_val = feas1_ecg_data_val[feas1_ecg_data_val["rri_len"] > 5]

In [155]:
feas1_ecg_data_test["noise_prediction"] = feas1_noise_predictions
feas1_ecg_data_test_clean = feas1_ecg_data_test[feas1_ecg_data_test["noise_prediction"] < 0]

feas1_ecg_data_val["noise_prediction"] = feas1_noise_predictions
feas1_ecg_data_val_clean = feas1_ecg_data_val[feas1_ecg_data_val["noise_prediction"] < 0]

In [156]:
feas1_ecg_data_val["class_index"].value_counts() - feas1_ecg_data_val_clean["class_index"].value_counts()

0    3711
2      61
1      36
Name: class_index, dtype: int64

In [179]:
ecg_data[(feas1_noise_predictions[ecg_data.index] < 0)]

IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

In [47]:
# Create some a filter function to select data from each partition
def filter_train_pts(ecg_data):
    print(f"filtering {(feas1_noise_predictions[ecg_data.index] > 0).sum()} ECGs out")
    print(ecg_data.head())
    print(feas1_noise_predictions.head())
    return ecg_data[(ecg_data["ptID"].isin(train_pts["ptID"])) & (feas1_noise_predictions[ecg_data.index] < 0)]

def filter_test_pts(ecg_data):
    return ecg_data[ecg_data["ptID"].isin(test_pts["ptID"]) & (feas1_noise_predictions[ecg_data.index] < 0)]

def filter_val_pts(ecg_data):
    return ecg_data[ecg_data["ptID"].isin(test_pts["ptID"]) & (feas1_noise_predictions[ecg_data.index] < 0)]

feas1_train_dataloader = DatasetSequenceIterator(loading_functions, [64 for n in range(num_chunks)], filter=filter_train_pts)
feas1_test_dataloader = get_dataloaders(feas1_ecg_data_test)
feas1_val_dataloader = get_dataloaders(feas1_ecg_data_val)

In [48]:
feas1_test_dataloader_clean = get_dataloaders(feas1_ecg_data_test_clean)
feas1_val_dataloader_clean = get_dataloaders(feas1_ecg_data_val_clean)

NameError: name 'feas1_ecg_data_test_clean' is not defined

In [35]:
print(feas1_ecg_data_test_clean["class_index"].value_counts())
print(feas1_ecg_data_val_clean["class_index"].value_counts())

print("  ")

print(feas1_ecg_data_test["class_index"].value_counts())
print(feas1_ecg_data_val["class_index"].value_counts())

0    19247
2      325
1       60
Name: class_index, dtype: int64
0    19151
2      391
1       90
Name: class_index, dtype: int64
  
0    22274
2      377
1      102
Name: class_index, dtype: int64
0    22862
2      452
1      126
Name: class_index, dtype: int64


### Prepare for training

In [138]:
del model
del feas1_train_dataloader
del feas1_test_dataloader
torch.cuda.empty_cache()

import gc
gc.collect()

5522

In [70]:
import Models.SpectrogramTransformer
importlib.reload(Models.SpectrogramTransformer)
# from Models.SpectrogramTransformer import TransformerModel
import Models.SpectrogramTransformerAttentionPooling
importlib.reload(Models.SpectrogramTransformerAttentionPooling)
from Models.SpectrogramTransformerAttentionPooling import TransformerModel

In [72]:
from torch.optim.lr_scheduler import StepLR, LambdaLR, SequentialLR

In [71]:
n_head = 4
n_fft = 128
embed_dim = 128 # int(n_fft/2)
n_inp_rri = 64

model = TransformerModel(3, embed_dim, n_head, 512, 6, n_fft, n_inp_rri, device=device).to(device)

(2, 18)


In [73]:
class focal_loss(nn.Module):

    def __init__(self, weights, gamma=2, label_smoothing=0):
        super(focal_loss, self).__init__()
        self.ce_loss = nn.CrossEntropyLoss(reduction="none", label_smoothing=label_smoothing)
        self.weights = weights
        self.gamma = gamma

    def forward(self, pred, targets):
        ce = self.ce_loss(pred, targets)
        pt = torch.exp(-ce)

        loss_sum = torch.sum(((1-pt) ** self.gamma) * ce * self.weights[targets])
        norm_factor = torch.sum(self.weights[targets])
        return loss_sum/norm_factor

In [74]:
def warmup(current_step: int):
    if current_step < number_warmup_batches:
        # print(current_step / number_warmup_batches ** 1.5)
        return current_step / number_warmup_batches ** 1.5
    else:
        # print(1/math.sqrt(current_step))
        return 1/math.sqrt(current_step)  # 1 / (10 ** (float(number_warmup_epochs - current_step)))


In [25]:
class_counts = torch.tensor(train_dataset["class_index"].value_counts().sort_index().values.astype(np.float32))
class_weights = (1/class_counts)
class_weights /= torch.sum(class_weights)
print(class_weights)

loss_func = focal_loss(class_weights, 2) # nn.CrossEntropyLoss(weight=class_weights, label_smoothing=0.1) # focal_loss(class_weights, 2, 0.05) #
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)

number_warmup_batches = 600
def warmup(current_step: int):
    if current_step < number_warmup_batches:
        # print(current_step / number_warmup_batches ** 1.5)
        return current_step / number_warmup_batches ** 1.5
    else:
        # print(1/math.sqrt(current_step))
        return 1/math.sqrt(current_step)  # 1 / (10 ** (float(number_warmup_epochs - current_step)))

scheduler = LambdaLR(optimizer, lr_lambda=warmup)
# scheduler = SequentialLR(optimizer, [warmup_scheduler, scheduler], [number_warmup_epochs])

NameError: name 'train_dataset' is not defined

In [270]:
# Remake scheduler before retraining on SAFER

"""
class_counts = torch.tensor(train_dataset_safer_clean["class_index"].value_counts().sort_index().values.astype(np.float32))
"""

"""
# just approximate weights using feas2 rather than computing for feas 1 - these might be fundamentally different because in feas2 the cardiologist stopped labelling after the first AF from a patient therefore fewer AF.
class_counts = torch.tensor(val_dataset_safer["class_index"].value_counts().sort_index().values.astype(np.float32))
"""

"""
# Use all of feas1 to compute the class counts - precomputed values for next time: tensor([0.0043, 0.7924, 0.2033])
class_counts = torch.tensor(feas1_ecg_data["class_index"].value_counts().sort_index().values.astype(np.float32))

class_weights = (1/class_counts)
class_weights /= torch.sum(class_weights)
"""

# class_counts = torch.tensor(ecg_data[ecg_data["noise_prediction"] < 0]["class_index"].value_counts().sort_index().values.astype(np.float32))
class_counts = torch.tensor(feas1_ecg_data_train_undersamp["class_index"].value_counts().sort_index().values.astype(np.float32))
class_weights = (1/class_counts)
class_weights /= torch.sum(class_weights)

# class_weights = torch.tensor([0.0043, 0.7924, 0.2033])

print(class_weights)

loss_func = focal_loss(class_weights, gamma=2, label_smoothing=0)

optimizer = torch.optim.Adam(model.parameters(), lr=0.00005)

number_warmup_batches = 600
def warmup(current_step: int):
    if current_step < number_warmup_batches:
        # print(current_step / number_warmup_batches ** 1.5)
        return current_step / number_warmup_batches ** 1.5
    else:
        # print(1/math.sqrt(current_step))
        return 1/math.sqrt(current_step)  # 1 / (10 ** (float(number_warmup_epochs - current_step)))

scheduler = LambdaLR(optimizer, lr_lambda=warmup)

NameError: name 'feas1_ecg_data_train_undersamp' is not defined

In [175]:
# Remake scheduler before retraining on CinC2017

class_counts = torch.tensor(train_dataset_2017["class_index"].value_counts().sort_index().values.astype(np.float32))
class_weights = (1/class_counts)
class_weights /= torch.sum(class_weights)
print(class_weights)

loss_func = nn.CrossEntropyLoss(weight=class_weights) # multiclass_cross_entropy_loss

optimizer = torch.optim.Adam(model.parameters(), lr=0.00004)

number_warmup_batches = 600
def warmup(current_step: int):
    if current_step < number_warmup_batches:
        # print(current_step / number_warmup_batches ** 1.5)
        return current_step / number_warmup_batches ** 1.5
    else:
        # print(1/math.sqrt(current_step))
        return 1/math.sqrt(current_step)  # 1 / (10 ** (float(number_warmup_epochs - current_step)))

scheduler = LambdaLR(optimizer, lr_lambda=warmup)

tensor([0.0947, 0.6933, 0.2121])


In [107]:
# Train the model I stole

import OtherModels.Prna.physionet2020_submission.model
importlib.reload(OtherModels.Prna.physionet2020_submission.model)
from OtherModels.Prna.physionet2020_submission.model import CTN
import OtherModels.Prna.physionet2020_submission.optimizer
importlib.reload(OtherModels.Prna.physionet2020_submission.optimizer)
from OtherModels.Prna.physionet2020_submission.optimizer import NoamOpt

# Train prna's transformer
n_head = 8
n_fft = 128
embed_dim = 128 # int(n_fft/2)
n_inp_rri = 64

class_counts = torch.tensor(train_dataset["class_index"].value_counts().sort_index().values.astype(np.float32))
class_weights = (1/class_counts)
class_weights /= torch.sum(class_weights)
print(class_weights)

model = CTN(256, n_head, 2048, 4, 0.1, 64, 0, 0, 3).to(device)

# Initialize parameters with Glorot / fan_avg.
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

# optimizer = NoamOpt(256, 1, 4000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
loss_func = nn.CrossEntropyLoss(weight=class_weights)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = StepLR(optimizer, step_size=10, gamma=0.5)

number_warmup_batches = 2
def warmup(current_step: int):
    return 1 / (10 ** (float(number_warmup_batches - current_step)))
warmup_scheduler = LambdaLR(optimizer, lr_lambda=warmup)

scheduler = SequentialLR(optimizer, [warmup_scheduler, scheduler], [number_warmup_batches])

tensor([0.1427, 0.7559, 0.1014])


In [265]:
from torch.profiler import profile, tensorboard_trace_handler
from tqdm import tqdm

import copy
model = model.to(device)
model.fix_transformer_params(fix_spec=False, fix_rri=False)
num_epochs = 40

def train(model, train_dataloader, test_dataloader):
    best_test_loss = 100
    best_epoch = -1
    best_model = copy.deepcopy(model).cpu()

    losses = []


    for epoch in range(num_epochs):
        total_loss = 0
        print(f"starting epoch {epoch} ...")
        # Train
        num_batches = 0
        model.train()
        for i, (signals, labels, _) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
            signal = signals[0].to(device).float()
            rris = signals[1].to(device).float()
            rri_len = signals[2].to(device).float()

            if torch.any(torch.isnan(signal)):
                print("Signals are nan")
                continue

            if torch.any(torch.isnan(rris)):
                print("Signals are nan")
                continue

            labels = labels.long()
            optimizer.zero_grad()
            output = model(signal, rris, rri_len).to("cpu")

            if torch.any(torch.isnan(output)):
                print(signal)
                print(rris)
                print(rri_len)
                print(output)
                raise ValueError

            loss = loss_func(output, labels)
            if torch.isnan(loss):
                raise ValueError
            loss.backward()
            # nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0, norm_type=2)
            optimizer.step()
            scheduler.step()
            num_batches += 1
            total_loss += float(loss)

        print(num_batches)

        print(f"Epoch {epoch} finished with average loss {total_loss/num_batches}")
        # writer.add_scalar("Loss/train", total_loss/num_batches, epoch)
        print("Testing ...")
        # Test
        num_test_batches = 0
        test_loss = 0
        with torch.no_grad():
            model.eval()
            for i, (signals, labels, _) in enumerate(test_dataloader):
                signal = signals[0].to(device).float()
                rris = signals[1].to(device).float()
                rri_len = signals[2].to(device).float()

                if torch.any(torch.isnan(signal)):
                    print("Signals are nan")
                    continue

                labels = labels.long()
                output = model(signal, rris, rri_len).to("cpu")
                loss = loss_func(output, labels)
                test_loss += float(loss)
                num_test_batches += 1

        print(f"Average test loss: {test_loss/num_test_batches}")
        losses.append([total_loss/num_batches, test_loss/num_test_batches])
        # writer.add_scalar("Loss/test", test_loss/num_t est_batches, epoch)

        if test_loss/num_test_batches < best_test_loss:
            best_model = copy.deepcopy(model).cpu()
            best_test_loss = test_loss/num_test_batches
            best_epoch = epoch
        else:
            if best_epoch + 5 <= epoch:
                return best_model, losses

    return best_model, losses

model, losses = train(model, feas1_train_dataloader_undersamp, feas1_test_dataloader_undersamp)
model = model.to(device)

In [276]:
losses = np.load("TrainedModels/Transformer_23_May_cinc_train_attention_pooling_no_augmentation_smoothing_training_curve.npy")

# "C:\Users\daniel\Documents\CambridgeSoftwareProjects\ecg-signal-quality\TrainedModels\Transformer_23_May_feas1_train_attention_pooling_augmentation_smoothing_retrain.npy"

In [362]:
# plot the training curve (1 axis only)
fig, ax = plt.subplots(figsize=(6, 4))

train_l = ax.plot([l[0] for l in losses], label="training loss")
val_l = ax.plot([l[1] for l in losses], label="validation loss", color="#ff7f0e")

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# ax.set_ylim(bottom=0)
ax.set_xlim(left=0)

ax.set_xlabel("Epoch number")
ax.set_ylabel("Loss")

ax.legend()

fig.tight_layout()
plt.show()

In [181]:
losses_np = np.array(losses)
np.save("TrainedModels/Transformer_24_May_cinc_2017_train_attention_pooling_augmentation_smoothing", losses_np)

In [179]:
# Save a model
torch.save(model.state_dict(), "TrainedModels/Transformer_24_May_cinc_2017_train_attention_pooling_augmentation_smoothing.pt")

# train_dataset_safer.to_pickle("TrainedModels/Transformer_15_Mar_train.pk")
# test_dataset_safer.to_pickle("TrainedModels/Transformer_15_Mar_test.pk")
# val_dataset_safer.to_pickle("TrainedModels/Transformer_15_Mar_val.pk")
# train_pt_df.to_pickle("TrainedModels/Transformer_spectrogram_small_fft_cut_all_safer_trained_average_warped_train.pk")
# val_pt_df.to_pickle("TrainedModels/Transformer_spectrogram_small_fft_cut_all_safer_trained_average_warped_val.pk")
# test_pt_df.to_pickle("TrainedModels/Transformer_spectrogram_small_fft_cut_all_safer_trained_average_warped_test.pk")

In [122]:
train_dataset.to_pickle("TrainedModels/Transformer_13_May_cinc_trained_initial_train.pk")
test_dataset.to_pickle("TrainedModels/Transformer_13_May_cinc_trained_initial_test.pk")
val_dataset.to_pickle("TrainedModels/Transformer_13_May_cinc_trained_initial_val.pk")

In [5]:
train_dataset_safer = pd.read_pickle("TrainedModels/Transformer_13_Mar_train.pk")
test_dataset_safer = pd.read_pickle("TrainedModels/Transformer_13_Mar_test.pk")
val_dataset_safer = pd.read_pickle("TrainedModels/Transformer_13_Mar_val.pk")

In [13]:
train_dataloader_safer = get_dataloaders(train_dataset_safer)
test_dataloader_safer = get_dataloaders(test_dataset_safer)
val_dataloader_safer = get_dataloaders(val_dataset_safer)

In [144]:
train_dataset_2017.to_pickle("TrainedModels/Transformer_13_May_cinc_2017_trained_train.pk")
test_dataset_2017.to_pickle("TrainedModels/Transformer_13_May_cinc_2017_trained_test.pk")

In [34]:
# Set this for safer cross validation later
cinc_model_path = "TrainedModels/Transformer_15_Mar_cinc_trained_noise_augmentation.pt"

In [149]:
# Load a model
# model = TransformerModel(2, embed_dim, n_head, 1024, 4, 47, n_fft).to(device)
model.load_state_dict(torch.load("TrainedModels/Transformer_27_may_feas1_train_attention_pooling_augmentation_smoothing_nk_beats_retrain.pt", map_location=device))

# train_pt_df = pd.read_pickle("TrainedModels/Transformer_spectrogram_small_fft_cut_all_safer_trained_average_warped_train.pk")
# val_pt_df = pd.read_pickle("TrainedModels/Transformer_spectrogram_small_fft_cut_all_safer_trained_average_warped_val.pk")
# test_pt_df = pd.read_pickle("TrainedModels/Transformer_spectrogram_small_fft_cut_all_safer_trained_average_warped_test.pk")


<All keys matched successfully>

In [None]:
# Load dataset
train_pt_df = pd.read_pickle("TrainedModels/Transformer_13_May_cinc_2017_trained_train.pk")

In [254]:
pt_data

Unnamed: 0_level_0,ptID,age,ptDiag,ptDiagRev1,ptDiagRev2,ptDiagRev3,cardRev,noRecs,noHQrecs,noRecsPossAF,feas,noNormalRecs,noAFRecs,noOtherRecs,pt_prediction_af
ptID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,1,71.0,4,6,6,6,0,118,118,8,1,118.0,0.0,0.0,
2,2,71.0,4,6,6,6,0,117,116,4,1,116.0,0.0,0.0,
3,3,75.0,4,6,6,6,0,30,30,2,1,25.0,0.0,0.0,
4,4,73.0,4,6,6,6,0,59,59,2,1,59.0,0.0,0.0,
5,5,70.0,4,6,6,6,0,118,91,38,1,89.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2137,2137,72.0,4,6,6,6,0,95,95,1,1,93.0,0.0,0.0,
2138,2138,65.0,4,6,6,6,0,57,57,0,1,57.0,0.0,0.0,
2139,2139,78.0,4,6,6,6,0,51,51,0,1,51.0,0.0,0.0,
2140,2140,69.0,4,4,4,4,1,60,60,2,1,59.0,0.0,1.0,


In [None]:
# Tonights training schedule

# Cross Validation dataset construction for SAFER data
# Split train and test data according to each patient


def get_train_and_val_feas1_sets(test_pts, val_pts):
    # Have to load all the data!
    feas1_ecg_data, feas1_pt_data = load_feas1_chunk_range((0, num_chunks))

    # Just use feas1 to prepare test and validation datasets (The train is best handled with a DatasetSequenceIterator)
    feas1_pt_data, feas1_ecg_data = prepare_safer_data(feas1_pt_data, feas1_ecg_data)
    feas1_ecg_data["class_index"].value_counts()

    feas1_ecg_data_test = feas1_ecg_data[feas1_ecg_data["ptID"].isin(test_pts["ptID"])]
    feas1_ecg_data_val = feas1_ecg_data[feas1_ecg_data["ptID"].isin(val_pts["ptID"])]

    print(feas1_ecg_data_test["class_index"].value_counts())
    print(feas1_ecg_data_val["class_index"].value_counts())

    del feas1_ecg_data
    del feas1_pt_data

    return feas1_ecg_data_test, feas1_ecg_data_val


def split_patients_2_groups(pt_data, frac):
    split_ratios = np.array([frac, 1-frac])

    splits = [[], []]

    af_counts = np.zeros(2, dtype=int)
    total_counts = np.zeros(2, dtype=int)

    total_total_count = 0
    total_af_count = 0

    for _, pt in pt_data.sample(frac=1).iterrows():
        total_total_count += pt["noHQrecs"]
        total_af_count += pt["noAFRecs"]

        exp_total_counts = total_total_count * split_ratios
        exp_af_counts = total_af_count * split_ratios

        af_rec_mat = np.diag(np.array([pt["noAFRecs"] for _ in range(2)]))
        hq_rec_mat = np.diag(np.array([pt["noHQrecs"] for _ in range(2)]))

        loss =  np.sum(np.abs(af_counts[None, :] + af_rec_mat - exp_af_counts[None, :]) + np.abs(total_counts[None, :] + hq_rec_mat - exp_total_counts[None, :]), axis=-1)
        best_fold = np.argmin(loss)

        splits[best_fold].append(pt)
        af_counts[best_fold] += pt["noAFRecs"]
        total_counts[best_fold] += pt["noHQrecs"]

    return pd.DataFrame(splits[0]), pd.DataFrame(splits[1])


pt_data = SAFERDataset.load_pt_dataset(1)
ecg_data = SAFERDataset.load_ecg_csv(1, pt_data, ecg_range=None, ecg_meas_diag=None, feas2_offset=10000, feas2_ecg_offset=200000)

ecg_data["feas"] = 1
ecg_data["length"] = 9120
ecg_data["rri_len"] = 20

pt_data, ecg_data = prepare_safer_data(pt_data, ecg_data)

ecg_data["noise_prediction"] = feas1_noise_predictions
ecg_data = ecg_data[ecg_data["noise_prediction"] < 0]  # Remove noisy samples here itself

# now set the counts in pt
pt_data["noHQrecs"] = ecg_data["ptID"].value_counts()
pt_data["noAFRecs"] = ecg_data[ecg_data["class_index"] == 1]["ptID"].value_counts()
pt_data["noHQrecs"].fillna(0, inplace=True)
pt_data["noAFRecs"].fillna(0, inplace=True)


num_cv = 5
num_folds = num_cv  # We'll produce a validation set from the test set in each fold!
pt_folds = [[] for _ in range(num_folds)]

af_counts = np.zeros(num_folds, dtype=int)
total_counts = np.zeros(num_folds, dtype=int)

total_total_count = 0
total_af_count = 0


# Go around the folds and assign patients to each
for _, pt in pt_data.sample(frac=1).iterrows():
    total_total_count += pt["noHQrecs"]
    total_af_count += pt["noAFRecs"]

    exp_total_counts = total_total_count * 1.0/num_folds
    exp_af_counts = total_af_count * 1.0/num_folds

    af_rec_mat = np.diag(np.array([pt["noAFRecs"] for _ in range(num_folds)]))
    hq_rec_mat = np.diag(np.array([pt["noHQrecs"] for _ in range(num_folds)]))

    loss =  np.sum(np.abs(af_counts[None, :] + af_rec_mat - exp_af_counts) + np.abs(total_counts[None, :] + hq_rec_mat - exp_total_counts), axis=-1)
    best_fold = np.argmin(loss)

    pt_folds[best_fold].append(pt)
    af_counts[best_fold] += pt["noAFRecs"]
    total_counts[best_fold] += pt["noHQrecs"]


test_pt_folds = [pd.DataFrame(fold) for fold in pt_folds]
val_pt_folds = []
train_pt_folds = []

for test_pt_f in test_pt_folds:
    val_pt_f, train_pt_f = split_patients_2_groups(pt_data[~pt_data["ptID"].isin(test_pt_f["ptID"])], 0.125)
    val_pt_folds.append(val_pt_f)
    train_pt_folds.append(train_pt_f)


for f in test_pt_folds:
    print(f["noAFRecs"].sum(), f["noHQrecs"].sum())

print("")

for f in val_pt_folds:
    print(f["noAFRecs"].sum(), f["noHQrecs"].sum())

conf_mats = []

num_epochs = 5

for i, (train_pt_df, test_pt_df, val_pt_df) in enumerate(zip(train_pt_folds, test_pt_folds, val_pt_folds)):
    print("======")
    print(f"Fold {i}")
    # Create some a filter function to select data from each partition
    def filter_train_pts(ecg_data):
        return ecg_data[(ecg_data["ptID"].isin(train_pt_df["ptID"])) & (feas1_noise_predictions[ecg_data.index] < 0)]

    feas1_train_dataloader = DatasetSequenceIterator(loading_functions, [64 for n in range(num_chunks)], filter=filter_train_pts)
    feas1_ecg_data_test, feas1_ecg_data_val = get_train_and_val_feas1_sets(test_pt_df, val_pt_df)

    feas1_test_dataloader = get_dataloaders(feas1_ecg_data_test, 64)
    feas1_val_dataloader = get_dataloaders(feas1_ecg_data_val, 64)


    class_counts = torch.tensor(ecg_data[ecg_data["ptID"].isin(train_pt_df["ptID"])]["class_index"].value_counts().values.astype(np.float32))
    class_weights = (1/class_counts)
    class_weights /= torch.sum(class_weights)
    print(class_weights)

    print(feas1_ecg_data_test["class_index"].value_counts())
    print(feas1_ecg_data_val["class_index"].value_counts())

    #  now get the model
    model = TransformerModel(3, embed_dim, n_head, 512, 6, n_fft, n_inp_rri, device=device).to(device)
    model.load_state_dict(torch.load("TrainedModels/Transformer_27_May_feas1_train_attention_pooling_augmentation_smoothing_no_noisy_nk_beats.pt", map_location=device))

    loss_func = focal_loss(class_weights, gamma=2, label_smoothing=0)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.00005)
    number_warmup_batches = 600
    scheduler = LambdaLR(optimizer, lr_lambda=warmup)

    model, losses = train(model, feas1_train_dataloader, feas1_val_dataloader)
    model = model.to(device)

    losses_np = np.array(losses)
    np.save(f"TrainedModels/Transformer_29_May_final_cross_validate_{i}_training_curve", losses_np)

    torch.save(model.state_dict(), f"TrainedModels/Transformer_29_May_final_cross_validate_{i}_training_curve.pt")
    feas1_ecg_data_test.to_pickle(f"C:/Users/daniel/Documents/feas1_train_set_cross_validate_{i}.pk")

    predictions, true_labels = get_predictions(model, feas1_test_dataloader, feas1_ecg_data_test)
    conf_mat = confusion_matrix(true_labels, np.argmax(predictions, axis=1))

    print(conf_mat)
    print(losses_np)

    conf_mats.append(conf_mat)

### Model testing

In [25]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, multilabel_confusion_matrix

def get_predictions(model, dataloader, dataset):

    attentions = []

    """
    def hook(module, x, y):
        for a in y[1]:
            attentions.append(a.detach().cpu().numpy())

    attention_hook = model.attention_pooling.attn.register_forward_hook(hook)
    """

    model.eval()

    true_labels = []
    predictions = []

    outputs = []
    inds = []

    with torch.no_grad():
        for i, (signals, labels, ind) in enumerate(dataloader):
            signal = signals[0].to(device).float()
            rris = signals[1].to(device).float()
            rri_len = signals[2].to(device).float()

            labels = labels.long().detach().numpy()
            true_labels.append(labels)

            output = model(signal, rris, rri_len).detach().to("cpu").numpy() # rris).detach().to("cpu").numpy()

            prediction = output # np.argmax(output, axis=-1)
            predictions.append(prediction)

            for i, o in zip(ind, output):
                outputs.append(o)
                if isinstance(i, str):
                    inds.append(i)
                else:
                    inds.append(i.item())

    dataset["prediction"] = pd.Series(data=outputs, index=inds)
    # dataset["attention"] = pd.Series(data=attentions, index=inds)

    predictions = np.concatenate(predictions)
    true_labels = np.concatenate(true_labels)

    # attention_hook.remove()

    return predictions, true_labels

# predictions, true_labels = get_predictions(model, feas1_val_dataloader_clean, feas1_ecg_data_val_clean)
# conf_mat = confusion_matrix(true_labels, np.argmax(predictions, axis=1))

In [77]:
feas1_ecg_data_test["noise_probs"] = feas1_ecg_data_test["noise_prediction"]
feas1_ecg_data_val["noise_probs"] = feas1_ecg_data_val["noise_prediction"]

def get_noise_free_conf_mat(dataset):
   return confusion_matrix(dataset[dataset["noise_probs"] < 0]["class_index"], dataset[dataset["noise_probs"] < 0]["prediction"].map(np.argmax))

noise_free_conf_mat = get_noise_free_conf_mat(feas1_ecg_data_val)

KeyError: 'noise_prediction'

In [11]:
def F1_ind(conf_mat, ind):
    return (2 * conf_mat[ind, ind])/(np.sum(conf_mat[ind]) + np.sum(conf_mat[:, ind]))

def print_results(conf_mat):
    print("Confusion matrix:")
    print(conf_mat)

    print(f"Sensitivity: {conf_mat[1, 1]/np.sum(conf_mat[1]):0.3f}")
    print(f"Specificity: {(conf_mat[0, 0] + conf_mat[0, 2] + conf_mat[2, 0] + conf_mat[2, 2])/(np.sum(conf_mat[0]) + np.sum(conf_mat[2])):0.3f}")
    print("")

    print(f"Normal F1: {F1_ind(conf_mat, 0):0.3f}")
    print(f"AF F1: {F1_ind(conf_mat, 1):0.3f}")
    print(f"Other F1: {F1_ind(conf_mat, 2):0.3f}")

    print()

print_results(conf_mat)

NameError: name 'conf_mat' is not defined

In [153]:
# Print noise free conf mats
print_results(noise_free_conf_mat)

Confusion matrix:
[[18559    68   524]
 [    4    52    34]
 [   81   140   170]]
Sensitivity: 0.578
Specificity: 0.989

Normal F1: 0.982
AF F1: 0.297
Other F1: 0.304



In [27]:
from sklearn.metrics import confusion_matrix

In [197]:
misclassified_inds = feas1_ecg_data_val[feas1_ecg_data_val["prediction"].map(np.argmax) != feas1_ecg_data_val["class_index"]].index.values

In [382]:
feas1_val_interesting = feas1_ecg_data_val[(feas1_ecg_data_val["class_index"] != 0) | (feas1_ecg_data_val["class_index"] != feas1_ecg_data_val["prediction"].map(np.argmax))]
feas1_val_interesting_dataloader = get_dataloaders(feas1_val_interesting, 64)

In [168]:
from sklearn.metrics import precision_recall_curve
from scipy.special import softmax

fig, ax = plt.subplots(figsize=(6, 4), dpi=250)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

ax.set_xlabel("Recall")
ax.set_ylabel("Precision")

labels = ["Normal", "AF", "Other"]

for i in range(3):
    p, r, d = precision_recall_curve((feas1_ecg_data_val_clean["class_index"] == i),  feas1_ecg_data_val_clean["prediction"].map(lambda x: softmax(x)[i]))
    if i == 1:
        p_af = p
        r_af = r
        d_af = d

    ax.plot(r, p, label=labels[i])
    # plt.xlim((0, 1.1))
    # plt.ylim((0, 1.1))

    # closest_point_to_0_final = np.argmin(np.abs(d))
    # ax.plot(r[closest_point_to_0_final], p[closest_point_to_0_final], "o", color="#ff7f0e", label=r"$p(AF) = 0.5$")

ax.legend()
plt.show()
# fig.savefig("FinalReportFigs/CNN_NoiseDetect_precision_recall.png")

In [171]:
max_prec = np.argmax(p_af)
print(p_af[max_prec])
print(r_af[max_prec])
print(d_af[max_prec])

1.0
0.022222222222222223
0.80841523


In [172]:
for p in p_af:
    print(p)

0.004584352078239609
0.004584585604401202
0.004584819154355578
0.004585052728106373
0.004585286325657224
0.00458551994701177
0.0045857535921736475
0.0045859872611464965
0.004586220953933958
0.004586454670539673
0.004586688410967281
0.004586922175220427
0.0045871559633027525
0.004587389775217901
0.004587623610969518
0.004587857470561248
0.004588091353996737
0.004588325261279633
0.004588559192413582
0.0045887931474022335
0.004589027126249235
0.004589261128958238
0.004589495155532891
0.004589729205976847
0.004589963280293758
0.004590197378487275
0.0045904315005610525
0.0045906656465187455
0.0045908998163640075
0.004591134010100494
0.004591368227731864
0.004591602469261772
0.004591836734693878
0.004592071024031839
0.004592305337279314
0.004592539674439965
0.004592774035517453
0.0045930084205154376
0.004593242829437583
0.004593477262287551
0.004593711719069008
0.004593946199785616
0.004594180704441042
0.00459441523303895
0.00459464978558301
0.004594884362076888
0.004595118962524252
0.004595

In [401]:
importlib.reload(Utilities.Plotting)
from Utilities.Plotting import *

In [210]:
import scipy.signal

dataset = feas1_ecg_data_val_clean
dataset["class_prediction"] = dataset["prediction"].map(lambda x: np.argmax(x))
#  & (dataset["noise_probs"] < 0)

selection = dataset[(dataset["class_prediction"] == 2) & (dataset["class_index"] == 1)]


for ecg_ind, ecg in selection.sample(frac=1).iterrows():
    print(ecg_ind)
    print(ecg[["measDiag", "prediction", "class_index"]])
    # filtered_ecg = scipy.signal.sosfiltfilt(sos, ecg["data"], padlen=150)

    plot_ecg(ecg["data"], 300, n_split=3, r_peaks=ecg["r_peaks"])
    plot_ecg_spectrogram(ecg["data"], 300, n_split=3, cut_range=[2, 18], figsize=(6, 2.5), export_quality=True)
    plot_ecg_drr(ecg["rri_feature"], ecg["rri_len"], export_quality=True)

    plt.show()

74351
measDiag                                 DiagEnum.AF
prediction     [-0.28620133, -0.32692084, 0.7239914]
class_index                                        1
Name: 74351, dtype: object
74431
measDiag                                DiagEnum.AF
prediction     [-0.7460612, 0.22486098, 0.49314404]
class_index                                       1
Name: 74431, dtype: object
74435
measDiag                                 DiagEnum.AF
prediction     [-0.017127324, -0.4723771, 0.8629926]
class_index                                        1
Name: 74435, dtype: object
74385
measDiag                                 DiagEnum.AF
prediction     [-0.76330775, 0.16884518, 0.48620653]
class_index                                        1
Name: 74385, dtype: object


KeyboardInterrupt: 

In [112]:
conf_mat_initial_transformer = np.array([[[ 717,   41,  226],
 [  88,  686,  129],
 [ 456,  184, 1407]], # CinC2020
[[   0, 2237, 1457],
 [   0,  437,   67],
 [   0, 1094,  555]],  # CinC2017
[[7384, 2281, 9848],
 [   0,    8,    8],
 [ 249,  151,  357]],  # Safer Feas2
[[ 8502,  2400, 11362],
 [    2,    50,    67],
 [  223,    75,   256]]])  # safer feas1

conf_mat_fine_tuned = np.array([[[  0, 320, 419],
 [  0,  91,  10],
 [  0, 171, 159]],  # CinC 2017
[[14918,  1543,  3052],
 [    3,     7,     6],
 [  562,    79,   116]],  # SAFER feas 2
[[17558,  2061,  2829],
 [    7,    92,    34],
 [  343,    60,   159]]])  # SAFER feas 1


conf_mat_final = np.array([[[20413,   376,  1475],
 [   13,    82,    38],
 [  203,    73,   282]],  # SAFER feas1
[[19113,    19,   381],
 [    7,     5,     4],
 [  428,    67,   262]]]) # SAFER feas2


for c in conf_mat_final:
    plot_confusion_matrix_2(c, ["Normal", "AF", "Other Rhythm"], colour="Blues")

In [403]:
# Plot with attention maps

import scipy.signal

dataset = feas1_val_interesting
dataset["class_prediction"] = dataset["prediction"].map(lambda x: np.argmax(x))
#  & (dataset["noise_probs"] < 0)

selection = dataset[(dataset["class_prediction"] == 1) & (dataset["class_index"] == 1)]

for ecg_ind, ecg in selection.dropna(subset=["attention"]).sample(frac=1).iterrows():
    print(ecg_ind)
    print(ecg[["measDiag", "prediction", "class_index"]])
    print(ecg["attention"].shape)
    # for 10s cut attention to 96
    # filtered_ecg = scipy.signal.sosfiltfilt(sos, ecg["data"], padlen=150)
    plot_ecg_with_attention(ecg["data"][:3000], 300, n_split=1, attention=ecg["attention"][:, :, :96])# , figsize=(6, 4), export_quality=True)
    # plot_ecg_spectrogram(ecg["data"], 300, n_split=3, cut_range=[2, 18])
    # plot_ecg_poincare(ecg["rri_feature"], ecg["rri_len"])
    plt.show()

83267
measDiag                                DiagEnum.AF
prediction     [-1.5628628, 0.83602333, 0.26011002]
class_index                                       1
Name: 83267, dtype: object
(4, 286, 286)
Plotting attention
74349
measDiag                                DiagEnum.AF
prediction     [-1.6065241, 0.97179544, 0.25990623]
class_index                                       1
Name: 74349, dtype: object
(4, 286, 286)
Plotting attention
150683
measDiag                                  DiagEnum.AF
prediction     [-1.6618122, 0.85432255, -0.005189167]
class_index                                         1
Name: 150683, dtype: object
(4, 286, 286)
Plotting attention
146007
measDiag                               DiagEnum.AF
prediction     [-2.0863369, 1.270616, 0.093946874]
class_index                                      1
Name: 146007, dtype: object
(4, 286, 286)
Plotting attention
150667
measDiag                               DiagEnum.AF
prediction     [-1.8283519, 0.92098296, 0.18493

KeyboardInterrupt: 

## Inspect the attention mechanism (not very useful yet)

In [383]:
# model.transformer_encoder.layers.
from plotly.subplots import make_subplots
fig = make_subplots(rows=2, cols=1)

def patch_attention(m):
    forward_orig = m.forward

    def wrap(*args, **kwargs):
        kwargs['need_weights'] = True
        kwargs['average_attn_weights'] = False

        return forward_orig(*args, **kwargs)

    m.forward = wrap

attentions = []
inds = []

def save_outputs(module, x, y):
    for att in y[1]:
        attentions.append(att.cpu().numpy())

patch_attention(model.transformer_encoder.layers[0].self_attn)
attention_hook = model.transformer_encoder.layers[0].self_attn.register_forward_hook(save_outputs)

model.eval()
with torch.no_grad():
    for i, (signals, labels, ind) in enumerate(feas1_val_interesting_dataloader):
        signal = signals[0].to(device).float()
        rris = signals[1].to(device).float()
        rri_len = signals[2].to(device).float()

        labels = labels.long().detach().numpy()
        for i in ind:
            if isinstance(i, str):
                inds.append(i)
            else:
                inds.append(i.item())

        output = model(signal, rris, rri_len) # rris).detach().to("cpu").numpy()
        # plot_ecg(signal[0].cpu().numpy())
        # plt.show()

attention_hook.remove()
# attentions = []

In [385]:
# len(attentions)
len(feas1_val_interesting)

2032

In [386]:
feas1_val_interesting["attention"] = pd.Series(data=attentions, index=inds)

In [191]:
attention_hook.remove()

### Inspect the attention pooling weights

In [170]:
# model.transformer_encoder.layers.
from plotly.subplots import make_subplots
fig = make_subplots(rows=2, cols=1)

attentions = None

def hook(module, x, y):
    global attentions
    print("hook")
    attentions = y[1].detach().to("cpu").numpy()

attention_hook = model.attention_pooling.attn.register_forward_hook(hook)

with torch.no_grad():
    for i, (signals, labels, ind) in enumerate(test_dataloader_safer):
        print(signals.shape)
        signals = torch.transpose(signals.to(device), 0, 1).float()
        # fft = torch.abs(torch.fft.fft(signals))
        # signals = torch.cat([signals, fft], dim=1)
        labels = labels.long().detach().numpy()
        output = model(signals).detach().to("cpu").numpy()

        if labels[0] == 0:
            print(attentions.shape)
            fig = make_subplots(2, 1)
            fig.add_trace(go.Scatter(y=signals[:, 0].detach().to("cpu").numpy()), row=1, col=1)
            for j in range(attentions.shape[-2]):
                fig.add_trace(go.Scatter(y=attentions[0, j, :]), row=2, col=1)
            fig.show()

        if i == 10:
            break

attention_hook.remove()

torch.Size([128, 9120])
hook


AttributeError: 'NoneType' object has no attribute 'append'

### Inspect the final classification layers

In [405]:
fc1_weight = model.decoder1.weight.data
fc2_weight = model.decoder2.weight.data

plt.imshow(fc1_weight.cpu().numpy())
plt.show()

In [406]:
dataset = feas1_ecg_data_val
dataset["class_prediction"] = dataset["prediction"].map(lambda x: np.argmax(x))
selection = dataset[(dataset["class_prediction"] == 1) & (dataset["class_index"] == 1) & (dataset["noise_probs"]< 0)]

In [69]:
np_signal = np.vstack(selection["data"].values)
np_rri = np.vstack(selection["rri_feature"].values)
rri_len = selection["rri_len"].values

In [70]:
signal = torch.tensor(np_signal, dtype=torch.float, device=device)
rri = torch.tensor(np_rri, dtype=torch.float, device=device)
rri_lens = torch.tensor(rri_len, device=device)

encoder_out = None

def get_encoding(module, x, y):
    global encoder_out
    print("hook")
    print(x[0].shape)
    encoder_out = x[0].detach().to("cpu")

encoding_hook = model.decoder1.register_forward_hook(get_encoding)

output = model(signal, rri, rri_lens)

encoding_hook.remove()

# Now recreate the output from just the RRI or ECG and see which makes the biggest impact
fc1_weight = model.decoder1.weight.data.to("cpu")
fc2_weight = model.decoder2.weight.data.to("cpu")


ecg_out = fc1_weight[:, :128] @ torch.unsqueeze(encoder_out[:, :128], dim=-1)
rri_out = fc1_weight[:, 128:] @ torch.unsqueeze(encoder_out[:, 128:], dim=-1)

print(ecg_out.shape)
print(ecg_out.shape)

ecg_out = nn.functional.relu(ecg_out)
rri_out = nn.functional.relu(rri_out)

ecg_out = fc2_weight @ ecg_out
rri_out = fc2_weight @ rri_out

print(ecg_out.shape)
print(ecg_out.shape)

plt.figure()
plt.title("ECG Signal Outputs")
plt.imshow(ecg_out)
plt.colorbar()

plt.figure()
plt.title("RRI Sequence Outputs")
plt.imshow(rri_out)
plt.colorbar()

plt.show()

hook
torch.Size([17, 192])
torch.Size([17, 128, 1])
torch.Size([17, 128, 1])
torch.Size([17, 3, 1])
torch.Size([17, 3, 1])


### TSNE the encoder outputs

In [71]:
from sklearn.manifold import TSNE

In [128]:
encoder_out = []
class_indexes = []
inds = []

def get_encoding(module, x, y):
    encoder_out.append(x[0].detach().to("cpu").numpy())

encoding_hook = model.decoder1.register_forward_hook(get_encoding)

dataloader = test_dataloader_safer

with torch.no_grad():
        for i, (signals, labels, ind) in enumerate(dataloader):
            signal = signals[0].to(device).float()
            rris = signals[1].to(device).float()
            rri_len = signals[2].to(device).float()

            labels = labels.long().detach().numpy()
            class_indexes.append(labels)

            output = model(signal, rris, rri_len).detach().to("cpu").numpy()
            inds.append(ind.cpu().detach().numpy())

encoding_hook.remove()

encoder_out = np.concatenate(encoder_out, axis=0)
class_indexes = np.concatenate(class_indexes, axis=0)
inds = np.concatenate(inds, axis=0)

test_dataset_safer["encodings"] = pd.Series(data=[encoder_out[i] for i in range(encoder_out.shape[0])], index=inds)

tsne = TSNE(perplexity=30)
embeddings = tsne.fit_transform(encoder_out)

print(embeddings.shape)



(4078, 2)


In [132]:
tsne = TSNE(perplexity=10)
embeddings = tsne.fit_transform(encoder_out)

plt.scatter(embeddings[:, 0], embeddings[:, 1], c=class_indexes, marker="x")
plt.colorbar()



### Performance Evaluation for SAFER (patient wise)

In [79]:
# Compute predictions for the entire feas1 dataset, with the cross validated models

feas1_ecg_predictions = []

for i in range(5):
    model = TransformerModel(3, embed_dim, n_head, 512, 6, n_fft, n_inp_rri, device=device).to(device)
    model.load_state_dict(torch.load(f"TrainedModels/Transformer_29_May_final_cross_validate_{i}_training_curve.pt", map_location=device))
    model.eval()

    feas1_ecg_data_test = pd.read_pickle(f"C:/Users/daniel/Documents/feas1_train_set_cross_validate_{i}.pk")
    feas1_ecg_data_test_all = feas1_ecg_data[feas1_ecg_data["ptID"].isin(feas1_ecg_data_test["ptID"])]
    print(feas1_ecg_data_test_all["class_index"].value_counts())

    feas1_test_dataloader = get_dataloaders(feas1_ecg_data_test_all, 64)

    get_predictions(model, feas1_test_dataloader, feas1_ecg_data_test_all)
    feas1_ecg_predictions.append(feas1_ecg_data_test_all.drop("data", axis=1))

feas1_ecg_predictions = pd.concat(feas1_ecg_predictions)

(2, 18)
0    32499
2      707
1      218
Name: class_index, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["prediction"] = pd.Series(data=outputs, index=inds)


(2, 18)
0    30217
2      584
1       86
Name: class_index, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["prediction"] = pd.Series(data=outputs, index=inds)


(2, 18)
0    29726
2      656
1      168
Name: class_index, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["prediction"] = pd.Series(data=outputs, index=inds)


(2, 18)
0    28603
2      687
1      113
Name: class_index, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["prediction"] = pd.Series(data=outputs, index=inds)


(2, 18)
0    28336
2      476
1      156
Name: class_index, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["prediction"] = pd.Series(data=outputs, index=inds)


In [81]:
feas1_ecg_predictions.to_pickle("C:/Users/daniel/Documents/feas1_train_set_cross_validate_all_predictions_even_noisy.pk")

In [12]:
feas1_ecg_predictions = pd.read_pickle("C:/Users/daniel/Documents/feas1_train_set_cross_validate_all_predictions_even_noisy.pk")

In [15]:
feas1_ecg_predictions["noise_prediction"] = feas1_noise_predictions

In [17]:
(feas1_ecg_predictions[feas1_ecg_predictions["poss_AF_tag"] == 1]["noise_prediction"].dropna() < 0).value_counts()

True     5860
False    4323
Name: noise_prediction, dtype: int64

In [18]:
(ecg_data[ecg_data["poss_AF_tag"] == 1]["noise_prediction"] <= 0).value_counts()

KeyError: 'noise_prediction'

In [36]:
ecg_data["prediction"] = feas1_ecg_predictions["prediction"]

In [29]:
def print_binary_results(conf_mat):
    print("Confusion matrix:")
    print(conf_mat)

    print(f"Sensitivity: {conf_mat[1, 1] / np.sum(conf_mat[1]):0.3f}")
    print(f"Specificity: {(conf_mat[0, 0]) / np.sum(conf_mat[0]):0.3f}")
    print("")

    print(f"Normal F1: {F1_ind(conf_mat, 0):0.3f}")
    print(f"AF F1: {F1_ind(conf_mat, 1):0.3f}")

In [30]:
def get_total_num_review(pt_ordered_ecg_groups):
    total_num_review = 0
    for _, g in pt_ordered_ecg_groups:
        total_num_review += (g["ECGIsAF"].cumsum() == 0).sum()
    return total_num_review

def get_pt_conf_mat(pt_data, pt_has_af_review, ground_truth):
    pt_data.loc[:, "pt_prediction_af"] = False
    pt_data.loc[:, "pt_prediction_af"] = pt_has_af_review
    pt_data["pt_prediction_af"].fillna(False, axis=0, inplace=True)

    return confusion_matrix(ground_truth, pt_data["pt_prediction_af"].astype(bool))

In [21]:
pt_data = feas1_pt_data

NameError: name 'feas1_pt_data' is not defined

In [31]:
pt_data["ptDiag"].value_counts()
allowed_pt_diags = [1, 2, 4 ] # [1, 2, 4 ], for feas 1   [1, 3] for feas 2
af_pt_diag = 2  # 2 for feas 1, 1 for feas 2

pt_data_limited_diag = pt_data[pt_data["ptDiag"].isin(allowed_pt_diags)] # .isin([4, 2, 1])] # limit diagnoses - avoid screening failure and such!   feas2 limitations are [1, 3]

dataset = feas1_ecg_predictions[feas1_ecg_predictions["noise_prediction"] < 100].dropna(subset=["prediction"])
dataset["prob_af"] = dataset["prediction"].map(lambda x: softmax(x)[1])
dataset["ECGIsAF"] = (dataset["measDiag"] == DiagEnum.AF)

ground_truth = dataset.groupby("ptID")["ECGIsAF"].any()  # Any patient with at least 1 ECG signal labelled as AF is also AF - some patients were diagnosed with ECGs which were not included in the data?
pt_in_dataset = pt_data.loc[ground_truth.index][pt_data["ptDiag"].isin(allowed_pt_diags)]
dataset = dataset[dataset["ptID"].isin(pt_in_dataset["ptID"])]
ground_truth = dataset.groupby("ptID")["ECGIsAF"].any()  # Any patient with at least 1 ECG signal labelled as AF is also AF - some patients were diagnosed with ECGs which were not included in the data?

print(ground_truth.value_counts())
print(pt_in_dataset["ptDiag"].value_counts())  # how did these 2 get their AF diagnosis with no AF ECGs!

False    2043
True       48
Name: ECGIsAF, dtype: int64
4    2025
2      64
1       2
Name: ptDiag, dtype: int64


  pt_in_dataset = pt_data.loc[ground_truth.index][pt_data["ptDiag"].isin(allowed_pt_diags)]


In [32]:
# compute the number of patients who have at least one of the signals labelled as AF
dataset["class_prediction"] = dataset["prediction"].map(np.argmax)
dataset["pred_af"] = dataset["class_prediction"] == 1
pt_diagnoses = dataset.groupby("ptID")["pred_af"].any()

pt_in_dataset.loc[:, "pt_prediction_af"] = pt_diagnoses

val_patients = pt_in_dataset.dropna(subset=["pt_prediction_af"])
pt_conf_mat = confusion_matrix((val_patients["ptDiag"] == af_pt_diag).values, val_patients["pt_prediction_af"].astype(bool))

print_binary_results(pt_conf_mat)

print(f"Max number of ECGs which must be reviewed: {val_patients[val_patients['pt_prediction_af'] == 1]['noHQrecs'].sum()}")

Confusion matrix:
[[1897  130]
 [  14   50]]
Sensitivity: 0.781
Specificity: 0.936

Normal F1: 0.963
AF F1: 0.410
Max number of ECGs which must be reviewed: 15011


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pt_in_dataset.loc[:, "pt_prediction_af"] = pt_diagnoses


In [33]:
# Alternatively, lets see if the cardiologist reviews only a fixed number of ECGs in order of prob AF.
dataset["prob_af"] = dataset["prediction"].map(lambda x: softmax(x)[1])
dataset["ECGIsAF"] = (dataset["measDiag"] == DiagEnum.AF)
pt_ordered_ecg_groups = dataset.sort_values("prob_af", ascending=False).groupby("ptID")

num_ecgs_review = 1000
pt_has_af_limited_review = pt_ordered_ecg_groups.head(num_ecgs_review).groupby("ptID")["ECGIsAF"].any()

pt_conf_mat = get_pt_conf_mat(pt_in_dataset, pt_has_af_limited_review, ground_truth)
print_binary_results(pt_conf_mat)

Confusion matrix:
[[2043    0]
 [   0   48]]
Sensitivity: 1.000
Specificity: 1.000

Normal F1: 1.000
AF F1: 1.000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pt_data.loc[:, "pt_prediction_af"] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pt_data.loc[:, "pt_prediction_af"] = pt_has_af_review
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pt_data["pt_prediction_af"].fillna(False, axis=0, inplace=True)


In [41]:
# Plot curve as a function of number of reviews per patient
from matplotlib.ticker import MaxNLocator, MultipleLocator

num_patients = len(val_patients.index)
max_num_reviews = 8
num_af_found = np.zeros(max_num_reviews)
num_review = np.arange(max_num_reviews)
num_review_avg = np.array([get_total_num_review(pt_ordered_ecg_groups.head(n).groupby("ptID")) for n in num_review])/num_patients

for i, num_ecgs_review in enumerate(num_review):
    if i == 0:
        continue
        # IDK why but i dont get a decent answer for 0
    pt_has_af_limited_review = pt_ordered_ecg_groups.head(num_ecgs_review).groupby("ptID")["ECGIsAF"].any()

    pt_conf_mat = get_pt_conf_mat(pt_in_dataset, pt_has_af_limited_review, ground_truth)
    num_af_found[i] = pt_conf_mat[1, 1]

fig, ax = plt.subplots(figsize=(6, 4), dpi=250)
ax.plot(num_review_avg, num_af_found)
ax.plot([0, num_review_avg.max()], [48, 48], linestyle="--")
for x, y, T in zip(num_review_avg, num_af_found, num_review):
    ax.annotate(f"{T}", (x, y))

ax.set_ylabel("AF patients detected")
ax.set_xlabel("Average ECGs reviewed per patient")

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

ax.set_xlim(left=0)
ax.set_ylim(bottom=0)
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
ax.xaxis.set_minor_locator(MultipleLocator(0.2))
ax.yaxis.set_minor_locator(MultipleLocator(1))

fig.tight_layout()

plt.show()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pt_data.loc[:, "pt_prediction_af"] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pt_data.loc[:, "pt_prediction_af"] = pt_has_af_review
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pt_data["pt_prediction_af"].fillna(False, axis=0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_ind

In [42]:
# Plot num reviews per AF diagnosis vs num AF diagnosis
total_num_review = np.array([get_total_num_review(pt_ordered_ecg_groups.head(n).groupby("ptID")) for n in num_review]) # num_review * num_patients
num_review_per_af = total_num_review[1:]/num_af_found[1:]

fig, ax = plt.subplots(figsize=(6, 4), dpi=250)
ax.plot(num_af_found[1:], num_review_per_af)
ax.plot([48, 48], [0, num_review_per_af.max()],  linestyle="--")

ax.set_ylabel("Reviews per AF patient")
ax.set_xlabel("AF patients found")

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

ax.xaxis.set_minor_locator(MultipleLocator(1))
ax.yaxis.set_minor_locator(MultipleLocator(5))

# ax.set_xlim(left=0)
ax.set_ylim(bottom=0)

fig.tight_layout()

plt.show()

In [43]:
total_num_review_limited = total_num_review
num_af_found_limited = num_af_found
num_review_per_af_limited = num_review_per_af

In [203]:
fig, ax = plt.subplots(figsize=(6, 4), dpi=250)
ax.plot(num_review, total_num_review)

ax.set_ylabel("Total reviews")
ax.set_xlabel(r"Max ECGs reviewed per patient, $n$")

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# ax.xaxis.set_minor_locator(MultipleLocator(1))
ax.yaxis.set_minor_locator(MultipleLocator(200))

ax.set_xlim(left=0)
ax.set_ylim(bottom=0)

fig.tight_layout()

plt.show()

In [99]:
total_num_review

array([   0, 1412, 2450, 3245, 3884, 4407, 4852, 5221], dtype=int64)

In [102]:
num_review * num_patients

array([    0,  2091,  4182,  6273,  8364, 10455, 12546, 14637])

In [40]:
# What if we only use Zenicor flagged ECGs - The system has much worse sensitivity than Zenicor therefore not much benefit to not sending it through Zenicor first!
# This is good actually!

dataset_flagged = dataset[(dataset["poss_AF_tag"] == 1)]
pt_ordered_ecg_groups = dataset_flagged.sort_values("prob_af", ascending=False).groupby("ptID")

num_ecgs_review = 3
pt_has_af_limited_review = pt_ordered_ecg_groups.head(num_ecgs_review).groupby("ptID")["ECGIsAF"].any()
print(get_total_num_review(pt_ordered_ecg_groups))

pt_conf_mat = get_pt_conf_mat(pt_in_dataset, pt_has_af_limited_review, ground_truth)
print_binary_results(pt_conf_mat)

8591
Confusion matrix:
[[2043    0]
 [   2   46]]
Sensitivity: 0.958
Specificity: 1.000

Normal F1: 1.000
AF F1: 0.979


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pt_data.loc[:, "pt_prediction_af"] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pt_data.loc[:, "pt_prediction_af"] = pt_has_af_review
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pt_data["pt_prediction_af"].fillna(False, axis=0, inplace=True)


##### Thresholding - set a limit on p(AF)

In [200]:
plt.hist(dataset[dataset["ECGIsAF"]]["prob_af"], alpha=0.7)
plt.hist(dataset[~dataset["ECGIsAF"]]["prob_af"], alpha=0.7)
plt.show()

In [50]:
T = 0.5
dataset_thresh = dataset[(dataset["prob_af"] > T) & (dataset["poss_AF_tag"] == 1)]
pt_ordered_ecg_groups_thresh = dataset_thresh.sort_values("prob_af", ascending=False).groupby("ptID")
pt_has_af_thresholded_review = pt_ordered_ecg_groups_thresh["ECGIsAF"].any()
print(pt_has_af_thresholded_review.value_counts())

pt_conf_mat = get_pt_conf_mat(pt_in_dataset, pt_has_af_thresholded_review, ground_truth)

print_binary_results(pt_conf_mat)

False    80
True     39
Name: ECGIsAF, dtype: int64
Confusion matrix:
[[2043    0]
 [   9   39]]
Sensitivity: 0.812
Specificity: 1.000

Normal F1: 0.998
AF F1: 0.897


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pt_data.loc[:, "pt_prediction_af"] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pt_data.loc[:, "pt_prediction_af"] = pt_has_af_review
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pt_data["pt_prediction_af"].fillna(False, axis=0, inplace=True)


In [51]:
# Plot curve as a function of number of reviews per patient
n_points = 101
Ts = np.linspace(0, 1, n_points)
num_af_found = np.zeros(n_points)
num_reviews = np.zeros(n_points)

for i, T in enumerate(Ts):
    dataset_thresh = dataset[(dataset["prob_af"] > T) & (dataset["poss_AF_tag"] == 1) & (dataset["noise_prediction"] < 100)]
    pt_ordered_ecg_groups_thresh = dataset_thresh.sort_values("prob_af", ascending=False).groupby("ptID")
    pt_has_af_thresholded_review = pt_ordered_ecg_groups_thresh["ECGIsAF"].any()

    pt_conf_mat = get_pt_conf_mat(pt_in_dataset, pt_has_af_thresholded_review, ground_truth)
    num_af_found[i] = pt_conf_mat[1, 1]
    num_reviews[i] = get_total_num_review(pt_ordered_ecg_groups_thresh)

fig, ax = plt.subplots(figsize=(6, 4), dpi=250)
ax.plot(num_reviews/len(pt_in_dataset.index), num_af_found)
for x, y, T in zip(num_reviews[::20], num_af_found[::20], Ts[::20]):
    ax.annotate(f"{T:.2f}", (x/len(pt_in_dataset.index), y))

ax.plot([0, 4], [48, 48], linestyle="--")

ax.set_ylabel("AF patients detected")
ax.set_xlabel("ECGs reviewed per patient")

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

ax.set_xlim(left=0, right=1.5)
ax.set_ylim(bottom=0)
ax.xaxis.set_major_locator(MaxNLocator(integer=True))

ax.xaxis.set_minor_locator(MultipleLocator(0.2))
ax.yaxis.set_minor_locator(MultipleLocator(1))

fig.tight_layout()

plt.show()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pt_data.loc[:, "pt_prediction_af"] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pt_data.loc[:, "pt_prediction_af"] = pt_has_af_review
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pt_data["pt_prediction_af"].fillna(False, axis=0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_ind

In [45]:
print(num_af_found[15])
print(num_reviews[15])
print(Ts[15])

IndexError: index 15 is out of bounds for axis 0 with size 8

In [48]:
# total_num_review_limited
# num_af_found_limited
num_review_per_af_limited

array([ 32.8372093 ,  54.44444444,  70.54347826,  82.63829787,
        91.8125    , 101.08333333, 108.77083333])

In [55]:
# Plot num reviews per AF diagnosis vs num AF diagnosis
total_num_review = num_reviews
num_review_per_af = total_num_review/num_af_found


fig, ax = plt.subplots(figsize=(5, 3.5), dpi=250)
ax.plot([48, 48], [0, num_review_per_af_limited.max()],  linestyle="--", color="#ff7f0e")
ax.plot(num_af_found_limited[1:], num_review_per_af_limited, color="#2ca02c", label="Limited reviews per patient")
ax.plot(num_af_found, num_review_per_af, color="#1f77b4", label="Probability threshold")

# ax.legend()

ax.set_ylabel("Reviews per AF patient")
ax.set_xlabel("AF patients found")

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

ax.set_xlim(left=40)
ax.set_ylim(bottom=0)

ax.xaxis.set_major_locator(MaxNLocator(integer=True))
ax.xaxis.set_minor_locator(MultipleLocator(1))
ax.yaxis.set_minor_locator(MultipleLocator(5))

fig.tight_layout()

plt.show()

  num_review_per_af = total_num_review/num_af_found


In [143]:
num_af_found[14:17]

array([48., 48., 47.])

In [141]:
num_review_per_af[15]

42.645833333333336

Whats going on with the AF

In [29]:
pt_data = SAFERDataset.load_pt_dataset(1)
ecg_data = SAFERDataset.load_ecg_csv(1, pt_data, ecg_range=None, ecg_meas_diag=None, feas2_offset=10000, feas2_ecg_offset=200000)

ecg_data["ECGIsAF"] = ecg_data["measDiag"] == DiagEnum.AF
print(ecg_data.groupby("ptID")["ECGIsAF"].any().value_counts())

ecg_data["feas"] = 1
ecg_data["length"] = 9120
ecg_data["rri_len"] = 20

# this removes a number of the AF samples, IDK where the other AF samples are going!
pt_data, ecg_data = prepare_safer_data(pt_data, ecg_data)
# train_pts, test_pts, val_pts = generate_patient_splits(pt_data, 0.15, 0.15)

False    2086
True       55
Name: ECGIsAF, dtype: int64


In [28]:
ecg_data["ECGIsAF"] = ecg_data["measDiag"] == DiagEnum.AF
ecg_data.groupby("ptID")["ECGIsAF"].any().value_counts()

False    2083
True       48
Name: ECGIsAF, dtype: int64

In [35]:
ecg_data

Unnamed: 0_level_0,ptID,age,ptDiag,ptDiagRev1,ptDiagRev2,ptDiagRev3,cardRev,measDiag,measDiagRev1,measDiagRev2,...,perhapsAF,measID,data,adc_gain,file_path,class_index,length,ECGIsAF,feas,rri_len
measID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,71.0,DiagEnum.NoAF,DiagEnum.Undecided,DiagEnum.Undecided,DiagEnum.Undecided,0,DiagEnum.Undecided,DiagEnum.Undecided,DiagEnum.Undecided,...,0,1,,,ECGs/000000/saferF1_000001,0,9120,False,1,20
2,1,71.0,DiagEnum.NoAF,DiagEnum.Undecided,DiagEnum.Undecided,DiagEnum.Undecided,0,DiagEnum.Undecided,DiagEnum.Undecided,DiagEnum.Undecided,...,0,2,,,ECGs/000000/saferF1_000002,0,9120,False,1,20
3,1,71.0,DiagEnum.NoAF,DiagEnum.Undecided,DiagEnum.Undecided,DiagEnum.Undecided,0,DiagEnum.Undecided,DiagEnum.Undecided,DiagEnum.Undecided,...,0,3,,,ECGs/000000/saferF1_000003,0,9120,False,1,20
4,1,71.0,DiagEnum.NoAF,DiagEnum.Undecided,DiagEnum.Undecided,DiagEnum.Undecided,0,DiagEnum.Undecided,DiagEnum.Undecided,DiagEnum.Undecided,...,0,4,,,ECGs/000000/saferF1_000004,0,9120,False,1,20
5,1,71.0,DiagEnum.NoAF,DiagEnum.Undecided,DiagEnum.Undecided,DiagEnum.Undecided,0,DiagEnum.Undecided,DiagEnum.Undecided,DiagEnum.Undecided,...,0,5,,,ECGs/000000/saferF1_000005,0,9120,False,1,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162511,2141,70.0,DiagEnum.NoAF,DiagEnum.Undecided,DiagEnum.Undecided,DiagEnum.Undecided,0,DiagEnum.Undecided,DiagEnum.Undecided,DiagEnum.Undecided,...,0,162511,,,ECGs/162000/saferF1_162511,0,9120,False,1,20
162512,2141,70.0,DiagEnum.NoAF,DiagEnum.Undecided,DiagEnum.Undecided,DiagEnum.Undecided,0,DiagEnum.Undecided,DiagEnum.Undecided,DiagEnum.Undecided,...,0,162512,,,ECGs/162000/saferF1_162512,0,9120,False,1,20
162513,2141,70.0,DiagEnum.NoAF,DiagEnum.Undecided,DiagEnum.Undecided,DiagEnum.Undecided,0,DiagEnum.Undecided,DiagEnum.Undecided,DiagEnum.Undecided,...,0,162513,,,ECGs/162000/saferF1_162513,0,9120,False,1,20
162514,2141,70.0,DiagEnum.NoAF,DiagEnum.Undecided,DiagEnum.Undecided,DiagEnum.Undecided,0,DiagEnum.Undecided,DiagEnum.Undecided,DiagEnum.Undecided,...,0,162514,,,ECGs/162000/saferF1_162514,0,9120,False,1,20


In [55]:
ecg_data = ecg_data.loc[feas1_noise_predictions.index][feas1_noise_predictions < 0]

In [56]:
ecg_data["ECGIsAF"] = ecg_data["measDiag"] == DiagEnum.AF
ecg_data.groupby("ptID")["ECGIsAF"].any().value_counts()

False    2067
True       41
Name: ECGIsAF, dtype: int64