In [1]:
import matplotlib.pyplot as plt
import plotly.graph_objects as go

import numpy as np
import pandas as pd

import sys
import os

from ecgdetectors import Detectors

import math
from DataHandlers.DiagEnum import DiagEnum
import DataHandlers.DiagEnum
import DataHandlers.SAFERDataset as SAFERDataset
import DataHandlers.CinC2020Dataset as CinC2020Dataset
import DataHandlers.CinC2020Enums
import importlib
import DataHandlers.CinCDataset as CinCDataset
import DataHandlers.DataAugmentations as DataAugmentations
from multiprocesspandas import applyparallel
importlib.reload(SAFERDataset)

# A fudge because I moved the files
sys.modules["SAFERDataset"] = SAFERDataset
sys.modules["CinC2020Dataset"] = CinC2020Dataset
sys.modules["DiagEnum"] = DataHandlers.DiagEnum
sys.modules["CinC2020Enums"] = DataHandlers.CinC2020Enums
sys.modules["CinCDataset"] = CinCDataset

In [15]:
import torch
from torch import nn
from torch import functional as F

if torch.cuda.is_available():
    print("Using Cuda")
    device = torch.device("cuda")
else:
    print("Using CPU")
    device = torch.device("cpu")

Using Cuda


### Load SAFER data

In [5]:
def get_rri_feature(x, n_beats=60):
    diffs = np.diff(x)[1:-1] # Discard first and last in case it is a false detect
    L = diffs.shape[0]
    if L > n_beats-1:
        return diffs[:n_beats-1]
    else:
        return np.pad(diffs, (0, n_beats-1-L), constant_values=0)

def get_r_peaks(x, detector):
    import numpy as np
    from scipy.signal import windows
    half_window_size = 50
    raw_peak_pos = np.array(detector.hamilton_detector(x["data"]))

    # Now find the closest peak to this value
    padded_data = np.pad(x["data"], half_window_size, constant_values=0)
    window = windows.hamming(2*half_window_size)  # window to bias the detector towards a nearby max rather than a far away one
    # Note the offset of half window size to account for the padding
    signal_segs = np.array([padded_data[rpp:rpp + 2*half_window_size] for rpp in raw_peak_pos]) * window[None, :]
    max_positions = np.argmax(signal_segs, axis=1) + raw_peak_pos - half_window_size
    return max_positions

In [56]:
feas2_pt_data, feas2_ecg_data = SAFERDataset.load_feas_dataset(2, "dataframe_heartrate")

In [57]:
def filter_func(pt_data, ecg_data):
    accepted_meas_diags = [DiagEnum.AF, DiagEnum.NoAF, DiagEnum.HeartBlock]
    ecg_data = ecg_data[(ecg_data["measDiag"].isin(accepted_meas_diags)) | (ecg_data["measID"] < 20000) | (ecg_data["not_tagged_ign_wide_qrs"] == 0)]
    pt_data = pt_data[pt_data["ptID"].isin(ecg_data["ptID"])]

    return pt_data, ecg_data

# warning: changing these chunk sizes may reload feas1 data from scratch, which will take ages
chunk_size = 20000
num_chunks = math.ceil(162515 / chunk_size )

ecg_data = []
pt_data = []

for chunk_num in range(num_chunks):
    feas1_pt_data, feas1_ecg_data = SAFERDataset.load_feas_dataset(1, f"dataframe_{chunk_num}_heartrate", filter_func=filter_func)
    print(len(feas1_ecg_data.index))
    feas1_pt_data, feas1_ecg_data = filter_func(feas1_pt_data, feas1_ecg_data)

    ecg_data.append(feas1_ecg_data)
    pt_data.append(feas1_pt_data)

feas1_ecg_data = pd.concat(ecg_data)
feas1_ecg_data["feas"] = 1
feas1_pt_data = pd.concat(pt_data)

19999
20000
19991
20000
20000
19943
20000
20000
2516


In [118]:
selection = safer_ecg_data[(safer_ecg_data["measDiag"] == DiagEnum.NoAF)]

for _, ecg in selection.sample(frac=1).iterrows():
    print(ecg["data"].shape)
    print(ecg["rri_feature"])
    print(ecg["measDiag"])
    plot_ecg(ecg["data"], 300, r_peaks=ecg["r_peaks"], n_split=3)# , attention=ecg["attention"][0][0][0], num_segments=ecg["attention"].shape[-1])
    plot_ecg_poincare(ecg["rri_feature"])
    plt.show()

(3000,)
[0.62       1.27333333 0.52       1.23666667 0.53       0.47333333
 0.81       0.55       0.46666667 0.80333333 0.55       1.29666667
 0.52666667 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.        ]
DiagEnum.NoAF
(3000,)
[0.56       0.58333333 0.49666667 0.66666667 0.57333333 0.58666667
 0.54       0.65333333 0.58       0.60666667 0.5        0.72666667
 0.40666667 0.82666667 0.40666667 0.84       0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.        

KeyboardInterrupt: 

In [58]:
safer_ecg_data = pd.concat([feas2_ecg_data, feas1_ecg_data])
safer_pt_data = pd.concat([feas2_pt_data, feas1_pt_data])

safer_ecg_data = safer_ecg_data[safer_ecg_data["length"] == 9120]
safer_ecg_data = safer_ecg_data[safer_ecg_data["measDiag"] != DiagEnum.PoorQuality]
safer_ecg_Data = safer_ecg_data[safer_ecg_data["tag_orig_Poor_Quality"] == 0]

In [None]:
# No need to load any data if training on all of feas1, if just training on a small section then use this
feas1_pt_data, feas1_ecg_data = SAFERDataset.load_feas_dataset(1, "dataframe_0.pk")
feas1_pt_data_card_reviewed, feas1_ecg_data_card_reviewed = SAFERDataset.load_feas_dataset(1, "dataframe", ecg_meas_diag=[DiagEnum.AF, DiagEnum.NoAF, DiagEnum.CannotExcludePathology, DiagEnum.HeartBlock])

# Remove duplicates
feas1_pt_data = feas1_pt_data[~feas1_pt_data["ptID"].isin(feas1_pt_data_card_reviewed["ptID"])]
feas1_ecg_data = feas1_ecg_data[~feas2_ecg_data["measID"].isin(feas1_ecg_data_card_reviewed["measID"])]

feas1_ecg_data["feas"] = 1
feas1_ecg_data_card_reviewed["feas"] = 1

safer_ecg_data = pd.concat([feas2_ecg_data, feas1_ecg_data, feas1_ecg_data_card_reviewed], ignore_index=True)
safer_pt_data = pd.concat([feas2_pt_data, feas1_pt_data, feas1_pt_data_card_reviewed])

safer_ecg_data = safer_ecg_data[safer_ecg_data["length"] == 9120]
safer_ecg_data = safer_ecg_data[safer_ecg_data["measDiag"] != DiagEnum.PoorQuality]

In [49]:
# If training on all of Feas1 then just load the patient data
feas1_pt_data = SAFERDataset.load_pt_dataset(1)
feas1_dummy_ecg_data = SAFERDataset.load_ecg_csv(1, feas1_pt_data, None, None, 0)  # the dummy data to hold a place
# safer_ecg_data = pd.concat([feas2_ecg_data, feas1_dummy_ecg_data], ignore_index=True)

In [59]:
safer_ecg_data = SAFERDataset.generate_af_class_labels(safer_ecg_data)

In [60]:
safer_pt_data["noNormalRecs"] = safer_ecg_data[safer_ecg_data["class_index"] == 0]["ptID"].value_counts()
safer_pt_data["noAFRecs"] = safer_ecg_data[safer_ecg_data["class_index"] == 1]["ptID"].value_counts()
safer_pt_data["noOtherRecs"] = safer_ecg_data[safer_ecg_data["class_index"] == 2]["ptID"].value_counts()

In [61]:
safer_pt_data["noAFRecs"] = safer_pt_data["noAFRecs"].fillna(0)
safer_pt_data["noNormalRecs"] = safer_pt_data["noNormalRecs"].fillna(0)
safer_pt_data["noOtherRecs"] = safer_pt_data["noOtherRecs"].fillna(0)

In [62]:
print(safer_ecg_data["class_index"].value_counts())

0    45581
2     3868
1      757
Name: class_index, dtype: int64


In [63]:
detectors = Detectors(300)
safer_ecg_data["r_peaks_hamilton"] = safer_ecg_data.apply_parallel(get_r_peaks, detector=detectors)

100%|██████████| 50206/50206 [00:49<00:00, 1022.81it/s]


In [138]:
# Test to cut just the centre portion of SAFER data

def cut_ecg_center(ecg, cut_size):
    ecg_len = ecg["data"].shape[0]
    return ecg["data"][int((ecg_len - cut_size)/2):-int((ecg_len - cut_size)/2)]

def cut_ecg_adjust_r_peaks(ecg, cut_size):
    ecg_len = ecg["data"].shape[0]
    shifted_peaks = ecg["r_peaks_hamilton"] - int((ecg_len - cut_size)/2)
    return shifted_peaks[np.logical_and(shifted_peaks >= 0, shifted_peaks < cut_size)]

safer_ecg_data["r_peaks"] = safer_ecg_data.apply(lambda x: cut_ecg_adjust_r_peaks(x, 3000), axis=1)
safer_ecg_data["data"] = safer_ecg_data.apply(lambda x: cut_ecg_center(x, 3000), axis=1)

In [64]:
safer_ecg_data["rri_feature"] = (safer_ecg_data["r_peaks_hamilton"].map(np.array)/300).map(lambda x: get_rri_feature(x, 60))

In [65]:
fewer_than_5_beats_found = safer_ecg_data["rri_feature"].map(lambda x: np.sum(x == -1) > 55)
print(fewer_than_5_beats_found.value_counts())
safer_ecg_data = safer_ecg_data[~fewer_than_5_beats_found]

False    50206
Name: rri_feature, dtype: int64


In [66]:
def validate_r_peaks(x):
    try:
        if type(x) == np.ndarray:
            return True
        else:
            print(x)
            return False
    except(Exception):
        print(x)
        return False


In [67]:
def validate_data(x):
    try:
        return x.shape[0] == 9120
    except(Exception):
        return False

safer_ecg_data = safer_ecg_data[safer_ecg_data["data"].map(validate_data)]

In [68]:
print(safer_ecg_data["heartrate"].min())
print(safer_ecg_data["heartrate"].max())

plt.hist(safer_ecg_data["heartrate"][safer_ecg_data["measDiag"] != DiagEnum.AF], alpha=0.7, density=True)
plt.hist(safer_ecg_data["heartrate"][safer_ecg_data["measDiag"] == DiagEnum.AF], alpha=0.7, density=True)
plt.show()

1.9736842105263157
179.60526315789474


In [15]:
# Cut out high and low heartrates - this doesnt make a difference I swear
safer_ecg_data = safer_ecg_data[(safer_ecg_data["heartrate"] < 120) & (safer_ecg_data["heartrate"] > 50)]

In [18]:
# Check the F1 score of Zenicor - its not even that bad! - I cant actually check this because Ive filtered on zenicor labels and used zenicor for the labelling

from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, multilabel_confusion_matrix

conf_mat = confusion_matrix(safer_ecg_data["class_index"], safer_ecg_data["poss_AF_tag"])
print(conf_mat)

# Same as the below function (as described in CinC)
def F1_ind(conf_mat, ind):
    return (2 * conf_mat[ind, ind])/(np.sum(conf_mat[ind]) + np.sum(conf_mat[:, ind]))

print("Confusion matrix:")
print(conf_mat)

print(f"Normal F1: {F1_ind(conf_mat, 0)}")
print(f"AF F1: {F1_ind(conf_mat, 1)}")
print(f"Other F1: {F1_ind(conf_mat, 2)}")

[[37056     0     0]
 [   60   697     0]
 [ 3584 11295     0]]
Confusion matrix:
[[37056     0     0]
 [   60   697     0]
 [ 3584 11295     0]]
Normal F1: 0.9531354493543907
AF F1: 0.10934190916934662
Other F1: 0.0


In [69]:
# Normalise the RRI features
safer_ecg_data["rri_feature_unpadded"] = safer_ecg_data["rri_feature"].map(lambda x: x[x > 0])
mean_feature = safer_ecg_data["rri_feature_unpadded"].map(lambda x: x.mean()).mean()
std_feature = safer_ecg_data["rri_feature_unpadded"].map(lambda x: x.std()).mean()

safer_ecg_data["rri_feature"] = (safer_ecg_data["rri_feature"] - mean_feature)/std_feature

  mean_feature = safer_ecg_data["rri_feature_unpadded"].map(lambda x: x.mean()).mean()
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


### Load CinC 2020 data

In [181]:
import DataHandlers.CinC2020Dataset as CinC2020Dataset
import importlib
importlib.reload(CinC2020Dataset)

df = CinC2020Dataset.load_dataset(save_name="dataframe")

In [182]:
# At the moment we only select data with length which can be truncated to 3000 samples (10s)

def select_length(df):
    df_within_range = df[(df["length"] <= 5000) & (df["length"] >= 3000)].copy()
    df_within_range["data"] = df_within_range["data"].map(lambda x: x[:3000])
    df_within_range["length"] = df_within_range["data"].map(lambda x: x.shape[0])
    return df_within_range

df = select_length(df)
df["length"].value_counts()

3000    48030
Name: length, dtype: int64

In [183]:
print(df["heartrate"].min())
print(df["heartrate"].max())

plt.hist(df["heartrate"])
plt.show()

18.0
186.0


In [184]:
df["r_peaks_hamilton"] = df.apply_parallel(get_r_peaks, detector=detectors)

100%|██████████| 48030/48030 [00:22<00:00, 2154.14it/s]


In [14]:
df["r_peaks_hamilton"] = df["r_peaks_hamilton"].map(lambda x: x[x < 3000])

In [187]:
df["rri_feature"] = (df["r_peaks_hamilton"]/df["fs"]).map(lambda x: get_rri_feature(x, 20))

In [188]:
# Show a breakdown of counts from each dataset
df["dataset"] = df["filepath"].map(lambda x: x.split(os.sep)[-3])
df.groupby("dataset")["class_index"].value_counts()

dataset               class_index
cpsc_2018             2               2373
                      1                904
                      0                661
cpsc_2018_extra       2                715
                      1                113
                      0                  3
georgia               2               7041
                      0               1735
                      1                568
ptb-xl                2              10646
                      0               9400
                      1               1514
st_petersburg_incart  0               5841
                      2               5506
                      1               1010
Name: class_index, dtype: int64

In [237]:
# Normalise the RRI features
df["rri_feature_unpadded"] = df["rri_feature"].map(lambda x: x[x > 0])
mean_feature = df["rri_feature_unpadded"].map(lambda x: x.mean()).mean()
std_feature = df["rri_feature_unpadded"].map(lambda x: x.std()).mean()

df["rri_feature"] = (df["rri_feature"] - mean_feature)/std_feature

  mean_feature = df["rri_feature_unpadded"].map(lambda x: x.mean()).mean()
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


In [238]:
df["rri_feature"]

0        [-3.5642334312905386, -1.6056394196769463, -1....
1        [0.007320354593068717, 0.03612320770503396, 0....
2        [-3.7658534030742894, -3.5930362844025017, -2....
3        [0.41056029816057343, -2.556133572371776, -3.1...
5        [0.26654603260075205, 1.0730259197357594, 0.75...
                               ...                        
56053    [2.5707742815579175, 0.3241517388246787, -0.82...
56054    [0.4105602981605744, 0.5833774168323601, -2.12...
56055    [0.4969688574964663, -1.6344422727889116, 2.05...
56056    [0.8138002417280772, 1.0730259197357614, 1.044...
56057    [2.39795716288613, 1.0442230666237944, 1.30344...
Name: rri_feature, Length: 48030, dtype: object

### Load noise from MIT database

In [127]:
import wfdb
import os
from scipy import signal

noises = ["em", "ma"]
noise_dfs = []
mit_dataset_path = "Datasets/mit-bih-noise-stress-test-database"

f_low = 0.67
f_high = 25

def filter_and_norm(x, sos):
    x_filt = signal.sosfiltfilt(sos, x, padlen=150)
    x_norm = (x_filt - x_filt.mean()) / x_filt.std()
    return x_norm

def resample(x, orig_fs, resample_rate):
    resample_len = int(round(x.shape[-1] * resample_rate/orig_fs))
    return x if (x.shape[-1] == resample_len) else signal.resample(x, resample_len)

def split_signal(data, split_len):
    data_splits = []
    splits = np.arange(0, data["data"].shape[0], split_len)

    for i, (start, end) in enumerate(zip(splits, splits[1:])):
        data_split = data.copy()
        data_split["data"] = data["data"][start:end]
        data_split["data"] = (data_split["data"] - data_split["data"].mean())/ data_split["data"].std()

        data_split.name = i
        data_splits.append(data_split)

    return data_splits


for n_path in noises:
    rec = wfdb.rdrecord(os.path.join(mit_dataset_path, n_path))
    sig = np.concatenate([rec.p_signal[:, 0], rec.p_signal[:, 1]])

    bandpass = signal.butter(3, [f_low, f_high], 'bandpass', fs=rec.fs, output='sos')
    notch = signal.butter(3, [48, 52], 'bandstop', fs=rec.fs, output='sos')

    sig = filter_and_norm(sig, bandpass)
    sig = filter_and_norm(sig, notch)

    sig = resample(sig, rec.fs, 300)
    sig_series = pd.Series(data={"data": sig, "fs": 300, "noise_type": n_path})

    split_signals = split_signal(sig_series, 3000)
    split_signals = pd.DataFrame(split_signals)

    noise_dfs.append(split_signals)

noise_df = pd.concat(noise_dfs, ignore_index=True)

### Load CinC2017 Dataset

In [2]:
cinc2017_df = CinCDataset.load_cinc_dataset()

In [3]:
cinc2017_df = cinc2017_df[cinc2017_df["length"] == 9000]
cinc2017_df = cinc2017_df[cinc2017_df["class"] != DiagEnum.PoorQuality]

In [6]:
detectors = Detectors(300)

cinc2017_df["r_peaks"] = cinc2017_df.apply_parallel(get_r_peaks, detector=detectors)
cinc2017_df["r_peaks"] = cinc2017_df["r_peaks"].map(np.array)
cinc2017_df["heartrate"] = cinc2017_df.apply(lambda e: (len(e["r_peaks"]) / (e["length"] / 300)) * 60, axis=1)

cinc2017_df["rri_feature"] = (cinc2017_df["r_peaks"]/300).map(get_rri_feature)

100%|██████████| 5854/5854 [00:03<00:00, 1911.90it/s]


In [33]:
# Normalise the RRI features
cinc2017_df["rri_feature_unpadded"] = cinc2017_df["rri_feature"].map(lambda x: x[x > 0])
mean_feature = cinc2017_df["rri_feature_unpadded"].map(lambda x: x.mean()).mean()
std_feature = cinc2017_df["rri_feature_unpadded"].map(lambda x: x.std()).mean()

cinc2017_df["rri_feature"] = (cinc2017_df["rri_feature"] - mean_feature)/std_feature

  mean_feature = cinc2017_df["rri_feature_unpadded"].map(lambda x: x.mean()).mean()
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


In [7]:
cinc2017_df["measDiag"] = cinc2017_df["class"]

### Generate dataloaders

In [8]:
mapper = CinC2020Dataset.CinC2020DiagMapper()
num_unique_classes = len(mapper.diag_desc.index)

# Note this only gets used for CinC data - the safer data labels were decided to have different meanings
def class_index_map(diag):
    if diag == DiagEnum.NoAF:
        return 0
    elif diag == DiagEnum.AF:
        return 1
    elif diag == DiagEnum.CannotExcludePathology:
        return 2
    elif diag == DiagEnum.Undecided:
        return 0

In [9]:
cinc2017_df["class_index"] = cinc2017_df["class"].map(class_index_map)

In [35]:
# Onehot encoding
from torch.utils.data import Dataset, DataLoader

class Dataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, dataset):
        'Initialization'
        self.dataset = dataset
        self.noise_prob = 0
        self.temp_warp = 0


    def __len__(self):
        'Denotes the total number of samples'
        return len(self.dataset.index)

    def set_noise_prob(self, prob, power_std, noise_df):
        self.noise_prob = prob
        self.noise_power_std = power_std
        self.noise_df = noise_df

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        row = self.dataset.iloc[index]

        data = row["data"]
        rri = row["rri_feature"]

        warp = np.random.binomial(1, self.temp_warp)
        if warp:
            data, r_peaks = DataAugmentations.temporal_warp(data, row["r_peaks_hamilton"])
            rri = get_rri_feature(r_peaks, 20)

        add_noise = np.random.binomial(1, self.noise_prob)
        if add_noise:
            noise = noise_df.sample()["data"].iloc[0] * np.random.normal(scale=self.noise_power_std)
            data += noise

        X = (data, rri)
        y = row["class_index"]
        ind = row.name

        return X, y, ind

In [36]:
# For SAFER data
# Split train and test data according to each patient
# Note this function stratifies for AF and non AF!
def generate_patient_splits(pt_data, test_frac, val_frac):
    train_patients = []
    test_patients = []
    val_patients = []

    test_val_frac = test_frac + val_frac
    val_second_frac = val_frac/test_val_frac

    for val, df in pt_data.groupby("noAFRecs"):
        print(f"processing {val}")
        print(f"number of patients {len(df.index)}")



        n = math.floor(len(df.index) * test_val_frac)
        if  test_val_frac > 0:
            res = ((len(df.index) * test_val_frac) - n)/test_val_frac
        else:
            res = 0
        n += np.random.binomial(res, test_val_frac)
        test_val = df.sample(n)

        n = math.floor(len(test_val.index) * val_second_frac)
        if  val_second_frac > 0:
            res = ((len(test_val.index) * val_second_frac) - n)/val_second_frac
        else:
            res = 0
        n += np.random.binomial(res, val_second_frac)
        val = test_val.sample(n)
        val_patients.append(val)

        test_patients.append(test_val[~test_val["ptID"].isin(val["ptID"])])
        train_patients.append(df[~df["ptID"].isin(test_val["ptID"])])

    train_pt_df = pd.concat(train_patients)
    test_pt_df = pd.concat(test_patients)
    val_pt_df = pd.concat(val_patients)

    return train_pt_df, test_pt_df, val_pt_df


def make_SAFER_dataloaders(pt_data, ecg_data, test_frac, val_frac, batch_size=128):
    train_pt_df, test_pt_df, val_pt_df = generate_patient_splits(pt_data, test_frac, val_frac)

    print(f"Test AF: {test_pt_df['noAFRecs'].sum()} Normal: {test_pt_df['noNormalRecs'].sum()} Other: {test_pt_df['noOtherRecs'].sum()}")
    print(f"Train AF: {train_pt_df['noAFRecs'].sum()} Normal: {train_pt_df['noNormalRecs'].sum()} Other: {train_pt_df['noOtherRecs'].sum()}")
    print(f"Val AF: {val_pt_df['noAFRecs'].sum()} Normal: {val_pt_df['noNormalRecs'].sum()} Other: {val_pt_df['noOtherRecs'].sum()}")

    train_dataloader = None
    test_dataloader = None
    val_dataloader = None

    train_dataset = None
    test_dataset = None
    val_dataset = None

    if not train_pt_df.empty:
        # get ECG datasets
        train_dataset = ecg_data[ecg_data["ptID"].isin(train_pt_df["ptID"])]
        # Normalise
        train_dataset["data"] = (train_dataset["data"] - train_dataset["data"].map(lambda x: x.mean()))/train_dataset["data"].map(lambda x: x.std())
        torch_dataset_train = Dataset(train_dataset)
        train_dataloader = DataLoader(torch_dataset_train, batch_size=batch_size, shuffle=True, pin_memory=True)

    if not test_pt_df.empty:
        test_dataset = ecg_data[(ecg_data["ptID"].isin(test_pt_df["ptID"]))]
        test_dataset["data"] = (test_dataset["data"] - test_dataset["data"].map(lambda x: x.mean()))/test_dataset["data"].map(lambda x: x.std())
        torch_dataset_test = Dataset(test_dataset)
        test_dataloader = DataLoader(torch_dataset_test, batch_size=batch_size, shuffle=True, pin_memory=True)

    if not val_pt_df.empty:
        val_dataset = ecg_data[(ecg_data["ptID"].isin(val_pt_df["ptID"]))]
        val_dataset["data"] = (val_dataset["data"] - val_dataset["data"].map(lambda x: x.mean()))/val_dataset["data"].map(lambda x: x.std())
        torch_dataset_val = Dataset(val_dataset)
        val_dataloader = DataLoader(torch_dataset_val, batch_size=batch_size, shuffle=True, pin_memory=True)

    return train_dataloader, test_dataloader, val_dataloader, train_dataset, test_dataset, val_dataset

In [None]:
train_dataloader_safer, test_dataloader_safer, val_dataloader_safer, train_dataset_safer, test_dataset_safer, val_dataset_safer = make_SAFER_dataloaders(safer_pt_data, safer_ecg_data, test_frac=0.15, val_frac=0.15, batch_size=32)

In [37]:
def get_dataloaders(dataset, batch_size=32):
    torch_dataset = Dataset(dataset)
    dataloader = DataLoader(torch_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
    return dataloader

In [70]:
# validate on Feas2 and train/test on feas1
val_dataset_safer = safer_ecg_data[safer_ecg_data["feas"] == 2]
val_dataloader_safer = get_dataloaders(val_dataset_safer)
train_dataloader_safer, test_dataloader_safer, _, train_dataset_safer, test_dataset_safer, _ = make_SAFER_dataloaders(safer_pt_data, safer_ecg_data[safer_ecg_data["feas"] == 1], test_frac=0.15, val_frac=0, batch_size=32)

processing 0.0
number of patients 2078
processing 1.0
number of patients 13
processing 2.0
number of patients 7
processing 3.0
number of patients 2
processing 4.0
number of patients 1
processing 5.0
number of patients 3
processing 6.0
number of patients 1
processing 8.0
number of patients 4
processing 9.0
number of patients 1
processing 10.0
number of patients 2
processing 11.0
number of patients 1
processing 17.0
number of patients 1
processing 18.0
number of patients 1
processing 19.0
number of patients 1
processing 22.0
number of patients 1
processing 23.0
number of patients 1
processing 26.0
number of patients 1
processing 28.0
number of patients 1
processing 35.0
number of patients 1
processing 38.0
number of patients 1
processing 45.0
number of patients 1
processing 53.0
number of patients 1
processing 62.0
number of patients 1
processing 80.0
number of patients 1
processing 94.0
number of patients 1
Test AF: 137.0 Normal: 6851.0 Other: 558.0
Train AF: 553.0 Normal: 37470.0 Other

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dataset["data"] = (train_dataset["data"] - train_dataset["data"].map(lambda x: x.mean()))/train_dataset["data"].map(lambda x: x.std())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset["data"] = (test_dataset["data"] - test_dataset["data"].map(lambda x: x.mean()))/test_dataset["data"].map(lambda x: x.std())


In [197]:
full_dataloader_safer = get_dataloaders(safer_ecg_data, 128)

In [38]:
full_cinc_dataloader = get_dataloaders(cinc2017_df, 128)
cinc2017_df["noise_probs"] = None

In [71]:
# Filter noisy things out of SAFER
# Now import a model
import Models.NoiseCNN
import importlib

importlib.reload(Models.NoiseCNN)
from Models.NoiseCNN import CNN

noiseDetector = CNN().to(device)
noiseDetector.load_state_dict(torch.load("TrainedModels/CNN_8_Mar.pt", map_location=device))
noiseDetector.eval()

def add_noise_predictions(nd, dataloader, dataset):
    noise_ps = []
    inds = []

    with torch.no_grad():
        for i, (signals, labels, ind) in enumerate(dataloader):
            signal = signals[0].to(device).float()
            noise_prob = nd(torch.unsqueeze(signal, 1)).detach().to("cpu").numpy()

            for i, n in zip(ind, noise_prob):
                inds.append(int(i))
                noise_ps.append(float(n))

    dataset["noise_probs"] = pd.Series(data=noise_ps, index=inds)

In [41]:
add_noise_predictions(noiseDetector, full_cinc_dataloader, cinc2017_df)

In [147]:
add_noise_predictions(noiseDetector, full_dataloader_safer, safer_ecg_data)

In [148]:
(safer_ecg_data["noise_probs"] > 0).value_counts()

False    35855
True     14351
Name: noise_probs, dtype: int64

In [72]:
add_noise_predictions(noiseDetector, val_dataloader_safer, val_dataset_safer)
add_noise_predictions(noiseDetector, test_dataloader_safer, test_dataset_safer)
add_noise_predictions(noiseDetector, train_dataloader_safer, train_dataset_safer)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["noise_probs"] = pd.Series(data=noise_ps, index=inds)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["noise_probs"] = pd.Series(data=noise_ps, index=inds)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["noise_probs"] = pd.Series(data=noise_ps, index=inds)


In [81]:
plt.hist(train_dataset_safer["noise_probs"])
plt.show()

In [73]:
# Remove the noisy samples
train_dataset_safer_clean = train_dataset_safer[train_dataset_safer["noise_probs"] < 0]
test_dataset_safer_clean = test_dataset_safer[test_dataset_safer["noise_probs"] < 0]
val_dataset_safer_clean = val_dataset_safer[val_dataset_safer["noise_probs"] < 0]

print(len(train_dataset_safer_clean.index))
print(len(test_dataset_safer_clean.index))
print(len(val_dataset_safer_clean.index))

train_dataloader_safer_clean = get_dataloaders(train_dataset_safer_clean)
test_dataloader_safer_clean = get_dataloaders(test_dataset_safer_clean)
val_dataloader_safer_clean = get_dataloaders(val_dataset_safer_clean)

16858
3382
16508


In [42]:
plt.hist(cinc2017_df["noise_probs"])
plt.show()

In [None]:
# Load the whole of the test and val sets, but leave the train to be loaded in training
# train_pt_df, test_pt_df, val_pt_df = generate_patient_splits(safer_pt_data, test_frac=0.15, val_frac=0.15)

batch_size = 32
# warning: changing these chunk sizes may reload feas1 data from scratch, which will take ages
chunk_size = 20000
num_chunks = math.ceil(162515 / chunk_size )

test_df = []
val_df = []

def get_feas1_dataset_from_pt(feas1_ecg_data, pt_df):
    dataset = feas1_ecg_data[feas1_ecg_data["ptID"].isin(pt_df.index)]
    dataset["feas"] = 1
    dataset = SAFERDataset.generate_af_class_labels(dataset)
    dataset["data"] = (dataset["data"] - dataset["data"].map(lambda x: x.mean()))/dataset["data"].map(lambda x: x.std())
    dataset = SAFERDataset.reload_r_peaks(dataset, 1, f"dataframe_{chunk_num}_heartrates.pk")
    dataset["rri_feature"] = (dataset["r_peaks"]/300).map(get_rri_feature)
    return dataset

def get_dataloaders(dataset, batch_size=32):
    torch_dataset = Dataset(dataset)
    dataloader = DataLoader(torch_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
    return dataloader


for chunk_num in range(num_chunks):
    _, feas1_ecg_data = SAFERDataset.load_feas_dataset(1, f"dataframe_{chunk_num}.pk", ecg_range=[chunk_size * chunk_num, chunk_size * (chunk_num + 1)])

    print(len(feas1_ecg_data.index))

    test_df.append(get_feas1_dataset_from_pt(feas1_ecg_data, test_pt_df))
    val_df.append(get_feas1_dataset_from_pt(feas1_ecg_data, val_pt_df))

test_dataset_safer = pd.concat(test_df)
val_dataset_safer = pd.concat(val_df)

test_dataset_safer["length"] = test_dataset_safer["data"].map(lambda x: x.shape[0])
val_dataset_safer["length"] = val_dataset_safer["data"].map(lambda x: x.shape[0])

test_dataset_safer = test_dataset_safer[test_dataset_safer["length"] == 9120]
val_dataset_safer = val_dataset_safer[val_dataset_safer["length"] == 9120]

test_dataloader_safer = get_dataloaders(test_dataset_safer)
val_dataloader_safer = get_dataloaders(val_dataset_safer)

In [241]:
### Make dataloaders for CinC data - separate cpsc as the validation set
from sklearn.model_selection import train_test_split

val_dataset = df[df["dataset"] == "cpsc_2018"]
train_dataset, test_dataset = train_test_split(df[df["dataset"] != "cpsc_2018"], test_size=0.15, stratify=df[df["dataset"] != "cpsc_2018"]["class_index"])
# test_dataset, val_dataset = train_test_split(test_dataset, test_size=0.5, stratify=test_dataset["class_index"])

test_dataset = test_dataset[test_dataset["measDiag"] != DiagEnum.Undecided]  # Should just remove any errors in loading the dataset
val_dataset = val_dataset[val_dataset["measDiag"] != DiagEnum.Undecided]  # Should just remove any errors in loading the dataset

torch_dataset_test = Dataset(test_dataset)
test_dataloader = DataLoader(torch_dataset_test, batch_size=128, shuffle=True, pin_memory=True)

torch_dataset_val = Dataset(val_dataset)
val_dataloader = DataLoader(torch_dataset_val, batch_size=128, shuffle=True, pin_memory=True)

torch_dataset_train = Dataset(train_dataset)
# torch_dataset_train.temp_warp = 0.2
# torch_dataset_train.set_noise_prob(0.1, 0.2, noise_df)
train_dataloader = DataLoader(torch_dataset_train, batch_size=128, shuffle=True, pin_memory=True)

In [242]:
train_dataset["class_index"].value_counts()

2    20322
0    14432
1     2724
Name: class_index, dtype: int64

In [243]:
test_dataset["class_index"].value_counts()

2    3586
0    2547
1     481
Name: class_index, dtype: int64

In [23]:
# Set the proportion of AF samples in the test data to that of the train data

val_df_counts = val_dataset["class_index"].value_counts()
train_df_counts = train_dataset["class_index"].value_counts()

train_not_af = train_df_counts.loc[2] + train_df_counts.loc[0]
val_not_af = val_df_counts.loc[2] + val_df_counts.loc[0]

val_af_wanted = int(round((train_df_counts.loc[1]/train_not_af) * val_not_af))

wanted_af_samples = val_dataset[val_dataset["class_index"] == 1].sample(val_af_wanted)
val_dataset = pd.concat([val_dataset[val_dataset["class_index"] != 1], wanted_af_samples])

torch_dataset_val = Dataset(val_dataset)
val_dataloader = DataLoader(torch_dataset_val, batch_size=32, shuffle=True, pin_memory=True)

In [43]:
### CinC2017 data
from sklearn.model_selection import train_test_split

train_dataset_2017, test_dataset_2017 = train_test_split(cinc2017_df, test_size=0.2, stratify=cinc2017_df["class_index"])
test_dataset_2017 = test_dataset_2017[test_dataset_2017["class"] != DiagEnum.Undecided]  # Should just remove any errors in loading the dataset

torch_dataset_test = Dataset(test_dataset_2017)
test_dataloader_2017 = DataLoader(torch_dataset_test, batch_size=32, shuffle=True, pin_memory=True)

torch_dataset_train = Dataset(train_dataset_2017)
train_dataloader_2017 = DataLoader(torch_dataset_train, batch_size=32, shuffle=True, pin_memory=True)

In [44]:
test_dataset_2017["class_index"].value_counts()

0    739
2    331
1    101
Name: class_index, dtype: int64

### Loading all Feas 1 data

### Prepare for training

In [23]:
import Models.SpectrogramTransformer
importlib.reload(Models.SpectrogramTransformer)
from Models.SpectrogramTransformer import TransformerModel

In [24]:
from torch.optim.lr_scheduler import StepLR, LambdaLR, SequentialLR

In [25]:
n_head = 4
n_fft = 128
embed_dim = 128 # int(n_fft/2)
n_inp_rri = 64

model = TransformerModel(3, embed_dim, n_head, 512, 6, n_fft, n_inp_rri, multiquery=False, device=device).to(device)

In [286]:
class focal_loss(nn.Module):

    def __init__(self, weights, gamma=2, label_smoothing=0):
        super(focal_loss, self).__init__()
        self.ce_loss = nn.CrossEntropyLoss(reduction="none", label_smoothing=label_smoothing)
        self.weights = weights
        self.gamma = gamma

    def forward(self, pred, targets):
        ce = self.ce_loss(pred, targets)
        pt = torch.exp(-ce)

        loss_sum = torch.sum(((1-pt) ** self.gamma) * ce * self.weights[targets])
        norm_factor = torch.sum(self.weights[targets])
        return loss_sum/norm_factor

In [314]:

class_counts = torch.tensor(train_dataset["class_index"].value_counts().sort_index().values.astype(np.float32))
class_weights = (1/class_counts)
class_weights /= torch.sum(class_weights)
print(class_weights)

loss_func = focal_loss(class_weights, 2, 0.01) # nn.CrossEntropyLoss(weight=class_weights, label_smoothing=0.1) # focal_loss(class_weights, 2) #
optimizer = torch.optim.Adam(model.parameters(), lr=0.00045)

scheduler = StepLR(optimizer, step_size=10, gamma=0.5)

number_warmup_epochs = 2
def warmup(current_step: int):
    return 1 / (10 ** (float(number_warmup_epochs - current_step)))
warmup_scheduler = LambdaLR(optimizer, lr_lambda=warmup)

scheduler = SequentialLR(optimizer, [warmup_scheduler, scheduler], [number_warmup_epochs])

tensor([0.1427, 0.7560, 0.1013])


In [None]:
# Test the focal loss vs cross entropy

f_loss = focal_loss(class_weights, gamma=2)
pred = torch.tensor([[0.0, -3.0, 0.0]])
target = torch.tensor([1])

print(f_loss(pred, target))
print(loss_func(pred, target))

In [94]:
# Remake scheduler before retraining on SAFER

class_counts = torch.tensor(train_dataset_safer_clean["class_index"].value_counts().sort_index().values.astype(np.float32))
class_weights = (1/class_counts)
class_weights /= torch.sum(class_weights)
print(class_weights)

loss_func = nn.CrossEntropyLoss(weight=class_weights) # focal_loss(class_weights, gamma=0.5) # nn.CrossEntropyLoss(weight=class_weights) # focal_loss(class_weights, 0)#  # multiclass_cross_entropy_loss

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
scheduler = StepLR(optimizer, step_size=5, gamma=0.5)

number_warmup_epochs = 2
def warmup(current_step: int):
    return 1 / (10 ** (float(number_warmup_epochs - current_step)))
warmup_scheduler = LambdaLR(optimizer, lr_lambda=warmup)

scheduler = SequentialLR(optimizer, [warmup_scheduler, scheduler], [number_warmup_epochs])

tensor([0.0226, 0.7974, 0.1800])


In [49]:
# Remake scheduler before retraining on CinC2017

class_counts = torch.tensor(train_dataset_2017["class_index"].value_counts().sort_index().values.astype(np.float32))
class_weights = (1/class_counts)
class_weights /= torch.sum(class_weights)
print(class_weights)

loss_func = nn.CrossEntropyLoss(weight=class_weights) # multiclass_cross_entropy_loss

optimizer = torch.optim.Adam(model.parameters(), lr=0.00004)
scheduler = StepLR(optimizer, step_size=4, gamma=0.5)

number_warmup_epochs = 1
def warmup(current_step: int):
    return 1 / (10 ** (float(number_warmup_epochs - current_step)))
warmup_scheduler = LambdaLR(optimizer, lr_lambda=warmup)

scheduler = SequentialLR(optimizer, [warmup_scheduler, scheduler], [number_warmup_epochs])

tensor([0.0946, 0.6941, 0.2113])


In [107]:
# Train the model I stole

import OtherModels.Prna.physionet2020_submission.model
importlib.reload(OtherModels.Prna.physionet2020_submission.model)
from OtherModels.Prna.physionet2020_submission.model import CTN
import OtherModels.Prna.physionet2020_submission.optimizer
importlib.reload(OtherModels.Prna.physionet2020_submission.optimizer)
from OtherModels.Prna.physionet2020_submission.optimizer import NoamOpt

# Train prna's transformer
n_head = 8
n_fft = 128
embed_dim = 128 # int(n_fft/2)
n_inp_rri = 64

class_counts = torch.tensor(train_dataset["class_index"].value_counts().sort_index().values.astype(np.float32))
class_weights = (1/class_counts)
class_weights /= torch.sum(class_weights)
print(class_weights)

model = CTN(256, n_head, 2048, 4, 0.1, 64, 0, 0, 3).to(device)

# Initialize parameters with Glorot / fan_avg.
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

# optimizer = NoamOpt(256, 1, 4000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
loss_func = nn.CrossEntropyLoss(weight=class_weights)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
scheduler = StepLR(optimizer, step_size=10, gamma=0.5)

number_warmup_epochs = 2
def warmup(current_step: int):
    return 1 / (10 ** (float(number_warmup_epochs - current_step)))
warmup_scheduler = LambdaLR(optimizer, lr_lambda=warmup)

scheduler = SequentialLR(optimizer, [warmup_scheduler, scheduler], [number_warmup_epochs])

tensor([0.1427, 0.7559, 0.1014])


In [95]:
from torch.profiler import profile, tensorboard_trace_handler
from tqdm import tqdm

import copy
model = model.to(device)
model.fix_transformer_params(True)
num_epochs = 40

def train(model, train_dataloader, test_dataloader):
    best_test_loss = 100
    best_epoch = -1
    best_model = copy.deepcopy(model).cpu()

    losses = []


    for epoch in range(num_epochs):
        total_loss = 0
        print(f"starting epoch {epoch} ...")
        # Train
        num_batches = 0
        model.train()
        for i, (signals, labels, _) in enumerate(train_dataloader):
            signal = signals[0].to(device).float()
            rris = signals[1].to(device).float()
            # fft = torch.abs(torch.fft.fft(signals))
            # signals = torch.cat([signals, fft], dim=1)

            if torch.any(torch.isnan(signal)):
                print("Signals are nan")
                continue

            if torch.any(torch.isnan(rris)):
                print("Signals are nan")
                continue

            labels = labels.long()
            optimizer.zero_grad()
            output = model(signal, rris).to("cpu")
            loss = loss_func(output, labels)
            if torch.isnan(loss):
                raise ValueError
            loss.backward()
            optimizer.step()
            num_batches += 1
            total_loss += float(loss)

        print(num_batches)

        print(f"Epoch {epoch} finished with average loss {total_loss/num_batches}")
        # writer.add_scalar("Loss/train", total_loss/num_batches, epoch)
        print("Testing ...")
        # Test
        num_test_batches = 0
        test_loss = 0
        with torch.no_grad():
            model.eval()
            for i, (signals, labels, _) in enumerate(test_dataloader):
                signal = signals[0].to(device).float()
                rris = signals[1].to(device).float()
                # fft = torch.abs(torch.fft.fft(signals))
                # signals = torch.cat([signals, fft], dim=1)
                if torch.any(torch.isnan(signal)):
                    print("Signals are nan")
                    continue

                labels = labels.long()
                output = model(signal, rris).to("cpu")
                loss = loss_func(output, labels)
                test_loss += float(loss)
                num_test_batches += 1

        print(f"Average test loss: {test_loss/num_test_batches}")
        losses.append([total_loss/num_batches, test_loss/num_test_batches])
        # writer.add_scalar("Loss/test", test_loss/num_t est_batches, epoch)

        if test_loss/num_test_batches < best_test_loss:
            best_model = copy.deepcopy(model).cpu()
            best_test_loss = test_loss/num_test_batches
            best_epoch = epoch
        else:
            if best_epoch + 5 <= epoch:
                return best_model, losses

        scheduler.step()

    return best_model, losses

model, losses = train(model, train_dataloader_safer_clean, test_dataloader_safer_clean)
model = model.to(device)

starting epoch 0 ...
527
Epoch 0 finished with average loss 0.7679003183031896
Testing ...
Average test loss: 0.6861964869049361
starting epoch 1 ...
527
Epoch 1 finished with average loss 0.5443412433086808
Testing ...
Average test loss: 0.5507056811508143
starting epoch 2 ...




527
Epoch 2 finished with average loss 0.4735345988051941
Testing ...
Average test loss: 0.5669324518936985
starting epoch 3 ...
527
Epoch 3 finished with average loss 0.4238542332340463
Testing ...
Average test loss: 0.5193882791922902
starting epoch 4 ...
527
Epoch 4 finished with average loss 0.3901673754433301
Testing ...
Average test loss: 0.5469455613561396
starting epoch 5 ...
527
Epoch 5 finished with average loss 0.37505323089927833
Testing ...
Average test loss: 0.5793545262993507
starting epoch 6 ...
527
Epoch 6 finished with average loss 0.3514704283899442
Testing ...
Average test loss: 0.579801801943554
starting epoch 7 ...
527
Epoch 7 finished with average loss 0.3109986720574875
Testing ...
Average test loss: 0.5673762518771976
starting epoch 8 ...
527
Epoch 8 finished with average loss 0.2992399547538902
Testing ...
Average test loss: 0.6032288415744057


In [297]:
# Save a model
torch.save(model.state_dict(), "TrainedModels/Transformer_17_Mar_cinc_trained_label_smooth.pt")

# train_dataset_safer.to_pickle("TrainedModels/Transformer_15_Mar_train.pk")
# test_dataset_safer.to_pickle("TrainedModels/Transformer_15_Mar_test.pk")
# val_dataset_safer.to_pickle("TrainedModels/Transformer_15_Mar_val.pk")
# train_pt_df.to_pickle("TrainedModels/Transformer_spectrogram_small_fft_cut_all_safer_trained_average_warped_train.pk")
# val_pt_df.to_pickle("TrainedModels/Transformer_spectrogram_small_fft_cut_all_safer_trained_average_warped_val.pk")
# test_pt_df.to_pickle("TrainedModels/Transformer_spectrogram_small_fft_cut_all_safer_trained_average_warped_test.pk")

In [5]:
train_dataset_safer = pd.read_pickle("TrainedModels/Transformer_13_Mar_train.pk")
test_dataset_safer = pd.read_pickle("TrainedModels/Transformer_13_Mar_test.pk")
val_dataset_safer = pd.read_pickle("TrainedModels/Transformer_13_Mar_val.pk")

In [13]:
train_dataloader_safer = get_dataloaders(train_dataset_safer)
test_dataloader_safer = get_dataloaders(test_dataset_safer)
val_dataloader_safer = get_dataloaders(val_dataset_safer)

In [34]:
# Set this for safer cross validation later
cinc_model_path = "TrainedModels/Transformer_15_Mar_cinc_trained_noise_augmentation.pt"

In [26]:
# Load a model
# model = TransformerModel(2, embed_dim, n_head, 1024, 4, 47, n_fft).to(device)
model.load_state_dict(torch.load("TrainedModels/Transformer_17_Mar_cinc_trained_label_smooth.pt", map_location=device))

# train_pt_df = pd.read_pickle("TrainedModels/Transformer_spectrogram_small_fft_cut_all_safer_trained_average_warped_train.pk")
# val_pt_df = pd.read_pickle("TrainedModels/Transformer_spectrogram_small_fft_cut_all_safer_trained_average_warped_val.pk")
# test_pt_df = pd.read_pickle("TrainedModels/Transformer_spectrogram_small_fft_cut_all_safer_trained_average_warped_test.pk")

# Should load the test data as well

<All keys matched successfully>

### Model testing

In [100]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, multilabel_confusion_matrix

def get_predictions(model, dataloader, dataset):

    attentions = []

    """
    def hook(module, x, y):
        for a in y[1]:
            attentions.append(a.detach().cpu().numpy())

    attention_hook = model.attention_pooling.attn.register_forward_hook(hook)
    """

    model.eval()

    true_labels = []
    predictions = []

    outputs = []
    inds = []

    with torch.no_grad():
        for i, (signals, labels, ind) in enumerate(dataloader):
            signal = signals[0].to(device).float()
            rris = signals[1].to(device).float()
            # fft = torch.abs(torch.fft.fft(signals))
            # signals = torch.cat([signals, fft], dim=1)
            labels = labels.long().detach().numpy()
            true_labels.append(labels)

            output = model(signal, rris).detach().to("cpu").numpy() # rris).detach().to("cpu").numpy()

            prediction = output # np.argmax(output, axis=-1)
            predictions.append(prediction)

            for i, o in zip(ind, output):
                outputs.append(o)
                inds.append(int(i))

    dataset["prediction"] = pd.Series(data=outputs, index=inds)
    # dataset["attention"] = pd.Series(data=attentions, index=inds)

    predictions = np.concatenate(predictions)
    true_labels = np.concatenate(true_labels)

    # attention_hook.remove()

    return predictions, true_labels

predictions, true_labels = get_predictions(model, val_dataloader_safer, val_dataset_safer)
conf_mat = confusion_matrix(true_labels, np.argmax(predictions, axis=1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["prediction"] = pd.Series(data=outputs, index=inds)


In [103]:
def get_noise_free_conf_mat(dataset):
   return confusion_matrix(dataset[dataset["noise_probs"] < 0]["class_index"], dataset[dataset["noise_probs"] < 0]["prediction"].map(np.argmax))

noise_free_conf_mat = get_noise_free_conf_mat(val_dataset_safer)

In [230]:
abnormal_heartrate_free_conf_mat = confusion_matrix(val_dataset_safer[(val_dataset_safer["heartrate"] < 120) & (val_dataset_safer["noise_probs"] < 0)]["class_index"], val_dataset_safer[(val_dataset_safer["heartrate"] < 120) & (val_dataset_safer["noise_probs"] < 0)]["prediction"].map(np.argmax))

In [101]:
def F1_ind(conf_mat, ind):
    return (2 * conf_mat[ind, ind])/(np.sum(conf_mat[ind]) + np.sum(conf_mat[:, ind]))

def print_results(conf_mat):
    print("Confusion matrix:")
    print(conf_mat)

    print(f"Sensitivity: {conf_mat[1, 1]/np.sum(conf_mat[1])}")
    print(f"Specificity: {conf_mat[0, 0]/np.sum(conf_mat[0])}")

    print(f"Normal F1: {F1_ind(conf_mat, 0)}")
    print(f"AF F1: {F1_ind(conf_mat, 1)}")
    print(f"Other F1: {F1_ind(conf_mat, 2)}")

print_results(conf_mat)

Confusion matrix:
[[17970   110  1457]
 [    3     2    11]
 [  320    63   375]]
Sensitivity: 0.125
Specificity: 0.9197932128781287
Normal F1: 0.9500396510705789
AF F1: 0.020942408376963352
Other F1: 0.28835063437139563


In [104]:
# Print noise free conf mats
print_results(noise_free_conf_mat)

Confusion matrix:
[[15173    39   700]
 [    3     2     9]
 [  233    55   294]]
Sensitivity: 0.14285714285714285
Specificity: 0.9535570638511816
Normal F1: 0.9688707257111842
AF F1: 0.03636363636363636
Other F1: 0.37097791798107255


In [218]:
print_results(abnormal_heartrate_free_conf_mat)

Confusion matrix:
[[14950    89   865]
 [    3     2     9]
 [  803   192   975]]
Sensitivity: 0.14285714285714285
Specificity: 0.9400150905432596
Normal F1: 0.9444093493367025
AF F1: 0.013468013468013467
Other F1: 0.5106048703849175


In [125]:
val_dataset_safer["class_index"].value_counts()

0    19537
2      758
1       16
Name: class_index, dtype: int64

In [107]:
test_dataset_safer_clean = test_dataset_safer[test_dataset_safer["noise_probs"] < 0]

In [105]:
from scipy import signal

def plot_ecg_spectrogram(x, fs=300, n_split=1):
    sample_len = x.shape[0]
    time_axis = np.arange(sample_len)/fs

    freq_axis, time_axis, stft = signal.stft(x, nperseg=128, noverlap=3*128/4)
    time_axis = time_axis/fs
    freq_axis = freq_axis * fs
    stft = np.log(np.abs(stft))
    cuts = np.round(np.linspace(0, stft.shape[-1]-1, n_split+1)).astype(int)

    fig, ax = plt.subplots(n_split, 1, figsize=(12, 5), squeeze=False)
    for j in range(n_split):
        ax[j][0].imshow(np.flipud(stft[:, cuts[j]:cuts[j+1]]), extent=[time_axis[cuts[j]], time_axis[cuts[j+1]], freq_axis[0], freq_axis[-1]], aspect=0.02)
        ax[j][0].set_xlabel("Time")

In [106]:
def plot_ecg_poincare(rri):
    fig = plt.figure()
    plt.plot(rri[1:], rri[:-1])
    plt.xlabel("RR interval n")
    plt.ylabel("RR interval n-1")
    plt.show()

In [107]:
dataset = val_dataset_safer
dataset["class_prediction"] = dataset["prediction"].map(lambda x: np.argmax(x))
selection = dataset[(dataset["class_prediction"] == 2) & (dataset["class_index"] == 1) & (dataset["noise_probs"] < 0)]

from matplotlib.ticker import AutoMinorLocator

def plot_ecg(x, fs=300, n_split=1, r_peaks=None, attention=None, num_segments=None):
    sample_len = x.shape[0]
    time_axis = np.arange(sample_len)/fs

    cuts = np.round(np.linspace(0, sample_len-1, n_split+1)).astype(int)

    fig, ax = plt.subplots(n_split, 1, figsize=(12, 5), squeeze=False)
    for j in range(n_split):
        ax[j][0].plot(time_axis[cuts[j]:cuts[j+1]], x[cuts[j]:cuts[j+1]])

        if r_peaks is not None:
            ax[j][0].plot(time_axis[r_peaks], x[r_peaks], "x")

        if attention is not None:
            print("Plotting attention")
            attention_step = (sample_len-1)/num_segments
            attention_gain = 0.5/attention.max()
            alpha = attention_gain * attention

            for i in range(num_segments):
                ax[j][0].axvspan(time_axis[math.floor(i*attention_step)],
                                 time_axis[math.floor((i+1)*attention_step)], color='green',
                                 alpha=alpha[i])

        ax[j][0].set_xlabel("Time")
        ax[j][0].set_xlim((time_axis[cuts[j]], time_axis[cuts[j+1]]))

        t_s = time_axis[cuts[j]]
        t_f = time_axis[cuts[j+1]]
        time_ticks = np.arange(t_s - t_s%0.2, t_f + (0.2 - t_f%0.2), 0.2)
        decimal_labels = ~np.isclose(time_ticks, np.round(time_ticks))
        time_labels = np.round(time_ticks).astype(int).astype(str)
        time_labels[decimal_labels] = ""

        ax[j][0].set_xticks(time_ticks, time_labels)


        ax[j][0].xaxis.set_minor_locator(AutoMinorLocator(5))
        ax[j][0].yaxis.set_minor_locator(AutoMinorLocator(5))

        ax[j][0].grid(which='major', linestyle='-', linewidth='0.5', color='black')
        ax[j][0].grid(which='minor', linestyle='-', linewidth='0.5', color='lightgray')

c = DiagEnum.CannotExcludePathology

for _, ecg in selection.sample(frac=1).iterrows():
    print(ecg[["measDiag", "prediction", "class_index"]])
    plot_ecg(ecg["data"], 300, n_split=3, r_peaks=ecg["r_peaks_hamilton"])# , attention=ecg["attention"][0][0][0], num_segments=ecg["attention"].shape[-1])
    plot_ecg_spectrogram(ecg["data"], 300, n_split=3)
    plot_ecg_poincare(ecg["rri_feature"])
    plt.show()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["class_prediction"] = dataset["prediction"].map(lambda x: np.argmax(x))


measDiag                                DiagEnum.AF
prediction     [-1.2193077, -0.23880465, 1.9670563]
class_index                                       1
Name: 214763, dtype: object
measDiag                              DiagEnum.AF
prediction     [1.2572961, -1.7592877, 2.0414455]
class_index                                     1
Name: 215725, dtype: object
measDiag                              DiagEnum.AF
prediction     [-1.159541, 0.32918468, 1.3688726]
class_index                                     1
Name: 206928, dtype: object
measDiag                                 DiagEnum.AF
prediction     [0.20005289, -0.107040234, 0.7029492]
class_index                                        1
Name: 219862, dtype: object
measDiag                                DiagEnum.AF
prediction     [-0.7543493, -0.16705689, 1.0699196]
class_index                                       1
Name: 211215, dtype: object
measDiag                               DiagEnum.AF
prediction     [-0.6771606, 0.48215356

In [56]:
selection["chal_diag_num"].map(lambda x: [mapper.mapToDesc(a) for a in x])

41                                  [1st degree av block]
48      [premature atrial contraction, supraventricula...
80                             [left bundle branch block]
82                            [right bundle branch block]
117                                 [1st degree av block]
                              ...                        
6756                                [1st degree av block]
6760                                [1st degree av block]
6811                          [right bundle branch block]
6832                          [right bundle branch block]
6855    [left bundle branch block, premature atrial co...
Name: chal_diag_num, Length: 337, dtype: object

In [57]:
selection["measDiag"].value_counts()

DiagEnum.HeartBlock                197
DiagEnum.CannotExcludePathology    140
Name: measDiag, dtype: int64

In [58]:
val_dataset["measDiag"].value_counts()

DiagEnum.HeartBlock                1491
DiagEnum.AF                         904
DiagEnum.CannotExcludePathology     882
DiagEnum.NoAF                       661
Name: measDiag, dtype: int64

In [178]:
ecg["attention"][0][0][0]

array([0.00351115, 0.0042624 , 0.00406908, 0.00400482, 0.0029262 ,
       0.00268505, 0.0044071 , 0.00310645, 0.00343307, 0.00364956,
       0.00370946, 0.00388232, 0.00260756, 0.00358304, 0.00438776,
       0.00428047, 0.0037303 , 0.00366007, 0.00439963, 0.00436396,
       0.00433379, 0.00336815, 0.00448245, 0.00451077, 0.00345731,
       0.00343545, 0.00408583, 0.00296606, 0.00390116, 0.00313194,
       0.00361513, 0.00349916, 0.0032706 , 0.00350706, 0.00353945,
       0.00419786, 0.00418801, 0.00349962, 0.00367059, 0.00421098,
       0.00274811, 0.00376888, 0.00347172, 0.00342216, 0.00446853,
       0.00362137, 0.00359131, 0.00260848, 0.00288552, 0.00412255,
       0.00440143, 0.00392521, 0.00424863, 0.00366803, 0.00299748,
       0.00375879, 0.00283509, 0.00329737, 0.0037809 , 0.003316  ,
       0.00314023, 0.00334355, 0.00356446, 0.00341068, 0.00346567,
       0.00252791, 0.00350228, 0.00352529, 0.00304219, 0.00385171,
       0.00352847, 0.00349656, 0.00411592, 0.00440683, 0.00402

### Try employing the noise detector

### SAFER cross validation

In [42]:

"""
import copy
model = TransformerModel(3, embed_dim, n_head, 512, 4, n_fft, n_inp_rri, multiquery=False, device=device).to(device)
model = model.to(device)
model.fix_transformer_params(False)
num_epochs = 40

model, losses = train(model, train_dataloader, test_dataloader)
model = model.to(device)

predictions, true_labels = get_predictions(model, test_dataloader, test_dataset)
conf_mat = confusion_matrix(true_labels, np.argmax(predictions, axis=1))

print("Finished training on CinC")
print(conf_mat)

print("\n========================\n")

# Save a model
torch.save(model.state_dict(), "TrainedModels/Transformer_spectrogram_small_fft_cut_rri_average.pt")
cinc_model_path = "TrainedModels/Transformer_spectrogram_small_fft_cut_rri_average.pt"
train_dataset.to_pickle("TrainedModels/Transformer_spectrogram_small_fft_cut_rri_cinc_trained_train_set.pk")

"""
# Cross Validation dataset construction for SAFER data
# Split train and test data according to each patient
# cinc_model_path = "TrainedModels/Transformer_spectrogram_small_fft_cut_rri_average.pt"

num_folds = 5
test_pt_folds = [[] for _ in range(num_folds)]

sorted_pts = safer_pt_data.sort_values("noAFRecs", axis=0)
group_num = 0

# Go around the folds and assign patients to each
for _, pt in sorted_pts.iterrows():
    test_pt_folds[group_num].append(pt)
    group_num = (group_num + 1) % num_folds

test_pt_folds = [pd.DataFrame(fold) for fold in test_pt_folds]
train_pt_folds = [safer_pt_data[~safer_pt_data["ptID"].isin(fold["ptID"])] for fold in test_pt_folds]

conf_mats = []

for i, (train_pt_df, test_pt_df) in enumerate(zip(train_pt_folds, test_pt_folds)):
    print(f"Fold {i}")
    train_df = safer_ecg_data[safer_ecg_data["ptID"].isin(train_pt_df["ptID"])]
    test_df = safer_ecg_data[(safer_ecg_data["ptID"].isin(test_pt_df["ptID"]))]
    print(test_df["class_index"].value_counts())

    torch_dataset_train = Dataset(train_df)
    torch_dataset_test = Dataset(test_df)

    train_dataloader = DataLoader(torch_dataset_train, batch_size=32, shuffle=True, pin_memory=True)
    test_dataloader = DataLoader(torch_dataset_test, batch_size=32, shuffle=True, pin_memory=True)

    model = TransformerModel(3, embed_dim, n_head, 512, 6, n_fft, n_inp_rri, multiquery=False, device=device).to(device)
    # Load pretrained model on CinC data
    model.load_state_dict(torch.load(cinc_model_path, map_location=device))
    model.fix_transformer_params(True)  # Only train the final classification head

    num_epochs = 20
    # Remake scheduler before retraining on SAFER
    class_counts = torch.tensor(train_df["class_index"].value_counts().sort_index().values.astype(np.float32))
    class_weights = (1/class_counts)
    class_weights /= torch.sum(class_weights)
    print(class_weights)

    loss_func = nn.CrossEntropyLoss(weight=class_weights) # multiclass_cross_entropy_loss

    optimizer = torch.optim.Adam(model.parameters(), lr=0.00006)
    scheduler = StepLR(optimizer, step_size=5, gamma=0.5)

    number_warmup_epochs = 1
    def warmup(current_step: int):
        return 1 / (10 ** (float(number_warmup_epochs - current_step)))
    warmup_scheduler = LambdaLR(optimizer, lr_lambda=warmup)

    scheduler = SequentialLR(optimizer, [warmup_scheduler, scheduler], [number_warmup_epochs])

    num_batches = len(train_dataloader)
    num_test_batches = len(test_dataloader)

    model, losses = train(model, train_dataloader, test_dataloader)
    model = model.to(device)

    test_df["prediction"] = None
    predictions, true_labels = get_predictions(model, test_dataloader, test_df)
    conf_mat = confusion_matrix(true_labels, np.argmax(predictions, axis=1))

    print(conf_mat)

    conf_mats.append(conf_mat)

Fold 0
0    7773
2    2906
1      93
Name: class_index, dtype: int64
tensor([0.0210, 0.9275, 0.0514])
starting epoch 0 ...
1310
Epoch 0 finished with average loss 1.0005992817514726
Testing ...
Average test loss: 0.9733702008908156
starting epoch 1 ...




1310
Epoch 1 finished with average loss 0.7874476455550158
Testing ...
Average test loss: 0.6695074533673352
starting epoch 2 ...
1310
Epoch 2 finished with average loss 0.7022962516273251
Testing ...
Average test loss: 0.6448957873204342
starting epoch 3 ...
1310
Epoch 3 finished with average loss 0.6781395497326632
Testing ...
Average test loss: 0.625849836304209
starting epoch 4 ...
1310
Epoch 4 finished with average loss 0.66206099320459
Testing ...
Average test loss: 0.6144771417630532
starting epoch 5 ...
1310
Epoch 5 finished with average loss 0.6488265996215907
Testing ...
Average test loss: 0.6085624280510034
starting epoch 6 ...
1310
Epoch 6 finished with average loss 0.644036285010458
Testing ...
Average test loss: 0.6078586491703634
starting epoch 7 ...
1310
Epoch 7 finished with average loss 0.6341695040350652
Testing ...
Average test loss: 0.6131853669528083
starting epoch 8 ...
1310
Epoch 8 finished with average loss 0.6314672520028726
Testing ...
Average test loss: 0.59

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["prediction"] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["prediction"] = pd.Series(data=outputs, index=inds)


[[6530  103 1140]
 [   1   60   32]
 [ 721  292 1893]]
Fold 1
0    7499
2    3046
1     251
Name: class_index, dtype: int64
tensor([0.0162, 0.9435, 0.0403])
starting epoch 0 ...
1310
Epoch 0 finished with average loss 1.0254583619250597
Testing ...
Average test loss: 0.8795041779618291
starting epoch 1 ...




1310
Epoch 1 finished with average loss 0.7879701555571483
Testing ...
Average test loss: 0.714662831254612
starting epoch 2 ...
1310
Epoch 2 finished with average loss 0.6845855759186599
Testing ...
Average test loss: 0.7515870203545107
starting epoch 3 ...
1310
Epoch 3 finished with average loss 0.65780495010487
Testing ...
Average test loss: 0.8080123509087506
starting epoch 4 ...
1310
Epoch 4 finished with average loss 0.6386490387202219
Testing ...
Average test loss: 0.8168073794721852
starting epoch 5 ...
1310
Epoch 5 finished with average loss 0.6230013792523901
Testing ...
Average test loss: 0.8713171713098267
starting epoch 6 ...
1310
Epoch 6 finished with average loss 0.6199904277465725
Testing ...
Average test loss: 0.8685038698142802


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["prediction"] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["prediction"] = pd.Series(data=outputs, index=inds)


[[6137  113 1249]
 [   6  102  143]
 [ 737  293 2016]]
Fold 2
0    7654
2    3065
1     241
Name: class_index, dtype: int64
tensor([0.0165, 0.9423, 0.0412])
starting epoch 0 ...
1305
Epoch 0 finished with average loss 0.9844564447914503
Testing ...
Average test loss: 1.0678713161813274
starting epoch 1 ...




1305
Epoch 1 finished with average loss 0.758573219865218
Testing ...
Average test loss: 0.8622738026929666
starting epoch 2 ...
1305
Epoch 2 finished with average loss 0.6871162034770995
Testing ...
Average test loss: 0.8078404231425972
starting epoch 3 ...
1305
Epoch 3 finished with average loss 0.6604604596150789
Testing ...
Average test loss: 0.7779394469872856
starting epoch 4 ...
1305
Epoch 4 finished with average loss 0.6460717118562866
Testing ...
Average test loss: 0.799132599036478
starting epoch 5 ...
1305
Epoch 5 finished with average loss 0.6397852709581112
Testing ...
Average test loss: 0.7811684252656236
starting epoch 6 ...
1305
Epoch 6 finished with average loss 0.6332267932850739
Testing ...
Average test loss: 0.8026521360126946
starting epoch 7 ...
1305
Epoch 7 finished with average loss 0.6206510029304987
Testing ...
Average test loss: 0.7796072541972291
starting epoch 8 ...
1305
Epoch 8 finished with average loss 0.6186455018789833
Testing ...
Average test loss: 0.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["prediction"] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["prediction"] = pd.Series(data=outputs, index=inds)


[[6044   95 1515]
 [   7  111  123]
 [ 670  377 2018]]
Fold 3
0    6944
2    3023
1     142
Name: class_index, dtype: int64
tensor([0.0190, 0.9326, 0.0484])
starting epoch 0 ...
1331
Epoch 0 finished with average loss 0.9952851227581904
Testing ...
Average test loss: 0.9776310884877096
starting epoch 1 ...




1331
Epoch 1 finished with average loss 0.7767781089518323
Testing ...
Average test loss: 0.747394289302675
starting epoch 2 ...
1331
Epoch 2 finished with average loss 0.6919177849542638
Testing ...
Average test loss: 0.7237302591151829
starting epoch 3 ...
1331
Epoch 3 finished with average loss 0.6597601039059443
Testing ...
Average test loss: 0.7114893982210492
starting epoch 4 ...
1331
Epoch 4 finished with average loss 0.6536946935840875
Testing ...
Average test loss: 0.6815207509111755
starting epoch 5 ...
1331
Epoch 5 finished with average loss 0.6387995858106462
Testing ...
Average test loss: 0.6847022759103323
starting epoch 6 ...
1331
Epoch 6 finished with average loss 0.6358838240111528
Testing ...
Average test loss: 0.6728915966858592
starting epoch 7 ...
1331
Epoch 7 finished with average loss 0.6293886829640613
Testing ...
Average test loss: 0.6806842289582083
starting epoch 8 ...
1331
Epoch 8 finished with average loss 0.6206947488725678
Testing ...
Average test loss: 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["prediction"] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["prediction"] = pd.Series(data=outputs, index=inds)


[[5514  123 1307]
 [   5   97   40]
 [ 712  390 1921]]
Fold 4
0    7214
2    2900
1      34
Name: class_index, dtype: int64
tensor([0.0223, 0.9220, 0.0556])
starting epoch 0 ...
1330
Epoch 0 finished with average loss 0.9981341232930807
Testing ...
Average test loss: 0.9266512298359061
starting epoch 1 ...




1330
Epoch 1 finished with average loss 0.7826685312547181
Testing ...
Average test loss: 0.6809049009714486
starting epoch 2 ...
1330
Epoch 2 finished with average loss 0.6945825639860075
Testing ...
Average test loss: 0.6537907642763365
starting epoch 3 ...
1330
Epoch 3 finished with average loss 0.6736083517845411
Testing ...
Average test loss: 0.6188608310038939
starting epoch 4 ...
1330
Epoch 4 finished with average loss 0.657848092319822
Testing ...
Average test loss: 0.6322421730970437
starting epoch 5 ...
1330
Epoch 5 finished with average loss 0.6477175208522861
Testing ...
Average test loss: 0.6077798468605528
starting epoch 6 ...
1330
Epoch 6 finished with average loss 0.642714230100015
Testing ...
Average test loss: 0.6079698264505129
starting epoch 7 ...
1330
Epoch 7 finished with average loss 0.6381319710634705
Testing ...
Average test loss: 0.6124712046962114
starting epoch 8 ...
1330
Epoch 8 finished with average loss 0.638067139304222
Testing ...
Average test loss: 0.6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["prediction"] = None


[[6065   56 1093]
 [   0   22   12]
 [ 740  238 1922]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["prediction"] = pd.Series(data=outputs, index=inds)


In [43]:
f1_scores_normal = [F1_ind(c, 0) for c in conf_mats]
f1_scores_af = [F1_ind(c, 1) for c in conf_mats]
# f1_scores_other = [F1_ind(c, 2) for c in conf_mats]

print(f"Mean F1 normal: {np.mean(f1_scores_normal)}")
print(f"Mean F1 af: {np.mean(f1_scores_af)}")
# print(f"Mean F1 other: {np.mean(f1_scores_other)}")
print(f"Individual F1 scores (af): {f1_scores_af}")

Mean F1 normal: 0.8532044833876433
Mean F1 af: 0.22817265811875792
Individual F1 scores (af): [0.21897810218978103, 0.26877470355731226, 0.26941747572815533, 0.2579787234042553, 0.12571428571428572]


In [44]:
[c for c in conf_mats]

[array([[6530,  103, 1140],
        [   1,   60,   32],
        [ 721,  292, 1893]], dtype=int64),
 array([[6137,  113, 1249],
        [   6,  102,  143],
        [ 737,  293, 2016]], dtype=int64),
 array([[6044,   95, 1515],
        [   7,  111,  123],
        [ 670,  377, 2018]], dtype=int64),
 array([[5514,  123, 1307],
        [   5,   97,   40],
        [ 712,  390, 1921]], dtype=int64),
 array([[6065,   56, 1093],
        [   0,   22,   12],
        [ 740,  238, 1922]], dtype=int64)]

In [59]:
best_train_df = safer_ecg_data[safer_ecg_data["ptID"].isin(train_pt_folds[4]["ptID"])]
best_test_df = safer_ecg_data[(safer_ecg_data["ptID"].isin(test_pt_folds[4]["ptID"]))]

In [60]:
torch_dataset_train = Dataset(best_train_df)
torch_dataset_test = Dataset(best_test_df)

train_dataloader = DataLoader(torch_dataset_train, batch_size=128, shuffle=True, pin_memory=True)
test_dataloader = DataLoader(torch_dataset_test, batch_size=128, shuffle=True, pin_memory=True)

False    7180
True     2968
Name: noise_probs, dtype: int64

In [163]:
for i, (train_pt_df, test_pt_df) in enumerate(zip(train_pt_folds, test_pt_folds)):
    best_train_df = safer_ecg_data[safer_ecg_data["ptID"].isin(train_pt_df["ptID"])]
    best_test_df = safer_ecg_data[(safer_ecg_data["ptID"].isin(test_pt_df["ptID"])) & (safer_ecg_data["measDiag"] != DiagEnum.Undecided)]

    print(f"Fold {i}")
    print(best_test_df["tag_orig_Poor_Quality"].value_counts())
    print(best_train_df["tag_orig_Poor_Quality"].value_counts())

Fold 0
0    839
1     21
Name: tag_orig_Poor_Quality, dtype: int64
0    3466
1     166
Name: tag_orig_Poor_Quality, dtype: int64
Fold 1
0    906
1     50
Name: tag_orig_Poor_Quality, dtype: int64
0    3399
1     137
Name: tag_orig_Poor_Quality, dtype: int64
Fold 2
0    878
1     24
Name: tag_orig_Poor_Quality, dtype: int64
0    3427
1     163
Name: tag_orig_Poor_Quality, dtype: int64
Fold 3
0    971
1     65
Name: tag_orig_Poor_Quality, dtype: int64
0    3334
1     122
Name: tag_orig_Poor_Quality, dtype: int64
Fold 4
0    985
1     46
Name: tag_orig_Poor_Quality, dtype: int64
0    3320
1     141
Name: tag_orig_Poor_Quality, dtype: int64


In [None]:
torch_dataset_train = Dataset(train_df)
torch_dataset_test = Dataset(test_df)

train_dataloader = DataLoader(torch_dataset_train, batch_size=128, shuffle=True, pin_memory=True)
test_dataloader = DataLoader(torch_dataset_test, batch_size=128, shuffle=True, pin_memory=True)

### Inspect the attention mechanism

In [172]:
# model.transformer_encoder.layers.
from plotly.subplots import make_subplots
fig = make_subplots(rows=2, cols=1)

def hook(module, x, y):
    fig = make_subplots(rows=2, cols=1)
    fig.add_trace(go.Heatmap(z=x[0][:, 0, :].cpu().numpy()), row=1, col=1)
    fig.add_trace(go.Heatmap(z=y[0][:, 0, :].cpu().numpy()), row=2, col=1)
    fig.show()

attention_hook = model.transformer_encoder.layers[0].self_attn.register_forward_hook(hook)

with torch.no_grad():
    for i, (signals, labels, ind) in enumerate(test_dataloader):
        print(signals.shape)
        fig = go.Figure()
        fig.add_trace(go.Scatter(y=signals[0]))
        fig.show()
        signals = torch.transpose(signals.to(device), 0, 1).float()
        # fft = torch.abs(torch.fft.fft(signals))
        # signals = torch.cat([signals, fft], dim=1)
        labels = labels.long().detach().numpy()

        output = model(signals).detach().to("cpu").numpy()
        break

attention_hook.remove()

torch.Size([32, 3000])


In [167]:
attention_hook.remove()

### Inspect the attention pooling weights

In [170]:
# model.transformer_encoder.layers.
from plotly.subplots import make_subplots
fig = make_subplots(rows=2, cols=1)

attentions = None

def hook(module, x, y):
    global attentions
    print("hook")
    attentions = y[1].detach().to("cpu").numpy()

attention_hook = model.attention_pooling.attn.register_forward_hook(hook)

with torch.no_grad():
    for i, (signals, labels, ind) in enumerate(test_dataloader_safer):
        print(signals.shape)
        signals = torch.transpose(signals.to(device), 0, 1).float()
        # fft = torch.abs(torch.fft.fft(signals))
        # signals = torch.cat([signals, fft], dim=1)
        labels = labels.long().detach().numpy()
        output = model(signals).detach().to("cpu").numpy()

        if labels[0] == 0:
            print(attentions.shape)
            fig = make_subplots(2, 1)
            fig.add_trace(go.Scatter(y=signals[:, 0].detach().to("cpu").numpy()), row=1, col=1)
            for j in range(attentions.shape[-2]):
                fig.add_trace(go.Scatter(y=attentions[0, j, :]), row=2, col=1)
            fig.show()

        if i == 10:
            break

attention_hook.remove()

torch.Size([128, 9120])
hook


AttributeError: 'NoneType' object has no attribute 'append'