In [1]:
import enum
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib
matplotlib.use('TkAgg')

from scipy import signal
from DataHandlers.DiagEnum import DiagEnum
import DataHandlers.SAFERDataset as SAFERDataset
import DataHandlers.CinC2020Dataset as CinC2020Dataset
import DataHandlers.CinCDataset as CinCDataset
from ecgdetectors import Detectors

In [5]:
df = CinC2020Dataset.load_dataset(save_name="dataframe")

In [None]:
search_dir = r"C:\Users\daniel\Documents\CambridgeSoftwareProjects\ecg-signal-quality\st-petersburg-incart-12-lead-arrhythmia-database-1.0.0\files"
save_dir = r"C:\Users\daniel\Documents\CambridgeSoftwareProjects\ecg-signal-quality\CinC2020Data\training\st_petersburg_incart\g1"

for root, dirs, files in os.walk(search_dir):
    for file in files:
        if file[-4:] == ".atr":
            g  = os.path.join(root, file)
            file_ind = int(file[-6:-4])
            save_path = os.path.join(save_dir, file[:-6] + f"{file_ind:04}" + file[-4:])
            os.system(f'copy {g} {save_path}')

In [32]:
dataset_path = r"C:\Users\daniel\Documents\CambridgeSoftwareProjects\ecg-signal-quality\CinC2020Data"

CinC2020Dataset.map_cinc_diagnoses(df)
df["class_index"] = df["measDiag"].map(CinC2020Dataset.diagnosis_to_class_list)
df.to_pickle(os.path.join(dataset_path, f"training/filtered_dataframe.pk.pk"))

In [9]:
def select_length():
    df_within_range = df[(df["length"] <= 5000) & (df["length"] >= 3000)].copy()
    df_within_range["data"] = df_within_range["data"].map(lambda x: x[:3000])
    df_within_range["length"] = df_within_range["data"].map(lambda x: x.shape[0])
    return df_within_range

df = select_length()
df["length"].value_counts()

3000    48030
Name: length, dtype: int64

In [7]:
df["measDiag"].value_counts()

DiagEnum.CannotExcludePathology    19214
DiagEnum.NoAF                      15712
DiagEnum.AF                         9907
DiagEnum.HeartBlock                 7970
DiagEnum.Undecided                   116
Name: measDiag, dtype: int64

In [7]:
# dataset = CinCDataset.load_cinc_dataset()
pt_dataset, dataset = SAFERDataset.load_feas_dataset(feas=2, save_name="dataframe", force_reload=False, process=True, force_reprocess=False) # , ecg_meas_diag=[e for e in DiagEnum if e != DiagEnum.Undecided])
# pt_dataset, dataset = SAFERDataset.load_feas_dataset(2, save_name="dataframe_laptop")

In [5]:
dataset = pd.read_pickle(r"C:\Users\daniel\Documents\2022_23_DSiromani\Feas2\ECGs\filtered_dataframe.pk")
pt_datset = pd.read_csv(r"C:\Users\daniel\Documents\2022_23_DSiromani\Feas2\pt_data_anon.csv")

In [8]:
print("Number of recordings by length")
print(dataset["length"].value_counts())
# dataset = dataset[dataset["length"] == 9120]
print(f"total number of recordings: {len(dataset.index)}")
dataset["class"] = dataset["measDiag"]

Number of recordings by length


NameError: name 'dataset' is not defined

In [9]:
dataset["class"].value_counts()

DiagEnum.Undecided                 22018
DiagEnum.NoAF                        758
DiagEnum.PoorQuality                 465
DiagEnum.AF                           16
DiagEnum.CannotExcludePathology        2
Name: class, dtype: int64

In [14]:
dataset_clean = dataset[(dataset["heartrate"] < 120) & (dataset["heartrate"] > 50) & (dataset["r_peak_height"].map(np.median) > 2) & ((dataset["measDiag"] == DiagEnum.AF) | (dataset["measDiag"] == DiagEnum.NoAF))]
dataset_clean["class"].value_counts()

DiagEnum.NoAF    615
DiagEnum.AF       15
Name: class, dtype: int64

In [15]:
dataset_clean.to_pickle(r"C:\Users\daniel\Documents\2022_23_DSiromani\Feas2\ECGs\clean_ecg_dataset.pk")

In [19]:
def compareEnums(series, e):
    return series.map(lambda x: x.value) == e.value

In [19]:
df[(df["diag_num"].str[0] == 0) & (df["measDiag"] == DiagEnum.CannotExcludePathology)]

Series([], Name: filepath, dtype: int64)

## Plotting example time series for each class

In [8]:
# Plot with proper ECG grid in matplotlib

from matplotlib.ticker import AutoMinorLocator

def plot_ecg(x, fs=500, r_peaks=None):
    sample_len = x.shape[0]
    time_axis = np.arange(sample_len)/fs

    y_step = 2

    cuts = [0, sample_len//3, (sample_len*2)//3, sample_len-1]

    fig, ax = plt.subplots(3, 1, figsize=(8, 6))
    for j in range(3):
        ax[j].plot(time_axis[cuts[j]:cuts[j+1]], x[cuts[j]:cuts[j+1]])
        if r_peaks is not None:
            ax[j].plot(time_axis[r_peaks], x[r_peaks], "x")
        ax[j].set_xlabel("Time")
        ax[j].set_xlim((time_axis[cuts[j]], time_axis[cuts[j+1]]))

        t_s = time_axis[cuts[j]]
        t_f = time_axis[cuts[j+1]]
        time_ticks = np.arange(t_s - t_s%0.2, t_f + (0.2 - t_f%0.2), 0.2)
        decimal_labels = ~np.isclose(time_ticks, np.round(time_ticks))
        time_labels = np.round(time_ticks).astype(int).astype(str)
        time_labels[decimal_labels] = ""

        ax[j].set_xticks(time_ticks, labels=time_labels)
        ax[j].set_yticks(np.arange(x.min()-y_step, x.max()+y_step, y_step))

        # ax[j].xaxis.set_major_formatter(plt.NullFormatter())
        # ax[j].yaxis.set_major_formatter(plt.NullFormatter())

        ax[j].xaxis.set_minor_locator(AutoMinorLocator(5))
        ax[j].yaxis.set_minor_locator(AutoMinorLocator(5))

        ax[j].set_ylim((x.min()-y_step, x.max()+y_step))
        ax[j].set_xlim((t_s, t_f))

        ax[j].grid(which='major', linestyle='-', linewidth='0.2', color='black')
        ax[j].grid(which='minor', linestyle='-', linewidth='0.2', color='lightgray')

    fig.tight_layout()
    # plt.savefig("test_ecg_plot.png", dpi=300)
    # plt.show()

c = 0

for _, ecg in df[df["chal_diag_num"].map(lambda x: (c in x))].sample(frac=1).iterrows():
    print(ecg[["measDiag", "diag_num"]])
    # print(ecg["data"].std())
    plot_ecg(ecg["data"], 300, ecg["r_peaks"])
    plt.show()

measDiag    DiagEnum.NoAF
diag_num      [426783006]
Name: 29249, dtype: object
measDiag    DiagEnum.CannotExcludePathology
diag_num             [164934002, 426783006]
Name: 26851, dtype: object
measDiag    DiagEnum.NoAF
diag_num      [426783006]
Name: 45560, dtype: object
measDiag    DiagEnum.NoAF
diag_num      [426783006]
Name: 38341, dtype: object
measDiag    DiagEnum.NoAF
diag_num      [426783006]
Name: 44312, dtype: object
measDiag    DiagEnum.NoAF
diag_num      [426783006]
Name: 42079, dtype: object
measDiag                 DiagEnum.CannotExcludePathology
diag_num    [164865005, 251120003, 270492004, 426783006]
Name: 29983, dtype: object
measDiag                DiagEnum.CannotExcludePathology
diag_num    [284470004, 426783006, 429622005, 55930002]
Name: 22050, dtype: object
measDiag                      DiagEnum.CannotExcludePathology
diag_num    [164865005, 39732003, 426783006, 446358003, 54...
Name: 40312, dtype: object
measDiag    DiagEnum.NoAF
diag_num      [426783006]
Name: 5

KeyboardInterrupt: 

In [44]:
# Check the average adc gain
fig, ax = plt.subplots(5)

for j, (i, df) in enumerate(dataset.groupby("measDiag", sort=False)):
    num_values = len(df.index)
    ax[j].hist(df["adc_gain"], bins=np.arange(0, 2, 0.2), density=True)
    ax[j].set_title(i.name)

fig.tight_layout()
plt.show()

In [8]:
# Check the heart rate
detectors = Detectors(300)

"""
r_peaks = detectors.pan_tompkins_detector(dataset["data"].loc[0])
print(r_peaks)

plt.plot(dataset["data"].loc[0])
plt.plot(r_peaks, dataset["data"].loc[0][r_peaks], "rx")
plt.show()
"""

dataset["r_peaks"] = dataset["data"].map(detectors.pan_tompkins_detector)
dataset["r_peaks"] = dataset["r_peaks"].map(np.array)

In [10]:
def get_heartrate(r_peaks, sig_len=30.4):
    return (len(r_peaks)/sig_len) * 60

dataset["heartrate"] = dataset["r_peaks"].map(get_heartrate)

In [11]:
dataset["heartrate"].max()

177.63157894736844

In [18]:
fig, ax = plt.subplots(5)

for j, (i, df) in enumerate(dataset.groupby("measDiag", sort=False)):
    num_values = len(df.index)
    ax[j].hist(df["heartrate"], bins=np.arange(0, 210, 10), density=True)
    ax[j].set_title(i.name)

fig.tight_layout()
plt.show()

In [12]:
import scipy.ndimage
# Check amplitude of R peaks
window = np.ones(3)/3

def get_rpeaks_weighted_avg(search_range, w, peaks, ecg):
    point_array = np.linspace(peaks - (search_range-1)/2, peaks + (search_range-1)/2, search_range)
    point_array = np.mod(point_array, ecg.shape[0])
    peak_vals = ecg[point_array.astype(int)]

    smoothed_peaks = scipy.ndimage.correlate1d(peak_vals.T, w)
    return np.max(smoothed_peaks, axis=1)

"""
peak_heights = get_rpeaks_weighted_avg(51, window, np.array(dataset.iloc[0]["r_peaks"]), dataset.iloc[0]["data"])
print(peak_heights)
print(dataset.iloc[0]["r_peaks"])

plot_ecg(dataset.iloc[0]["data"], fs=300)
plt.show()
"""

dataset["r_peak_height"] = dataset.apply(lambda x: get_rpeaks_weighted_avg(50, window, x["r_peaks"], x["data"]), axis=1)

In [13]:
fig, ax = plt.subplots(5)

for j, (i, df) in enumerate(dataset.groupby("measDiag", sort=False)):
    num_values = len(df.index)
    ax[j].hist(df["r_peak_height"].map(np.median), bins=np.arange(0, 10, 1), density=True)
    ax[j].set_title(i.name)

fig.tight_layout()
plt.show()

In [20]:
c = DiagEnum.AF

num_rows = 4
num_cols = 4

num_class_samples = num_cols * num_rows
fig = make_subplots(rows=num_rows, cols=num_cols)

for i, (_, sample) in enumerate(dataset[compareEnums(dataset["class"], c)].sample(num_class_samples).iterrows()):
    fig.add_trace(go.Scatter(y=sample["data"]), row=i%num_cols + 1, col = i//num_rows + 1)

fig.update_layout(height=1000)
fig.update_xaxes(title="sample number")
fig.update_yaxes(title="amplitude")
fig.show()

NameError: name 'compareEnums' is not defined

## Plotting samples from each class with their DFTs below

In [None]:
c = DiagEnum.PoorQuality
num_cols = 3
num_rows = 2

num_class_samples = num_cols
fig = make_subplots(rows=num_rows, cols=num_cols)

for i, (_, sample) in enumerate(dataset[compareEnums(dataset["class"], c)].sample(num_class_samples).iterrows()):
    fig.add_trace(go.Scatter(y=sample["data"]), row=1, col = i + 1)
    fft = np.log10(np.abs(np.fft.fft(sample["data"])))
    fftfreq = np.fft.fftfreq(len(sample["data"]), d=1.0/300.0)

    fig.add_trace(go.Scatter(y=fft, x=fftfreq), row = 2, col = i + 1)

fig.update_layout(height=1000)
fig.show()

## Plotting samples with their STFT below

In [12]:
num_cols = 3
num_rows = 2

num_class_samples = num_cols
fig = make_subplots(rows=num_rows, cols=num_cols)

for i, (_, sample) in enumerate(df.sample(num_class_samples).iterrows()):
    fig.add_trace(go.Scatter(y=sample["data"]), row=1, col = i + 1)

    f_axis, t_axis, stft = signal.stft(sample["data"], nperseg=64)
    fig.add_trace(go.Heatmap(z=np.log10(np.abs(stft)), y=f_axis, x=t_axis), row = 2, col = i + 1)

fig.update_layout(height=1000)
fig.show()

## Plotting samples from each class with their wavelet transform below

These plots dont really work well and I so far have made no other use of wavelet transforms

### First a Discrete wavelet transform (dont understand how this works)

In [None]:
phi, psi, x = pywt.Wavelet('sym4').wavefun(1)
plt.plot(x, phi)
plt.show()

plt.plot(x, psi)
plt.show()

In [None]:
c = "~"
num_cols = 3
num_rows = 2

num_class_samples = num_cols
fig = make_subplots(rows=num_rows, cols=num_cols)

for i, (_, sample) in enumerate(dataset[dataset["class"] == c].sample(num_class_samples).iterrows()):
    fig.add_trace(go.Scatter(y=sample["data"][0]), row=1, col = i + 1)
    wavelets = np.array(pywt.wavedec(sample["data"][0], 'sym4'))
    for wavelet in wavelets:
        print(wavelet.shape)
        # The wavelets arent the same shape IDK how to use this!

## Try a continuous wavelet transform from scipy

In [None]:
# Something feels off about this as well!

c = "N"
num_cols = 3
num_rows = 2

num_class_samples = num_cols
fig = make_subplots(rows=num_rows, cols=num_cols)

for i, (_, sample) in enumerate(dataset[dataset["class"] == c].sample(num_class_samples).iterrows()):
    fig.add_trace(go.Scatter(y=sample["data"][0]), row=1, col = i + 1)
    widths = np.linspace(1, 100, 20)
    cwtmatr = signal.cwt(sample["data"][0], signal.ricker, widths)
    wavelets_sample = np.abs(cwtmatr)
    print(wavelets_sample.shape)

    fig.add_trace(go.Heatmap(z=wavelets_sample), row = 2, col = i + 1)

fig.update_layout(height=1000)
fig.show()

### Check the performance of Zenicors system

In [12]:
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(dataset["class_index"], dataset["tag_orig_Poor_Quality"])
print(conf_mat)

def F1_ind(conf_mat, ind):
    return (2 * conf_mat[ind, ind])/(np.sum(conf_mat[ind]) + np.sum(conf_mat[:, ind]))

print(f"Sensitivity: {conf_mat[1, 1]/np.sum(conf_mat[1])}")
print(f"Specificity: {conf_mat[0, 0]/np.sum(conf_mat[0])}")

print(f"Normal F1: {F1_ind(conf_mat, 0)}")
print(f"Noisy F1: {F1_ind(conf_mat, 1)}")

[[22337   457]
 [  342   123]]
Sensitivity: 0.2645161290322581
Specificity: 0.9799508642625252
Normal F1: 0.9824291337716887
Noisy F1: 0.23540669856459331


### The Noise Stress Test Database

In [None]:
dataset = pd.read_pickle("mit-bih-noise-stress-test-database/database.pk")
print(dataset["class"].value_counts())

In [None]:
plot_ecg(dataset[dataset["class"] == "N"]["data"].iloc[0][:3000])

### NeuroKit analysis
I dont know for sure what use this will be but it might be cool
From a signal processing point of view this could also be useful to compare how certian things are done and what effect noise has on them

In [5]:
import neurokit2 as nk

In [9]:
ecgs = [nk.ecg_simulate(duration=1, heart_rate=70, sampling_rate=250, noise=0.2, random_state=i) for i in range(10)]

fig = go.Figure()
for ecg in ecgs:
    fig.add_trace(go.Scatter(y=ecg))
fig.show()

In [16]:
ecg = nk.ecg_simulate(duration=2, heart_rate=70, sampling_rate=360, noise=0.2, random_state=1)[220:220+216]

plt.plot(ecg)
plt.xlabel("sample number")
plt.title("Simulated Heartbeat")
plt.show()

In [10]:
signals, info = nk.ecg_process(dataset[dataset["class"] == "N"].iloc[100]["data"][0], sampling_rate=250)
nk.ecg_plot(signals, sampling_rate=250)

NameError: name 'dataset' is not defined

In [None]:
peaks, info = nk.ecg_peaks(ecg, sampling_rate=250)
print(peaks[peaks["ECG_R_Peaks"] != 0])
nk.hrv(peaks, sampling_rate=100, show=True)

Simulate ECGs!

In [None]:
simulated_ecg = nk.ecg_simulate(duration=15, sampling_rate=300, heart_rate=70)
plt.plot(simulated_ecg)

### PCA Analysis

In [None]:
from sklearn.decomposition import PCA
from ecgdetectors import Detectors

In [None]:
X = dataset["data"].values

In [None]:
detectors = Detectors(300)

beat_window = 300

slices = []
i = 0
for _, series in dataset.iterrows():
    print(f"Processing ecg {i}/{len(dataset.index)}\r", end="")
    r_peaks = detectors.hamilton_detector(series["data"])
    windows = [(int(p-beat_window/2), int(p+beat_window/2)) for p in r_peaks]
    padded_data = np.pad(series["data"], beat_window, mode="reflect")
    slices.extend([padded_data[w[0]:w[1]] for w in windows])
    i += 1

In [None]:
print(slices[0].shape[0])

In [None]:
slices = filter(lambda x: x.shape[0] == beat_window, slices)

In [None]:
slices = list(slices)
print(len(slices))

In [None]:
X = np.stack(slices)

pca = PCA(20)
pca.fit(X)
components = pca.components_
print(components.shape)

In [None]:
num_rows = 5
num_cols = 4

num_class_samples = num_cols * num_rows
fig = make_subplots(rows=num_rows, cols=num_cols)

for i, comp in enumerate(components):
    fig.add_trace(go.Scatter(y=comp), row= i//num_cols + 1, col = i%num_cols + 1)

fig.update_layout(height=1000)
fig.update_xaxes(title="sample number")
fig.update_yaxes(title="amplitude")
fig.show()