In [2]:
import matplotlib
matplotlib.use('TkAgg')
import ecg_noise_detector.src.ecg_noise_detector.noiseDetector as nd

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC

import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

In [2]:
# Use this if changing noiseDetector.py
import importlib
importlib.reload(nd)

<module 'ecg_noise_detector.src.ecg_noise_detector.noiseDetector' from 'C:\\Users\\daniel\\Documents\\CambridgeSoftwareProjects\\ecg-signal-quality\\ecg_noise_detector\\src\\ecg_noise_detector\\noiseDetector.py'>

### Load the dataset

In [3]:
dataset = pd.read_pickle("CinC2017Data/database.pk")

In [5]:
dataset["length"] = dataset["data"].map(lambda arr: arr.shape[-1])
# select only the 30s length records
dataset = dataset[dataset["length"] == 9000]
dataset["data"] = dataset["data"].map(lambda d: d[0])

In [6]:
# dataset["onehot"] = dataset["class"].map(generate_onehot)
def generate_index(c):
    if c == "N":
        return 0
    if c == "O":
        return 0
    if c == "A":
        return 0
    if c == "~":
        return 1

dataset["class_index"] = dataset["class"].map(generate_index)

train_dataset, test_dataset = train_test_split(dataset, test_size=0.15, stratify=dataset["class_index"])

print(len(test_dataset.index))

897


In [7]:
print(dataset["class"].value_counts())

N    3695
O    1655
A     504
~     123
Name: class, dtype: int64


### Load the SAFER data

In [4]:
from enum import Enum
import os

class DiagEnum(Enum):
    AF = 1
    CannotExcludePathology = 2
    NoAF = 3
    PoorQuality = 4
    ScreeningFailure = 5
    Undecided = 6
    ReviewersDisagree = -1

feas2_path = r"D:\2022_23_DSiromani\Feas2"

feas2_pt_data = pd.read_csv(os.path.join(feas2_path, "pt_data_anon.csv"))
feas2_ecg_data = pd.read_pickle(os.path.join(feas2_path, r"ECGs\filtered_dataframe.pk"))

print(feas2_ecg_data.head())

   ptID   age         ptDiag          ptDiagRev1     ptDiagRev2  \
0     1  79.0  DiagEnum.NoAF  DiagEnum.Undecided  DiagEnum.NoAF   
1     1  79.0  DiagEnum.NoAF  DiagEnum.Undecided  DiagEnum.NoAF   
2     1  79.0  DiagEnum.NoAF  DiagEnum.Undecided  DiagEnum.NoAF   
3     1  79.0  DiagEnum.NoAF  DiagEnum.Undecided  DiagEnum.NoAF   
4     1  79.0  DiagEnum.NoAF  DiagEnum.Undecided  DiagEnum.NoAF   

           ptDiagRev3  cardRev            measDiag        measDiagRev1  \
0  DiagEnum.Undecided        1  DiagEnum.Undecided  DiagEnum.Undecided   
1  DiagEnum.Undecided        1  DiagEnum.Undecided  DiagEnum.Undecided   
2  DiagEnum.Undecided        1  DiagEnum.Undecided  DiagEnum.Undecided   
3  DiagEnum.Undecided        1  DiagEnum.Undecided  DiagEnum.Undecided   
4  DiagEnum.Undecided        1  DiagEnum.Undecided  DiagEnum.Undecided   

         measDiagRev2  ...  poss_AF_tag not_tagged  not_tagged_ign_wide_qrs  \
0  DiagEnum.Undecided  ...            0          1                       

In [48]:
feas2_ecg_data["class_index"] = feas2_ecg_data["measDiag"].map(lambda x: int(x == DiagEnum.PoorQuality))

In [49]:
# Split train and test data according to each patient
test_frac = 0.2

feas2_pt_data["noLQrecs"] = feas2_pt_data["noRecs"] - feas2_pt_data["noHQrecs"]
train_patients = []
test_patients = []

for val, df in feas2_pt_data.groupby("noLQrecs"):
    print(f"processing {val}")
    test = df.sample(frac=test_frac)
    test_patients.append(test)
    train_patients.append(df[~df["ptID"].isin(test["ptID"])])

train_pt_df = pd.concat(train_patients)
test_pt_df = pd.concat(test_patients)

print(f"Test high quality: {test_pt_df['noHQrecs'].sum()} low quality: {test_pt_df['noLQrecs'].sum()} ")
print(f"Train high quality: {train_pt_df['noHQrecs'].sum()} low quality: {train_pt_df['noLQrecs'].sum()} ")

processing 0
processing 1
processing 2
processing 3
processing 4
processing 5
processing 6
processing 7
processing 8
processing 9
processing 10
processing 11
processing 12
processing 13
processing 14
processing 15
processing 16
processing 17
processing 21
processing 26
processing 27
processing 33
Test high quality: 4642 low quality: 101 
Train high quality: 18037 low quality: 479 


In [50]:
train_ecg_df = feas2_ecg_data[feas2_ecg_data["ptID"].isin(train_pt_df["ptID"])]
test_ecg_df = feas2_ecg_data[feas2_ecg_data["ptID"].isin(test_pt_df["ptID"])]

### An aside on optimising the length transform

In [19]:
lt_yoav = nd._length_transfrom(test_dataset["data"][0], 100)

In [20]:
lt_mine = nd._length_transform_faster(test_dataset["data"][0], 100)

In [53]:
def test_code():
    ecgs = test_dataset["data"].head(20)
    nd.is_noisy_batch(ecgs, fs=300)

import cProfile
cProfile.run('test_code()')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
  p = pd.Series(index=ecgs.index)


Processing ecg: A04315
Processing ecg: A03779
Processing ecg: A05397
Processing ecg: A03823
Processing ecg: A00227
Processing ecg: A08170
Processing ecg: A03236
Processing ecg: A07489
Processing ecg: A04197
Processing ecg: A00860
Processing ecg: A04877
Processing ecg: A05200
Processing ecg: A01613
Processing ecg: A02253
Processing ecg: A05897
Processing ecg: A05041
Processing ecg: A07776
Processing ecg: A06002
Processing ecg: A07440
Processing ecg: A01792
         5192772 function calls (5113124 primitive calls) in 4.064 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    4.064    4.064 3875118959.py:1(test_code)
     2840    0.001    0.000    0.012    0.000 <__array_function__ internals>:177(all)
     3360    0.004    0.000    0.021    0.000 <__array_function__ internals>:177(amax)
     8480    0.015    0.000    0.058    0.000 <__array_function__ internals>:177(any)
     1720    0.001    0.000   

### Run Yoav's classifier and analyse the results

In [7]:
ecgs = test_dataset["data"]
test_dataset["predictions"] = nd.is_noisy_batch(ecgs, fs=300, filter=False)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
  p = pd.Series(index=ecgs.index)


Processing ecg: A00503
Processing ecg: A06949
Processing ecg: A05398
Processing ecg: A08103
Processing ecg: A05609
Processing ecg: A02815
Processing ecg: A07811
Processing ecg: A06506
Processing ecg: A06564
Processing ecg: A08068
Processing ecg: A08203
Processing ecg: A06871
Processing ecg: A00812
Processing ecg: A03623
Processing ecg: A00256
Processing ecg: A03910
Processing ecg: A03932
Processing ecg: A07280
Processing ecg: A07013
Processing ecg: A00022
Processing ecg: A02472
Processing ecg: A08055
Processing ecg: A07819
Processing ecg: A01137
Processing ecg: A07482
Processing ecg: A00415
Processing ecg: A04376
Processing ecg: A04510
Processing ecg: A03410
Processing ecg: A03156
Processing ecg: A07249
Processing ecg: A05010
Processing ecg: A08245
Processing ecg: A08237
Processing ecg: A03766
Processing ecg: A00189
Processing ecg: A00281
Processing ecg: A01817
Processing ecg: A00863
Processing ecg: A00908
Processing ecg: A03745
Processing ecg: A04434
Processing ecg: A03262
Processing 

In [9]:
test_dataset["predictions"] = test_dataset["predictions"].astype(int)

Plot examples of the data with window by window classification of the errors

In [29]:
i = 100

print(test_dataset.iloc[i]["class"])
nd.plot_ecg(test_dataset.iloc[i]["data"], fs=300)
plt.show()

O


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


<Figure size 1169x827 with 3 Axes>

In [10]:
conf_mat = confusion_matrix(test_dataset["class_index"], test_dataset["predictions"])

def F1_ind(conf_mat, ind):
    return (2 * conf_mat[ind, ind])/(np.sum(conf_mat[ind]) + np.sum(conf_mat[:, ind]))

print(f"Normal F1: {F1_ind(conf_mat, 0)}")
print(f"Other F1: {F1_ind(conf_mat, 1)}")

ConfusionMatrixDisplay.from_predictions(test_dataset["class_index"], test_dataset["predictions"], display_labels=["sufficient quality", "insufficient quality"], cmap="inferno")
plt.show()

Normal F1: 0.556920556920557
Other F1: 0.055846422338568937


In [11]:
print(f"Sensitivity: {conf_mat[1, 1]/np.sum(conf_mat[1])}")
print(f"Specificity: {conf_mat[0, 0]/np.sum(conf_mat[0])}")

Sensitivity: 0.8888888888888888
Specificity: 0.38680318543799774


In [32]:
false_positives = test_dataset[(test_dataset["predictions"] == 1) & (test_dataset["class_index"] == 0)]
false_negatives = test_dataset[(test_dataset["predictions"] == 0) & (test_dataset["class_index"] == 1)]
noisy = test_dataset[(test_dataset["predictions"] == 1) & (test_dataset["class_index"] == 1)]

In [25]:
nd.plot_ecg(false_positives.iloc[50]["data"], fs=300)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


<Figure size 1169x827 with 3 Axes>

In [30]:
nd.plot_ecg(false_negatives.iloc[0]["data"], fs=300)


Trying to unpickle estimator SVC from version 1.0.1 when using version 1.1.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations



<Figure size 1169x827 with 3 Axes>

In [33]:
nd.plot_ecg(noisy.iloc[0]["data"], fs=300)


Trying to unpickle estimator SVC from version 1.0.1 when using version 1.1.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations



<Figure size 1169x827 with 3 Axes>

In [35]:
# Try on SAFER data
ecgs = feas2_ecg_data["data"]
feas2_ecg_data["predictions"] = nd.is_noisy_batch(ecgs, fs=300, filter=False)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
  p = pd.Series(index=ecgs.index)


Processing ecg: 0
Processing ecg: 1
Processing ecg: 2
Processing ecg: 3
Processing ecg: 4
Processing ecg: 5
Processing ecg: 6
Processing ecg: 7
Processing ecg: 8
Processing ecg: 9
Processing ecg: 10
Processing ecg: 11
Processing ecg: 12
Processing ecg: 13
Processing ecg: 14
Processing ecg: 15
Processing ecg: 16
Processing ecg: 17
Processing ecg: 18
Processing ecg: 19
Processing ecg: 20
Processing ecg: 21
Processing ecg: 22
Processing ecg: 23
Processing ecg: 24
Processing ecg: 25
Processing ecg: 26
Processing ecg: 27
Processing ecg: 28
Processing ecg: 29
Processing ecg: 30
Processing ecg: 31
Processing ecg: 32
Processing ecg: 33
Processing ecg: 34
Processing ecg: 35
Processing ecg: 36
Processing ecg: 37
Processing ecg: 38
Processing ecg: 39
Processing ecg: 40
Processing ecg: 41
Processing ecg: 42
Processing ecg: 43
Processing ecg: 44
Processing ecg: 45
Processing ecg: 46
Processing ecg: 47
Processing ecg: 48
Processing ecg: 49
Processing ecg: 50
Processing ecg: 51
Processing ecg: 52
Pro

In [41]:
feas2_ecg_data["predictions"] = feas2_ecg_data["predictions"].astype(int)
feas2_ecg_data["class_index"] = feas2_ecg_data["measDiag"].map(lambda x: int(x == DiagEnum.PoorQuality))

In [42]:
conf_mat = confusion_matrix(feas2_ecg_data["class_index"], feas2_ecg_data["predictions"])

def F1_ind(conf_mat, ind):
    return (2 * conf_mat[ind, ind])/(np.sum(conf_mat[ind]) + np.sum(conf_mat[:, ind]))

print(f"Noisy F1: {F1_ind(conf_mat, 1)}")

ConfusionMatrixDisplay.from_predictions(feas2_ecg_data["class_index"], feas2_ecg_data["predictions"], display_labels=["sufficient quality", "insufficient quality"], cmap="inferno")
plt.show()

Noisy F1: 0.08801341156747695


In [43]:
false_positives = feas2_ecg_data[(feas2_ecg_data["predictions"] == 1) & (feas2_ecg_data["class_index"] == 0)]

In [46]:
nd.plot_ecg(false_positives.iloc[70]["data"], fs=300)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


<Figure size 1169x827 with 3 Axes>

## Try Training the SVM on the physionet data ourselves and see what the results are

In [51]:
# Extract features for 5s segments of the data using Yoav's code

ecg_features = []
i = 0

def process_ecgs(dataset):
    i = 0
    ecg_features = []

    for ind, x in dataset.iterrows():
        print(f"Processing ecg {i}/{len(dataset.index)}\r", end="")
        df = nd._process_ecg(x["data"], fs=300, filter=False)
        df["class_index"] = x["class_index"]
        ecg_features.append(df)
        i += 1

    return pd.concat(ecg_features, keys=dataset.index)


train_dataset = process_ecgs(train_ecg_df)

Processing ecg 18515/18516

In [60]:
test_dataset = process_ecgs(test_ecg_df)

Processing ecg 386/4743

KeyboardInterrupt: 

In [62]:
pk_path = r"D:\2022_23_DSiromani\Feas2\ECGs"
train_dataset.to_pickle(os.path.join(pk_path, "safer_yeov_processed.pk"))
test_dataset.to_pickle(os.path.join(pk_path, "safer_yeov_processed_test.pk"))

In [None]:
pk_path = "CinC2017Data/database_yeov_processed.pk"
df.to_pickle(pk_path)

In [13]:
train_dataset = pd.read_pickle("CinC2017Data/safer_yeov_processed.pk")

In [None]:
pk_path = r"D:\2022_23_DSiromani\Feas2\ECGs"
train_dataset = pd.read_pickle(os.path.join(pk_path, "safer_yeov_processed.pk"))
test_dataset = pd.read_pickle(os.path.join(pk_path, "safer_yeov_processed_test.pk"))

In [63]:
# Normalise some features to have 0 mean and variance 1 (other elements are already limited/normalised)

var_1_features = ["sSQI", "kSQI"]
for f in var_1_features:
    train_dataset[f] = (train_dataset[f] - train_dataset[f].mean())/train_dataset[f].var()

In [64]:
# train the model
class_weights = 1/train_dataset["class_index"].value_counts()
class_weights /= np.sum(class_weights)
print(class_weights)

model = SVC(class_weight=class_weights.to_dict())
# See documentation for default values e.g. use rbf, regularising C = 1
features = ["sSQI", "kSQI", "pSQI", "basSQI", "bSQI", "rSQI"]

model = model.fit(train_dataset[features].values, train_dataset["class_index"].values)

0    0.020631
1    0.979369
Name: class_index, dtype: float64


In [67]:
predictions = model.predict(test_dataset[features].values)

test_dataset["predictions"] = predictions
# Select the values with more than 50% noisy as overall noisy
results_df = test_dataset.groupby(level=0).mean()
# results_df["predictions"] = results_df["predictions"].round()

ConfusionMatrixDisplay.from_predictions(results_df["class_index"], results_df["predictions"].round(), display_labels=["sufficint quality", "insufficient quality"], cmap="inferno")
plt.show()
conf_mat = confusion_matrix(results_df["class_index"], results_df["predictions"].round())

def F1_ind(conf_mat, ind):
    return (2 * conf_mat[ind, ind])/(np.sum(conf_mat[ind]) + np.sum(conf_mat[:, ind]))

print(f"Normal F1: {F1_ind(conf_mat, 0)}")
print(f"Other F1: {F1_ind(conf_mat, 1)}")

Normal F1: 0.963556531284303
Other F1: 0.11702127659574468


In [69]:
from sklearn.metrics import precision_recall_curve
import plotly.graph_objects as go

p, r, d = precision_recall_curve(results_df["class_index"], results_df["predictions"])

F1 = 2 * p * r /(p + r)

fig = go.Figure()
fig.add_trace(go.Scatter(x=r, y=p, hovertext=[f"decision boundary: {x:.2f}\nF1 score: {f:.03f}" for x, f in zip(d, F1)]))

fig.update_xaxes(title="Recall")
fig.update_yaxes(title="Precision")
fig.show()

# Why are the F values different!

In [15]:
def F1_ind(conf_mat, ind):
    return (2 * conf_mat[ind, ind])/(np.sum(conf_mat[ind]) + np.sum(conf_mat[:, ind]))

In [17]:
# Train test split per ECG (not per 5s segment as they could overlap)

ind = np.unique(np.array(df.index.get_level_values(0)))
classes = np.array([df["class_index"].loc[i, 0] for i in ind])

from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=5)

conf_mats = []

for i, (train_ind, test_ind) in enumerate(kf.split(ind, classes)):
    print(f"======== Split {i} ========")
    train_dataset = df.loc[ind[train_ind]]
    test_dataset = df.loc[ind[test_ind]]

    model = SVC(class_weight=class_weights.to_dict())
    model = model.fit(train_dataset[features].values, train_dataset["class_index"].values)

    predictions = model.predict(test_dataset[features].values)

    test_dataset["predictions"] = predictions
    results_df = test_dataset.groupby(level=0).mean()
    results_df["predictions"] = results_df["predictions"].round()

    conf_mat = confusion_matrix(results_df["class_index"], results_df["predictions"])
    conf_mats.append(conf_mat)



In [18]:
print(f"Normal F1: {np.mean([F1_ind(c, 0) for c in conf_mats])}")
print(f"Poor quality F1: {np.mean([F1_ind(c, 1) for c in conf_mats])}")

Normal F1: 0.8402917113679687
Poor quality F1: 0.11499547655338753


### Try putting features for each section into the classifier as one input

In [70]:
# Construct test and train matrices

features = ["sSQI", "kSQI", "pSQI", "basSQI", "bSQI", "rSQI"]
train_matrix = []
train_targets = []
indexes = []

for i in set(train_dataset.index.get_level_values(0)):
    indexes.append(i)
    train_matrix.append(train_dataset.loc[i][features].values.flatten())
    train_targets.append(train_dataset.loc[(i, 0)]["class_index"])

train_matrix = np.array(train_matrix)
train_targets = np.array(train_targets)

print(train_matrix.shape)
print(train_targets.shape)


test_matrix=  []
test_targets = []
for i in set(test_dataset.index.get_level_values(0)):
    test_matrix.append(test_dataset.loc[i][features].values.flatten())
    test_targets.append(test_dataset.loc[(i, 0)]["class_index"])


test_matrix = np.array(test_matrix)
test_targets = np.array(test_targets)

print(test_matrix.shape)

(18516, 36)
(18516,)
(4743, 36)


In [77]:
# train the model
class_weights = 1/train_dataset["class_index"].value_counts()
class_weights /= np.sum(class_weights)
print(class_weights)

model = SVC(class_weight=class_weights.to_dict(), probability=True)
# See documentation for default values e.g. use rbf, regularising C = 1
features = ["sSQI", "kSQI", "pSQI", "basSQI", "bSQI", "rSQI"]

model = model.fit(train_matrix, train_targets)

0    0.020631
1    0.979369
Name: class_index, dtype: float64


In [79]:
predictions = model.predict_proba(test_matrix)

ConfusionMatrixDisplay.from_predictions(test_targets, np.round(predictions[:, 1]), display_labels=["sufficient quality", "insufficient quality"], cmap="inferno")
plt.show()
conf_mat = confusion_matrix(test_targets, np.round(predictions[:, 1]))

def F1_ind(conf_mat, ind):
    return (2 * conf_mat[ind, ind])/(np.sum(conf_mat[ind]) + np.sum(conf_mat[:, ind]))

print(f"Normal F1: {F1_ind(conf_mat, 0)}")
print(f"Other F1: {F1_ind(conf_mat, 1)}")

Normal F1: 0.9911730298840795
Other F1: 0.0


In [80]:
p, r, d = precision_recall_curve(test_targets, predictions[:, 1])

F1 = 2 * p * r /(p + r)

fig = go.Figure()
fig.add_trace(go.Scatter(x=r, y=p, hovertext=[f"decision boundary: {x:.2f}\nF1 score: {f:.03f}" for x, f in zip(d, F1)]))

fig.update_xaxes(title="Recall")
fig.update_yaxes(title="Precision")
fig.show()

In [74]:
print(f"Sensitivity: {conf_mat[1, 1]/np.sum(conf_mat[1])}")
print(f"Specificity: {conf_mat[0, 0]/np.sum(conf_mat[0])}")

Sensitivity: 0.3373493975903614
Specificity: 0.934763948497854


In [30]:
false_positives = np.array(list(set(test_dataset.index.get_level_values(0))))[(test_targets == 0) * (predictions == 1)]
print(false_positives[0])

print(test_matrix[(test_targets == 0) * (predictions == 1)][1])

plt.plot(dataset["data"].loc[false_positives[1]])
plt.show()

A07546
[-0.42255649 -1.55240177  0.4495139   0.81001201  0.5         0.66666667
 -0.87647898 -1.74441361  0.31188967  0.86828259  0.57142857  0.63636364
 -0.72636483 -2.10796461  0.39596124  0.90054756  0.4375      0.69565217
 -0.82719949 -1.62955888  0.2995643   0.98486546  0.58333333  0.63157895
 -0.74217845 -1.59165107  0.33631894  0.94724882  0.53846154  0.65
 -0.50393047 -1.45371067  0.31330429  0.84007719  0.63636364  0.61111111]


Why am I having poorer accuracy here?
 - Is it because the SVM is not able to understand so many variables