In [None]:
import matplotlib
matplotlib.use('TkAgg')
import ecg_noise_detector.src.ecg_noise_detector.noiseDetector as nd


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC

import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from DiagEnum import DiagEnum

In [None]:
# Use this if changing noiseDetector.py
import importlib
importlib.reload(nd)

### Load the dataset

In [None]:
dataset = pd.read_pickle("CinC2017Data/database.pk")

In [None]:
dataset["length"] = dataset["data"].map(lambda arr: arr.shape[-1])
# select only the 30s length records
dataset = dataset[dataset["length"] == 9000]
dataset["data"] = dataset["data"].map(lambda d: d[0])

In [None]:
# dataset["onehot"] = dataset["class"].map(generate_onehot)
def generate_index(c):
    if c == "N":
        return 0
    if c == "O":
        return 0
    if c == "A":
        return 0
    if c == "~":
        return 1

dataset["class_index"] = dataset["class"].map(generate_index)

train_dataset, test_dataset = train_test_split(dataset, test_size=0.15, stratify=dataset["class_index"])

print(len(test_dataset.index))

In [None]:
print(dataset["class"].value_counts())

### Load the SAFER data

In [None]:
from DiagEnum import DiagEnum
import SAFERDataset

In [None]:
feas2_pt_data, feas2_ecg_data = SAFERDataset.load_feas_dataset(2, "dataframe")

In [None]:
feas2_pt_data.index = feas2_pt_data["ptID"]
feas2_pt_data["noRecs"] = feas2_ecg_data["ptID"].value_counts()
feas2_pt_data["noHQrecs"] = feas2_ecg_data[feas2_ecg_data["class_index"] == 0]["ptID"].value_counts()

feas2_pt_data["noHQrecsNotUndecided"] = feas2_ecg_data[(feas2_ecg_data["class_index"] == 0) & (feas2_ecg_data["measDiag"] != DiagEnum.Undecided)]["ptID"].value_counts()
feas2_pt_data[["noRecs", "noHQrecs", "noHQrecsNotUndecided"]] = feas2_pt_data[["noRecs", "noHQrecs", "noHQrecsNotUndecided"]].fillna(0)

In [None]:
feas2_ecg_data["class_index"].value_counts()

In [None]:
(feas2_pt_data["noRecs"] - feas2_pt_data["noHQrecs"]).sum()

In [None]:
# For SAFER data
# Split train and test data according to each patient
def make_SAFER_dataloaders(pt_data, ecg_data, test_frac, only_clean_training=True):
    pt_data["noLQrecs"] = pt_data["noRecs"] - pt_data["noHQrecs"]  # for Feas1 this might include stuff flagged by zenicor as noisy?
    train_patients = []
    test_patients = []

    for val, df in pt_data.groupby("noLQrecs"):
        print(f"processing {val}")
        print(f"number of patients {len(df.index)}")
        test = df.sample(frac=test_frac)
        test_patients.append(test)
        train_patients.append(df[~df["ptID"].isin(test["ptID"])])

    train_pt_df = pd.concat(train_patients)
    test_pt_df = pd.concat(test_patients)

    print(f"Test high quality: {test_pt_df['noHQrecs'].sum()} low quality: {test_pt_df['noLQrecs'].sum()} ")
    print(f"Train high quality: {train_pt_df['noHQrecs'].sum()} low quality: {train_pt_df['noLQrecs'].sum()} ")

    train_dataset = None
    test_dataset = None

    if not train_pt_df.empty:
        # get ECG datasets
        train_dataset = ecg_data[ecg_data["ptID"].isin(train_pt_df["ptID"])]
        # Normalise
        train_dataset["data"] = (train_dataset["data"] - train_dataset["data"].map(lambda x: x.mean()))/train_dataset["data"].map(lambda x: x.std())

    if not test_pt_df.empty:
        test_dataset = ecg_data[(ecg_data["ptID"].isin(test_pt_df["ptID"])) & (ecg_data["measDiag"] != DiagEnum.Undecided)]
        test_dataset["data"] = (test_dataset["data"] - test_dataset["data"].map(lambda x: x.mean()))/test_dataset["data"].map(lambda x: x.std())

    return train_dataset, test_dataset

train_ecg_df, test_ecg_df = make_SAFER_dataloaders(feas2_pt_data, feas2_ecg_data, test_frac=0.2, only_clean_training=False, )

In [None]:
# For SAFER data
# Split train and test data according to each patient
def make_SAFER_dataloaders(pt_data, ecg_data, test_frac, val_frac, only_clean_training=True):
    pt_data["noLQrecs"] = pt_data["noRecs"] - pt_data[
        "noHQrecs"]  # for Feas1 this might include stuff flagged by zenicor as noisy?

    train_patients = []
    test_patients = []
    val_patients = []

    test_val_frac = test_frac + val_frac
    val_second_frac = val_frac / test_val_frac

    lq_counts = np.array([0, 0, 0], dtype=int)
    total_counts = np.array([0, 0, 0], dtype=int)

    fracs = np.array([1 - test_frac - val_frac, test_frac, val_frac])

    total_lq_count = 0
    total_total_count = 0

    test_val_ratio_overall = pt_data["noLQrecs"].sum() / (
                pt_data["noLQrecs"].sum() + pt_data["noHQrecsNotUndecided"].sum())
    train_ratio_overall = pt_data["noLQrecs"].sum() / pt_data["noRecs"].sum()

    for val, pt in pt_data.iterrows():
        total_total_count += pt["noHQrecsNotUndecided"]
        total_lq_count += pt["noLQrecs"]

        exp_total_counts = total_total_count * fracs
        exp_lq_counts = total_lq_count * fracs

        loss_0 = np.sum(np.abs(lq_counts + np.array([pt["noLQrecs"], 0, 0]) - exp_lq_counts) + np.abs(
            total_counts + np.array([pt["noHQrecsNotUndecided"], 0, 0]) - exp_total_counts))
        loss_1 = np.sum(np.abs(lq_counts + np.array([0, pt["noLQrecs"], 0]) - exp_lq_counts) + np.abs(
            total_counts + np.array([0, pt["noHQrecsNotUndecided"], 0]) - exp_total_counts))
        loss_2 = np.sum(np.abs(lq_counts + np.array([0, 0, pt["noLQrecs"]]) - exp_lq_counts) + np.abs(
            total_counts + np.array([0, 0, pt["noHQrecsNotUndecided"]]) - exp_total_counts))

        min_loss = min([loss_0, loss_1, loss_2])

        if min_loss == loss_0:
            train_patients.append(pt)
            lq_counts += np.array([pt["noLQrecs"], 0, 0], dtype=int)
            total_counts += np.array([pt["noHQrecsNotUndecided"], 0, 0], dtype=int)
        elif min_loss == loss_1:
            test_patients.append(pt)
            lq_counts += np.array([0, pt["noLQrecs"], 0], dtype=int)
            total_counts += np.array([0, pt["noHQrecsNotUndecided"], 0], dtype=int)
        else:
            val_patients.append(pt)
            lq_counts += np.array([0, 0, pt["noLQrecs"]], dtype=int)
            total_counts += np.array([0, 0, pt["noHQrecsNotUndecided"]], dtype=int)

    train_pt_df = pd.DataFrame(train_patients)
    test_pt_df = pd.DataFrame(test_patients)
    val_pt_df = pd.DataFrame(val_patients)

    print(f"Test high quality: {test_pt_df['noHQrecsNotUndecided'].sum()} low quality: {test_pt_df['noLQrecs'].sum()} ")
    print(
        f"Train high quality: {train_pt_df['noHQrecsNotUndecided'].sum()} low quality: {train_pt_df['noLQrecs'].sum()} ")
    print(f"Val high quality: {val_pt_df['noHQrecsNotUndecided'].sum()} low quality: {val_pt_df['noLQrecs'].sum()}")

    train_dataloader = None
    test_dataloader = None
    val_dataloader = None

    train_dataset = None
    test_dataset = None
    val_dataset = None

    if not train_pt_df.empty:
        train_dataset = ecg_data[(ecg_data["ptID"].isin(train_pt_df["ptID"]))]

    if not test_pt_df.empty:
        test_dataset = ecg_data[
            (ecg_data["ptID"].isin(test_pt_df["ptID"])) & (ecg_data["measDiag"] != DiagEnum.Undecided)]


    if not val_pt_df.empty:
        val_dataset = ecg_data[
            (ecg_data["ptID"].isin(val_pt_df["ptID"])) & (ecg_data["measDiag"] != DiagEnum.Undecided)]

    return train_dataset, test_dataset, val_dataset

train_dataset, test_dataset, val_dataset = make_SAFER_dataloaders(feas2_pt_data, feas2_ecg_data, test_frac=0.15, val_frac=0.15, only_clean_training=False)

### An aside on optimising the length transform

In [None]:
lt_yoav = nd._length_transfrom(test_dataset["data"][0], 100)

In [None]:
lt_mine = nd._length_transform_faster(test_dataset["data"][0], 100)

In [None]:
def test_code():
    ecgs = test_dataset["data"].head(20)
    nd.is_noisy_batch(ecgs, fs=300)

import cProfile
cProfile.run('test_code()')

### Run Yoav's classifier and analyse the results

In [None]:
ecgs = val_dataset["data"]
val_dataset["predictions"] = nd.is_noisy_batch(ecgs, fs=300, filter=False)

In [None]:
val_dataset["predictions"] = val_dataset["predictions"].astype(int)

Plot examples of the data with window by window classification of the errors

In [None]:
i = 100

print(test_dataset.iloc[i]["class"])
nd.plot_ecg(test_dataset.iloc[i]["data"], fs=300)
plt.show()

In [None]:
conf_mat = confusion_matrix(val_dataset["class_index"], val_dataset["predictions"])

def F1_ind(conf_mat, ind):
    return (2 * conf_mat[ind, ind])/(np.sum(conf_mat[ind]) + np.sum(conf_mat[:, ind]))

print(conf_mat)
print(f"Normal F1: {F1_ind(conf_mat, 0)}")
print(f"Other F1: {F1_ind(conf_mat, 1)}")

# ConfusionMatrixDisplay.from_predictions(test_dataset["class_index"], test_dataset["predictions"], display_labels=["sufficient quality", "insufficient quality"], cmap="inferno")
# plt.show()

In [None]:
print(f"Sensitivity: {conf_mat[1, 1]/np.sum(conf_mat[1])}")
print(f"Specificity: {conf_mat[0, 0]/np.sum(conf_mat[0])}")

In [None]:
false_positives = test_dataset[(test_dataset["predictions"] == 1) & (test_dataset["class_index"] == 0)]
false_negatives = test_dataset[(test_dataset["predictions"] == 0) & (test_dataset["class_index"] == 1)]
noisy = test_dataset[(test_dataset["predictions"] == 1) & (test_dataset["class_index"] == 1)]

In [None]:
nd.plot_ecg(false_positives.iloc[50]["data"], fs=300)

In [None]:
nd.plot_ecg(false_negatives.iloc[0]["data"], fs=300)

In [None]:
nd.plot_ecg(noisy.iloc[0]["data"], fs=300)

In [None]:
# Try on SAFER data
ecgs = feas2_ecg_data["data"]
feas2_ecg_data["predictions"] = nd.is_noisy_batch(ecgs, fs=300, filter=False)

In [None]:
feas2_ecg_data["predictions"] = feas2_ecg_data["predictions"].astype(int)
feas2_ecg_data["class_index"] = feas2_ecg_data["measDiag"].map(lambda x: int(x == DiagEnum.PoorQuality))

In [None]:
conf_mat = confusion_matrix(feas2_ecg_data["class_index"], feas2_ecg_data["predictions"])

def F1_ind(conf_mat, ind):
    return (2 * conf_mat[ind, ind])/(np.sum(conf_mat[ind]) + np.sum(conf_mat[:, ind]))

print(f"Noisy F1: {F1_ind(conf_mat, 1)}")

ConfusionMatrixDisplay.from_predictions(feas2_ecg_data["class_index"], feas2_ecg_data["predictions"], display_labels=["sufficient quality", "insufficient quality"], cmap="inferno")
plt.show()

In [None]:
false_positives = feas2_ecg_data[(feas2_ecg_data["predictions"] == 1) & (feas2_ecg_data["class_index"] == 0)]

In [None]:
nd.plot_ecg(false_positives.iloc[70]["data"], fs=300)

## Try Training the SVM ourselves and see what the results are

### first split the ECGs into segments and compute the features

In [None]:
# import ecg_noise_detector.src.ecg_noise_detector.noiseDetector as nd
import importlib
importlib.reload(nd)

In [None]:
train_ecg_df = train_dataset
test_ecg_df = test_dataset
val_ecg_df = val_dataset

In [None]:
train_ecg_df["measDiag"].value_counts()

In [None]:
# Extract features for 5s segments of the data using Yoav's code

def process_ecgs(dataset):
    i = 0
    ecg_features = []

    for ind, x in dataset.iterrows():
        print(f"Processing ecg {i}/{len(dataset.index)}\r", end="")
        df = nd._process_ecg(x["data"], fs=300, filter=False)
        df["class_index"] = x["class_index"]
        df["ecg_ind"] = ind

        ecg_start_inds = np.arange(0, int(len(x["data"]) + (- 5 + 2.5)*300), int(2.5*300))
        print(ecg_start_inds)
        df["ecg_start"] = ecg_start_inds
        df["measDiag"] = x["measDiag"]
        ecg_features.append(df)
        i += 1

    return pd.concat(ecg_features, keys=dataset.index)

train_dataset = process_ecgs(train_ecg_df)

In [None]:
# Extract features for 5s segments of the data using Yoav's code
from multiprocessing import Pool
import tqdm

def process_ecgs_multicore(dataset):
    ecg_features = []

    dataset_iter = dataset.iterrows()
    with Pool(processes=12) as pool:
        for f in tqdm.tqdm(pool.imap_unordered(nd._process_single_ecg, dataset_iter, chunksize=32)):
            ecg_features.append(f)

    return pd.concat(ecg_features, keys=dataset.index)

# train_dataset = process_ecgs_multicore(train_ecg_df)
test_dataset = process_ecgs_multicore(test_ecg_df)
val_dataset = process_ecgs_multicore(val_ecg_df)

In [None]:
import os
pk_path = r"D:\2022_23_DSiromani\Feas2\ECGs"
train_dataset.to_pickle(os.path.join(pk_path, "safer_yeov_processed_4.pk"))
test_dataset.to_pickle(os.path.join(pk_path, "safer_yeov_processed_test_4.pk"))
val_dataset.to_pickle(os.path.join(pk_path, "safer_yeov_processed_val_4.pk"))

### Load the dataset of precomputed features

In [None]:
import os
pk_path = r"D:\2022_23_DSiromani\Feas2\ECGs"
train_dataset = pd.read_pickle(os.path.join(pk_path, "safer_yeov_processed_4.pk"))
test_dataset = pd.read_pickle(os.path.join(pk_path, "safer_yeov_processed_test_4.pk"))
val_dataset = pd.read_pickle(os.path.join(pk_path, "safer_yeov_processed_val_4.pk"))

In [None]:
train_dataset = train_dataset[train_dataset["measDiag"] != DiagEnum.Undecided]
test_dataset = test_dataset[test_dataset["measDiag"] != DiagEnum.Undecided]
val_dataset = val_dataset[val_dataset["measDiag"] != DiagEnum.Undecided]
# A full dataset for cross validation/other processing
total_dataset = pd.concat([train_dataset, test_dataset, val_dataset])

In [None]:
# Normalise some features to have 0 mean and variance 1 (other elements are already limited/normalised)

var_1_features = ["sSQI", "kSQI"]
for f in var_1_features:
    train_dataset[f] = (train_dataset[f] - total_dataset[f].mean())/total_dataset[f].std()
    test_dataset[f] = (test_dataset[f] - total_dataset[f].mean())/total_dataset[f].std()
    val_dataset[f] = (val_dataset[f] - total_dataset[f].mean())/total_dataset[f].std()
    total_dataset[f] = (total_dataset[f] - total_dataset[f].mean())/total_dataset[f].std()

In [None]:
# Save the data as a matlab file with the raw ECG segments for the signal processing selection/relabelling

mat_path = r"D:\2022_23_DSiromani\Feas2\ECGs"
cut_size = int(2.5 * 300)

total_dataset['ecg'] = total_dataset.apply(lambda x: feas2_ecg_data.loc[x["ecg_ind"]]["data"][x["ecg_start"]:x["ecg_start"]+cut_size], axis=1)
matlab_dict = total_dataset.to_dict("list")

import scipy.io

scipy.io.savemat(os.path.join(mat_path, "safer_yeov_processed_norm_3.mat"), matlab_dict, appendmat=True, format='5', long_field_names=False, do_compression=False, oned_as='row')

In [None]:
# Load selection/relabelling results from Matlab

import scipy.io
import os

mat_path = r"D:\2022_23_DSiromani\Feas2\ECGs"
rejection_list = scipy.io.loadmat(os.path.join(mat_path, "safer_yeov_processed_rejection_total.mat"))["noisy_samples"][:, 0]

In [None]:
total_dataset['sig_proc_noise_score'] = rejection_list
train_dataset = train_dataset[train_dataset["measDiag"] != DiagEnum.Undecided]
test_dataset = test_dataset[test_dataset["measDiag"] != DiagEnum.Undecided]

In [None]:
# Investigate the results

plt.hist(train_dataset[train_dataset["class_index"] == 0]["sig_proc_noise_score"], bins=np.linspace(0, 750, 75))
plt.hist(train_dataset[train_dataset["class_index"] == 1]["sig_proc_noise_score"], bins=np.linspace(0, 750, 75))
plt.show()

In [None]:
signal_proc_sel  = ((train_dataset['sig_proc_noise_score'] < 50) & (train_dataset["class_index"] == 0)) | ((train_dataset["sig_proc_noise_score"] > 100) & (train_dataset["class_index"] == 1))
print(signal_proc_sel.value_counts())

train_dataset_selected = train_dataset[signal_proc_sel]

# signal_proc_relabel = train_dataset["sig_proc_noise_score"] > 100
# train_dataset["class_index_sig_proc"] = signal_proc_relabel.astype(int)

In [None]:
signal_proc_sel = ((total_dataset['sig_proc_noise_score'] < 50) & (total_dataset["class_index"] == 0)) | (
            (total_dataset["sig_proc_noise_score"] > 100) & (total_dataset["class_index"] == 1
))
print(signal_proc_sel.value_counts())
total_dataset_selected = total_dataset[signal_proc_sel]

# signal_proc_relabel = train_dataset["sig_proc_noise_score"] > 100
# train_dataset["class_index_sig_proc"] = signal_proc_relabel.astype(int)

In [None]:
train_dataset_selected = train_dataset

In [None]:
df = total_dataset

In [None]:
# train the model
class_weights = 1/train_dataset_selected["class_index"].value_counts()
class_weights /= np.sum(class_weights)
print(class_weights)

model = SVC(class_weight=class_weights.to_dict())
# See documentation for default values e.g. use rbf, regularising C = 1
features = ["sSQI", "kSQI", "pSQI", "basSQI", "bSQI", "rSQI"]

model = model.fit(train_dataset_selected[features].values, train_dataset_selected["class_index"].values)

In [None]:
predictions = model.predict(test_dataset[features].values)

predictions_series = pd.Series(data=predictions, index=test_dataset[test_dataset["measDiag"] != DiagEnum.Undecided].index)
test_dataset["predictions"] = predictions_series

# Select the values with more than 50% noisy as overall noisy
results_df = test_dataset[test_dataset["measDiag"] != DiagEnum.Undecided].groupby(level=0).mean()

conf_mat = confusion_matrix(results_df["class_index"], results_df["predictions"].round())
print(conf_mat)

def F1_ind(conf_mat, ind):
    return (2 * conf_mat[ind, ind])/(np.sum(conf_mat[ind]) + np.sum(conf_mat[:, ind]))

print(f"Normal F1: {F1_ind(conf_mat, 0)}")
print(f"Noisy F1: {F1_ind(conf_mat, 1)}")

In [None]:
# How good is the signal processing alone

sig_proc_results_df = train_dataset[train_dataset["measDiag"] != DiagEnum.Undecided].groupby(level=0).mean()
conf_mat = confusion_matrix(sig_proc_results_df["class_index"], sig_proc_results_df["sig_proc_noise_score"] > 20)

def F1_ind(conf_mat, ind):
    return (2 * conf_mat[ind, ind])/(np.sum(conf_mat[ind]) + np.sum(conf_mat[:, ind]))

print(f"Normal F1: {F1_ind(conf_mat, 0)}")
print(f"Noisy F1: {F1_ind(conf_mat, 1)}")

In [None]:
from sklearn.metrics import precision_recall_curve
import plotly.graph_objects as go

p, r, d = precision_recall_curve(sig_proc_results_df["class_index"], sig_proc_results_df["sig_proc_noise_score"])

F1 = 2 * p * r /(p + r)

fig = go.Figure()
fig.add_trace(go.Scatter(x=r, y=p, hovertext=[f"decision boundary: {x:.2f}\nF1 score: {f:.03f}" for x, f in zip(d, F1)]))

fig.update_xaxes(title="Recall")
fig.update_yaxes(title="Precision")
fig.show()

# Why are the F values different!

In [None]:
def F1_ind(conf_mat, ind):
    return (2 * conf_mat[ind, ind])/(np.sum(conf_mat[ind]) + np.sum(conf_mat[:, ind]))

In [None]:
mat_path = r"D:\2022_23_DSiromani\Feas2\ECGs"
cut_size = int(2.5 * 300)

test_dataset['ecg'] = test_dataset.apply(lambda x: feas2_ecg_data.loc[x["ecg_ind"]]["data"][x["ecg_start"]:x["ecg_start"]+cut_size], axis=1)
matlab_dict = test_dataset.to_dict("list")

import scipy.io

scipy.io.savemat(os.path.join(mat_path, "safer_yeov_processed_norm_3_test.mat"), matlab_dict, appendmat=True, format='5', long_field_names=False, do_compression=False, oned_as='row')

In [None]:
import scipy.io
import os

mat_path = r"D:\2022_23_DSiromani\Feas2\ECGs"
rejection_list = scipy.io.loadmat(os.path.join(mat_path, "safer_yeov_processed_rejection_test.mat"))["noisy_samples"][:, 0]
test_dataset['sig_proc_noise_score'] = rejection_list

In [None]:
signal_proc_sel  = ((test_dataset['sig_proc_noise_score'] < 50) & (test_dataset["class_index"] == 0)) | ((test_dataset["sig_proc_noise_score"] > 100) & (test_dataset["class_index"] == 1))
print(signal_proc_sel.value_counts())

test_dataset_selected = test_dataset[signal_proc_sel]

In [None]:
df = total_dataset_selected

In [None]:
# df = pd.concat([train_dataset, test_dataset])
inds = []
ptIds = []

for i, e in df.iterrows():
    inds.append(i)
    ptIds.append(feas2_ecg_data["ptID"].loc[e["ecg_ind"]])

df["ptID"] = pd.Series(data=ptIds, index=inds)

In [None]:
feas2_pt_data["noRecs"] = df["ptID"].value_counts()
feas2_pt_data["noLQrecs"] = df[df["class_index"] == 1]["ptID"].value_counts()

num_folds = 5
test_pt_folds = [[] for _ in range(num_folds)]

sorted_pts = feas2_pt_data.sort_values("noLQrecs", axis=0)
group_num = 0

# Go around the folds and assign patients to each
for _, pt in sorted_pts.iterrows():
    test_pt_folds[group_num].append(pt)
    group_num = (group_num + 1) % num_folds

test_pt_folds = [pd.DataFrame(fold) for fold in test_pt_folds]
train_pt_folds = [feas2_pt_data[~feas2_pt_data["ptID"].isin(fold["ptID"])] for fold in test_pt_folds]

"""
ind = np.unique(np.array(df.index.get_level_values(0)))
classes = np.array([df["class_index"].loc[i, 0] for i in ind])
"""

features = ["sSQI", "kSQI", "pSQI", "basSQI", "bSQI", "rSQI"]

conf_mats = []

for i, (train_pt_df, test_pt_df) in enumerate(zip(train_pt_folds, test_pt_folds)):
    print(f"======== Split {i} ========")
    train_dataset = df[df["ptID"].isin(train_pt_df["ptID"])]
    test_dataset =  df[df["ptID"].isin(test_pt_df["ptID"])]

    train_dataset = train_dataset[train_dataset["measDiag"] != DiagEnum.Undecided]

    model = SVC(class_weight='balanced')
    model = model.fit(train_dataset[features].values, train_dataset["class_index"].values)

    test_dataset = test_dataset[test_dataset["measDiag"] != DiagEnum.Undecided]

    predictions = model.predict(test_dataset[features].values)

    test_dataset["predictions"] = predictions
    results_df = test_dataset.groupby(level=0).mean()
    results_df["predictions"] = results_df["predictions"].round()

    conf_mat = confusion_matrix(results_df["class_index"], results_df["predictions"])
    conf_mats.append(conf_mat)

In [None]:
df["class_index"].value_counts(dropna=False)

In [None]:
num_cv = 4
num_folds = 2 * num_cv  # twice to produce the val and test for each fold!
pt_folds = [[] for _ in range(num_folds)]

lq_counts = np.zeros(num_folds, dtype=int)
total_counts = np.zeros(num_folds, dtype=int)

total_total_count = 0
total_lq_count = 0

# Go around the folds and assign patients to each
for _, pt in feas2_pt_data.iterrows():
    total_total_count += pt["noHQrecsNotUndecided"] + pt["noLQrecs"]
    total_lq_count += pt["noLQrecs"]

    exp_total_counts = total_total_count * 1.0/num_folds
    exp_lq_counts = total_lq_count * 1.0/num_folds

    lq_rec_mat = np.diag(np.array([pt["noLQrecs"] for _ in range(num_folds)]))
    hq_rec_mat = np.diag(np.array([pt["noHQrecsNotUndecided"] for _ in range(num_folds)]))

    loss =  np.sum(np.abs(lq_counts[None, :] + lq_rec_mat - exp_lq_counts) + np.abs(total_counts[None, :] + hq_rec_mat - exp_total_counts), axis=-1)
    best_fold = np.argmin(loss)

    pt_folds[best_fold].append(pt)
    lq_counts[best_fold] += pt["noLQrecs"]
    total_counts[best_fold] += pt["noHQrecsNotUndecided"] + pt["noLQrecs"]


test_pt_folds = [pd.DataFrame(fold) for fold in pt_folds[:4]]
val_pt_folds = [pd.DataFrame(fold) for fold in pt_folds[4:]]
train_pt_folds = [feas2_pt_data[(~feas2_pt_data["ptID"].isin(test_fold["ptID"])) & (~feas2_pt_data["ptID"].isin(val_fold["ptID"]))] for test_fold, val_fold in zip(test_pt_folds, val_pt_folds)]

for f in test_pt_folds:
    print(f["noLQrecs"].sum(), f["noHQrecsNotUndecided"].sum())

for f in val_pt_folds:
    print(f["noLQrecs"].sum(), f["noHQrecsNotUndecided"].sum())

conf_mats = []

num_epochs = 30

for i, (train_pt_df, test_pt_df, val_pt_df) in enumerate(zip(train_pt_folds, test_pt_folds, val_pt_folds)):
    print(f"Fold {i}")
    train_df = df[(df["ptID"].isin(train_pt_df["ptID"])) & (df["measDiag"] != DiagEnum.Undecided)]
    test_df = df[(df["ptID"].isin(test_pt_df["ptID"])) & (df["measDiag"] != DiagEnum.Undecided)]
    val_df = df[(df["ptID"].isin(val_pt_df["ptID"])) & (df["measDiag"] != DiagEnum.Undecided)]

    print(train_df["class_index"].value_counts())
    print(test_df["class_index"].value_counts())
    print(val_df["class_index"].value_counts())

    model = SVC(class_weight='balanced')
    model = model.fit(train_df[features].values, train_df["class_index"].values)

    predictions = model.predict(val_df[features].values)
    print(len(predictions))

    test_predictions = model.predict(test_df[features].values)

    val_df.loc[:, "predictions"] = predictions
    results_df = val_df.groupby(level=0).mean()
    results_df.loc[:, "predictions"] = results_df["predictions"].round()

    test_df["predictions"] = test_predictions
    test_results_df = val_df.groupby(level=0).mean()
    test_results_df["predictions"] = test_results_df["predictions"].round()

    print(results_df["predictions"].value_counts(dropna=False))

    conf_mat = confusion_matrix(results_df["class_index"], results_df["predictions"])
    conf_mats.append(conf_mat)

In [None]:
print(f"Normal F1: {np.mean([F1_ind(c, 0) for c in conf_mats])}")
print(f"Poor quality F1: {np.mean([F1_ind(c, 1) for c in conf_mats])}")

In [None]:
[F1_ind(c, 1) for c in conf_mats]

### Try putting features for each section into the classifier as one input

In [None]:
# Construct test and train matrices

features = ["sSQI", "kSQI", "pSQI", "basSQI", "bSQI", "rSQI"]
train_matrix = []
train_targets = []
indexes = []

for i in set(train_dataset.index.get_level_values(0)):
    indexes.append(i)
    train_matrix.append(train_dataset.loc[i][features].values.flatten())
    train_targets.append(train_dataset.loc[(i, 0)]["class_index"])

train_matrix = np.array(train_matrix)
train_targets = np.array(train_targets)

print(train_matrix.shape)
print(train_targets.shape)


test_matrix=  []
test_targets = []
for i in set(test_dataset.index.get_level_values(0)):
    test_matrix.append(test_dataset.loc[i][features].values.flatten())
    test_targets.append(test_dataset.loc[(i, 0)]["class_index"])


test_matrix = np.array(test_matrix)
test_targets = np.array(test_targets)


val_matrix=  []
val_targets = []
for i in set(val_dataset.index.get_level_values(0)):
    val_matrix.append(val_dataset.loc[i][features].values.flatten())
    val_targets.append(val_dataset.loc[(i, 0)]["class_index"])

val_matrix = np.array(val_matrix)
val_targets = np.array(val_targets)

print(test_matrix.shape)

In [None]:
# train the model
class_weights = 1/train_dataset["class_index"].value_counts()
class_weights /= np.sum(class_weights)
print(class_weights)

model = SVC(class_weight=class_weights.to_dict(), probability=True)
# See documentation for default values e.g. use rbf, regularising C = 1
features = ["sSQI", "kSQI", "pSQI", "basSQI", "bSQI", "rSQI"]

model = model.fit(train_matrix, train_targets)

In [None]:
predictions = model.predict_proba(val_matrix)

ConfusionMatrixDisplay.from_predictions(val_targets, np.round(predictions[:, 1]), display_labels=["sufficient quality", "insufficient quality"], cmap="inferno")
plt.show()
conf_mat = confusion_matrix(val_targets, np.round(predictions[:, 1]))

def F1_ind(conf_mat, ind):
    return (2 * conf_mat[ind, ind])/(np.sum(conf_mat[ind]) + np.sum(conf_mat[:, ind]))

print(conf_mat)
print(f"Normal F1: {F1_ind(conf_mat, 0)}")
print(f"Other F1: {F1_ind(conf_mat, 1)}")

In [None]:
p, r, d = precision_recall_curve(val_targets, predictions[:, 1])

F1 = 2 * p * r /(p + r)

fig = go.Figure()
fig.add_trace(go.Scatter(x=r, y=p, hovertext=[f"decision boundary: {x:.2f}\nF1 score: {f:.03f}" for x, f in zip(d, F1)]))

fig.update_xaxes(title="Recall")
fig.update_yaxes(title="Precision")
fig.show()

In [None]:
print(f"Sensitivity: {conf_mat[1, 1]/np.sum(conf_mat[1])}")
print(f"Specificity: {conf_mat[0, 0]/np.sum(conf_mat[0])}")

In [None]:
false_positives = np.array(list(set(test_dataset.index.get_level_values(0))))[(test_targets == 0) * (predictions == 1)]
print(false_positives[0])

print(test_matrix[(test_targets == 0) * (predictions == 1)][1])

plt.plot(dataset["data"].loc[false_positives[1]])
plt.show()

Why am I having poorer accuracy here?
 - Is it because the SVM is not able to understand so many variables

### Put all the segments into an LSTM classifier

In [None]:
import torch
import torch.nn as nn

In [None]:
from torch.utils.data import Dataset, DataLoader

class Dataset(torch.utils.data.Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, dataset):
        'Initialization'
        self.dataset = dataset

  def __len__(self):
        'Denotes the total number of samples'
        return len(self.dataset.index)

  def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        row = self.dataset.iloc[index]

        X = row["data"]
        y = row["class_index"]
        ind = row["ind"]

        return X, y, ind

In [None]:
features = ["sSQI", "kSQI", "pSQI", "basSQI", "bSQI", "rSQI"]
train_matrix = []
train_targets = []
indexes = []

for i in set(train_dataset.index.get_level_values(0)):
    indexes.append(i)
    train_matrix.append(train_dataset.loc[i][features].values.flatten())
    train_targets.append(train_dataset.loc[(i, 0)]["class_index"])

train_df = pd.DataFrame({"data": train_matrix, "class_index": train_targets, "ind": indexes})
train_dataloader = DataLoader(Dataset(train_df), batch_size=32, shuffle=True, pin_memory=True)

test_matrix=  []
test_targets = []
indexes = []

for i in set(test_dataset.index.get_level_values(0)):
    indexes.append(i)
    test_matrix.append(test_dataset.loc[i][features].values.flatten())
    test_targets.append(test_dataset.loc[(i, 0)]["class_index"])

test_df = pd.DataFrame({"data": test_matrix, "class_index": test_targets, "ind": indexes})
test_dataloader = DataLoader(Dataset(test_df), batch_size=32, shuffle=True, pin_memory=True)

In [None]:
if torch.cuda.is_available():
    print("Using Cuda")
    device = torch.device("cuda")
else:
    print("Using CPU")
    device = torch.device("cpu")

In [None]:
class LSTM_Classifier(nn.Module):
    def __init__(self):
        super(LSTM_Classifier, self).__init__()
        self.lstm_n_hidden = 16
        self.lstm_n_input = 6
        self.lstm = nn.LSTM(input_size=self.lstm_n_input, hidden_size=self.lstm_n_hidden, bidirectional=True, batch_first=True, num_layers=2)

        self.linear1 = nn.Linear(self.lstm_n_hidden * 2, self.lstm_n_hidden*2)
        self.activation = nn.ReLU()
        self.linear2 = nn.Linear(self.lstm_n_hidden*2, 1)

    def init_lstm_hidden(self, batch_size, device):
        # This resets the LSTM hidden state after each batch
        hidden_state = torch.zeros(4, batch_size, self.lstm_n_hidden, device=device)
        cell_state = torch.zeros(4, batch_size, self.lstm_n_hidden, device=device)
        return (hidden_state, cell_state)

    def forward(self, x):
        # [N, 30, 5]
        _, (h, _) = self.lstm(x, self.init_lstm_hidden(x.shape[0], x.device))
        h = torch.flatten(torch.transpose(h, 0, 1)[:, 2:, :], 1, 2)

        pred = self.linear1(h)
        pred = self.activation(pred)
        pred = self.linear2(pred)
        return pred[:, 0]


In [None]:
num_epochs = 10
model = LSTM_Classifier().to(device)

# Use weightings to handle class imbalance

class_counts = torch.tensor(train_df["class_index"].value_counts().values.astype(np.float32))
class_weights = (class_counts[1] + class_counts[0])/class_counts[1]
loss_func = torch.nn.BCEWithLogitsLoss(pos_weight=class_weights)

optimizer = torch.optim.Adam(model.parameters(), lr=0.00005)

In [None]:
import copy
model = model.to(device)

def train(model):
    best_test_loss = 100
    best_model = copy.deepcopy(model).cpu()

    losses = []

    for epoch in range(num_epochs):
        total_loss = 0
        print(f"starting epoch {epoch} ...")
        # Train
        num_batches = 0
        model.train()
        for i, (features, labels, _) in enumerate(train_dataloader):
            features = torch.unsqueeze(features.to(device), 1).float()
            features = torch.reshape(features, (features.shape[0], -1, 6))

            # fft = torch.abs(torch.fft.fft(signals))
            # signals = torch.cat([signals, fft], dim=1)
            labels = labels.float().to(device)

            optimizer.zero_grad()
            output = model(features)
            loss = loss_func(output, labels)
            loss.backward()
            optimizer.step()
            num_batches += 1
            total_loss += float(loss)

        print(f"Epoch {epoch} finished with average loss {total_loss/num_batches}")
        print("Testing ...")
        # Test
        num_test_batches = 0
        test_loss = 0
        with torch.no_grad():
            model.eval()
            for i, (features, labels, _) in enumerate(test_dataloader):
                features = torch.unsqueeze(features.to(device), 1).float()
                features = torch.reshape(features, (features.shape[0], -1, 6))
                # fft = torch.abs(torch.fft.fft(signals))
                # signals = torch.cat([signals, fft], dim=1)
                labels = labels.float().to(device)
                output = model(features)
                loss = loss_func(output, labels)
                test_loss += float(loss)
                num_test_batches += 1

        print(f"Average test loss: {test_loss/num_test_batches}")

        if test_loss/num_test_batches < best_test_loss:
            best_model = copy.deepcopy(model).cpu()
            best_test_loss = test_loss/num_test_batches

        losses.append([total_loss/num_batches, test_loss/num_test_batches])

    return best_model, losses

model, losses = train(model)
model = model.to(device)

In [None]:
# Plot test data reconstruction
test_df["prediction"] = None

with torch.no_grad():
    model.eval()
    for i, (signals, _, ind) in enumerate(test_dataloader):
        signals = torch.unsqueeze(signals.to(device), 1).float()
        signals = torch.reshape(signals, (signals.shape[0], -1, 6))
        # fft = torch.abs(torch.fft.fft(signals))
        # signals = torch.cat([signals, fft], dim=1)
        # labels = labels.type(torch.LongTensor)

        output = model(signals).detach().cpu().numpy()

        for i, o in zip(ind, output):
            test_df["prediction"].loc[int(i)] = o

test_df_predicted = test_df.dropna(subset=["prediction"])

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

test_targets = test_df_predicted["class_index"].to_numpy(dtype=float)
predictions = sigmoid(test_df_predicted["prediction"].to_numpy(dtype=float))

ConfusionMatrixDisplay.from_predictions(test_targets, np.round(predictions), display_labels=["sufficient quality", "insufficient quality"], cmap="inferno")
plt.show()
conf_mat = confusion_matrix(test_targets, np.round(predictions))

def F1_ind(conf_mat, ind):
    return (2 * conf_mat[ind, ind])/(np.sum(conf_mat[ind]) + np.sum(conf_mat[:, ind]))

print(f"Normal F1: {F1_ind(conf_mat, 0)}")
print(f"Other F1: {F1_ind(conf_mat, 1)}")

In [None]:
from sklearn.metrics import precision_recall_curve
import plotly.graph_objects as go

p, r, d = precision_recall_curve(test_targets, predictions)

F1 = 2 * p * r /(p + r)

fig = go.Figure()
fig.add_trace(go.Scatter(x=r, y=p, hovertext=[f"decision boundary: {x:.2f}\nF1 score: {f:.03f}" for x, f in zip(d, F1)]))

fig.update_xaxes(title="Recall")
fig.update_yaxes(title="Precision")
fig.show()