Steps of the pipeline in this notebook:

1. Load raw data
2. Create preprocessed data
3. Get predictions of a model
4. Dimensionality reduction (U-MAP)
5. Find cluster of interest of FN in dimensionality reduction space
6. Plot the cluster of interest in raw data space

In [9]:
import pandas as pd
import os
import glob
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

random_seed = 10

In [2]:
# 1. Load raw data

def load_all_partitions(directory):
    all_files = glob.glob(os.path.join(directory, '*.csv'))
    df_list = [pd.read_csv(file) for file in all_files]
    return pd.concat(df_list, ignore_index=True)

x_raw = load_all_partitions('data/AF-Raw-Data/AF Data/all_raw_data_csv')

# Remove rows that are duplicates or have values > 1800
n_before = x_raw.shape[0]
x_raw = x_raw.drop_duplicates()
x_raw = x_raw[(x_raw.T < 1800).all()]
print(f"Removed {n_before - x_raw.shape[0]} rows with values > 1800 or are duplicate")

# Balance the dataset, by taking the same number of samples from each class
class_1 = x_raw[x_raw['Class_Label'] == 1]
class_0 = x_raw[x_raw['Class_Label'] == 0].sample(len(class_1), random_state=random_seed)
x_raw = pd.concat([class_1, class_0])
print(f"Balanced dataset: {x_raw.shape[0]} samples in total")

x_raw.reset_index(drop=True, inplace=True)
x_raw['Sample_id'] = x_raw.index

# Split the class label from the features and split the data into train and test
y_raw = x_raw[['Class_Label', 'Sample_id']]
x_raw = x_raw.drop(columns=['Class_Label'])
x_raw_train, x_raw_test, y_train, y_test = train_test_split(x_raw, y_raw, test_size=0.2, random_state=random_seed)

Removed 136070 rows with values > 1800 or are duplicate
Balanced dataset: 109272 samples in total


In [3]:
# 2. Create preprocessed data

# 30 bins (of 50 milliseconds) are created covering R-R intervals of 200 ms up to 1700 ms. For each
#sample the frequency of an R-R interval occurring in a certain bin was counted.

# Function to bin and count intervals for a row
def bin_row(row, bin_edges):
    intervals = row[:-1].values # Exclude the class label from binning
    bin_indices = np.digitize(intervals, bins=bin_edges, right=False)
    bin_counts = np.bincount(bin_indices, minlength=len(bin_edges)+1)[1:-1]  # Exclude counts outside defined bins
    return bin_counts

def preprocess_into_bins(x_data, y_data):
    # Define the edges of the bins
    bin_edges = np.arange(200, 1751, 50) #ms
    
    x_bins = x_data.apply(lambda row: bin_row(row, bin_edges), axis=1, result_type='expand')
    x_bins.columns = [f'bin_{i + 1}' for i in range(len(bin_edges) - 1)]
    # x_bins['Class_Label'] = x_data['Class_Label']
    x_bins['Sample_id'] = x_data['Sample_id']
    return x_bins

x_bins_train = preprocess_into_bins(x_raw_train, y_train)
x_bins_test = preprocess_into_bins(x_raw_test, y_test)


In [10]:
# 3. Get predictions of a model

# Normalise using standard scaler
scaler = StandardScaler()
x_bins_train_scaled = scaler.fit_transform(x_bins_train.drop(columns=['Sample_id']))
x_bins_test_scaled = scaler.transform(x_bins_test.drop(columns=['Sample_id']))

svm = SVC(random_state=random_seed, kernel='rbf')
svm.fit(x_bins_train_scaled, y_train['Class_Label'])
y_pred = svm.predict(x_bins_test_scaled)

acc_svm = accuracy_score(y_test['Class_Label'], y_pred)
f1_svm = f1_score(y_test['Class_Label'], y_pred)

print(f"Accuracy: {round(acc_svm,4)}, F1: {round(f1_svm,4)}")

ValueError: Classification metrics can't handle a mix of multiclass-multioutput and binary targets

Accuracy: 0.953, F1: 0.9531
