In [1]:
import os
import json
import pandas as pd

# Directory containing the JSON files
ekg_dir = 'ekg_files'

# List to store all DataFrames
dfs = []
for filename in os.listdir(ekg_dir):
    if filename.endswith('.json'):
        with open(os.path.join(ekg_dir, filename), 'r', encoding='utf-8') as f:
            data = json.load(f)
        # Take the first lead (usually "I")
        if 'leads' in data and len(data['leads']) > 0:
            lead = data['leads'][0]
            signal = lead.get('signal', [])
            # Remove None values from the signal
            clean_signal = [x for x in signal if x is not None]
            df_tmp = pd.DataFrame({'Signal': clean_signal})
            dfs.append([filename.replace('_','/').replace('.json',''), df_tmp])

df = pd.read_csv('DANE_mpsi.csv', sep='\t', encoding='utf-8')
kg_to_label = dict(zip(df['KG'], df['zgon']))

# Prepare dataset: list of (signal, label) tuples
signal_label_dataset = []
for kg, signal_df in dfs:
    if kg in kg_to_label:
        label = kg_to_label[kg]
        signal = signal_df['Signal'].values
        signal_label_dataset.append((kg,signal, label))

In [2]:
import numpy as np
from scipy.signal import find_peaks

def extract_rr_intervals(signal, fs=320):
    # Find R-peaks (simple thresholding, may need tuning)
    peaks, _ = find_peaks(signal, distance=fs*0.2)  # at least 200ms between peaks
    rr_intervals = np.diff(peaks) / fs  # in seconds
    return rr_intervals

# For all signals in signal_label_dataset
rr_label_dataset = [(filename, extract_rr_intervals(sig, fs=320), label) for filename,sig, label in signal_label_dataset]

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# For all signals in signal_label_dataset
rr_label_dataset = [(filename, extract_rr_intervals(sig, fs=320), label) for filename, sig, label in signal_label_dataset]

# Prepare features and labels
X = [rr for _, rr, label in rr_label_dataset]
y = np.array([label for _, rr, label in rr_label_dataset])

# Pad sequences to the same length for the classifier
max_len = max(len(rr) for rr in X) # Find max length
X_padded = np.array([np.pad(rr, (0, max_len - len(rr)), 'constant', constant_values=0) for rr in X])

pipeline = Pipeline([
	('imputer', SimpleImputer(strategy='mean')),
	# ('scaler', StandardScaler())
])	

X_transformed = pipeline.fit_transform(X_padded)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=1)

models = [
    ('Decision Tree', DecisionTreeClassifier(max_depth=10, random_state=42)),
    ('Random Forest', RandomForestClassifier(max_depth=2, min_samples_leaf=25, random_state=42, n_estimators=500)),
    ('Gradient Boosting', GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, random_state=42)),
    ('AdaBoost', AdaBoostClassifier(n_estimators=200, random_state=42)),
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=42)),
    ('SVM (RBF)', SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)),
    ('KNN', KNeighborsClassifier(n_neighbors=5)),
    ('Naive Bayes', GaussianNB()),
]

for name, model in models:
	# Train the model
	model.fit(X_train, y_train)
	
	# Evaluate the model
	train_score = model.score(X_train, y_train)
	test_score = model.score(X_test, y_test)
	
	print(f"{name} train accuracy: {train_score:.3f}")
	print(f"{name} test accuracy: {test_score:.3f}")

Decision Tree train accuracy: 0.848
Decision Tree test accuracy: 0.532
Random Forest train accuracy: 0.683
Random Forest test accuracy: 0.506
Gradient Boosting train accuracy: 0.946
Gradient Boosting test accuracy: 0.506
AdaBoost train accuracy: 0.797
AdaBoost test accuracy: 0.532
Logistic Regression train accuracy: 0.641
Logistic Regression test accuracy: 0.519
SVM (RBF) train accuracy: 0.651
SVM (RBF) test accuracy: 0.519
KNN train accuracy: 0.689
KNN test accuracy: 0.506
Naive Bayes train accuracy: 0.486
Naive Bayes test accuracy: 0.468


In [4]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf', 'poly', 'sigmoid'],
    'degree': [2, 3, 4],
    'coef0': [0.0, 0.1, 0.5, 1.0] 
}

# Initialize base model
svc = SVC(probability=True, random_state=42)

# Perform grid search
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validated accuracy:", grid_search.best_score_)
print("Test set accuracy:", grid_search.score(X_test, y_test))

Best parameters: {'C': 10, 'coef0': 1.0, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
Best cross-validated accuracy: 0.5777777777777777
Test set accuracy: 0.4810126582278481


Worse than guessing...