In [48]:
import os
import json
import pandas as pd


# Directory containing the JSON files
ekg_dir = 'ekg_files'

# List to store all DataFrames
dfs = []
for filename in os.listdir(ekg_dir):
    if filename.endswith('.json'):
        with open(os.path.join(ekg_dir, filename), 'r', encoding='utf-8') as f:
            data = json.load(f)
        # Take the first lead (usually "I")
        if 'leads' in data and len(data['leads']) > 0:
            lead = data['leads'][0]
            signal = lead.get('signal', [])
            # Remove None values from the signal
            clean_signal = [x for x in signal if x is not None]
            df_tmp = pd.DataFrame({'Signal': clean_signal})
            dfs.append([filename.replace('_','/').replace('.json',''), df_tmp])

df = pd.read_csv('DANE_mpsi.csv', sep='\t', encoding='utf-8')
kg_to_label = dict(zip(df['KG'], df['zgon']))

# Prepare dataset: list of (signal, label) tuples
signal_label_dataset = []
for kg, signal_df in dfs:
    if kg in kg_to_label:
        label = kg_to_label[kg]
        signal = signal_df['Signal'].values
        signal_label_dataset.append((kg,signal, label))

In [50]:
import numpy as np
from scipy.signal import find_peaks

def extract_rr_intervals(signal, fs=320):
    # Find R-peaks (simple thresholding, may need tuning for real data)
    peaks, _ = find_peaks(signal, distance=fs*0.2)  # at least 200ms between peaks
    rr_intervals = np.diff(peaks) / fs  # in seconds
    return rr_intervals

# For all signals in signal_label_dataset
rr_label_dataset = [(filename,extract_rr_intervals(sig, fs=320), label) for filename,sig, label in signal_label_dataset]

In [96]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.ensemble import RandomForestClassifier

# Prepare features and labels
X = [rr for _,rr, label in rr_label_dataset]
y = [label for _,rr, label in rr_label_dataset]

# Pad sequences to the same length for the classifier


# Find max length
max_len = max(len(rr) for rr in X)
X_padded = np.array([np.pad(rr, (0, max_len - len(rr)), 'constant', constant_values=0) for rr in X])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=1)

# Train Decision Tree
clf = DecisionTreeClassifier(max_depth=10,random_state=42)
clf.fit(X_train, y_train)

# Evaluate
score = clf.score(X_test, y_test)
train_score = clf.score(X_train, y_train)
print(f"Decision Tree train accuracy: {train_score:.3f}")
print(f"Decision Tree test accuracy: {score:.3f}")

rf_clf = RandomForestClassifier(max_depth=2,min_samples_leaf=25,random_state=42, n_estimators=500)
rf_clf.fit(X_train, y_train)

rf_train_score = rf_clf.score(X_train, y_train)
rf_score = rf_clf.score(X_test, y_test)
print(f"Random Forest train accuracy: {rf_train_score:.3f}")
print(f"Random Forest test accuracy: {rf_score:.3f}")


Decision Tree train accuracy: 0.848
Decision Tree test accuracy: 0.532
Random Forest train accuracy: 0.683
Random Forest test accuracy: 0.506
