In [17]:
import os
import json
import pandas as pd

# Directory containing the JSON files
ekg_dir = 'ekg_files'

# List to store all DataFrames
dfs = []
for filename in os.listdir(ekg_dir):
    if filename.endswith('.json'):
        with open(os.path.join(ekg_dir, filename), 'r', encoding='utf-8') as f:
            data = json.load(f)
        # Take the first lead (usually "I")
        if 'leads' in data and len(data['leads']) > 0:
            lead = data['leads'][0]
            signal = lead.get('signal', [])
            # Remove None values from the signal
            clean_signal = [x for x in signal if x is not None]
            df_tmp = pd.DataFrame({'Signal': clean_signal})
            dfs.append([filename.replace('_','/').replace('.json',''), df_tmp])

In [18]:
len(dfs)

60

In [19]:
min_length = float('inf')
min_kg = None
for kg, signal_df in dfs:
    if len(signal_df) < min_length:
        min_length = len(signal_df)
        min_kg = kg
        
print(f"Shortest signal is from {min_kg} with length {min_length}")

for i in range(len(dfs)):
    dfs[i][1] = dfs[i][1].iloc[:min_length].reset_index(drop=True)

Shortest signal is from 39879/2021 with length 761


In [20]:
len(dfs)

60

In [3]:
df = pd.read_csv('DANE_mpsi.csv', sep='\t', encoding='utf-8')

kg_to_label = dict(zip(df['KG'], df['zgon']))

signal_label_dataset = []
for kg, signal_df in dfs:
    if kg in kg_to_label:
        label = kg_to_label[kg]
        signal = signal_df['Signal'].values
        signal_label_dataset.append((signal, label))

In [14]:
len(signal_label_dataset)

41

In [25]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset
from torch_ecg.models.ecg_seq_lab_net import ECG_SEQ_LAB_NET
from torch_ecg.cfg import CFG
from sklearn.model_selection import train_test_split



# Dataset wrapper
class SignalDataset(Dataset):
    def __init__(self, signal_label_dataset):
        self.data = []
        self.labels = []
        for signal, label in signal_label_dataset:
            signal = torch.tensor(signal, dtype=torch.float32)
            if signal.shape[0] != 761:
                raise ValueError("Each signal must be of length 761")
            self.data.append(signal)
            self.labels.append(torch.tensor(label, dtype=torch.float32))
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Load dataset
dataset = SignalDataset(signal_label_dataset)
X = torch.stack([s for s, _ in dataset])  # shape [batch, 761]
y = torch.stack([l for _, l in dataset])  # shape [batch]

# Prepare model config
config = CFG(
    input_len=761,
    input_channels=1,
    classes=1,
)

# Initialize ECG model
embedder = ECG_SEQ_LAB_NET(config,n_leads=1)
embedder.eval()

# Get embeddings
with torch.no_grad():
    X_input = X.unsqueeze(1)  # [batch, 1, 761]
    output = embedder(X_input)  # [batch, 1, 761]
    embeddings = output.mean(dim=-1)  # [batch, 1] → mean pooling
    embeddings = embeddings.squeeze(1)  # [batch]

# Simple MLP classifier
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.net(x).squeeze(-1)

mlp = MLP(input_dim=95)  # embedding is 1D after mean pooling
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(mlp.parameters(), lr=1e-3)
# Split embeddings and labels into train and test sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, y, test_size=0.2, random_state=42, stratify=y)

# Training loop
epochs = 1000
for epoch in range(epochs):
    mlp.train()
    optimizer.zero_grad()
    outputs = mlp(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 50 == 0 or epoch == 0:
        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

        # Evaluate on train set
        mlp.eval()
        with torch.no_grad():
            train_logits = mlp(X_train)
            train_probs = torch.sigmoid(train_logits)
            train_preds = (train_probs > 0.5).float()
            train_acc = (train_preds == y_train).float().mean().item()

            test_logits = mlp(X_test)
            test_probs = torch.sigmoid(test_logits)
            test_preds = (test_probs > 0.5).float()
            test_acc = (test_preds == y_test).float().mean().item()

            print(f"Train accuracy: {train_acc:.4f} | Test accuracy: {test_acc:.4f}")




Epoch 1, Loss: 0.7015
Train accuracy: 0.3750 | Test accuracy: 0.4444
Epoch 50, Loss: 0.6623
Train accuracy: 0.6250 | Test accuracy: 0.5556
Epoch 100, Loss: 0.6616
Train accuracy: 0.6250 | Test accuracy: 0.5556
Epoch 150, Loss: 0.6616
Train accuracy: 0.6250 | Test accuracy: 0.5556
Epoch 200, Loss: 0.6616
Train accuracy: 0.6250 | Test accuracy: 0.5556
Epoch 250, Loss: 0.6616
Train accuracy: 0.6250 | Test accuracy: 0.5556
Epoch 300, Loss: 0.6616
Train accuracy: 0.6250 | Test accuracy: 0.5556
Epoch 350, Loss: 0.6616
Train accuracy: 0.6250 | Test accuracy: 0.5556
Epoch 400, Loss: 0.6616
Train accuracy: 0.6250 | Test accuracy: 0.5556
Epoch 450, Loss: 0.6616
Train accuracy: 0.6250 | Test accuracy: 0.5556
Epoch 500, Loss: 0.6616
Train accuracy: 0.6250 | Test accuracy: 0.5556
Epoch 550, Loss: 0.6616
Train accuracy: 0.6250 | Test accuracy: 0.5556
Epoch 600, Loss: 0.6616
Train accuracy: 0.6250 | Test accuracy: 0.5556
Epoch 650, Loss: 0.6616
Train accuracy: 0.6250 | Test accuracy: 0.5556
Epoch 700

In [13]:
embeddings.shape

torch.Size([41, 95])