# ECG classification on the PTB-XL dataset
Dataset link: https://doi.org/10.13026/x4td-x982

## Set-Up

Install the dependencies.

In [1]:
!pip install wfdb



Import the external modules.

In [2]:
import os
import sys
import torch
import wfdb
import ast
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score

print(f'python {sys.version}')
print(f'torch {torch.__version__}')
print(f'CPUs: {os.cpu_count()}')
print(f'GPUs: {torch.cuda.device_count()}')

python 3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:45:18) [GCC 12.3.0]
torch 2.1.0
CPUs: 48
GPUs: 4


Import the internal modules.

In [3]:
from src.model import Model

## Data

Define the path to the data files.

In [4]:
path = 'data'

Load the labels.

In [5]:
df = pd.read_csv(f'{path}/ptbxl_database.csv', index_col='ecg_id')
y = df['scp_codes'].apply(lambda x: [s.strip('_') for s in set(ast.literal_eval(x).keys())]).values
print(y.shape)

(21837,)


Load the time series.

In [6]:
x = np.array([np.transpose(wfdb.rdsamp(f'{path}/records100/{f.replace("records100", "")}')[0]) for f in df['filename_lr']])
print(x.shape)

(21837, 12, 1000)


Split the data.

In [7]:
x_train, y_train = x[df['strat_fold'] < 10], y[df['strat_fold'] < 10]
x_test, y_test = x[df['strat_fold'] == 10], y[df['strat_fold'] == 10]

Encode the labels.

In [8]:
mlb = MultiLabelBinarizer()
mlb.fit(y_train)
y_train = mlb.transform(y_train)
y_test = mlb.transform(y_test)
print(y_train.shape)
print(y_test.shape)

(19634, 71)
(2203, 71)


Scale the time series.

In [9]:
mu = np.mean(x_train, keepdims=True)
sigma = np.std(x_train, keepdims=True)
x_train = (x_train - mu) / sigma
x_test = (x_test - mu) / sigma

## Baseline Model

Build the model.

In [10]:
model = Model(
    timesteps=x_train.shape[2],
    features=x_train.shape[1],
    hidden_size=None,
    num_layers=None,
    dropout=None,
    filters=[128, 256, 128],
    kernel_sizes=[8, 5, 3],
    units=128,
    num_classes=y_train.shape[1],
    model_type='baseline'
)

number of parameters: 318,279


Visualize the model.

In [11]:
print(model.model)

DataParallel(
  (module): FCN(
    (fcn): ConvolutionalBranch(
      (model): Sequential(
        (Conv1d_0): Conv1d(12, 128, kernel_size=(8,), stride=(1,), padding=same, bias=False)
        (BatchNorm1d_0): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True, track_running_stats=True)
        (ReLU_0): ReLU()
        (Conv1d_1): Conv1d(128, 256, kernel_size=(5,), stride=(1,), padding=same, bias=False)
        (BatchNorm1d_1): BatchNorm1d(256, eps=0.001, momentum=0.99, affine=True, track_running_stats=True)
        (ReLU_1): ReLU()
        (Conv1d_2): Conv1d(256, 128, kernel_size=(3,), stride=(1,), padding=same, bias=False)
        (BatchNorm1d_2): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True, track_running_stats=True)
        (ReLU_2): ReLU()
      )
    )
    (avg_pool): AdaptiveAvgPool1d(output_size=1)
    (max_pool): AdaptiveMaxPool1d(output_size=1)
    (batch_norm1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (batch_norm2): Ba

Train the model.

In [12]:
model.fit(
    x_train=x_train,
    y_train=y_train,
    x_test=x_test,
    y_test=y_test,
    learning_rate=0.01,
    batch_size=128,
    epochs=50,
    verbose=True
)

training on CUDA 0, 1, 2, 3
epoch: 1, train_time: 6.33, train_loss: 0.005405, test_time: 0.39, test_loss: 0.004899
epoch: 2, train_time: 4.00, train_loss: 0.003247, test_time: 0.39, test_loss: 0.001390
epoch: 3, train_time: 3.91, train_loss: 0.001075, test_time: 0.38, test_loss: 0.000794
epoch: 4, train_time: 3.91, train_loss: 0.000797, test_time: 0.38, test_loss: 0.000787
epoch: 5, train_time: 3.92, train_loss: 0.000734, test_time: 0.39, test_loss: 0.000702
epoch: 6, train_time: 4.05, train_loss: 0.000684, test_time: 0.38, test_loss: 0.000690
epoch: 7, train_time: 3.92, train_loss: 0.000638, test_time: 0.38, test_loss: 0.000628
epoch: 8, train_time: 3.92, train_loss: 0.000608, test_time: 0.38, test_loss: 0.000605
epoch: 9, train_time: 3.92, train_loss: 0.000591, test_time: 0.39, test_loss: 0.000614
epoch: 10, train_time: 3.93, train_loss: 0.000578, test_time: 0.39, test_loss: 0.000589
epoch: 11, train_time: 3.92, train_loss: 0.000567, test_time: 0.38, test_loss: 0.000571
epoch: 12, tr

Save the learning history.

In [13]:
model.history.to_csv('results/history_baseline.csv', index=False)

Evaluate the model on the training set.

In [14]:
y_score_train = model.predict(x_train)
score_train = roc_auc_score(y_true=y_train, y_score=y_score_train, average='macro')
print(round(score_train, 3))

0.965


Save the training set predictions.

In [15]:
pd.DataFrame(data=y_score_train, columns=mlb.classes_).to_csv('results/y_score_train_baseline.csv', index=False)

Evaluate the model on the test set.

In [16]:
y_score_test = model.predict(x_test)
score_test = roc_auc_score(y_true=y_test, y_score=y_score_test, average='macro')
print(round(score_test, 3))

0.918


Save the test set predictions.

In [17]:
pd.DataFrame(data=y_score_test, columns=mlb.classes_).to_csv('results/y_score_test_baseline.csv', index=False)

## Proposed Model

Build the model.

In [18]:
model = Model(
    timesteps=x_train.shape[2],
    features=x_train.shape[1],
    hidden_size=8,
    num_layers=1,
    dropout=0.8,
    filters=[128, 256, 128],
    kernel_sizes=[8, 5, 3],
    units=128,
    num_classes=y_train.shape[1],
    model_type='proposed'
)

number of parameters: 351,183


Visualize the model.

In [19]:
print(model.model)

DataParallel(
  (module): LSTM_FCN(
    (fcn): ConvolutionalBranch(
      (model): Sequential(
        (Conv1d_0): Conv1d(12, 128, kernel_size=(8,), stride=(1,), padding=same, bias=False)
        (BatchNorm1d_0): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True, track_running_stats=True)
        (ReLU_0): ReLU()
        (Conv1d_1): Conv1d(128, 256, kernel_size=(5,), stride=(1,), padding=same, bias=False)
        (BatchNorm1d_1): BatchNorm1d(256, eps=0.001, momentum=0.99, affine=True, track_running_stats=True)
        (ReLU_1): ReLU()
        (Conv1d_2): Conv1d(256, 128, kernel_size=(3,), stride=(1,), padding=same, bias=False)
        (BatchNorm1d_2): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True, track_running_stats=True)
        (ReLU_2): ReLU()
      )
    )
    (lstm): RecurrentBranch(
      (model): Sequential(
        (LSTM_0): LSTM(1000, 8, batch_first=True)
        (Lambda_0): Lambda()
        (Dropout_0): Dropout(p=0.8, inplace=False)
      )
    )
    (avg_pool)

Train the model.

In [20]:
model.fit(
    x_train=x_train,
    y_train=y_train,
    x_test=x_test,
    y_test=y_test,
    learning_rate=0.01,
    batch_size=128,
    epochs=50,
    verbose=True
)

training on CUDA 0, 1, 2, 3
epoch: 1, train_time: 4.21, train_loss: 0.005449, test_time: 0.40, test_loss: 0.004797
epoch: 2, train_time: 4.07, train_loss: 0.003203, test_time: 0.40, test_loss: 0.001313
epoch: 3, train_time: 4.07, train_loss: 0.001034, test_time: 0.40, test_loss: 0.000791
epoch: 4, train_time: 4.07, train_loss: 0.000794, test_time: 0.40, test_loss: 0.000810
epoch: 5, train_time: 4.08, train_loss: 0.000724, test_time: 0.40, test_loss: 0.000688
epoch: 6, train_time: 4.20, train_loss: 0.000674, test_time: 0.40, test_loss: 0.000675
epoch: 7, train_time: 4.08, train_loss: 0.000636, test_time: 0.40, test_loss: 0.000636
epoch: 8, train_time: 4.08, train_loss: 0.000604, test_time: 0.40, test_loss: 0.000603
epoch: 9, train_time: 4.08, train_loss: 0.000591, test_time: 0.40, test_loss: 0.000589
epoch: 10, train_time: 4.08, train_loss: 0.000572, test_time: 0.40, test_loss: 0.000577
epoch: 11, train_time: 4.08, train_loss: 0.000568, test_time: 0.40, test_loss: 0.000565
epoch: 12, tr

Save the learning history.

In [21]:
model.history.to_csv('results/history_proposed.csv', index=False)

Evaluate the model on the training set.

In [22]:
y_score_train = model.predict(x_train)
score_train = roc_auc_score(y_true=y_train, y_score=y_score_train, average='macro')
print(round(score_train, 3))

0.966


Save the training set predictions.

In [23]:
pd.DataFrame(data=y_score_train, columns=mlb.classes_).to_csv('results/y_score_train_proposed.csv', index=False)

Evaluate the model on the test set.

In [24]:
y_score_test = model.predict(x_test)
score_test = roc_auc_score(y_true=y_test, y_score=y_score_test, average='macro')
print(round(score_test, 3))

0.92


Save the test set predictions.

In [25]:
pd.DataFrame(data=y_score_test, columns=mlb.classes_).to_csv('results/y_score_test_proposed.csv', index=False)