# ECG classification on the PTB-XL dataset
Dataset link: https://doi.org/10.13026/kfzx-aw45

## Set-Up

Install the dependencies.

In [1]:
!pip install wfdb



In [2]:
import sys
print(f'python {sys.version}')

python 3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:45:18) [GCC 12.3.0]


Import the external modules.

In [3]:
import torch
print(f'torch {torch.__version__}')

torch 2.1.0


In [4]:
import os
num_devices = torch.cuda.device_count()
print(f'CPU count: {os.cpu_count()}')
print(f'GPU count: {num_devices}')

CPU count: 48
GPU count: 4


In [5]:
import wfdb
import ast
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score

Import the internal modules.

In [6]:
from src.model import Model

## Data

Define the path to the data files.

In [7]:
path = 'data'

Load the labels.

In [8]:
df = pd.read_csv(f'{path}/ptbxl_database.csv', index_col='ecg_id')
y = df['scp_codes'].apply(lambda x: list(set(ast.literal_eval(x).keys()))).values
print(y.shape)

(21799,)


Load the time series.

In [9]:
x = np.array([np.transpose(wfdb.rdsamp(f'{path}/{f.replace("records100", "")}')[0]) for f in df['filename_lr']])
print(x.shape)

(21799, 12, 1000)


Split the data.

In [10]:
x_train, y_train = x[df['strat_fold'] < 10], y[df['strat_fold'] < 10]
x_test, y_test = x[df['strat_fold'] == 10], y[df['strat_fold'] == 10]

Encode the labels.

In [11]:
mlb = MultiLabelBinarizer()
mlb.fit(y_train)
y_train = mlb.transform(y_train)
y_test = mlb.transform(y_test)
print(y_train.shape)
print(y_test.shape)

(19601, 71)
(2198, 71)


Scale the time series.

In [12]:
mu = np.mean(x_train, keepdims=True)
sigma = np.std(x_train, keepdims=True)
x_train = (x_train - mu) / sigma
x_test = (x_test - mu) / sigma

## Baseline Model

Build the model.

In [13]:
model = Model(
    features=x_train.shape[1],
    units=None,
    dropout=None,
    filters=[128, 256, 128],
    kernel_sizes=[8, 5, 3],
    num_classes=y_train.shape[1],
    model_type='baseline'
)

number of parameters: 294,215


Visualize the model.

In [14]:
print(model.model)

DataParallel(
  (module): FCN(
    (fcn): ConvolutionalBranch(
      (model): Sequential(
        (Conv1d_0): Conv1d(12, 128, kernel_size=(8,), stride=(1,), padding=same, bias=False)
        (BatchNorm1d_0): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True, track_running_stats=True)
        (ReLU_0): ReLU()
        (Conv1d_1): Conv1d(128, 256, kernel_size=(5,), stride=(1,), padding=same, bias=False)
        (BatchNorm1d_1): BatchNorm1d(256, eps=0.001, momentum=0.99, affine=True, track_running_stats=True)
        (ReLU_1): ReLU()
        (Conv1d_2): Conv1d(256, 128, kernel_size=(3,), stride=(1,), padding=same, bias=False)
        (BatchNorm1d_2): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True, track_running_stats=True)
        (ReLU_2): ReLU()
      )
    )
    (avg_pool): AdaptiveAvgPool1d(output_size=1)
    (max_pool): AdaptiveMaxPool1d(output_size=1)
    (batch_norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (dropout): Dropout

Train the model.

In [15]:
model.fit(
    x_train=x_train,
    y_train=y_train,
    x_test=x_test,
    y_test=y_test,
    learning_rate=0.01,
    batch_size=num_devices * 128,
    epochs=50,
    verbose=True
)

training on CUDA 0, 1, 2, 3
epoch: 1, train_time: 5.35, train_loss: 0.000618, test_time: 0.29, test_loss: 0.000233
epoch: 2, train_time: 2.91, train_loss: 0.000190, test_time: 0.29, test_loss: 0.000190
epoch: 3, train_time: 2.91, train_loss: 0.000171, test_time: 0.29, test_loss: 0.000182
epoch: 4, train_time: 2.91, train_loss: 0.000163, test_time: 0.29, test_loss: 0.000174
epoch: 5, train_time: 2.92, train_loss: 0.000154, test_time: 0.29, test_loss: 0.000175
epoch: 6, train_time: 2.93, train_loss: 0.000149, test_time: 0.29, test_loss: 0.000167
epoch: 7, train_time: 2.93, train_loss: 0.000144, test_time: 0.29, test_loss: 0.000163
epoch: 8, train_time: 2.92, train_loss: 0.000140, test_time: 0.29, test_loss: 0.000160
epoch: 9, train_time: 2.93, train_loss: 0.000138, test_time: 0.29, test_loss: 0.000156
epoch: 10, train_time: 2.94, train_loss: 0.000136, test_time: 0.29, test_loss: 0.000155
epoch: 11, train_time: 2.93, train_loss: 0.000134, test_time: 0.29, test_loss: 0.000158
epoch: 12, tr

Save the learning history.

In [16]:
model.history.to_csv('history_baseline.csv', index=False)

Evaluate the model on the training set.

In [17]:
y_score_train = model.predict(x_train)
score_train = roc_auc_score(y_true=y_train, y_score=y_score_train, average='macro')
print(score_train)

0.9697236120147117


Save the training set predictions.

In [18]:
pd.DataFrame(data=y_score_train, columns=mlb.classes_).to_csv('y_score_train_baseline.csv', index=False)

Evaluate the model on the test set.

In [19]:
y_score_test = model.predict(x_test)
score_test = roc_auc_score(y_true=y_test, y_score=y_score_test, average='macro')
print(score_test)

0.9168235673151055


Save the test set predictions.

In [20]:
pd.DataFrame(data=y_score_test, columns=mlb.classes_).to_csv('y_score_test_baseline.csv', index=False)

## Proposed Model

Build the model.

In [21]:
model = Model(
    features=x_train.shape[1],
    units=[128, 128, 128],
    dropout=0.8,
    filters=[128, 256, 128],
    kernel_sizes=[8, 5, 3],
    num_classes=y_train.shape[1],
    model_type='proposed'
)

number of parameters: 640,455


Visualize the model.

In [22]:
print(model.model)

DataParallel(
  (module): LSTM_FCN(
    (fcn): ConvolutionalBranch(
      (model): Sequential(
        (Conv1d_0): Conv1d(12, 128, kernel_size=(8,), stride=(1,), padding=same, bias=False)
        (BatchNorm1d_0): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True, track_running_stats=True)
        (ReLU_0): ReLU()
        (Conv1d_1): Conv1d(128, 256, kernel_size=(5,), stride=(1,), padding=same, bias=False)
        (BatchNorm1d_1): BatchNorm1d(256, eps=0.001, momentum=0.99, affine=True, track_running_stats=True)
        (ReLU_1): ReLU()
        (Conv1d_2): Conv1d(256, 128, kernel_size=(3,), stride=(1,), padding=same, bias=False)
        (BatchNorm1d_2): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True, track_running_stats=True)
        (ReLU_2): ReLU()
      )
    )
    (lstm): RecurrentBranch(
      (model): Sequential(
        (LSTM_0): LSTM(12, 128, batch_first=True)
        (Lambda_0): Lambda()
        (ReLU_0): ReLU()
        (Dropout_0): Dropout(p=0.8, inplace=False)
   

Train the model.

In [23]:
model.fit(
    x_train=x_train,
    y_train=y_train,
    x_test=x_test,
    y_test=y_test,
    learning_rate=0.01,
    batch_size=num_devices * 128,
    epochs=50,
    verbose=True
)

training on CUDA 0, 1, 2, 3
epoch: 1, train_time: 8.02, train_loss: 0.000520, test_time: 0.51, test_loss: 0.000232
epoch: 2, train_time: 7.97, train_loss: 0.000178, test_time: 0.51, test_loss: 0.000185
epoch: 3, train_time: 7.97, train_loss: 0.000160, test_time: 0.51, test_loss: 0.000173
epoch: 4, train_time: 7.97, train_loss: 0.000152, test_time: 0.52, test_loss: 0.000169
epoch: 5, train_time: 7.96, train_loss: 0.000146, test_time: 0.51, test_loss: 0.000169
epoch: 6, train_time: 7.97, train_loss: 0.000141, test_time: 0.52, test_loss: 0.000160
epoch: 7, train_time: 8.02, train_loss: 0.000137, test_time: 0.51, test_loss: 0.000154
epoch: 8, train_time: 8.01, train_loss: 0.000136, test_time: 0.51, test_loss: 0.000159
epoch: 9, train_time: 8.00, train_loss: 0.000133, test_time: 0.51, test_loss: 0.000150
epoch: 10, train_time: 8.01, train_loss: 0.000131, test_time: 0.51, test_loss: 0.000155
epoch: 11, train_time: 8.00, train_loss: 0.000129, test_time: 0.51, test_loss: 0.000151
epoch: 12, tr

Save the learning history.

In [24]:
model.history.to_csv('history_proposed.csv', index=False)

Evaluate the model on the training set.

In [25]:
y_score_train = model.predict(x_train)
score_train = roc_auc_score(y_true=y_train, y_score=y_score_train, average='macro')
print(score_train)

0.9718111344761367


Save the training set predictions.

In [26]:
pd.DataFrame(data=y_score_train, columns=mlb.classes_).to_csv('y_score_train_proposed.csv', index=False)

Evaluate the model on the test set.

In [27]:
y_score_test = model.predict(x_test)
score_test = roc_auc_score(y_true=y_test, y_score=y_score_test, average='macro')
print(score_test)

0.921477931035661


Save the test set predictions.

In [28]:
pd.DataFrame(data=y_score_test, columns=mlb.classes_).to_csv('y_score_test_proposed.csv', index=False)