# setup

In [1]:
import os

if os.getcwd().split('/')[-1] == 'notebooks':
    os.chdir('..')

In [2]:
import pandas as pd
import wfdb
import numpy as np
import h5py

from ecgprep import read_ecg
from tqdm import tqdm

In [3]:
ds_folder = 'lobachevsky-university-electrocardiography-database-1.0.1'
records = pd.read_csv(os.path.join(ds_folder, 'RECORDS'), header = None)
leads = ['avf', 'avl', 'avr', 'i', 'ii', 'iii', 'v1', 'v2', 'v3', 'v4', 'v5', 'v6']
symbols = ['N', 'p', 't']

out_folder = 'output'
if not os.path.exists(out_folder):
    os.makedirs(out_folder)


# generate

In [4]:
n_files = len(records)
n_samples = 5000
n_leads = len(leads)
n_symbols = len(symbols)

h5f = h5py.File(os.path.join(out_folder, 'ludb.h5'), 'w')

x = h5f.create_dataset('tracings', (n_files, n_leads, n_samples), dtype='f8')
y = h5f.create_dataset('annotations', (n_files, n_leads, n_symbols, n_samples), dtype='f8')
id = h5f.create_dataset('exam_id', (n_files,), dtype='i4')

In [5]:
for ii, row in tqdm(records.iterrows()):
    # read signal
    file_path = os.path.join(ds_folder, row[0])
    ecg, sample_rate, leads = read_ecg.read_ecg(file_path)

    # read annotations and prepare y
    label = np.zeros(shape = (len(leads), len(symbols), ecg.shape[1]))
    closed_flag = False
    for l, lead in enumerate(leads):
        ann = wfdb.rdann(file_path, extension = 'avf')
        assert len(ann.sample) == len(ann.symbol)

        for idx, symbol in enumerate(ann.symbol):
            if symbol not in symbols: # check if symbol is a letter
                continue
            
            # assert ann.symbol[idx - 1] == '(' and ann.symbol[idx + 1] == ')'
            if ann.symbol[idx - 1] != '(' or ann.symbol[idx + 1] != ')':
                closed_flag = True
            peak = ann.sample[idx] # check if is a closed wave and get onset and end of wave
            if ann.symbol[idx - 1] == '(':
                begin = ann.sample[idx - 1]
            else:
                begin = ann.sample[idx]
            if ann.symbol[idx + 1] == ')':
                end = ann.sample[idx + 1]
            else:
                end = ann.sample[idx]
            
            label[l, symbols.index(symbol), begin:end] = 1 # attribute 1 for closed wave in channel of corresponding symbol
    
    # write
    x[ii, :, :] = ecg
    y[ii, :, :, :] = label
    id[ii] = int(file_path.split('/')[-1])
    # warning
    if closed_flag:
        print('file {} not in closed wave'.format(file_path))

2it [00:00, 18.23it/s]

98it [00:03, 26.57it/s]

file lobachevsky-university-electrocardiography-database-1.0.1/data/95 not in closed wave


107it [00:04, 20.53it/s]

file lobachevsky-university-electrocardiography-database-1.0.1/data/104 not in closed wave


116it [00:04, 23.67it/s]

file lobachevsky-university-electrocardiography-database-1.0.1/data/111 not in closed wave


200it [00:07, 25.45it/s]


In [6]:
h5f.close()