# setup

In [9]:
from ecgprep import preprocess, read_ecg
import h5py
import pandas as pd
import tqdm
import os
import matplotlib.pyplot as plt
import numpy as np

In [2]:
input_file = 'RECORDS.txt'
root_dir = 'WFDB_ShaoxingUniv'
out_file = 'ningbo.h5'

fmt = 'wfdb'

new_freq = 400
new_len = 4096
scale = 2
use_all_leads = True
remove_baseline = False
remove_powerline = None

# draft

In [3]:
all_leads = ['DI', 'DII', 'DIII', 'AVR', 'AVL', 'AVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']

In [4]:
files = pd.read_csv(input_file, header=None).values.flatten()
folder = root_dir
n = len(files)

In [5]:
for i, f in enumerate(tqdm.tqdm(files)):
    break
i, f

  0%|          | 0/45152 [00:00<?, ?it/s]


(0, 'JS00001')

In [6]:
ecg, sample_rate, leads = read_ecg.read_ecg(os.path.join(folder, f), format = fmt)
ecg, sample_rate, leads

(array([[-0.254, -0.254, -0.254, ..., -0.034,  0.024,  0.005],
        [ 0.264,  0.264,  0.264, ..., -0.068, -0.049, -0.034],
        [ 0.517,  0.517,  0.517, ..., -0.034, -0.073, -0.039],
        ...,
        [ 0.81 ,  0.81 ,  0.81 , ..., -0.205, -0.2  , -0.171],
        [ 0.81 ,  0.81 ,  0.81 , ..., -0.2  , -0.195, -0.166],
        [ 0.527,  0.527,  0.527, ...,  0.102,  0.093,  0.112]]),
 500,
 ['I', 'II', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6'])

In [11]:
np.isnan(ecg).any()

False

In [12]:
ecg_preprocessed, new_rate, new_leads = preprocess.preprocess_ecg(ecg, sample_rate, all_leads, # different lead names
                                                                          new_freq=new_freq,
                                                                          new_len=new_len,
                                                                          scale=scale,
                                                                          use_all_leads=use_all_leads,
                                                                          remove_baseline=remove_baseline,
                                                                          remove_powerline=remove_powerline)
ecg_preprocessed, new_rate, new_leads

(array([[-0.45662999, -0.52572182, -0.49735178, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.47596687,  0.54495072,  0.52134413, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.93259686,  1.07067254,  1.01869592, ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 1.46068218,  1.67124455,  1.60023106, ...,  0.        ,
          0.        ,  0.        ],
        [ 1.46065339,  1.67128349,  1.60013837, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.94859643,  1.09061578,  1.03516844, ...,  0.        ,
          0.        ,  0.        ]]),
 400,
 ['DI',
  'DII',
  'DIII',
  'AVR',
  'AVL',
  'AVF',
  'V1',
  'V2',
  'V3',
  'V4',
  'V5',
  'V6'])

In [13]:
np.isnan(ecg_preprocessed).any()

False

# assert

In [17]:
# for i, f in enumerate(tqdm.tqdm(files)):
#     ecg, sample_rate, leads = read_ecg.read_ecg(os.path.join(folder, f), format=fmt)
#     assert not np.isnan(ecg).any()
#     ecg_preprocessed, new_rate, new_leads = preprocess.preprocess_ecg(ecg, sample_rate, leads,
#                                                                             new_freq=new_freq,
#                                                                             new_len=new_len,
#                                                                             scale=scale,
#                                                                             use_all_leads=use_all_leads,
#                                                                             remove_baseline=remove_baseline,
#                                                                             remove_powerline=remove_powerline)
#     assert not np.isnan(ecg_preprocessed).any()

 23%|██▎       | 10365/45152 [01:10<03:55, 147.48it/s]


AssertionError: 

In [20]:
# np.isnan(ecg).any(), np.isnan(ecg_preprocessed).any()

(True, True)

# log

In [46]:
log = []
names = []
for i, f in enumerate(tqdm.tqdm(files)):
    ecg, sample_rate, leads = read_ecg.read_ecg(os.path.join(folder, f), format=fmt)
    ecg_preprocessed, new_rate, new_leads = preprocess.preprocess_ecg(ecg, sample_rate, leads,
                                                                            new_freq=new_freq,
                                                                            new_len=new_len,
                                                                            scale=scale,
                                                                            use_all_leads=use_all_leads,
                                                                            remove_baseline=remove_baseline,
                                                                            remove_powerline=remove_powerline)
    # log.append([f, np.isnan(ecg).any(), np.isnan(ecg_preprocessed).any()])
    log.append([np.isnan(ecg).any(), np.isnan(ecg_preprocessed).any()])
    names.append(f)

100%|██████████| 45152/45152 [04:02<00:00, 186.29it/s]


In [48]:
array = np.array(log)
array, names

(array([[False, False],
        [False, False],
        [False, False],
        ...,
        [False, False],
        [False, False],
        [False, False]]),
 ['JS00001',
  'JS00002',
  'JS00004',
  'JS00005',
  'JS00006',
  'JS00007',
  'JS00008',
  'JS00009',
  'JS00010',
  'JS00011',
  'JS00012',
  'JS00013',
  'JS00014',
  'JS00015',
  'JS00016',
  'JS00017',
  'JS00018',
  'JS00019',
  'JS00020',
  'JS00021',
  'JS00022',
  'JS00023',
  'JS00024',
  'JS00025',
  'JS00026',
  'JS00027',
  'JS00029',
  'JS00030',
  'JS00031',
  'JS00032',
  'JS00033',
  'JS00034',
  'JS00036',
  'JS00037',
  'JS00038',
  'JS00039',
  'JS00040',
  'JS00041',
  'JS00042',
  'JS00043',
  'JS00044',
  'JS00045',
  'JS00046',
  'JS00047',
  'JS00048',
  'JS00049',
  'JS00050',
  'JS00051',
  'JS00052',
  'JS00053',
  'JS00054',
  'JS00055',
  'JS00056',
  'JS00057',
  'JS00058',
  'JS00059',
  'JS00060',
  'JS00061',
  'JS00062',
  'JS00063',
  'JS00064',
  'JS00065',
  'JS00066',
  'JS00067',
  'JS0006

In [50]:
np.sum(array[:, 0]), np.sum(array[:, 1])

(97, 97)

In [52]:
np.unique(array[:, 0] == array[:, 1])

array([ True])

In [54]:
df = np.array(names)
df[array[:, 0]]

array(['JS10765', 'JS10767', 'JS10890', 'JS10951', 'JS11887', 'JS11897',
       'JS11956', 'JS12751', 'JS13181', 'JS14161', 'JS14343', 'JS14627',
       'JS14659', 'JS15624', 'JS16169', 'JS16222', 'JS16813', 'JS19309',
       'JS19708', 'JS20330', 'JS20656', 'JS21144', 'JS21617', 'JS21668',
       'JS21701', 'JS21853', 'JS21881', 'JS23116', 'JS23450', 'JS23482',
       'JS23588', 'JS23786', 'JS23950', 'JS24016', 'JS25106', 'JS25322',
       'JS25458', 'JS26009', 'JS26130', 'JS26145', 'JS26245', 'JS26605',
       'JS26793', 'JS26843', 'JS26977', 'JS27034', 'JS27170', 'JS27271',
       'JS27278', 'JS27407', 'JS27460', 'JS27835', 'JS27985', 'JS28075',
       'JS28648', 'JS28757', 'JS33280', 'JS34479', 'JS34509', 'JS34788',
       'JS34868', 'JS34879', 'JS35050', 'JS35065', 'JS35192', 'JS35654',
       'JS35727', 'JS36015', 'JS36018', 'JS36189', 'JS36244', 'JS36568',
       'JS36731', 'JS37105', 'JS37173', 'JS37176', 'JS37439', 'JS37592',
       'JS37609', 'JS37781', 'JS38231', 'JS38252', 

# physionet

In [57]:
ecg, sample_rate, leads = read_ecg.read_ecg('a-large-scale-12-lead-electrocardiogram-database-for-arrhythmia-study-1.0.0/WFDBRecords/11/113/JS10765', format=fmt)
ecg, sample_rate, leads, np.isnan(ecg).any()

(array([[ 2.0000e-02,  2.0000e-02,  2.0000e-02, ..., -1.8500e-01,
         -1.9500e-01, -2.4400e-01],
        [ 2.9000e-02,  2.9000e-02,  2.9000e-02, ..., -2.1000e-01,
         -2.0000e-01, -2.0000e-01],
        [ 1.0000e-02,  1.0000e-02,  1.0000e-02, ..., -2.4000e-02,
         -5.0000e-03,  4.4000e-02],
        ...,
        [-3.9000e-02, -3.9000e-02, -3.9000e-02, ...,  3.4000e-02,
          5.4000e-02,  5.9000e-02],
        [-1.6934e+01, -1.6934e+01, -1.6934e+01, ..., -4.1500e-01,
         -4.1500e-01, -3.9500e-01],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00, ...,  3.4000e-02,
          3.9000e-02,  3.9000e-02]]),
 500,
 ['I', 'II', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6'],
 True)

In [58]:
ecg_preprocessed, new_rate, new_leads = preprocess.preprocess_ecg(ecg, sample_rate, all_leads, # different lead names
                                                                          new_freq=new_freq,
                                                                          new_len=new_len,
                                                                          scale=scale,
                                                                          use_all_leads=use_all_leads,
                                                                          remove_baseline=remove_baseline,
                                                                          remove_powerline=remove_powerline)
ecg_preprocessed, new_rate, new_leads, np.isnan(ecg_preprocessed).any()

(array([[ 3.72808139e-02,  3.92626683e-02,  4.35779473e-02, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        [ 5.28287699e-02,  5.88950905e-02,  5.91054670e-02, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        [ 1.55479560e-02,  1.96324221e-02,  1.55275197e-02, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        ...,
        [-7.02247167e-02, -8.06592296e-02, -7.66685410e-02, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        [-3.04849064e+01, -3.49897658e+01, -3.32906842e+01, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00]]),
 400,
 ['DI',
  'DII',
  'DIII',
  'AVR',
  'AVL',
  'AVF',
  'V1',
  'V2',
  'V3',
  'V4',
  'V5',
  'V6'],
 True)