# setup

In [1]:
import os

if os.getcwd().split('/')[-1] == 'notebooks':
    os.chdir('..')

In [2]:
import numpy as np
import h5py
import pandas as pd
import pickle

from tqdm import tqdm

In [3]:
from data_loader.util import slide_and_cut_beat_aligned

In [4]:
from scipy import signal

def just_resample(data, sample_Fs = 400, resample_Fs = 500):
    sample_len = data.shape[1]
    resample_len = int(sample_len * (resample_Fs / sample_Fs))
    resample_data = signal.resample(data, resample_len, axis=1, window=None)

    return resample_data

In [5]:
import warnings
warnings.simplefilter('always')

# clean

In [6]:
code15 = h5py.File('/home/josegfer/datasets/code/output/code15.h5', 'r')
code15['tracings'], code15['exam_id']

(<HDF5 dataset "tracings": shape (345797, 4096, 12), type "<f4">,
 <HDF5 dataset "exam_id": shape (345797,), type "<i8">)

In [7]:
num_files = code15['tracings'].shape[0]
n_lead = 12
n_segment = 10
beat_length = 400
n_classes = 6

In [8]:
isoeletrico = []
alertas = []
erros = []

In [9]:
remove = []
for i in tqdm(range(num_files)):
    recording = code15['tracings'][i].T
    exam_id = code15['exam_id'][i]

    if (recording == 0).all(): # isoeletrico
        # print('isoeletrico idx: {}, exam_id: {}'.format(i, exam_id))
        isoeletrico.append([i, exam_id])
        remove.append(i)
        continue

    # resample
    recording = just_resample(recording, sample_Fs = 400, resample_Fs = 500)

    try: # noise
        with warnings.catch_warnings(record = True) as w:
            recording, info2save = slide_and_cut_beat_aligned(recording, 1, 5000, 500,
                                                        seg_with_r=False, beat_length=400)
            if w:
                # print('warning idx: {}, exam_id: {}'.format(i, exam_id))
                alertas.append([i, exam_id])
                remove.append(i)
                continue
    except:
        # print('error idx: {}, exam_id: {}'.format(i, exam_id))
        erros.append([i, exam_id])
        remove.append(i)
        continue

100%|██████████| 345797/345797 [1:50:45<00:00, 52.03it/s]  


In [10]:
len(isoeletrico), len(alertas), len(erros)

(688, 877, 112)

In [11]:
len(remove)

1677

In [12]:
with open('data/remove', 'wb') as fp:
    pickle.dump(remove, fp)

In [13]:
with open('data/remove', 'rb') as fp:
    lista = pickle.load(fp)
lista

[214,
 1015,
 1176,
 1565,
 1692,
 1753,
 2047,
 2171,
 2178,
 2238,
 2396,
 2465,
 2519,
 2655,
 2923,
 2933,
 3056,
 3123,
 3386,
 3557,
 3615,
 4120,
 4413,
 4553,
 6182,
 6190,
 6214,
 6234,
 6302,
 6481,
 6592,
 6749,
 6883,
 6888,
 6894,
 7585,
 7693,
 7712,
 8142,
 8238,
 8555,
 9381,
 9779,
 9839,
 9867,
 9931,
 9970,
 10378,
 10384,
 10472,
 10879,
 10905,
 11082,
 11488,
 11846,
 11928,
 12178,
 12307,
 12417,
 12487,
 12938,
 13436,
 13505,
 14294,
 14788,
 15043,
 15168,
 15304,
 15685,
 15756,
 15785,
 15851,
 15882,
 16630,
 16838,
 16939,
 16948,
 17040,
 17172,
 17250,
 17441,
 18086,
 18293,
 18433,
 18439,
 18589,
 19451,
 19717,
 19847,
 19909,
 20000,
 20477,
 20489,
 20579,
 20609,
 20906,
 21018,
 21595,
 22070,
 22371,
 22852,
 23113,
 23355,
 23440,
 23594,
 23711,
 23951,
 23953,
 24137,
 24217,
 24261,
 24565,
 24568,
 24663,
 24840,
 25031,
 25047,
 25455,
 25512,
 25700,
 25794,
 25982,
 26120,
 26577,
 26722,
 26727,
 26740,
 26769,
 26836,
 26886,
 27167,


In [14]:
code15.close()

# after

In [3]:
code15 = h5py.File('/home/josegfer/datasets/code/output/code15.h5', 'r')
code15['tracings'], code15['exam_id']

(<HDF5 dataset "tracings": shape (345797, 4096, 12), type "<f4">,
 <HDF5 dataset "exam_id": shape (345797,), type "<i8">)

In [4]:
metadata = pd.read_csv('/home/josegfer/datasets/code/data/exams.csv')
label_columns = ['1dAVb', 'RBBB', 'LBBB', 'SB', 'ST', 'AF']
metadata

Unnamed: 0,exam_id,age,is_male,nn_predicted_age,1dAVb,RBBB,LBBB,SB,ST,AF,patient_id,death,timey,normal_ecg,trace_file
0,1169160,38,True,40.160484,False,False,False,False,False,False,523632,False,2.098628,True,exams_part13.hdf5
1,2873686,73,True,67.059440,False,False,False,False,False,False,1724173,False,6.657529,False,exams_part13.hdf5
2,168405,67,True,79.621740,False,False,False,False,False,True,51421,False,4.282188,False,exams_part13.hdf5
3,271011,41,True,69.750260,False,False,False,False,False,False,1737282,False,4.038353,True,exams_part13.hdf5
4,384368,73,True,78.873460,False,False,False,False,False,False,331652,False,3.786298,False,exams_part13.hdf5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345774,1123951,33,True,35.893005,False,False,False,False,False,False,770553,False,2.189039,True,exams_part2.hdf5
345775,954704,73,False,68.169136,False,False,False,False,False,False,1044781,False,2.520546,False,exams_part2.hdf5
345776,589697,75,False,78.080810,False,False,False,False,False,False,1020589,False,3.304107,False,exams_part2.hdf5
345777,2780563,44,False,73.120636,False,False,False,False,False,False,178,False,7.339720,False,exams_part2.hdf5


In [5]:
indices, h5_indices, csv_indices = np.intersect1d(code15['exam_id'], metadata['exam_id'], assume_unique = True, return_indices = True)
indices, h5_indices, csv_indices

(array([      0,       0,       0, ..., 4416606, 4416611, 4416614]),
 array([ 20000,  40001,  60002, ..., 296146,  64497,  97636]),
 array([-305796, -285795, -265794, ...,  162700,  341804,   95898]))

In [6]:
with open('data/remove', 'rb') as fp:
    remove = pickle.load(fp)
len(remove), remove

(1677,
 [214,
  1015,
  1176,
  1565,
  1692,
  1753,
  2047,
  2171,
  2178,
  2238,
  2396,
  2465,
  2519,
  2655,
  2923,
  2933,
  3056,
  3123,
  3386,
  3557,
  3615,
  4120,
  4413,
  4553,
  6182,
  6190,
  6214,
  6234,
  6302,
  6481,
  6592,
  6749,
  6883,
  6888,
  6894,
  7585,
  7693,
  7712,
  8142,
  8238,
  8555,
  9381,
  9779,
  9839,
  9867,
  9931,
  9970,
  10378,
  10384,
  10472,
  10879,
  10905,
  11082,
  11488,
  11846,
  11928,
  12178,
  12307,
  12417,
  12487,
  12938,
  13436,
  13505,
  14294,
  14788,
  15043,
  15168,
  15304,
  15685,
  15756,
  15785,
  15851,
  15882,
  16630,
  16838,
  16939,
  16948,
  17040,
  17172,
  17250,
  17441,
  18086,
  18293,
  18433,
  18439,
  18589,
  19451,
  19717,
  19847,
  19909,
  20000,
  20477,
  20489,
  20579,
  20609,
  20906,
  21018,
  21595,
  22070,
  22371,
  22852,
  23113,
  23355,
  23440,
  23594,
  23711,
  23951,
  23953,
  24137,
  24217,
  24261,
  24565,
  24568,
  24663,
  24840,
  2503

In [10]:
remove_id = []
for h5_idx in tqdm(remove):
    exam_id = code15['exam_id'][h5_idx]
    remove_id.append(exam_id)

100%|██████████| 1677/1677 [00:00<00:00, 9426.70it/s]


In [11]:
with open('data/remove_id', 'wb') as fp:
    pickle.dump(remove_id, fp)