# setup

In [1]:
import os

if os.getcwd().split('/')[-1] == 'notebooks':
    os.chdir('..')

In [2]:
import numpy as np
import h5py
import pandas as pd
import pickle

from tqdm import tqdm

In [3]:
from data_loader.util import slide_and_cut_beat_aligned

In [4]:
from scipy import signal

def just_resample(data, sample_Fs = 400, resample_Fs = 500):
    sample_len = data.shape[1]
    resample_len = int(sample_len * (resample_Fs / sample_Fs))
    resample_data = signal.resample(data, resample_len, axis=1, window=None)

    return resample_data

# read

In [5]:
code15 = h5py.File('/home/josegfer/datasets/code/output/code15.h5', 'r')
code15['tracings'], code15['exam_id']

(<HDF5 dataset "tracings": shape (345797, 4096, 12), type "<f4">,
 <HDF5 dataset "exam_id": shape (345797,), type "<i8">)

In [6]:
metadata = pd.read_csv('/home/josegfer/datasets/code/data/exams.csv')
label_columns = ['1dAVb', 'RBBB', 'LBBB', 'SB', 'ST', 'AF']
metadata

Unnamed: 0,exam_id,age,is_male,nn_predicted_age,1dAVb,RBBB,LBBB,SB,ST,AF,patient_id,death,timey,normal_ecg,trace_file
0,1169160,38,True,40.160484,False,False,False,False,False,False,523632,False,2.098628,True,exams_part13.hdf5
1,2873686,73,True,67.059440,False,False,False,False,False,False,1724173,False,6.657529,False,exams_part13.hdf5
2,168405,67,True,79.621740,False,False,False,False,False,True,51421,False,4.282188,False,exams_part13.hdf5
3,271011,41,True,69.750260,False,False,False,False,False,False,1737282,False,4.038353,True,exams_part13.hdf5
4,384368,73,True,78.873460,False,False,False,False,False,False,331652,False,3.786298,False,exams_part13.hdf5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345774,1123951,33,True,35.893005,False,False,False,False,False,False,770553,False,2.189039,True,exams_part2.hdf5
345775,954704,73,False,68.169136,False,False,False,False,False,False,1044781,False,2.520546,False,exams_part2.hdf5
345776,589697,75,False,78.080810,False,False,False,False,False,False,1020589,False,3.304107,False,exams_part2.hdf5
345777,2780563,44,False,73.120636,False,False,False,False,False,False,178,False,7.339720,False,exams_part2.hdf5


In [7]:
with open('data/remove_id', 'rb') as fp:
    remove = pickle.load(fp)
len(remove), remove

(1677,
 [1571227,
  139970,
  1420381,
  137793,
  774079,
  665629,
  3139585,
  957074,
  956212,
  1255590,
  3064998,
  712255,
  3066387,
  2945735,
  1104277,
  911536,
  1024570,
  158087,
  526680,
  650732,
  3068746,
  1174091,
  1614004,
  274029,
  3028402,
  490520,
  776733,
  3628183,
  412505,
  2842957,
  1588966,
  1141497,
  1524922,
  1382026,
  2904898,
  1434717,
  1578599,
  1668407,
  407959,
  333677,
  1355703,
  1236298,
  2858070,
  1367672,
  4415239,
  2980705,
  2851729,
  2663590,
  645777,
  296441,
  1197076,
  1295955,
  94078,
  775190,
  1230958,
  921078,
  739322,
  3213164,
  1289725,
  1471630,
  4244783,
  596099,
  3156116,
  1428647,
  725206,
  1290443,
  940003,
  644376,
  969433,
  763678,
  4411125,
  2938128,
  2742846,
  234435,
  1032192,
  2760887,
  855104,
  389984,
  945491,
  1319342,
  2744641,
  188075,
  53197,
  3211347,
  2857247,
  130315,
  776873,
  4235283,
  1648108,
  590389,
  0,
  116904,
  2521296,
  3092259,
  3411

In [8]:
metadata_clean = metadata.copy()
for exam_id in tqdm(remove):
    metadata_clean = metadata_clean.drop(index = metadata_clean[metadata_clean['exam_id'] == exam_id].index)

100%|██████████| 1677/1677 [00:33<00:00, 50.18it/s]


In [9]:
indices, h5_indices, csv_indices = np.intersect1d(code15['exam_id'], metadata_clean['exam_id'], assume_unique = True, return_indices = True)
indices, h5_indices, csv_indices

(array([      0,       0,       0, ..., 4416606, 4416611, 4416614]),
 array([ 20000,  40001,  60002, ..., 296146,  64497,  97636]),
 array([-305796, -285795, -265794, ...,  161929,  340164,   95447]))

In [10]:
output = []
for idx, row in tqdm(metadata_clean.iterrows()):
    exam_id = row['exam_id']
    h5_idx = h5_indices[indices == exam_id]
    output.append(h5_idx[0])
metadata_clean.insert(len(metadata_clean.columns), 'h5_idx', output)
metadata_clean = metadata_clean.reset_index(drop = True)

344120it [00:56, 6114.37it/s]


In [11]:
metadata_clean

Unnamed: 0,exam_id,age,is_male,nn_predicted_age,1dAVb,RBBB,LBBB,SB,ST,AF,patient_id,death,timey,normal_ecg,trace_file,h5_idx
0,1169160,38,True,40.160484,False,False,False,False,False,False,523632,False,2.098628,True,exams_part13.hdf5,270517
1,2873686,73,True,67.059440,False,False,False,False,False,False,1724173,False,6.657529,False,exams_part13.hdf5,270518
2,168405,67,True,79.621740,False,False,False,False,False,True,51421,False,4.282188,False,exams_part13.hdf5,270519
3,271011,41,True,69.750260,False,False,False,False,False,False,1737282,False,4.038353,True,exams_part13.hdf5,270520
4,384368,73,True,78.873460,False,False,False,False,False,False,331652,False,3.786298,False,exams_part13.hdf5,270521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
344115,1123951,33,True,35.893005,False,False,False,False,False,False,770553,False,2.189039,True,exams_part2.hdf5,53847
344116,954704,73,False,68.169136,False,False,False,False,False,False,1044781,False,2.520546,False,exams_part2.hdf5,53848
344117,589697,75,False,78.080810,False,False,False,False,False,False,1020589,False,3.304107,False,exams_part2.hdf5,53849
344118,2780563,44,False,73.120636,False,False,False,False,False,False,178,False,7.339720,False,exams_part2.hdf5,53850


# loop

In [12]:
# num_files = code15['tracings'].shape[0]
num_files = len(metadata_clean)
n_lead = 12
n_segment = 10
beat_length = 400
n_classes = 6

In [13]:
h5f = h5py.File('data/code15bat.h5', 'w')

# x = h5f.create_dataset('recording', (num_files - len(remove), n_lead, n_segment, beat_length), dtype = code15['tracings'].dtype)
# r = h5f.create_dataset('ratio', (num_files - len(remove), 1, n_segment), dtype='f8')
# y = h5f.create_dataset('label', (num_files - len(remove), n_classes), dtype='bool')
# # w = h5f.create_dataset('weight', (num_files, n_classes), dtype='f8')
# id = h5f.create_dataset('exam_id', shape = (num_files - len(remove),), dtype = code15['exam_id'].dtype)
x = h5f.create_dataset('recording', (num_files, n_lead, n_segment, beat_length), dtype = code15['tracings'].dtype)
r = h5f.create_dataset('ratio', (num_files, 1, n_segment), dtype = 'f8')
y = h5f.create_dataset('label', (num_files, n_classes), dtype = 'bool')
id = h5f.create_dataset('exam_id', shape = (num_files, ), dtype = code15['exam_id'].dtype)

In [14]:
# ii = 0
# for i in tqdm(range(num_files)):
#     if i in remove:
#         continue
#     recording = code15['tracings'][i].T
#     exam_id = code15['exam_id'][i]

#     # resample
#     recording = just_resample(recording, sample_Fs = 400, resample_Fs = 500)

#     csv_idx = csv_indices[indices == exam_id]
#     assert exam_id == metadata['exam_id'].loc[csv_idx].values[0]
#     onehot = metadata[label_columns].loc[csv_idx].values

#     # slide and cut
#     scbeat, info2save = slide_and_cut_beat_aligned(recording, n_segment = 1, window_size = 5000, sampling_rate = 500, 
#                                                    seg_with_r = False, beat_length = 400)
#     x[ii, :, :, :] = np.transpose(scbeat, (0, 2, 1, 3))
#     r[ii, :, :] = info2save
#     y[ii, :] = onehot
#     id[ii] = exam_id
#     ii += 1

In [15]:
for idx, row in tqdm(metadata_clean.iterrows()):
    h5_idx = row['h5_idx']
    recording = code15['tracings'][h5_idx].T
    onehot = row[label_columns].to_numpy(dtype = 'bool')
    exam_id = code15['exam_id'][h5_idx]
    assert exam_id == row['exam_id']

    # resample
    recording = just_resample(recording, sample_Fs = 400, resample_Fs = 500)
    # slide and cut
    scbeat, info2save = slide_and_cut_beat_aligned(recording, n_segment = 1, window_size = 5000, sampling_rate = 500, 
                                                   seg_with_r = False, beat_length = 400)
    
    x[idx, :, :, :] = np.transpose(scbeat, (0, 2, 1, 3))
    r[idx, :, :] = info2save
    y[idx, :] = onehot
    id[idx] = exam_id

344120it [2:05:37, 45.66it/s]


In [16]:
code15.close()
h5f.close()