# setup

In [1]:
import os

if os.getcwd().split('/')[-1] == 'notebooks':
    os.chdir('..')

In [98]:
import numpy as np
import h5py
import pandas as pd

from tqdm import tqdm

In [18]:
from data_loader.util import load_label_files, load_challenge_data, resample, slide_and_cut_beat_aligned, load_labels, load_weights

In [84]:
from scipy import signal

def just_resample(data, sample_Fs = 400, resample_Fs = 500):
    sample_len = data.shape[1]
    resample_len = int(sample_len * (resample_Fs / sample_Fs))
    resample_data = signal.resample(data, resample_len, axis=1, window=None)

    return resample_data

# draft

In [3]:
h5f = h5py.File('data/challenge2020.h5', 'r')

In [12]:
h5f['recording'], h5f['ratio'], h5f['label'], h5f['weight']

(<HDF5 dataset "recording": shape (42976, 12, 10, 400), type "<f8">,
 <HDF5 dataset "ratio": shape (42976, 1, 10), type "<f8">,
 <HDF5 dataset "label": shape (42976, 108), type "|b1">,
 <HDF5 dataset "weight": shape (42976, 108), type "<f8">)

In [14]:
h5f['ratio'][0], h5f['label'][0], h5f['weight'][0]

(array([[0.8825, 0.8825, 0.8825, 0.8825, 0.8825, 0.8825, 0.8825, 0.8825,
         0.8825, 0.8825]]),
 array([False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
         True, False, False, False, False, False, False, False, False]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0

In [15]:
h5f.close()

# read

In [16]:
# Define the weights, the SNOMED CT code for the normal class, and equivalent SNOMED CT codes.
weights_file = 'weights.csv'
normal_class = '426783006'
equivalent_classes = [['713427006', '59118001'], ['284470004', '63593006'], ['427172004', '17338001']]

In [19]:
input_directory_label = '/home/josegfer/datasets/challenge2020/data'
label_dir = '/home/josegfer/datasets/challenge2020/data'
# Find the label files.
print('Finding label and output files...')
label_files = load_label_files(input_directory_label)

Finding label and output files...


In [45]:
print('Loading labels...')
classes, labels_onehot, labels = load_labels(label_files, normal_class,
                                                                    equivalent_classes)
[type(classes), len(classes)], [type(labels_onehot), labels_onehot.shape], [type(labels), len(labels)]

Loading labels...


([list, 108], [numpy.ndarray, (42976, 108)], [list, 42976])

In [46]:
print('Loading weights...')
weights = load_weights(weights_file, classes)
type(weights), weights.shape

Loading weights...
/home/josegfer/li2021bat


(numpy.ndarray, (108, 108))

In [47]:
### class for dataset
CPSC_classes = ['270492004', '164889003', '164909002', '284470004', '426783006',
                '713427006']  # "59118001" = "713427006"
CPSC_class_weight = np.zeros((108,))
for cla in CPSC_classes:
    CPSC_class_weight[classes.index(cla)] = 1
# CPSC_extra
CPSC_extra_excluded_classes = ['445118002', '39732003', '251146004', '698252002', '10370003', '164947007',
                                '111975006', '164917005', '47665007', '427393009', '426783006', '59931005']
CPSC_extra_class_weight = np.ones((108,))
for cla in CPSC_extra_excluded_classes:
    CPSC_extra_class_weight[classes.index(cla)] = 0
# PTB-XL
PTB_XL_excluded_classes = ['426627000', '427172004']  # , '17338001'
PTB_XL_class_weight = np.ones((108,))
for cla in PTB_XL_excluded_classes:
    PTB_XL_class_weight[classes.index(cla)] = 0
# G12ECG
G12ECG_excluded_classes = ['10370003', '164947007']
G12ECG_class_weight = np.ones((108,))
for cla in G12ECG_excluded_classes:
    G12ECG_class_weight[classes.index(cla)] = 0

# draft

In [48]:
num_files = len(label_files)
# num_files = 5
n_lead = 12
n_segment = 10
beat_length = 400
n_classes = len(classes)
num_files, n_classes

(42976, 108)

In [49]:
for i in tqdm(range(num_files)):
    break
    recording, header, name = load_challenge_data(label_files[i], label_dir)
    recording[np.isnan(recording)] = 0

    # divide ADC_gain and resample
    recording = resample(recording, header, 500)

    if name[0] == 'S' or name[0] == 'I': # PTB or St.P dataset
        pass
    elif name[0] == 'A': # CPSC
        class_weight = CPSC_class_weight
    elif name[0] == 'Q': # CPSC-extra
        class_weight = CPSC_extra_class_weight
    elif name[0] == 'H': # PTB-XL
        class_weight = PTB_XL_class_weight
    elif name[0] == 'E': # G12ECG
        class_weight = G12ECG_class_weight
    else:
        print('warning! not from one of the datasets')
        print(name)

    # slide and cut
    recording, info2save = slide_and_cut_beat_aligned(recording, 1, 5000, 500,
                                                    seg_with_r=False, beat_length=400)
    x[i, :, :, :] = np.transpose(recording, (0, 2, 1, 3))
    r[i, :, :] = info2save
    y[i, :] = labels_onehot[i]
    w[i, :] = class_weight

  0%|          | 0/42976 [00:00<?, ?it/s]


In [68]:
recording, header, name = load_challenge_data(label_files[i], label_dir)
name, header, type(recording), recording.shape

('A0001',
 ['A0001 12 500 7500\n',
  'A0001.mat 16x1+24 1000.0(0)/mV 16 0 28 -1716 0 I\n',
  'A0001.mat 16x1+24 1000.0(0)/mV 16 0 7 2029 0 II\n',
  'A0001.mat 16x1+24 1000.0(0)/mV 16 0 -21 3745 0 III\n',
  'A0001.mat 16x1+24 1000.0(0)/mV 16 0 -17 3680 0 aVR\n',
  'A0001.mat 16x1+24 1000.0(0)/mV 16 0 24 -2664 0 aVL\n',
  'A0001.mat 16x1+24 1000.0(0)/mV 16 0 -7 -1499 0 aVF\n',
  'A0001.mat 16x1+24 1000.0(0)/mV 16 0 -290 390 0 V1\n',
  'A0001.mat 16x1+24 1000.0(0)/mV 16 0 -204 157 0 V2\n',
  'A0001.mat 16x1+24 1000.0(0)/mV 16 0 -96 -2555 0 V3\n',
  'A0001.mat 16x1+24 1000.0(0)/mV 16 0 -112 49 0 V4\n',
  'A0001.mat 16x1+24 1000.0(0)/mV 16 0 -596 -321 0 V5\n',
  'A0001.mat 16x1+24 1000.0(0)/mV 16 0 -16 -3112 0 V6\n',
  '# Age: 74\n',
  '# Sex: Male\n',
  '# Dx: 59118001\n',
  '# Rx: Unknown\n',
  '# Hx: Unknown\n',
  '# Sx: Unknown\n'],
 numpy.ndarray,
 (12, 7500))

In [59]:
recording[np.isnan(recording)] = 0
recording = resample(recording, header, 500)
recording.shape

(12, 7500)

In [61]:
if name[0] == 'S' or name[0] == 'I': # PTB or St.P dataset
    pass
elif name[0] == 'A': # CPSC
    class_weight = CPSC_class_weight
elif name[0] == 'Q': # CPSC-extra
    class_weight = CPSC_extra_class_weight
elif name[0] == 'H': # PTB-XL
    class_weight = PTB_XL_class_weight
elif name[0] == 'E': # G12ECG
    class_weight = G12ECG_class_weight
else:
    print('warning! not from one of the datasets')
    print(name)
class_weight

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0.])

In [69]:
recording, info2save = slide_and_cut_beat_aligned(recording, n_segment = 1, window_size = 5000, sampling_rate = 500,
                                                seg_with_r=False, beat_length=400)
recording.shape, info2save

((1, 10, 12, 400),
 array([[0.8825, 0.8825, 0.8825, 0.8825, 0.8825, 0.8825, 0.8825, 0.8825,
         0.8825, 0.8825]]))

# code15

In [101]:
code15 = h5py.File('/home/josegfer/datasets/code/output/code15.h5', 'r')
code15['tracings'], code15['exam_id']

(<HDF5 dataset "tracings": shape (345797, 4096, 12), type "<f4">,
 <HDF5 dataset "exam_id": shape (345797,), type "<i8">)

In [116]:
metadata = pd.read_csv('/home/josegfer/datasets/code/data/exams.csv')
label_columns = ['1dAVb', 'RBBB', 'LBBB', 'SB', 'ST', 'AF']
metadata

Unnamed: 0,exam_id,age,is_male,nn_predicted_age,1dAVb,RBBB,LBBB,SB,ST,AF,patient_id,death,timey,normal_ecg,trace_file
0,1169160,38,True,40.160484,False,False,False,False,False,False,523632,False,2.098628,True,exams_part13.hdf5
1,2873686,73,True,67.059440,False,False,False,False,False,False,1724173,False,6.657529,False,exams_part13.hdf5
2,168405,67,True,79.621740,False,False,False,False,False,True,51421,False,4.282188,False,exams_part13.hdf5
3,271011,41,True,69.750260,False,False,False,False,False,False,1737282,False,4.038353,True,exams_part13.hdf5
4,384368,73,True,78.873460,False,False,False,False,False,False,331652,False,3.786298,False,exams_part13.hdf5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345774,1123951,33,True,35.893005,False,False,False,False,False,False,770553,False,2.189039,True,exams_part2.hdf5
345775,954704,73,False,68.169136,False,False,False,False,False,False,1044781,False,2.520546,False,exams_part2.hdf5
345776,589697,75,False,78.080810,False,False,False,False,False,False,1020589,False,3.304107,False,exams_part2.hdf5
345777,2780563,44,False,73.120636,False,False,False,False,False,False,178,False,7.339720,False,exams_part2.hdf5


In [103]:
indices, h5_indices, csv_indices = np.intersect1d(code15['exam_id'], metadata['exam_id'], assume_unique = True, return_indices = True)
indices, h5_indices, csv_indices

(array([      0,       0,       0, ..., 4416606, 4416611, 4416614]),
 array([ 20000,  40001,  60002, ..., 296146,  64497,  97636]),
 array([-305796, -285795, -265794, ...,  162700,  341804,   95898]))

In [104]:
num_files = code15['tracings'].shape[0]
n_lead = 12
n_segment = 10
beat_length = 400
n_classes = 6

In [105]:
for i in tqdm(range(num_files)):
    break
    recording = code15['tracings'][i].T
    exam_id = code15['exam_id'][i]

    # resample
    recording = just_resample(recording, sample_Fs = 400, resample_Fs = 500)

    csv_idx = csv_indices[indices == exam_id]
    assert exam_id == metadata['exam_id'].loc[csv_idx].values[0]
    onehot = metadata[label_columns].loc[csv_idx].values

    # slide and cut
    scbeat, info2save = slide_and_cut_beat_aligned(recording, n_segment = 1, window_size = 5000, sampling_rate = 500, 
                                                   seg_with_r = False, beat_length = 400)
    x[i, :, :, :] = np.transpose(recording, (0, 2, 1, 3))
    r[i, :, :] = info2save
    y[i, :] = onehot
    id[i] = exam_id

  0%|          | 0/345797 [00:00<?, ?it/s]


In [106]:
# recording, header, name = load_challenge_data(label_files[i], label_dir)
recording = code15['tracings'][i].T
exam_id = code15['exam_id'][i]
type(recording), recording.shape, exam_id

(numpy.ndarray, (12, 4096), 590673)

In [107]:
recording = just_resample(recording, sample_Fs = 400, resample_Fs = 500)
recording.shape

(12, 5120)

In [124]:
csv_idx = csv_indices[indices == exam_id]
assert exam_id == metadata['exam_id'].loc[csv_idx].values[0]
onehot = metadata[label_columns].loc[csv_idx].values[0]
onehot

array([False, False, False, False, False, False])

In [90]:
scbeat, info2save = slide_and_cut_beat_aligned(recording, n_segment = 1, window_size = 5000, sampling_rate = 500, 
                                               seg_with_r = False, beat_length = 400)
scbeat.shape, info2save

((1, 10, 12, 400),
 array([[0.8275, 0.8275, 0.8275, 0.8275, 0.8275, 0.8275, 0.8275, 0.8275,
         0.8275, 0.8275]]))

In [125]:
code15.close()

# loop

In [184]:
code15 = h5py.File('/home/josegfer/datasets/code/output/code15.h5', 'r')

In [185]:
remove = []
for i in tqdm(range(num_files)):
    recording = code15['tracings'][i].T

    if (recording == 0).all(): # isoeletrico
        remove.append(i)
        continue
    
    try: # noise
        scbeat, info2save = slide_and_cut_beat_aligned(recording, n_segment = 1, window_size = 5000, sampling_rate = 500, 
                                                       seg_with_r = False, beat_length = 400)
    except:
        remove.append(i)
        continue

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  warn(
  warn(
  return _methods._mean(a, axis=axis, dtype=dtype,
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  return _methods._mean(a, axis=axis, dtype=dtype,
  warn(
  warn(
  warn(
  return _methods._mean(a, axis=axis, dtype=dtype,
  warn(
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  return _methods._mean(a, axis=axis, dtype=dtype,
  warn(
  return _methods._mean(a, axis=axis, dtype=dtype,
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  return _methods._mean(a, axis=axis, dtype=dtype,
  warn(
  return _methods._mean(a, axis=axis, dtype=dtype,
  warn(
  return _methods._mean(a, axis=axis, dtype=dtype,
  warn(
  warn(
  warn(
  return _methods._mean(a, axis=axis, dtype=dtype,
  warn(
  return _methods._mean(a, axis=a

In [157]:
num_files = code15['tracings'].shape[0]
n_lead = 12
n_segment = 10
beat_length = 400
n_classes = 6

In [158]:
h5f = h5py.File('data/code15bat.h5', 'w')

x = h5f.create_dataset('recording', (num_files - len(remove), n_lead, n_segment, beat_length), dtype = code15['tracings'].dtype)
r = h5f.create_dataset('ratio', (num_files - len(remove), 1, n_segment), dtype='f8')
y = h5f.create_dataset('label', (num_files - len(remove), n_classes), dtype='bool')
# w = h5f.create_dataset('weight', (num_files, n_classes), dtype='f8')
id = h5f.create_dataset('exam_id', shape = (num_files - len(remove),), dtype = code15['exam_id'].dtype)

In [None]:
ii = 0
for i in tqdm(range(num_files)):
    if i in remove:
        continue
    recording = code15['tracings'][i].T
    exam_id = code15['exam_id'][i]

    # resample
    recording = just_resample(recording, sample_Fs = 400, resample_Fs = 500)

    csv_idx = csv_indices[indices == exam_id]
    assert exam_id == metadata['exam_id'].loc[csv_idx].values[0]
    onehot = metadata[label_columns].loc[csv_idx].values

    # slide and cut
    scbeat, info2save = slide_and_cut_beat_aligned(recording, n_segment = 1, window_size = 5000, sampling_rate = 500, 
                                                   seg_with_r = False, beat_length = 400)
    x[ii, :, :, :] = np.transpose(scbeat, (0, 2, 1, 3))
    r[ii, :, :] = info2save
    y[ii, :] = onehot
    id[ii] = exam_id
    ii += 1

In [183]:
code15.close()
h5f.close()