# setup

In [1]:
import os

if os.getcwd().split('/')[-1] == 'notebooks':
    os.chdir('..')

In [2]:
import numpy as np
import h5py

from tqdm import tqdm

In [3]:
from data_loader.util import load_label_files, load_challenge_data, resample, slide_and_cut_beat_aligned, load_labels, load_weights

# read

In [4]:
# Define the weights, the SNOMED CT code for the normal class, and equivalent SNOMED CT codes.
weights_file = 'weights.csv'
normal_class = '426783006'
equivalent_classes = [['713427006', '59118001'], ['284470004', '63593006'], ['427172004', '17338001']]

In [5]:
input_directory_label = '/home/josegfer/datasets/challenge2020/data'
label_dir = '/home/josegfer/datasets/challenge2020/data'
# Find the label files.
print('Finding label and output files...')
label_files = load_label_files(input_directory_label)

Finding label and output files...


In [6]:
print('Loading labels...')
classes, labels_onehot, labels = load_labels(label_files, normal_class,
                                                                    equivalent_classes)

Loading labels...


In [7]:
print('Loading weights...')
weights = load_weights(weights_file, classes)

Loading weights...
/home/josegfer/li2021bat


In [8]:
indices = np.any(weights, axis=0)  # Find indices of classes in weight matrix.
indices_unscored = ~indices

In [9]:
### class for dataset
CPSC_classes = ['270492004', '164889003', '164909002', '284470004', '426783006',
                '713427006']  # "59118001" = "713427006"
CPSC_class_weight = np.zeros((108,))
for cla in CPSC_classes:
    CPSC_class_weight[classes.index(cla)] = 1
# CPSC_extra
CPSC_extra_excluded_classes = ['445118002', '39732003', '251146004', '698252002', '10370003', '164947007',
                                '111975006', '164917005', '47665007', '427393009', '426783006', '59931005']
CPSC_extra_class_weight = np.ones((108,))
for cla in CPSC_extra_excluded_classes:
    CPSC_extra_class_weight[classes.index(cla)] = 0
# PTB-XL
PTB_XL_excluded_classes = ['426627000', '427172004']  # , '17338001'
PTB_XL_class_weight = np.ones((108,))
for cla in PTB_XL_excluded_classes:
    PTB_XL_class_weight[classes.index(cla)] = 0
# G12ECG
G12ECG_excluded_classes = ['10370003', '164947007']
G12ECG_class_weight = np.ones((108,))
for cla in G12ECG_excluded_classes:
    G12ECG_class_weight[classes.index(cla)] = 0

# loop

In [10]:
num_files = len(label_files)
# num_files = 5
n_lead = 12
n_segment = 10
beat_length = 400
n_classes = len(classes)

In [11]:
h5f = h5py.File('data/challenge2020.h5', 'w')
x = h5f.create_dataset('recording', (num_files, n_lead, n_segment, beat_length), dtype='f8')
r = h5f.create_dataset('ratio', (num_files, 1, n_segment), dtype='f8')
y = h5f.create_dataset('label', (num_files, n_classes), dtype='bool')
w = h5f.create_dataset('weight', (num_files, n_classes), dtype='f8')

In [12]:
for i in tqdm(range(num_files)):
    recording, header, name = load_challenge_data(label_files[i], label_dir)
    recording[np.isnan(recording)] = 0

    # divide ADC_gain and resample
    recording = resample(recording, header, 500)

    if name[0] == 'S' or name[0] == 'I': # PTB or St.P dataset
        pass
    elif name[0] == 'A': # CPSC
        class_weight = CPSC_class_weight
    elif name[0] == 'Q': # CPSC-extra
        class_weight = CPSC_extra_class_weight
    elif name[0] == 'H': # PTB-XL
        class_weight = PTB_XL_class_weight
    elif name[0] == 'E': # G12ECG
        class_weight = G12ECG_class_weight
    else:
        print('warning! not from one of the datasets')
        print(name)

    # slide and cut
    recording, info2save = slide_and_cut_beat_aligned(recording, 1, 5000, 500,
                                                    seg_with_r=False, beat_length=400)
    x[i, :, :, :] = np.transpose(recording, (0, 2, 1, 3))
    r[i, :, :] = info2save
    y[i, :] = labels_onehot[i]
    w[i, :] = class_weight

100%|██████████| 42976/42976 [26:39<00:00, 26.87it/s]   


In [13]:
h5f.close()