In [1]:
import numpy as np
import pandas as pd
import torch
import joblib
import h5py
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import warnings; warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [28]:
CLASSES = ['NORM', 'MI']

In [2]:
data = np.load('./data/data_raw.npz')

In [11]:
data.keys()

KeysView(NpzFile './data/data_raw.npz' with keys: patient212/s0434_re, patient195/s0337lre, patient151/s0206_re, patient116/s0302lre, patient075/s0327lre...)

In [15]:
meta = pd.read_csv("./data/meta.csv", usecols=["patient", "record_id", "sig_name", "Reason_for_admission"])
meta

Unnamed: 0,patient,record_id,sig_name,Reason_for_admission
0,patient001,s0010_re,"['i', 'ii', 'iii', 'avr', 'avl', 'avf', 'v1', ...",Myocardial infarction
1,patient001,s0014lre,"['i', 'ii', 'iii', 'avr', 'avl', 'avf', 'v1', ...",Myocardial infarction
2,patient001,s0016lre,"['i', 'ii', 'iii', 'avr', 'avl', 'avf', 'v1', ...",Myocardial infarction
3,patient002,s0015lre,"['i', 'ii', 'iii', 'avr', 'avl', 'avf', 'v1', ...",Myocardial infarction
4,patient003,s0017lre,"['i', 'ii', 'iii', 'avr', 'avl', 'avf', 'v1', ...",Myocardial infarction
...,...,...,...,...
544,patient292,s0555_re,"['i', 'ii', 'iii', 'avr', 'avl', 'avf', 'v1', ...",Myocardial infarction
545,patient292,s0556_re,"['i', 'ii', 'iii', 'avr', 'avl', 'avf', 'v1', ...",Myocardial infarction
546,patient293,s0557_re,"['i', 'ii', 'iii', 'avr', 'avl', 'avf', 'v1', ...",Myocardial infarction
547,patient293,s0558_re,"['i', 'ii', 'iii', 'avr', 'avl', 'avf', 'v1', ...",Myocardial infarction


In [26]:
meta['key'] = meta['patient'] + '/' + meta['record_id']
meta['key']

0      patient001/s0010_re
1      patient001/s0014lre
2      patient001/s0016lre
3      patient002/s0015lre
4      patient003/s0017lre
              ...         
544    patient292/s0555_re
545    patient292/s0556_re
546    patient293/s0557_re
547    patient293/s0558_re
548    patient294/s0559_re
Name: key, Length: 549, dtype: object

In [49]:
def get_label(d):
    if d == 'Myocardial infarction':
        return np.array([0, 1])
    elif d == 'Healthy control':
        return np.array([1, 0])
    else:
        return np.array([0, 0])


0      [0, 1]
1      [0, 1]
2      [0, 1]
3      [0, 1]
4      [0, 1]
        ...  
517    [0, 1]
518    [0, 1]
519    [0, 1]
520    [0, 1]
521    [0, 1]
Name: label, Length: 522, dtype: object

In [33]:
meta = meta.dropna().reset_index(drop=True)

In [24]:
LENGTH = 1000
def crop(signal):
    return signal[0:LENGTH, :]

def downsample(signal, ratio=5):
    # Signal of size (L, 12) to (L/ratio, 12)
    return signal[::ratio, :]

In [75]:
records = []
labels = []

for key in tqdm(meta['key']):
    signal = data[key]
    
    # Drop the last 3 channels:
    signal = signal[:, :12]
    assert signal.shape[1] == 12
    
    # Downsample x10
    signal = downsample(signal, 10)
    
    # Crop
    signal = crop(signal)
    assert signal.shape == (LENGTH, 12)
        
    records.append(signal)
    
    # Get label:
    d = meta['Reason_for_admission'][meta['key'] == key].values[0]
    labels.append(get_label(d))

X, y = np.array(records), np.array(labels)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 522/522 [00:15<00:00, 33.98it/s]


In [76]:
y

array([[0, 1],
       [0, 1],
       [0, 1],
       ...,
       [0, 1],
       [0, 1],
       [0, 1]])

In [77]:
assert X.shape[0], y.shape[0]
    
print(X.shape)

(522, 1000, 12)


In [78]:
### SPLIT
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

assert X_train.shape[1] == 1000

In [80]:
y_test.shape

(173, 2)

In [81]:
y_test.sum(axis=0)/len(y_test)

array([0.16763006, 0.69364162])

In [12]:
X_train = X_train.astype(np.float64)
X_test = X_test.astype(np.float64)
y_train = y_train.astype(np.float64)
y_test = y_test.astype(np.float64)


In [13]:
joblib.dump(X_train, './data/X_train.joblib')
joblib.dump(y_train, './data/y_train.joblib')
joblib.dump(X_test, './data/X_test.joblib')
joblib.dump(y_test, './data/y_test.joblib')

['./data/y_test.joblib']

In [11]:
# X_train = joblib.load('./data/X_train.joblib')
# X_test = joblib.load('./data/X_test.joblib')
# y_train = joblib.load('./data/y_train.joblib')
# y_test = joblib.load('./data/y_test.joblib')
