In [1]:
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
import pickle as pkl
import torch
import random
import gc
from datetime import datetime
import psutil


'''PREFLIGHT SETUP'''
from functools import partial
print_flush = partial(print, flush=True)
torch.manual_seed(1)
random.seed(1)
np.random.seed(1)
'''PREFLIGHT SETUP'''

'PREFLIGHT SETUP'

In [2]:
AF_ECG = np.zeros((6230, 2400))
labeled_perc=0.1
AF_subsample_idx = np.random.choice(range(AF_ECG.shape[0]), int(labeled_perc * AF_ECG.shape[0]), replace=False)


In [7]:
class Dataset_whole_limited_labeled(Dataset):
    def __init__(self, data_path, split, labeled_perc=0.1):
        super().__init__()
        self.data_path = data_path
        self.split = split
        self.labeled_perc = labeled_perc
        self.build_dataset()
        self.length = len(self.all_labels)
#         self.AF_subsample_idx = None
#         self.PVC_subsample_idx = None
#         self.NSR_subsample_idx = None
#         self.AF_idxs = None
#         self.PVC_idxs = None
#         self.NSR_idxs = None
        

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        
        if idx < self.AF_ECG.shape[0]:
            return self.AF_ECG[idx][None, :], self.AF_PPG[idx][None, :], self.AF_ECG[self.AF_idxs[idx]][None, :], self.AF_PPG[self.AF_idxs[idx]][None, :], self.all_labels[idx]
        elif idx >= self.AF_ECG.shape[0] and idx < (self.AF_ECG.shape[0] + self.PVC_ECG.shape[0]):
            offset = self.AF_ECG.shape[0]
            return self.PVC_ECG[idx-offset][None, :], self.PVC_PPG[idx-offset][None, :], self.PVC_ECG[self.PVC_idxs[idx-offset]][None, :], self.PVC_PPG[self.PVC_idxs[idx-offset]][None, :], self.all_labels[idx]
        else:
            offset = self.AF_ECG.shape[0] + self.PVC_ECG.shape[0]
            return self.NSR_ECG[idx-offset][None, :], self.NSR_PPG[idx-offset][None, :], self.NSR_ECG[self.NSR_idxs[idx-offset]][None, :], self.NSR_PPG[self.NSR_idxs[idx-offset]][None, :], self.all_labels[idx]

    def build_dataset(self):
        tstart = datetime.now()
        print_flush(f'\tloading... {self.split}')
        
        self.AF_ECG = torch.from_numpy(np.load(f'{self.data_path}/AF_v5/{self.split}_ECG_resampled2400.npy')).float()
        self.AF_PPG = torch.from_numpy(np.load(f'{self.data_path}/AF_v5/{self.split}_PPG_resampled2400.npy')).float()
        print_flush(f'\tAF loaded {self.split}')
        self.PVC_ECG = torch.from_numpy(np.load(f'{self.data_path}/PVC_v5/{self.split}_ECG_resampled2400.npy')).float()
        self.PVC_PPG = torch.from_numpy(np.load(f'{self.data_path}/PVC_v5/{self.split}_PPG_resampled2400.npy')).float()
        print_flush(f'\tPVC loaded {self.split}')
        self.NSR_ECG = torch.from_numpy(np.load(f'{self.data_path}/NSR_v5/{self.split}_ECG_resampled2400.npy')).float()
        self.NSR_PPG = torch.from_numpy(np.load(f'{self.data_path}/NSR_v5/{self.split}_PPG_resampled2400.npy')).float()
        print_flush(f'\tNSR loaded {self.split}')

        assert self.AF_ECG.shape == self.AF_PPG.shape
        assert self.NSR_ECG.shape == self.NSR_PPG.shape
        assert self.PVC_ECG.shape == self.PVC_PPG.shape

        print_flush(f'\tloading {self.split} finished t={datetime.now() - tstart}, mem used={psutil.virtual_memory()[3]/1000000000}')

        self.AF_subsample_idx = np.random.choice(range(self.AF_ECG.shape[0]), int(self.labeled_perc * self.AF_ECG.shape[0]), replace=False)
        self.PVC_subsample_idx = np.random.choice(range(self.PVC_ECG.shape[0]), int(self.labeled_perc * self.PVC_ECG.shape[0]), replace=False)
        self.NSR_subsample_idx = np.random.choice(range(self.NSR_ECG.shape[0]), int(self.labeled_perc * self.NSR_ECG.shape[0]), replace=False)

        if self.labeled_perc != 1:

            self.AF_idxs = np.repeat(self.AF_subsample_idx, int(1/self.labeled_perc)+1, axis=0)[:self.AF_ECG.shape[0]]
            self.PVC_idxs = np.repeat(self.PVC_subsample_idx, int(1/self.labeled_perc)+1, axis=0)[:self.PVC_ECG.shape[0]]
            self.NSR_idxs = np.repeat(self.NSR_subsample_idx, int(1/self.labeled_perc)+1, axis=0)[:self.NSR_ECG.shape[0]]

        else:
            self.AF_idxs = np.arange(self.AF_ECG.shape[0])
            self.PVC_idxs = np.arange(self.PVC_ECG.shape[0])
            self.NSR_idxs = np.arange(self.NSR_ECG.shape[0])

        AF_labels = np.ones((self.AF_ECG.shape[0]))
        NSR_labels = np.zeros((self.PVC_ECG.shape[0] + self.NSR_ECG.shape[0]))
        self.all_labels = torch.from_numpy(np.concatenate((AF_labels, NSR_labels), axis=0)).long()

        print(f'dataset built AF counts {len(AF_labels)}, NSR counts {len(NSR_labels)}, total counts {len(self.all_labels)}', flush=True)
        print(self.AF_idxs.shape, self.PVC_idxs.shape, self.NSR_idxs.shape, flush=True)
        print('Labeled perc check:', flush=True)
        print(len(np.unique(self.AF_idxs)) / self.AF_ECG.shape[0], flush=True)
        print(len(np.unique(self.PVC_idxs)) / self.PVC_ECG.shape[0], flush=True)
        print(len(np.unique(self.NSR_idxs)) / self.NSR_ECG.shape[0], flush=True)

In [8]:
train_dataset = Dataset_whole_limited_labeled('/labs/hulab/stark_stuff/ppg_ecg_project/data/', split='train', labeled_perc=0.1)


	loading... train
	AF loaded train
	PVC loaded train
	NSR loaded train
	loading train finished t=0:02:53.560513, mem used=230.94325248
dataset built AF counts 2757888, NSR counts 3014334, total counts 5772222
(2757888,) (1411158,) (1603176,)
Labeled perc check:
0.09090942054209598
0.09090973512533677
0.0909095445540602


In [9]:
1411158 + 1603176

3014334