## Data Formatting
- 1분 단위 Signal Segmentation
- Stress Class는 0(baseline, amusement, meditation), 1(stress)로 binary classification으로 Formulation
- train, val, test split은 각각 10명, 2명, 3명의 Signal record로 지정

In [1]:
import glob
import numpy as np
import pickle
import matplotlib.pyplot as plt
import pandas as pd

from tqdm import tqdm
from sklearn.model_selection import train_test_split

- 필요 함수

In [2]:
def load_pickle(path):
    with open(path,'rb') as file: # Binary read
        _data = pickle._Unpickler(file)
        _data.encoding = 'latin1'
        data = _data.load()
    return data

In [3]:
def interp_spline(ecg, step=1, k=3):
    x_new = np.arange(0, ecg.shape[0], ecg.shape[0]/step)
    interp_spline_method = splrep(np.arange(0, ecg.shape[0], 1), ecg, k=k)
    
    return splev(x_new, interp_spline_method)

In [4]:
def window(a, w = 4, o = 2, copy = False):
    sh = (a.size - w + 1, w)
    st = a.strides * 2
    view = np.lib.stride_tricks.as_strided(a, strides = st, shape = sh)[0::o]
    if copy:
        return view.copy()
    else:
        return view

- load subjects lsit

In [5]:
wesad_subjects = glob.glob('../00_Data/01_PPG2ECG/01_Original/04_WESAD/*')
wesad_subjects = [wesad_subjects[i].split('\\')[-1] for i in range(len(wesad_subjects))][:-1]

- train, val, test subject split

In [7]:
val_idx = np.array([0, 1])
test_idx = np.array([12, 13, 14])
val_idx, test_idx

(array([0, 1]), array([12, 13, 14]))

In [8]:
val_subs = [wesad_subjects[val_idx[i]] for i in range(len(val_idx))]
test_subs = [wesad_subjects[test_idx[i]] for i in range(len(test_idx))]
train_subs = list(set(wesad_subjects) - set(list(val_subs) + list(test_subs)))

In [9]:
train_subs, val_subs, test_subs

(['S17', 'S5', 'S3', 'S15', 'S14', 'S16', 'S13', 'S6', 'S4', 'S2'],
 ['S10', 'S11'],
 ['S7', 'S8', 'S9'])

- binary class mapping dict

In [10]:
binary_cls_map_dict = {}
binary_cls_map_dict[1] = 0
binary_cls_map_dict[2] = 1
binary_cls_map_dict[3] = 0
binary_cls_map_dict[4] = 0

- train dataset binary classification processing

In [11]:
sampling_time = 60
overlap_ratio = 0.1
save_prefix = '../00_Data/04_Stress_Classification/01_train/'

break_idx = 0
for train_sub in tqdm(train_subs, total=len(train_subs)):
    data_path = '../00_Data/01_PPG2ECG/01_Original/04_WESAD/' + train_sub + '/' + train_sub + '.pkl'
    
    # load pickle
    data_dict = load_pickle(data_path)
    
    # ecg info
    ecg_ori_sig = data_dict['signal']['chest']['ECG']
    ecg_sig_fs = 700
    
    # ppg info
    ppg_ori_sig = data_dict['signal']['wrist']['BVP']
    ppg_sig_fs = 64
    
    # label info
    label_ori_sig = data_dict['label']
    label_sig_fs = 700
    
    # overlap windowing parameter
    ecg_target_frequency = ecg_sig_fs * sampling_time
    ecg_overlap_frequency = ecg_target_frequency-round((overlap_ratio * ecg_target_frequency))
    
    ppg_target_frequency = ppg_sig_fs * sampling_time
    ppg_overlap_frequency = ppg_target_frequency-round((overlap_ratio * ppg_target_frequency))
    
    label_target_frequency = label_sig_fs * sampling_time
    label_overlap_frequency = label_target_frequency-round((overlap_ratio * label_target_frequency))
    
    # windowing
    ppg_seg_result = window(a=ppg_ori_sig[:,0], w=ppg_target_frequency, o=ppg_overlap_frequency)
    ecg_seg_result = window(a=ecg_ori_sig[:,0], w=ecg_target_frequency, o=ecg_overlap_frequency)
    label_seg_result = window(a=label_ori_sig, w=label_target_frequency, o=label_overlap_frequency)
    
    for i in range(len(ppg_seg_result)):
        label_count = pd.Series(label_seg_result[i]).value_counts()
        label_index = np.array(label_count.index)
        label_count_num = label_count.values
        
        if len(label_index) == 1:
            if label_index == 0 or label_index>=5:
                continue    
            else:
                label = binary_cls_map_dict[label_index[0]]
        else:
            label_ratio = label_count_num / label_count_num.sum()
            label_ratio_sort = np.argsort(label_ratio)[::-1]
            major_label = label_index[label_ratio_sort[0]]
            
            if major_label == 0 or major_label>=5:
                continue
            else:
                label = binary_cls_map_dict[major_label]
                
        seg_dict = {}
        seg_dict['ECG'] = {}
        seg_dict['ECG']['sig'] = ecg_seg_result[i]
        seg_dict['ECG']['sig_fs'] = ecg_sig_fs
        seg_dict['ECG']['sig_time'] = sampling_time
        seg_dict['ECG']['sig_len'] = len(ecg_seg_result[i])
        seg_dict['ECG']['sig_info'] = 'Single'
        seg_dict['ECG']['units'] = None
        
        seg_dict['PPG'] = {}
        seg_dict['PPG']['sig'] = ppg_seg_result[i]
        seg_dict['PPG']['sig_fs'] = ppg_sig_fs
        seg_dict['PPG']['sig_time'] = sampling_time
        seg_dict['PPG']['sig_len'] = len(ppg_seg_result[i])
        seg_dict['PPG']['units'] = None
        
        seg_dict['label'] = label
        
        save_filename = train_sub + '_' + str(i).zfill(3) + '.npy'
        save_path = save_prefix + save_filename
        np.save(save_path, seg_dict)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:39<00:00,  3.98s/it]


- val dataset

In [12]:
sampling_time = 60
overlap_ratio = 0.1
save_prefix = '../00_Data/04_Stress_Classification/02_val/'

break_idx = 0
for val_sub in tqdm(val_subs, total=len(val_subs)):
    data_path = '../00_Data/01_PPG2ECG/01_Original/04_WESAD/' + val_sub + '/' + val_sub + '.pkl'
    
    # load pickle
    data_dict = load_pickle(data_path)
    
    # ecg info
    ecg_ori_sig = data_dict['signal']['chest']['ECG']
    ecg_sig_fs = 700
    
    # ppg info
    ppg_ori_sig = data_dict['signal']['wrist']['BVP']
    ppg_sig_fs = 64
    
    # label info
    label_ori_sig = data_dict['label']
    label_sig_fs = 700
    
    # overlap windowing parameter
    ecg_target_frequency = ecg_sig_fs * sampling_time
    ecg_overlap_frequency = ecg_target_frequency-round((overlap_ratio * ecg_target_frequency))
    
    ppg_target_frequency = ppg_sig_fs * sampling_time
    ppg_overlap_frequency = ppg_target_frequency-round((overlap_ratio * ppg_target_frequency))
    
    label_target_frequency = label_sig_fs * sampling_time
    label_overlap_frequency = label_target_frequency-round((overlap_ratio * label_target_frequency))
    
    # windowing
    ppg_seg_result = window(a=ppg_ori_sig[:,0], w=ppg_target_frequency, o=ppg_overlap_frequency)
    ecg_seg_result = window(a=ecg_ori_sig[:,0], w=ecg_target_frequency, o=ecg_overlap_frequency)
    label_seg_result = window(a=label_ori_sig, w=label_target_frequency, o=label_overlap_frequency)
    
    for i in range(len(ppg_seg_result)):
        label_count = pd.Series(label_seg_result[i]).value_counts()
        label_index = np.array(label_count.index)
        label_count_num = label_count.values
        
        if len(label_index) == 1:
            if label_index == 0 or label_index>=5:
                continue    
            else:
                label = binary_cls_map_dict[label_index[0]]
        else:
            label_ratio = label_count_num / label_count_num.sum()
            label_ratio_sort = np.argsort(label_ratio)[::-1]
            major_label = label_index[label_ratio_sort[0]]
            
            if major_label == 0 or major_label>=5:
                continue
            else:
                label = binary_cls_map_dict[major_label]
                
        seg_dict = {}
        seg_dict['ECG'] = {}
        seg_dict['ECG']['sig'] = ecg_seg_result[i]
        seg_dict['ECG']['sig_fs'] = ecg_sig_fs
        seg_dict['ECG']['sig_time'] = sampling_time
        seg_dict['ECG']['sig_len'] = len(ecg_seg_result[i])
        seg_dict['ECG']['sig_info'] = 'Single'
        seg_dict['ECG']['units'] = None
        
        seg_dict['PPG'] = {}
        seg_dict['PPG']['sig'] = ppg_seg_result[i]
        seg_dict['PPG']['sig_fs'] = ppg_sig_fs
        seg_dict['PPG']['sig_time'] = sampling_time
        seg_dict['PPG']['sig_len'] = len(ppg_seg_result[i])
        seg_dict['PPG']['units'] = None
        
        seg_dict['label'] = label
        
        save_filename = val_sub + '_' + str(i).zfill(3) + '.npy'
        save_path = save_prefix + save_filename
        np.save(save_path, seg_dict)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.49s/it]


- testset

In [13]:
sampling_time = 60
overlap_ratio = 0.1
save_prefix = '../00_Data/04_Stress_Classification/03_test/'

break_idx = 0
for test_sub in tqdm(test_subs, total=len(test_subs)):
    data_path = '../00_Data/01_PPG2ECG/01_Original/04_WESAD/' + test_sub + '/' + test_sub + '.pkl'
    
    # load pickle
    data_dict = load_pickle(data_path)
    
    # ecg info
    ecg_ori_sig = data_dict['signal']['chest']['ECG']
    ecg_sig_fs = 700
    
    # ppg info
    ppg_ori_sig = data_dict['signal']['wrist']['BVP']
    ppg_sig_fs = 64
    
    # label info
    label_ori_sig = data_dict['label']
    label_sig_fs = 700
    
    # overlap windowing parameter
    ecg_target_frequency = ecg_sig_fs * sampling_time
    ecg_overlap_frequency = ecg_target_frequency-round((overlap_ratio * ecg_target_frequency))
    
    ppg_target_frequency = ppg_sig_fs * sampling_time
    ppg_overlap_frequency = ppg_target_frequency-round((overlap_ratio * ppg_target_frequency))
    
    label_target_frequency = label_sig_fs * sampling_time
    label_overlap_frequency = label_target_frequency-round((overlap_ratio * label_target_frequency))
    
    # windowing
    ppg_seg_result = window(a=ppg_ori_sig[:,0], w=ppg_target_frequency, o=ppg_overlap_frequency)
    ecg_seg_result = window(a=ecg_ori_sig[:,0], w=ecg_target_frequency, o=ecg_overlap_frequency)
    label_seg_result = window(a=label_ori_sig, w=label_target_frequency, o=label_overlap_frequency)
    
    for i in range(len(ppg_seg_result)):
        label_count = pd.Series(label_seg_result[i]).value_counts()
        label_index = np.array(label_count.index)
        label_count_num = label_count.values
        
        if len(label_index) == 1:
            if label_index == 0 or label_index>=5:
                continue    
            else:
                label = binary_cls_map_dict[label_index[0]]
        else:
            label_ratio = label_count_num / label_count_num.sum()
            label_ratio_sort = np.argsort(label_ratio)[::-1]
            major_label = label_index[label_ratio_sort[0]]
            
            if major_label == 0 or major_label>=5:
                continue
            else:
                label = binary_cls_map_dict[major_label]
                
        seg_dict = {}
        seg_dict['ECG'] = {}
        seg_dict['ECG']['sig'] = ecg_seg_result[i]
        seg_dict['ECG']['sig_fs'] = ecg_sig_fs
        seg_dict['ECG']['sig_time'] = sampling_time
        seg_dict['ECG']['sig_len'] = len(ecg_seg_result[i])
        seg_dict['ECG']['sig_info'] = 'Single'
        seg_dict['ECG']['units'] = None
        
        seg_dict['PPG'] = {}
        seg_dict['PPG']['sig'] = ppg_seg_result[i]
        seg_dict['PPG']['sig_fs'] = ppg_sig_fs
        seg_dict['PPG']['sig_time'] = sampling_time
        seg_dict['PPG']['sig_len'] = len(ppg_seg_result[i])
        seg_dict['PPG']['units'] = None
        
        seg_dict['label'] = label
        
        save_filename = test_sub + '_' + str(i).zfill(3) + '.npy'
        save_path = save_prefix + save_filename
        np.save(save_path, seg_dict)

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:10<00:00,  3.40s/it]


- label distribution

In [4]:
npy_paths = glob.glob('../00_Data/04_Stress_Classification/02_val/*.npy')
npy_labels = []

for path in npy_paths:
    data_dict = np.load(path, allow_pickle=True).item()
    npy_labels.append(data_dict['label'])
    
pd.Series(npy_labels).value_counts()

0    85
1    26
Name: count, dtype: int64

## make partition

In [1]:
import glob
import numpy as np

- load train, val, test paths

In [2]:
trainset = glob.glob('../00_Data/04_Stress_Classification/01_train/*.npy')
valset = glob.glob('../00_Data/04_Stress_Classification/02_val/*.npy')
testset = glob.glob('../00_Data/04_Stress_Classification/03_test/*.npy')

- partition

In [4]:
partition = {}
partition['trainset'] = trainset
partition['valset'] = valset
partition['testset'] = testset

In [6]:
np.save("./partition/partition_formatting.npy", partition)