- 각 파일당 환자 1명의 레코드로 취급
- train:val:test = 6:2:2로 split 수행
- train, val, test split 수행 시 각 환자 단위로 split되야하고 각 환자는 여러 label을 가지고 있으므로 label 분포 비교를 여러번 수행하여 train, val, test의 label 비율이 동일하도록 구현

## 01. Train, Test Split

In [1]:
import numpy as np
import glob
import scipy.io as io
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import train_test_split

- 레이블 분포 비교를 위한 KLDivergence 함수 구현

In [2]:
def kl_divergence(p, q):
    return np.sum(np.where(p != 0, p * np.log(p / q), 0))

- load data paths

In [5]:
data_paths = glob.glob('../00_Data/02_Arrhythmia_Classification/01_Original/*.mat')

all_labels = []
for path in data_paths:
    data = io.loadmat(path)
    all_labels.extend(data['labels'].flatten())

- 부정맥 레이블 분포 산출

In [10]:
all_dist = (pd.Series(all_labels).value_counts().sort_index() / pd.Series(all_labels).value_counts().sum()).values
all_dist

array([0.31187136, 0.09449676, 0.08057317, 0.04653298, 0.12123348,
       0.34529225])

- seed를 변경해가며 테스트해서 Train, Test 세트의 분포를 전체 부정맥 레이블과 동일하게 근사

In [11]:
seed_list = []
kl_dist = []

for i in tqdm(range(100)):
    random_seed = np.random.randint(0, 10000000)
    trainset, testset,_ , _ = train_test_split(data_paths, data_paths, test_size=0.2, random_state=random_seed)

    train_class = []
    test_class = []

    for path in trainset:
        data = io.loadmat(path)
        train_class.extend(data['labels'].flatten())

    for path in testset:
        data = io.loadmat(path)
        test_class.extend(data['labels'].flatten())
        
    train_dist = (pd.Series(train_class).value_counts().sort_index() / pd.Series(train_class).value_counts().sum()).values
    test_dist = (pd.Series(test_class).value_counts().sort_index() / pd.Series(test_class).value_counts().sum()).values
    
    kld = kl_divergence(train_dist, test_dist)
    seed_list.append(random_seed)
    kl_dist.append(kld)


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [03:45<00:00,  2.26s/it]


- 가장 유사한 train, test 분포 확인

In [14]:
random_seed = seed_list[np.argsort(kl_dist)[0]]
trainset, testset,_ , _ = train_test_split(data_paths, data_paths, test_size=0.2, random_state=random_seed)

train_class = []
test_class = []

for path in trainset:
    data = io.loadmat(path)
    train_class.extend(data['labels'].flatten())

for path in testset:
    data = io.loadmat(path)
    test_class.extend(data['labels'].flatten())

train_dist = (pd.Series(train_class).value_counts().sort_index() / pd.Series(train_class).value_counts().sum()).values
test_dist = (pd.Series(test_class).value_counts().sort_index() / pd.Series(test_class).value_counts().sum()).values

In [15]:
train_dist

array([0.31749546, 0.09465372, 0.07805997, 0.04500587, 0.12247892,
       0.34230605])

In [16]:
test_dist

array([0.2893075 , 0.09386707, 0.09065611, 0.05265975, 0.11623675,
       0.35727282])

- 저장

In [19]:
partition_traintest = {}
partition_traintest['trainset'] = trainset
partition_traintest['testset'] = testset

In [20]:
len(partition_traintest['trainset']),len(partition_traintest['testset'])

(72, 19)

In [21]:
np.save('./partition/01_partition_traintest.npy',  partition_traintest)

## 02. Train, Val Split

In [1]:
import numpy as np
import glob
import scipy.io as io
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import train_test_split

- KLD

In [2]:
def kl_divergence(p, q):
    return np.sum(np.where(p != 0, p * np.log(p / q), 0))

- load partition_traintest

In [12]:
partition = np.load('./partition/01_partition_traintest.npy', allow_pickle=True).item()

origin_trainset = partition['trainset']

all_labels = []
for path in origin_trainset:
    data = io.loadmat(path)
    all_labels.extend(data['labels'].flatten())

In [4]:
all_dist = (pd.Series(all_labels).value_counts().sort_index() / pd.Series(all_labels).value_counts().sum()).values

In [5]:
all_dist

array([0.31749546, 0.09465372, 0.07805997, 0.04500587, 0.12247892,
       0.34230605])

- search best seed

In [6]:
seed_list = []
kl_dist = []

for i in tqdm(range(100)):
    random_seed = np.random.randint(0, 10000000)
    trainset, valset,_ , _ = train_test_split(origin_trainset, origin_trainset, test_size=0.2, random_state=random_seed)

    train_class = []
    val_class = []

    for path in trainset:
        data = io.loadmat(path)
        train_class.extend(data['labels'].flatten())

    for path in valset:
        data = io.loadmat(path)
        val_class.extend(data['labels'].flatten())
        
    train_dist = (pd.Series(train_class).value_counts().sort_index() / pd.Series(train_class).value_counts().sum()).values
    val_dist = (pd.Series(val_class).value_counts().sort_index() / pd.Series(val_class).value_counts().sum()).values
    
    kld = kl_divergence(train_dist, val_dist)
    seed_list.append(random_seed)
    kl_dist.append(kld)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:58<00:00,  1.79s/it]


- 가장 유사한 train, val 분포 확인

In [7]:
random_seed = seed_list[np.argsort(kl_dist)[0]]
trainset, valset,_ , _ = train_test_split(origin_trainset, origin_trainset, test_size=0.2, random_state=random_seed)

train_class = []
val_class = []

for path in trainset:
    data = io.loadmat(path)
    train_class.extend(data['labels'].flatten())

for path in valset:
    data = io.loadmat(path)
    val_class.extend(data['labels'].flatten())

train_dist = (pd.Series(train_class).value_counts().sort_index() / pd.Series(train_class).value_counts().sum()).values
val_dist = (pd.Series(val_class).value_counts().sort_index() / pd.Series(val_class).value_counts().sum()).values

kld = kl_divergence(train_dist, val_dist)
seed_list.append(random_seed)
kl_dist.append(kld)

In [8]:
train_dist

array([0.32380383, 0.09406647, 0.07444301, 0.04834479, 0.12564332,
       0.33369858])

In [9]:
val_dist

array([0.29170626, 0.09705443, 0.09284648, 0.03135605, 0.10954255,
       0.37749423])

In [10]:
all_dist

array([0.31749546, 0.09465372, 0.07805997, 0.04500587, 0.12247892,
       0.34230605])

- make train, val, test partition

In [13]:
partition_trainval_test = {}
partition_trainval_test['trainset'] = trainset
partition_trainval_test['valset'] = valset
partition_trainval_test['testset'] = partition['testset']

In [15]:
len(partition_trainval_test['trainset']), len(partition_trainval_test['valset']), len(partition_trainval_test['testset'])

(57, 15, 19)

- save

In [18]:
np.save('./partition/02_partition_trainval_test.npy', partition_trainval_test)

## 3. Convert npy file
- 01, 02에서 작업한 train, val, test 파일을 npy 파일로 변환

In [1]:
import numpy as np
import scipy.io as io
import glob
from tqdm import tqdm

- 필요 함수

In [2]:
def make_data_dict(PPG_sig, PPG_fs, PPG_units, label):
    # make ecg dict
    ecg_dict = None
    
    # make ppg dict
    ppg_dict = {}
    ppg_dict['sig'] = PPG_sig
    ppg_dict['sig_fs'] = PPG_fs
    ppg_dict['sig_time'] = len(PPG_sig) // PPG_fs # seconds
    ppg_dict['sig_len'] = len(PPG_sig)
    ppg_dict['units'] = PPG_units
    ppg_dict['label'] = label
    
    # make personal info
    person_info = None
    
    # make final data_dict
    data_dict = {}
    data_dict['ECG'] = ecg_dict
    data_dict['PPG'] = ppg_dict
    data_dict['Personal_Info'] = person_info
    
    return data_dict

- load partition

In [3]:
partition = np.load('./partition/02_partition_trainval_test.npy', allow_pickle=True).item()

trainset = partition['trainset']
valset = partition['valset']
testset = partition['testset']

- trainset 처리

In [4]:
save_prefix = '../00_Data/02_Arrhythmia_Classification/02_Formatting/01_train/'

for path in tqdm(trainset, total=len(trainset)):
    original_data = io.loadmat(path)
    
    original_ppg_signal = original_data['ppgseg']
    label = original_data['labels']
    
    ppg_sig_fs = 100
    ppg_units = None
    
    for i in range(len(original_ppg_signal)):
        seg_ppg = np.transpose(original_ppg_signal[i:i+1], (1, 0))[:,0]
        seg_dict = make_data_dict(seg_ppg, ppg_sig_fs, ppg_units, label[i][0])
        
        save_filename = path.split('\\')[-1].split('.')[0] + '_' + str(i+1).zfill(4) + '.npy'
        save_path = save_prefix + save_filename
        
        np.save(save_path, seg_dict)

100%|██████████████████████████████████████████████████████████████████████████████████| 57/57 [00:30<00:00,  1.86it/s]


In [5]:
len(glob.glob('../00_Data/02_Arrhythmia_Classification/02_Formatting/01_train/*.npy'))

30117

- valset 처리

In [6]:
save_prefix = '../00_Data/02_Arrhythmia_Classification/02_Formatting/02_val/'

for path in tqdm(valset, total=len(valset)):
    original_data = io.loadmat(path)
    
    original_ppg_signal = original_data['ppgseg']
    label = original_data['labels']
    
    ppg_sig_fs = 100
    ppg_units = None
    
    for i in range(len(original_ppg_signal)):
        seg_ppg = np.transpose(original_ppg_signal[i:i+1], (1, 0))[:,0]
        seg_dict = make_data_dict(seg_ppg, ppg_sig_fs, ppg_units, label[i][0])
        
        save_filename = path.split('\\')[-1].split('.')[0] + '_' + str(i+1).zfill(4) + '.npy'
        save_path = save_prefix + save_filename
        
        np.save(save_path, seg_dict)

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:07<00:00,  1.95it/s]


In [7]:
len(glob.glob('../00_Data/02_Arrhythmia_Classification/02_Formatting/02_val/*.npy'))

7367

- testset 처리

In [8]:
save_prefix = '../00_Data/02_Arrhythmia_Classification/02_Formatting/03_test/'

for path in tqdm(testset, total=len(testset)):
    original_data = io.loadmat(path)
    
    original_ppg_signal = original_data['ppgseg']
    label = original_data['labels']
    
    ppg_sig_fs = 100
    ppg_units = None
    
    for i in range(len(original_ppg_signal)):
        seg_ppg = np.transpose(original_ppg_signal[i:i+1], (1, 0))[:,0]
        seg_dict = make_data_dict(seg_ppg, ppg_sig_fs, ppg_units, label[i][0])
        
        save_filename = path.split('\\')[-1].split('.')[0] + '_' + str(i+1).zfill(4) + '.npy'
        save_path = save_prefix + save_filename
        
        np.save(save_path, seg_dict)

100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [00:09<00:00,  1.96it/s]


In [9]:
len(glob.glob('../00_Data/02_Arrhythmia_Classification/02_Formatting/03_test/*.npy'))

9343

- partition_formatting 작성

In [10]:
import glob
import numpy as np

In [11]:
trainset = glob.glob('../00_Data/02_Arrhythmia_Classification/02_Formatting/01_train/*.npy')
valset = glob.glob('../00_Data/02_Arrhythmia_Classification/02_Formatting/02_val/*.npy')
testset = glob.glob('../00_Data/02_Arrhythmia_Classification/02_Formatting/03_test//*.npy')

In [12]:
len(trainset), len(valset), len(testset)

(30117, 7367, 9343)

In [13]:
partition_formatting = {}
partition_formatting['trainset'] = trainset
partition_formatting['valset'] = valset
partition_formatting['testset'] = testset

In [14]:
np.save('./partition/03_partition_formatting.npy', partition_formatting)