This notebook can be used to create a 'reduced' data set. In this data set, we reduce the original DL data set by a factor 10. This was necessary to load the data set into memory on a external cluster which was used for hyperparameter search. The reduced data set was only used for hyperparameter search and NOT for training the final models.

In [None]:
# Import packages
import sys, os, fnmatch, csv
import numpy as np
import h5py
import pandas as pd
import zarr

sys.path.insert(0, os.path.dirname(os.getcwd()))

In [35]:
from config import PATH_RAW_DATA, PATH_METADATA, PATH_DATA_PROCESSED_DL, PATH_DATA_PROCESSED_DL_REDUCED

## Load preprocessed data (non-reduced)

In [36]:
%%time

# Load all the metadata

from sklearn.model_selection import train_test_split

# Step 1: Get all the files in the output folder
file_names = os.listdir(PATH_DATA_PROCESSED_DL)

# Step 2: Get the full paths of the files (without extensions)
files = [os.path.splitext(os.path.join(PATH_DATA_PROCESSED_DL, file_name))[0] for file_name in fnmatch.filter(file_names, "*.zarr")]

# Step 3: Load all the metadata
frames = []

for idx, feature_file in enumerate(files):
    df_metadata = pd.read_csv(feature_file.replace("processed_raw_", "processed_metadata_") + ".csv")
    frames.append(df_metadata)

df_metadata = pd.concat(frames) 

# Step 4: Add missing age information based on the age group the subject is in
df_metadata['age_months'].fillna(df_metadata['age_group'], inplace=True)
df_metadata['age_days'].fillna(df_metadata['age_group']*30, inplace=True)
df_metadata['age_years'].fillna(df_metadata['age_group']/12, inplace=True)

# Step 5: List all the unique subject IDs
subject_ids = sorted(list(set(df_metadata["code"].tolist())))

CPU times: user 4.04 s, sys: 346 ms, total: 4.38 s
Wall time: 5.11 s


In [37]:
df_metadata

Unnamed: 0,code,cnt_path,cnt_file,age_group,age_days,age_months,age_years
0,23,/Volumes/Seagate Expansion Drive/ePodium/Data/...,023_35_mc_mmn36,35,1052.0,35.066667,2.922222
0,337,/Volumes/Seagate Expansion Drive/ePodium/Data/...,337_23_jc_mmn_36_wk,23,692.0,23.066667,1.922222
0,456,/Volumes/Seagate Expansion Drive/ePodium/Data/...,456_23_md_mmn36_wk,23,691.0,23.033333,1.919444
0,328,/Volumes/Seagate Expansion Drive/ePodium/Data/...,328_23_jc_mmn36_wk,23,699.0,23.300000,1.941667
0,314,/Volumes/Seagate Expansion Drive/ePodium/Data/...,314_29_mmn_36_wk,29,877.0,29.233333,2.436111
...,...,...,...,...,...,...,...
0,348,/Volumes/Seagate Expansion Drive/ePodium/Data/...,348_29_jc_mmn25_wk,29,858.0,28.600000,2.383333
0,9,/Volumes/Seagate Expansion Drive/ePodium/Data/...,009_23_jc_mmn58,23,692.0,23.066667,1.922222
0,751,/Volumes/Seagate Expansion Drive/ePodium/Data/...,751-452-29m-jr-mmn36,29,869.0,28.966667,2.413889
0,348,/Volumes/Seagate Expansion Drive/ePodium/Data/...,348_17_jc_mmn25_wk,17,512.0,17.066667,1.422222


In [38]:
def chunks(lst, n):
    """Yield successive n-sized chunks from list."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
        
def average_epochs():
    IDs = sorted(list(set(df_metadata["code"].tolist())))
    # Step 1: Iterate over subjects
    for ID in IDs:
        
        # Step 2: Find all files of a subject
        df_temp = df_metadata[df_metadata['code'] == ID]
    
        # Step 3: Find all the age groups the subject was found in
        ages_subject = sorted(list(set(df_temp['age_group'].tolist())))
        
        # Step 4: Loop over all the age groups the subject is in
        for age_group in ages_subject:            
            X_data = np.zeros((0, 30, 501))
            X_averaged_subsets = np.zeros((0, 30, 501))
            
            # Step 5: Concatenate data of files in the same age group
            for i, metadata_file in df_temp[df_temp['age_group'] == age_group].iterrows():
                filename = os.path.join(PATH_DATA_PROCESSED_DL, 'processed_raw_' + metadata_file['cnt_file'] + '.zarr')
                data_signal = zarr.open(os.path.join(filename), mode='r')
                X_data = np.concatenate((X_data, data_signal), axis=0)
            
            np.random.shuffle(X_data) # Shuffle data for randomly picking epochs without replacement
            
            for subset in chunks(X_data, 10):
                X_data_mean = np.mean(subset[:,:,:], axis=0) # Average all epochs in subset
                X_data_mean = np.expand_dims(X_data_mean, axis=0)
                X_averaged_subsets = np.concatenate((X_averaged_subsets, X_data_mean), axis=0)
            
            file_name = f"{str(ID).zfill(3)}_{age_group}"
            zarr_name = file_name + ".zarr"
            csv_name = file_name + ".csv"
            
            metadata_file['cnt_file'] = file_name
            path_metadata = os.path.join(PATH_DATA_PROCESSED_DL_REDUCED, csv_name)
            
            pd.DataFrame(metadata_file).transpose().to_csv(path_metadata, sep=',', index=False, header=True)     
            z_file =  zarr.open(os.path.join(PATH_DATA_PROCESSED_DL_REDUCED, zarr_name), 
                                mode='w', 
                                shape=X_averaged_subsets.shape, 
                                chunks=(1, X_averaged_subsets.shape[1], X_averaged_subsets.shape[2]))
            z_file[:] = X_averaged_subsets
        
average_epochs()

## Load preprocessed data (reduced)

In [None]:
%%time

# Load all the metadata

from sklearn.model_selection import train_test_split

# Step 1: Get all the files in the output folder
file_names = os.listdir(PATH_DATA_PROCESSED_DL_REDUCED)

# Step 2: Get the full paths of the files (without extensions)
files = [os.path.splitext(os.path.join(PATH_DATA_PROCESSED_DL_REDUCED, file_name))[0] for file_name in fnmatch.filter(file_names, "*.zarr")]

# Step 3: Load all the metadata
frames = []

for idx, feature_file in enumerate(files):
    df_metadata = pd.read_csv(feature_file + ".csv")
    frames.append(df_metadata)

df_metadata = pd.concat(frames) 

# Step 4: Add missing age information based on the age group the subject is in
df_metadata['age_months'].fillna(df_metadata['age_group'], inplace=True)
df_metadata['age_days'].fillna(df_metadata['age_group']*30, inplace=True)
df_metadata['age_years'].fillna(df_metadata['age_group']/12, inplace=True)

# Step 5: List all the unique subject IDs
subject_ids = sorted(list(set(df_metadata["code"].tolist())))

In [32]:
df_metadata

Unnamed: 0,code,cnt_path,cnt_file,age_group,age_days,age_months,age_years
0,1,/Volumes/Seagate Expansion Drive/ePodium/Data/...,001_29,29,842.0,28.066667,2.338889
0,1,/Volumes/Seagate Expansion Drive/ePodium/Data/...,001_23,23,691.0,23.033333,1.919444
0,1,/Volumes/Seagate Expansion Drive/ePodium/Data/...,001_35,35,1047.0,34.9,2.908333
0,1,/Volumes/Seagate Expansion Drive/ePodium/Data/...,001_17,17,510.0,17.0,1.416667
0,1,/Volumes/Seagate Expansion Drive/ePodium/Data/...,001_41,41,1230.0,41.0,3.416667
0,1,/Volumes/Seagate Expansion Drive/ePodium/Data/...,001_11,11,329.0,10.966667,0.913889
0,1,/Volumes/Seagate Expansion Drive/ePodium/Data/...,001_47,47,1403.0,46.766667,3.897222
