# Non-EEG Biosignals Dataset

In [1]:
# Import the necessary libraries

from tensorflow import keras
import numpy as np
import pandas as pd
import sklearn
from scipy.interpolate import interp1d
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import matplotlib
from matplotlib.pyplot import plot as plt
from collections import Counter
import tensorflow as tf
import os
import glob

## Dataset chart for single modality: Heartrate

 The dataset consists of data collected from 20 different college (14 males, 6 females) subjects. <br /> It has data from 5 different modalities: Electrodermal Activity, Temperature, 3D-Accelerometer, Heart rate, and Arterial oxygen saturation. <br /> For the preliminary experiments, we begin by considering only one modality: Heart rate

In [4]:
# Creation of dataset chart with durations for different classes and subjects for Heartrate

df_hr_spo2 = pd.read_csv('HSBD_DS/Subjects_all_HR_SpO2.csv')

labels = list(np.unique(df_hr_spo2['Label']))
subjects = list(np.unique(df_hr_spo2['Subject']))

df_mod = df_hr_spo2.drop_duplicates(['Subject','Label']).sort_values(by=['Subject','Label']).reset_index(drop=True)
df_mod = df_mod.drop(columns=['Hour','Minute','Second','HeartRate','SpO2'])

df_mod['Frame count'] = list(df_hr_spo2.groupby(['Subject', 'Label']).size())

label_dict = {}
for i in labels:    
    label_dict[i] = list(df_mod[df_mod['Label']==i]['Frame count'])
    
df_chart = pd.DataFrame(label_dict)
df_chart['Subject'] = subjects
df_chart = df_chart.iloc[:, [7,3,2,4,0,5,1,6]] # Re-arranging classes to show in the order during data collection
df_chart = df_chart.style.set_caption('Dataset chart for 20 subjects with the duration of each class in seconds')
df_chart

Unnamed: 0,Subject,Relax_1,PhysicalStress,Relax_2,CognitiveStress,Relax_3,EmotionalStress,Relax_4
0,Subject_1,301,329,301,365,301,402,300
1,Subject_10,301,327,301,356,301,400,300
2,Subject_11,301,328,301,356,301,652,299
3,Subject_12,299,327,302,355,301,401,300
4,Subject_13,301,328,301,355,301,401,299
5,Subject_14,301,327,301,355,301,400,300
6,Subject_15,301,325,301,355,301,397,300
7,Subject_16,301,327,301,355,301,401,299
8,Subject_17,304,327,301,357,300,401,299
9,Subject_18,301,329,301,355,301,401,300


## Multimodal analysis

Sensors for Heartrate and SpO2 are sampled at a frequency of 1Hz found in 'HSBD_DS/Subjects_all_HR_SpO2.csv'  <br /> 
whereas the remaining sensors are sampled at 8Hz found in 'HSBD_DS/Subjects_all_AccTempEDA.csv'

In [5]:
# Loading the datasets and defining lists of labels, subjects, and different modalities according to availability from respective files

df_hr_spo2 = pd.read_csv('HSBD_DS/Subjects_all_HR_SpO2.csv')
df_acc_temp_eda = pd.read_csv('HSBD_DS/Subjects_all_AccTempEDA.csv')
df_acc_temp_eda = df_acc_temp_eda.dropna().reset_index(drop=True)
df_hr_spo2.sort_values(by='Subject')
label = ['Relax_1','Relax_2', 'Relax_3', 'Relax_4', 'PhysicalStress', 'CognitiveStress', 'EmotionalStress'] #
subjects = list(np.unique(df_hr_spo2['Subject']))
modals = ['HeartRate', 'SpO2', 'Temp', 'EDA', 'AccX', 'AccY', 'AccZ']
modals_1 = ['HeartRate', 'SpO2']
modals_2 = ['Temp', 'EDA', 'AccX', 'AccY', 'AccZ']

## Pre-processing data for one modality: Heartrate

Since the raw data has an unequal duration, for each subject and class, the duration is up/downsampled to 360s (6min). Further, to have segments of appropriate fixed durations, the data is chunked to 60s (1min) long segments. 

In [8]:
# Up/down sample raw data to 360s and chunk to fixed segments of 60s each (One modality)

def dataframe_prep_onemod(df, frame_rate=60, max_size=360):

    frame_rate = 60 #  Length of each segment
    max_size = 360 # Duration to which the overall duration has to be up/downsampled
    
    # Declare empty lists and dataframes to store processed data
    
    df_list = {} 
    arr_list = {}
    arr_interp = {}
    arr_reshape = {}
    df_final = pd.DataFrame()
    
    # Loop through all the subjects (20)
    for j in range(len(subjects)):
        
        #Loop though all the labels (7)
        for i in range(len(label)):
            
            var = 'df_'+ str(label[i]) +'_'+ str(subjects[j]) # Select one class per subject
            df_list[var] = df[(df['Subject'] == subjects[j]) & (df['Label'] == label[i])].reset_index(drop=True) 
                                                        # From the dataframe, select the data belonging to one class per subject
            
            arr_list[var] = np.array(df_list[var]['HeartRate']) # Obtain NumPy arrays for the selected data        
            arr_interp[var] = interp1d(np.arange(arr_list[var].size), arr_list[var]) # Set up an interpolation function
                                                                            # to up/sample to the defined duration            
            arr_reshape[var] = arr_interp[var](np.linspace(0, arr_list[var].size-1, max_size)) 
            
            arr_reshape[var] =  np.reshape(arr_reshape[var], (int(len(arr_reshape[var])/frame_rate), frame_rate)) # Divide sampled data into segments
            df_list[var] = pd.DataFrame(arr_reshape[var]) # A new dataframe to store the processed data
            
            # Adding a label to the data segments
            
            if label[i] == 'Relax_' + str(i+1):
                df_list[var]['Label'] = 0 # Setting all the 4 different Relax classes with '0' label
                df_list[var]['Label_ori'] = label[i]
            else:
                df_list[var]['Label'] = i-3 # Setting PhysicalStress:1, CognitiveStress:2, and EmotionalStress:3
                df_list[var]['Label_ori'] = label[i]
                
            df_final = pd.concat([df_final, df_list[var]]) # Concatentate over all classes and subjects
        
    return df_final

In [9]:
# Dataframe storing data for one modality: Heartrate

df_hr = dataframe_prep_onemod(df_hr_spo2, frame_rate=60, max_size=360)
df_hr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,52,53,54,55,56,57,58,59,Label,Label_ori
0,89.000000,88.164345,87.328691,87.000000,87.000000,87.000000,86.986072,86.150418,85.314763,85.000000,...,75.545961,74.710306,74.000000,74.000000,73.203343,73.000000,73.000000,72.696379,0,Relax_1
1,71.860724,71.025070,71.000000,70.353760,70.000000,70.000000,70.000000,70.000000,70.824513,71.000000,...,68.406685,68.000000,68.000000,68.000000,68.000000,68.000000,68.607242,69.000000,0,Relax_1
2,69.000000,69.114206,69.949861,70.785515,70.378830,70.000000,70.000000,70.000000,70.000000,70.000000,...,70.732591,71.000000,71.000000,71.000000,70.924791,70.089136,70.000000,70.000000,0,Relax_1
3,70.417827,71.000000,71.000000,71.000000,71.760446,72.000000,71.568245,71.000000,70.896936,70.061281,...,76.743733,77.000000,77.543175,78.000000,78.000000,77.949861,77.114206,77.000000,0,Relax_1
4,76.442897,76.000000,75.771588,75.000000,75.000000,74.264624,74.000000,74.000000,73.757660,73.000000,...,79.011142,79.846797,78.635097,76.963788,76.000000,75.810585,75.000000,75.000000,0,Relax_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,77.000000,77.000000,77.000000,77.584958,80.618384,82.000000,84.150418,86.651811,87.000000,89.640669,...,75.208914,75.000000,75.000000,75.000000,75.000000,75.000000,75.000000,74.409471,3,EmotionalStress
2,74.000000,74.000000,73.066852,74.000000,74.000000,74.000000,74.000000,74.000000,74.618384,75.000000,...,76.000000,76.000000,75.128134,75.000000,74.000000,74.000000,74.328691,75.000000,3,EmotionalStress
3,75.000000,75.000000,73.428969,73.000000,72.000000,72.000000,72.000000,72.000000,72.000000,73.169916,...,75.000000,75.610028,76.724234,77.000000,77.000000,78.000000,78.362117,80.000000,3,EmotionalStress
4,80.000000,80.000000,79.362117,79.000000,78.133705,78.000000,77.000000,77.000000,77.000000,77.000000,...,75.000000,75.000000,75.000000,75.000000,75.805014,76.000000,76.033426,76.852368,3,EmotionalStress


## Pre-processing data for all modalities

In [10]:
# Up/down sample raw data to 360s and chunk to fixed segments of 60s each (All modalities)

def dataframe_prep_allmod(frame_rate=60, max_size=360):

    df_list = {}
    arr_list = {}
    arr_interp = {}
    arr_reshape = {}
    df_mod_one = pd.DataFrame()
    df_mod_all = pd.DataFrame()
    df_fin = pd.DataFrame()
    data_modals = {}
    labels_modals = {}
    
    frame_rate = frame_rate
    max_size = max_size
    
    # Choose the dataset for a specific modality
    for k in range(len(modals)):
        
        # If the selected modality is Heartrate or SpO2. select first dataset
        if modals[k] in modals_1:
            df = df_hr_spo2
        
        # If the selecetd modality is any other, select second dataset
        elif modals[k] in modals_2:
            df = df_acc_temp_eda  
        
        # Loop through all the subjects (20)
        for j in range(len(subjects)):    
            
            #Loop though all the labels (7)
            for i in range(len(label)):      
                
                var = 'df_'+ str(label[i]) +'_'+ str(subjects[j]) # Select one class per subject
                df_list[var] = df[(df['Subject'] == subjects[j]) & (df['Label'] == label[i])].reset_index(drop=True)
                                                                # From the dataframe, select the data belonging to one class per subject
                arr_list[var] = np.array(df_list[var][modals[k]]) # Obtain NumPy arrays for the selected data     
                arr_interp[var] = interp1d(np.arange(arr_list[var].size), arr_list[var]) # Set up an interpolation function
                                                                     # to up/sample to the defined duration 
                arr_reshape[var] = arr_interp[var](np.linspace(0, arr_list[var].size-1, max_size))
                arr_reshape[var] =  np.reshape(arr_reshape[var], (int(len(arr_reshape[var])/frame_rate), frame_rate)) # Divide sampled data into segments        
        
                df_mod_one = pd.DataFrame(arr_reshape[var]) # A new dataframe to store the processed data         
                    
                # Adding a label to the data segments
                    
                if label[i] == 'Relax_' + str(i+1):
                    df_mod_one['Label'] = 0
                    df_mod_one['Label_ori'] = label[i]
                    
                else:                    
                    df_mod_one['Label'] = i-3
                    df_mod_one['Label_ori'] = label[i]
                 
                # Adding subject and modality information for completeness
                
                df_mod_one['Subject'] = subjects[j]    
                df_mod_one['Modality'] = modals[k]
                df_mod_all = pd.concat([df_mod_all, df_mod_one])
    
    # Rearranging and stacking to create the final dataframe
    
    data = df_mod_all[df_mod_all['Modality'] == modals[0]].iloc[:,:-1].values
    
    for i in range(1, len(modals)):
        data = np.dstack((data, (df_mod_all[df_mod_all['Modality'] == modals[i]].iloc[:,:-1].values)))     
    
    df_fin['Data'] = list(data[:,:-1])
    df_fin['Subject'] = list(data[:,-1,0])
    df_fin['Label'] = list(data[:,-3,0])
    df_fin['Label_ori'] = list(data[:,-2,0])
        
    return df_fin

### Balancing the sampling frequency of different sensors to combine different modalities together

Heartrate and SpO2 sensor data are sampled at 1Hz and EDA, Temp, and Acc sensors are sampled at 8Hz. </br>
In order to combine all the sensors together, Heartrate and SpO2 are upsampled to 8Hz to maintain uniformity

In [11]:
# Dataframe storing data for all modalities

df_allmod = dataframe_prep_allmod(frame_rate=60, max_size=360*8) # Here multiplying by 8 indicates the upsampling

In [12]:
df_allmod

Unnamed: 0,Data,Subject,Label,Label_ori
0,"[[89.0, 97.0, 30.1, 0.083, 0.76, -0.87, -0.1],...",Subject_1,0,Relax_1
1,"[[85.74782910732894, 96.0, 30.1, 0.083, 0.74, ...",Subject_1,0,Relax_1
2,"[[81.0, 96.0, 30.4, 0.083, 0.7, -0.90034734282...",Subject_1,0,Relax_1
3,"[[83.0, 97.0, 30.4, 0.083, 0.73, -0.89, -0.09]...",Subject_1,0,Relax_1
4,"[[81.99131642931573, 97.0, 30.4, 0.083, 0.7106...",Subject_1,0,Relax_1
...,...,...,...,...
6715,"[[73.0, 96.0, 33.9, 1.1124425147620716, -0.06,...",Subject_9,3,EmotionalStress
6716,"[[74.0, 96.0, 33.9, 1.0912000694685653, -0.06,...",Subject_9,3,EmotionalStress
6717,"[[78.0, 96.0, 33.9, 1.0635682528655783, 0.0052...",Subject_9,3,EmotionalStress
6718,"[[76.0, 96.0, 33.9, 1.0270635637374084, 0.0002...",Subject_9,3,EmotionalStress


## Splitting the dataframe into training and test subsets

In [13]:
# Leaving one subject out for all modalities

def train_test_loso(df, subject=0): # Input the dataframe and the subject to be selected as the test subject    
    df_one_train =  pd.DataFrame()
    df_one_test = pd.DataFrame()
    df_train = pd.DataFrame()
    df_test = pd.DataFrame()
    
    # Loop through all the labels and select the data belonging to the selected subject as the test dataset
    for j in range(len(label)):
        data_train = df[(df['Subject']!=subjects[subject])]['Data'].values # Select train subjects
        data_train = np.array([np.array(xi) for xi in data_train]) # Obtain NumPy arrays for the selected data             
            
        data_test = df[(df['Subject']==subjects[subject])]['Data'].values # Select test subject  
        data_test = np.array([np.array(xi) for xi in data_test]) # Obtain NumPy arrays for the selected data         
        
        # Convert arrays back to a dataframe
        df_one_train['Data'] = list(data_train[:,:-2,:])
        df_one_train['Label'] = data_train[:,-2,0] 
        df_one_train['Label_ori'] = data_train[:,-1,0] 
            
        df_one_test['Data'] = list(data_test[:,:-2,:])
        df_one_test['Label'] = data_test[:,-2,0] 
        df_one_test['Label_ori'] = data_test[:,-1,0] 
            
        df_train = pd.concat([df_train, df_one_train]).reset_index(drop=True)
        df_test =  pd.concat([df_test, df_one_test]).reset_index(drop=True)
    
    return df_train, df_test

In [14]:
# Leaving one segment out for all modalities

def train_test_losego(df, seg_indice=0, num_seg=1):  # Input the dataframe and the segment to be left for the test dataset
    df_train = pd.DataFrame()
    df_test = pd.DataFrame()
    df_one_train =  pd.DataFrame()
    df_one_test = pd.DataFrame()
    
    # Loop through all the subjects (20)
    for i in range(len(subjects)):
        # Loop through all labels(7)
        for j in range(len(label)):
            data_all = df[(df['Subject']==subjects[i]) & (df['Label_ori']==label[j])]['Data'].values           
            
            data_all = np.array([np.array(xi) for xi in data_all])
            indice =  seg_indice  #np.random.choice(data_all.shape[0], size=1, replace=False)           
            
            data_test = data_all[indice*8:(indice*8)+num_seg]
            data_train = np.delete(data_all, indice, axis=0)
            
            df_one_train['Data'] = list(data_train[:,:-2,:])
            df_one_train['Label'] = data_train[:,-2,0] 
            df_one_train['Label_ori'] = data_train[:,-1,0] 
            df_one_test['Data'] = list(data_test[:,:-2,:])
            df_one_test['Label'] = data_test[:,-2,0] 
            df_one_test['Label_ori'] = data_test[:,-1,0] 
            
            df_train = pd.concat([df_train, df_one_train]).reset_index(drop=True)
            df_test =  pd.concat([df_test, df_one_test]).reset_index(drop=True)
        
    return df_train, df_test

In [15]:
# Train and test data for leaving one subject out

df_loso_train, df_loso_test = train_test_loso(df_allmod, subject=0)
df_loso_test

Unnamed: 0,Data,Label,Label_ori
0,"[[89.0, 97.0, 30.1, 0.083, 0.76, -0.87, -0.1],...",0,Relax_1
1,"[[85.74782910732894, 96.0, 30.1, 0.083, 0.74, ...",0,Relax_1
2,"[[81.0, 96.0, 30.4, 0.083, 0.7, -0.90034734282...",0,Relax_1
3,"[[83.0, 97.0, 30.4, 0.083, 0.73, -0.89, -0.09]...",0,Relax_1
4,"[[81.99131642931573, 97.0, 30.4, 0.083, 0.7106...",0,Relax_1
...,...,...,...
2347,"[[75.0, 96.0, 33.5, 0.245, 0.8911705453282411,...",3,EmotionalStress
2348,"[[76.0, 95.28898923237233, 33.5, 0.243, 0.8926...",3,EmotionalStress
2349,"[[75.93192080583538, 95.0, 33.7, 0.24, 0.88979...",3,EmotionalStress
2350,"[[74.42514762070164, 95.0, 33.5, 0.238, 0.9, -...",3,EmotionalStress


In [16]:
# Train and test data for leaving first segment out

df_losego_train, df_losego_test = train_test_losego(df_allmod, seg_indice=0, num_seg=1)
df_losego_test

Unnamed: 0,Data,Label,Label_ori
0,"[[89.0, 97.0, 30.1, 0.083, 0.76, -0.87, -0.1],...",0,Relax_1
1,"[[125.0, 95.0, 32.4, 1.887, 0.52, 0.47, 0.56],...",0,Relax_2
2,"[[85.0, 96.0, 32.9, 1.723, -0.14, -0.65, 0.45]...",0,Relax_3
3,"[[76.0, 95.0, 33.7, 0.245, 0.6, -0.95, -0.04],...",0,Relax_4
4,"[[76.0, 96.0, 32.2, 0.092, 0.35, -0.23, -1.01]...",1,PhysicalStress
...,...,...,...
135,"[[95.0, 96.0, 33.7, 3.234, 1.16, -0.3, 0.53], ...",0,Relax_3
136,"[[86.0, 98.0, 33.9, 1.164, 1.02, -0.55, -0.18]...",0,Relax_4
137,"[[90.0, 96.0, 30.8, 0.408, 0.19, -0.22, -1.02]...",1,PhysicalStress
138,"[[94.0, 98.0, 32.0, 1.61, -0.17, -0.96, -0.25]...",2,CognitiveStress
