# Non-EEG Biosignals Dataset Analysis

In [7]:
import os
os.chdir('../')
%pwd

'/home/dheeraj/Projects/Explainable-AI-Non-EEG'

In [4]:
# Import the necessary libraries

from tensorflow import keras
import numpy as np
import pandas as pd
import sklearn
from scipy.interpolate import interp1d
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import matplotlib
from matplotlib.pyplot import plot as plt
from collections import Counter
import tensorflow as tf
import os
import glob

## Dataset chart for single modality: Heartrate

 The dataset consists of data collected from 20 different college (14 males, 6 females) subjects. <br /> It has data from 5 different modalities: Electrodermal Activity, Temperature, 3D-Accelerometer, Heart rate, and Arterial oxygen saturation. <br /> For the preliminary experiments, we begin by considering only one modality: Heart rate

In [13]:
# Creation of dataset chart with durations for different classes and subjects for Heartrate

dataset_df = pd.read_csv('data/dataset/Subjects_all_HR_SpO2.csv') # Importing the dataset
labels = list(np.unique(dataset_df['Label'])) # Creating a list of labels
subjects = list(np.unique(dataset_df['Subject'])) # Creating a list of subjects

df_mod = dataset_df.drop_duplicates(['Subject','Label']).sort_values(by=['Subject','Label']).reset_index(drop=True) # Removing duplicates
df_mod = df_mod.drop(columns=['Hour','Minute','Second','HeartRate','SpO2']) # Dropping unnecessary columns

df_mod['Frame count'] = list(dataset_df.groupby(['Subject', 'Label']).size()) # Calculating the duration of each class

label_dict = {}
for i in labels:    
    label_dict[i] = list(df_mod[df_mod['Label']==i]['Frame count'])
    
df_chart = pd.DataFrame(label_dict)
df_chart['Subject'] = subjects
df_chart = df_chart.iloc[:, [7,3,2,4,0,5,1,6]] # Re-arranging classes to show in the order during data collection
df_chart = df_chart.style.set_caption('Dataset chart for 20 subjects with the duration of each class in seconds')
df_chart


Unnamed: 0,Subject,Relax_1,PhysicalStress,Relax_2,CognitiveStress,Relax_3,EmotionalStress,Relax_4
0,Subject_1,301,329,301,365,301,402,300
1,Subject_10,301,327,301,356,301,400,300
2,Subject_11,301,328,301,356,301,652,299
3,Subject_12,299,327,302,355,301,401,300
4,Subject_13,301,328,301,355,301,401,299
5,Subject_14,301,327,301,355,301,400,300
6,Subject_15,301,325,301,355,301,397,300
7,Subject_16,301,327,301,355,301,401,299
8,Subject_17,304,327,301,357,300,401,299
9,Subject_18,301,329,301,355,301,401,300


## Multimodal analysis

Sensors for Heartrate and SpO2 are sampled at a frequency of 1Hz found in 'data/dataset/Subjects_all_HR_SpO2.csv'  <br /> 
whereas the remaining sensors are sampled at 8Hz found in 'data/dataset/Subjects_all_AccTempEDA.csv'

In [15]:
# Loading the datasets and defining lists of labels, subjects, and different modalities according to availability from respective files

df_hr_spo2 = pd.read_csv('data/dataset//Subjects_all_HR_SpO2.csv') # HeartRate, SpO2
df_acc_temp_eda = pd.read_csv('data/dataset/Subjects_all_AccTempEDA.csv') # AccX, AccY, AccZ, Temp, EDA
df_acc_temp_eda = df_acc_temp_eda.dropna().reset_index(drop=True) # Removing NaN
df_hr_spo2.sort_values(by='Subject') 
label = ['Relax_1','Relax_2', 'Relax_3', 'Relax_4', 'PhysicalStress', 'CognitiveStress', 'EmotionalStress'] 
subjects = list(np.unique(df_hr_spo2['Subject']))
modals = ['HeartRate', 'SpO2', 'Temp', 'EDA', 'AccX', 'AccY', 'AccZ']
modals_1 = ['HeartRate', 'SpO2']
modals_2 = ['Temp', 'EDA', 'AccX', 'AccY', 'AccZ']

## Pre-processing data for one modality: Heartrate

Since the raw data has an unequal duration, for each subject and class, the duration is up/downsampled to 360s (6min). Further, to have segments of appropriate fixed durations, the data is chunked to 60s (1min) long segments. 

In [None]:
# Up/down sample raw data to 360s and chunk to fixed segments of 60s each (One modality)

def dataframe_prep_onemod(df, frame_rate=60, max_size=360):

    frame_rate = 60 #  Length of each segment
    max_size = 360 # Duration to which the overall duration has to be up/downsampled
    
    # Declare empty lists and dataframes to store processed data
    
    df_list = {} 
    arr_list = {}
    arr_interp = {}
    arr_reshape = {}
    df_final = pd.DataFrame()
    
    # Loop through all the subjects (20)
    for j in range(len(subjects)):
        
        #Loop though all the labels (7)
        for i in range(len(label)):
            
            var = 'df_'+ str(label[i]) +'_'+ str(subjects[j]) # Select one class per subject
            df_list[var] = df[(df['Subject'] == subjects[j]) & (df['Label'] == label[i])].reset_index(drop=True) 
                                                        # From the dataframe, select the data belonging to one class per subject
            
            arr_list[var] = np.array(df_list[var]['HeartRate']) # Obtain NumPy arrays for the selected data        
            arr_interp[var] = interp1d(np.arange(arr_list[var].size), arr_list[var]) # Set up an interpolation function
                                                                            # to up/sample to the defined duration            
            arr_reshape[var] = arr_interp[var](np.linspace(0, arr_list[var].size-1, max_size)) 
            
            arr_reshape[var] =  np.reshape(arr_reshape[var], (int(len(arr_reshape[var])/frame_rate), frame_rate)) # Divide sampled data into segments
            df_list[var] = pd.DataFrame(arr_reshape[var]) # A new dataframe to store the processed data
            
            # Adding a label to the data segments
            
            if label[i] == 'Relax_' + str(i+1):
                df_list[var]['Label'] = 0 # Setting all the 4 different Relax classes with '0' label
                df_list[var]['Label_ori'] = label[i]
            else:
                df_list[var]['Label'] = i-3 # Setting PhysicalStress:1, CognitiveStress:2, and EmotionalStress:3
                df_list[var]['Label_ori'] = label[i]
                
            df_final = pd.concat([df_final, df_list[var]]) # Concatentate over all classes and subjects
        
    return df_final

In [19]:
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d

class DataframePrepOneMod:
    def __init__(self, subjects, label):
        """
        Initializes the DataframePrepOneMod class.

        Parameters:
        - subjects (list): List of subjects.
        - label (list): List of labels.
        """
        self.subjects = subjects
        self.label = label

    def dataframe_prep_onemod(self, df, frame_rate=60, max_size=360):
        """
        Up/down samples raw data to 'max_size' and chunks to fixed segments of 'frame_rate' each for one modality.

        Parameters:
        - df (pd.DataFrame): Input DataFrame containing raw data.
        - frame_rate (int): Length of each segment (default: 60).
        - max_size (int): Duration to which the overall duration has to be up/downsampled (default: 360).

        Returns:
        - pd.DataFrame: Processed DataFrame with up/downsampled and segmented data.
        """
        # Declare empty lists and dataframes to store processed data
        df_list = {}
        df_final = pd.DataFrame()

        # Loop through all the subjects
        for j in range(len(self.subjects)):
            # Loop through all the labels
            for i in range(len(self.label)):
                var = f'df_{self.label[i]}_{self.subjects[j]}'  # Variable name based on label and subject
                df_list[var] = df[(df['Subject'] == self.subjects[j]) & (df['Label'] == self.label[i])].reset_index(drop=True)

                # Convert 'HeartRate' column to a NumPy array
                arr_list = np.array(df_list[var]['HeartRate'])

                # Interpolate and reshape the array
                # Interpolate the array
                arr_interp = interp1d(np.arange(arr_list.size), arr_list)
                arr_reshape = np.reshape(arr_interp(np.linspace(0, arr_list.size-1, max_size)),
                         (int(len(arr_interp(np.linspace(0, arr_list.size-1, max_size)))/frame_rate), frame_rate))


                df_list[var] = pd.DataFrame(arr_reshape)

                # Adding a label to the data segments
                df_list[var]['Label'] = 0 if 'Relax' in self.label[i] else i-3
                df_list[var]['Label_ori'] = self.label[i]

                df_final = pd.concat([df_final, df_list[var]])

        return df_final


In [20]:
# Instantiate the class
preprocessor = DataframePrepOneMod(subjects=subjects, label=label)

# Assuming 'your_dataframe' is the DataFrame you want to process
processed_dataframe = preprocessor.dataframe_prep_onemod(dataset_df)

In [21]:
processed_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,52,53,54,55,56,57,58,59,Label,Label_ori
0,89.000000,88.164345,87.328691,87.000000,87.000000,87.000000,86.986072,86.150418,85.314763,85.000000,...,75.545961,74.710306,74.000000,74.000000,73.203343,73.000000,73.000000,72.696379,0,Relax_1
1,71.860724,71.025070,71.000000,70.353760,70.000000,70.000000,70.000000,70.000000,70.824513,71.000000,...,68.406685,68.000000,68.000000,68.000000,68.000000,68.000000,68.607242,69.000000,0,Relax_1
2,69.000000,69.114206,69.949861,70.785515,70.378830,70.000000,70.000000,70.000000,70.000000,70.000000,...,70.732591,71.000000,71.000000,71.000000,70.924791,70.089136,70.000000,70.000000,0,Relax_1
3,70.417827,71.000000,71.000000,71.000000,71.760446,72.000000,71.568245,71.000000,70.896936,70.061281,...,76.743733,77.000000,77.543175,78.000000,78.000000,77.949861,77.114206,77.000000,0,Relax_1
4,76.442897,76.000000,75.771588,75.000000,75.000000,74.264624,74.000000,74.000000,73.757660,73.000000,...,79.011142,79.846797,78.635097,76.963788,76.000000,75.810585,75.000000,75.000000,0,Relax_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,77.000000,77.000000,77.000000,77.584958,80.618384,82.000000,84.150418,86.651811,87.000000,89.640669,...,75.208914,75.000000,75.000000,75.000000,75.000000,75.000000,75.000000,74.409471,3,EmotionalStress
2,74.000000,74.000000,73.066852,74.000000,74.000000,74.000000,74.000000,74.000000,74.618384,75.000000,...,76.000000,76.000000,75.128134,75.000000,74.000000,74.000000,74.328691,75.000000,3,EmotionalStress
3,75.000000,75.000000,73.428969,73.000000,72.000000,72.000000,72.000000,72.000000,72.000000,73.169916,...,75.000000,75.610028,76.724234,77.000000,77.000000,78.000000,78.362117,80.000000,3,EmotionalStress
4,80.000000,80.000000,79.362117,79.000000,78.133705,78.000000,77.000000,77.000000,77.000000,77.000000,...,75.000000,75.000000,75.000000,75.000000,75.805014,76.000000,76.033426,76.852368,3,EmotionalStress


In [76]:
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d

class DataframePrepAllMod:
    def __init__(self, subjects, modals, modals_1, modals_2, df_hr_spo2, df_acc_temp_eda, label, frame_rate=60, max_size=360):
        """
        Initializes the DataframePrepAllMod class.

        Parameters:
        - subjects (list): List of subjects.
        - modals (list): List of modalities.
        - modals_1 (list): List of modalities for the first dataset.
        - modals_2 (list): List of modalities for the second dataset.
        - df_hr_spo2 (pd.DataFrame): DataFrame for Heartrate and SpO2 data.
        - df_acc_temp_eda (pd.DataFrame): DataFrame for accelerometer, temperature, and EDA data.
        - label (list): List of labels.
        - frame_rate (int): Length of each segment (default: 60).
        - max_size (int): Duration to which the overall duration has to be up/downsampled (default: 360).
        """
        self.subjects = subjects
        self.modals = modals
        self.modals_1 = modals_1
        self.modals_2 = modals_2
        self.df_hr_spo2 = df_hr_spo2
        self.df_acc_temp_eda = df_acc_temp_eda
        self.label = label
        self.frame_rate = frame_rate
        self.max_size = max_size

    def dataframe_prep_allmod(self):
        """
        Up/down samples raw data to 'max_size' and chunks to fixed segments of 'frame_rate' each for all modalities.

        Returns:
        - pd.DataFrame: Processed DataFrame with up/downsampled and segmented data for all modalities.
        """
        df_mod_all = pd.DataFrame()

        for k in range(len(self.modals)):
            if self.modals[k] in self.modals_1:
                df = self.df_hr_spo2
            elif self.modals[k] in self.modals_2:
                df = self.df_acc_temp_eda

            for j in range(len(self.subjects)):
                for i in range(len(self.label)):
                    var = f'df_{self.label[i]}_{self.subjects[j]}'
                    df_list = df[(df['Subject'] == self.subjects[j]) & (df['Label'] == self.label[i])].reset_index(drop=True)
                    arr_list = np.array(df_list[self.modals[k]])

                    arr_interp = interp1d(np.arange(arr_list.size), arr_list)
                    arr_reshape = np.reshape(arr_interp(np.linspace(0, arr_list.size-1, int(self.max_size))),
                                             (int(len(arr_interp(np.linspace(0, arr_list.size-1, int(self.max_size)))) / self.frame_rate), self.frame_rate))

                    df_mod_one = pd.DataFrame(arr_reshape)

                    df_mod_one['Label'] = 0 if 'Relax' in self.label[i] else i-3
                    df_mod_one['Label_ori'] = self.label[i]

                    df_mod_one['Subject'] = self.subjects[j]
                    df_mod_one['Modality'] = self.modals[k]
                    df_mod_all = pd.concat([df_mod_all, df_mod_one])

        data = df_mod_all[df_mod_all['Modality'] == self.modals[0]].iloc[:, :-1].values

        for i in range(1, len(self.modals)):
            data = np.dstack((data, (df_mod_all[df_mod_all['Modality'] == self.modals[i]].iloc[:, :-1].values)))

        df_fin = pd.DataFrame()
        df_fin['Data'] = list(data[:, :-1])
        df_fin['Subject'] = list(data[:, -1, 0])
        df_fin['Label'] = list(data[:, -3, 0])
        df_fin['Label_ori'] = list(data[:, -2, 0])
    
        return df_fin


In [77]:
# Create an instance of the class
preprocessor_allmod = DataframePrepAllMod(subjects, modals, modals_1, modals_2, df_hr_spo2, df_acc_temp_eda, label)

# Use the method to preprocess the data
processed_dataframe_allmod = preprocessor_allmod.dataframe_prep_allmod()

In [78]:
processed_dataframe_allmod

Unnamed: 0,Data,Subject,Label,Label_ori
0,"[[89.0, 97.0, 30.1, 0.083, 0.76, -0.87, -0.1],...",Subject_1,0,Relax_1
1,"[[71.86072423398329, 97.0, 30.8, 0.083, 0.72, ...",Subject_1,0,Relax_1
2,"[[69.0, 95.0, 31.2, 0.087, 0.14228412256267461...",Subject_1,0,Relax_1
3,"[[70.41782729805016, 95.0, 31.6, 0.087, 0.15, ...",Subject_1,0,Relax_1
4,"[[76.44289693593313, 96.0, 31.8, 0.087, 0.1554...",Subject_1,0,Relax_1
...,...,...,...,...
835,"[[77.0, 94.0, 34.5, 2.3835766016713094, 0.8621...",Subject_9,3,EmotionalStress
836,"[[74.0, 95.0, 34.3, 2.237, -0.1770194986072419...",Subject_9,3,EmotionalStress
837,"[[75.0, 96.0, 34.1, 1.77, -0.17, -0.9905292479...",Subject_9,3,EmotionalStress
838,"[[80.0, 96.40947075208913, 34.1, 1.43998050139...",Subject_9,3,EmotionalStress


In [84]:
import pandas as pd
import numpy as np

class LOSOValidator:
    def __init__(self, df):
        """
        Initializes the LOSOValidator class.

        Parameters:
        - df (pd.DataFrame): The input dataframe containing the data, where each row represents a data segment.
        """
        self.df = df
        self.df_train = pd.DataFrame()
        self.df_test = pd.DataFrame()

    def train_test_loso(self, test_subject=0):
        """
        Performs Leave-One-Subject-Out (LOSO) cross-validation.

        Parameters:
        - test_subject (int): The index of the subject to be selected as the test subject.

        Returns:
        - pd.DataFrame: Training dataset.
        - pd.DataFrame: Test dataset.
        """
        # Extract all unique subjects from the dataframe
        all_subjects = self.df['Subject'].unique()

        for label in self.df['Label_ori'].unique():
            data_train = self.df[self.df['Subject'] != all_subjects[test_subject]].copy()  # Select train subjects
            data_test = self.df[self.df['Subject'] == all_subjects[test_subject]].copy()  # Select test subject

            # Create dataframes for training and test sets
            df_one_train = self.create_dataframe(data_train, label)
            df_one_test = self.create_dataframe(data_test, label)

            # Concatenate with cumulative dataframes
            self.df_train = pd.concat([self.df_train, df_one_train]).reset_index(drop=True)
            self.df_test = pd.concat([self.df_test, df_one_test]).reset_index(drop=True)

        return self.df_train, self.df_test

    def create_dataframe(self, data, label):
        """
        Creates a dataframe from the provided data with the specified label.

        Parameters:
        - data (pd.DataFrame): Input data.
        - label (int): Label for the dataframe.

        Returns:
        - pd.DataFrame: Processed dataframe.
        """
        df_one = pd.DataFrame()
        df_one['Data'] = [np.array(xi[:-2, :]) for xi in data['Data'].values]
        df_one['Label'] = data['Data'].apply(lambda x: x[-2, 0])
        df_one['Label_ori'] = label

        df_one['Subject'] = data['Subject'].values[0]
 
        return df_one


In [85]:
# Assuming you have a DataFrame 'df' with the required columns ('Data', 'Subject', 'Label_ori')
# ...

# Example usage for Leave-One-Subject-Out (LOSO) cross-validation
cross_validator_loso = LOSOValidator(processed_dataframe_allmod)
train_loso, test_loso = cross_validator_loso.train_test_loso(test_subject=1)




In [86]:
train_loso

Unnamed: 0,Data,Label,Label_ori,Subject
0,"[[89.0, 97.0, 30.1, 0.083, 0.76, -0.87, -0.1],...",0.0,Relax_1,Subject_1
1,"[[71.86072423398329, 97.0, 30.8, 0.083, 0.72, ...",0.0,Relax_1,Subject_1
2,"[[69.0, 95.0, 31.2, 0.087, 0.14228412256267461...",0.0,Relax_1,Subject_1
3,"[[70.41782729805016, 95.0, 31.6, 0.087, 0.15, ...",0.0,Relax_1,Subject_1
4,"[[76.44289693593313, 96.0, 31.8, 0.087, 0.1554...",0.0,Relax_1,Subject_1
...,...,...,...,...
5581,"[[77.0, 94.0, 34.5, 2.3835766016713094, 0.8621...",3.0,EmotionalStress,Subject_1
5582,"[[74.0, 95.0, 34.3, 2.237, -0.1770194986072419...",3.0,EmotionalStress,Subject_1
5583,"[[75.0, 96.0, 34.1, 1.77, -0.17, -0.9905292479...",3.0,EmotionalStress,Subject_1
5584,"[[80.0, 96.40947075208913, 34.1, 1.43998050139...",3.0,EmotionalStress,Subject_1


In [87]:
test_loso

Unnamed: 0,Data,Label,Label_ori,Subject
0,"[[96.0, 97.0, 27.0, 3.251, 0.45, -1.01, -0.04]...",,Relax_1,Subject_10
1,"[[67.72144846796657, 95.0, 28.3, 3.676, 0.8511...",,Relax_1,Subject_10
2,"[[72.27855153203343, 95.0, 29.1, 3.776, 1.05, ...",,Relax_1,Subject_10
3,"[[75.16434540389969, 95.0, 29.7, 3.682, 1.0565...",,Relax_1,Subject_10
4,"[[78.55710306406687, 95.0, 30.1, 3.579, 1.0545...",,Relax_1,Subject_10
...,...,...,...,...
289,"[[84.0, 96.0, 33.1, 8.208, 0.733760445682451, ...",,EmotionalStress,Subject_10
290,"[[76.62952646239552, 96.0, 33.02479108635098, ...",,EmotionalStress,Subject_10
291,"[[67.0557103064067, 95.0, 33.1, 7.712, 0.58, -...",,EmotionalStress,Subject_10
292,"[[71.74094707520896, 96.0, 33.1, 7.617, 0.61, ...",,EmotionalStress,Subject_10


In [88]:
processed_dataframe_allmod

Unnamed: 0,Data,Subject,Label,Label_ori
0,"[[89.0, 97.0, 30.1, 0.083, 0.76, -0.87, -0.1],...",Subject_1,0,Relax_1
1,"[[71.86072423398329, 97.0, 30.8, 0.083, 0.72, ...",Subject_1,0,Relax_1
2,"[[69.0, 95.0, 31.2, 0.087, 0.14228412256267461...",Subject_1,0,Relax_1
3,"[[70.41782729805016, 95.0, 31.6, 0.087, 0.15, ...",Subject_1,0,Relax_1
4,"[[76.44289693593313, 96.0, 31.8, 0.087, 0.1554...",Subject_1,0,Relax_1
...,...,...,...,...
835,"[[77.0, 94.0, 34.5, 2.3835766016713094, 0.8621...",Subject_9,3,EmotionalStress
836,"[[74.0, 95.0, 34.3, 2.237, -0.1770194986072419...",Subject_9,3,EmotionalStress
837,"[[75.0, 96.0, 34.1, 1.77, -0.17, -0.9905292479...",Subject_9,3,EmotionalStress
838,"[[80.0, 96.40947075208913, 34.1, 1.43998050139...",Subject_9,3,EmotionalStress
