In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time 
import re
import csv
import scipy.io
# import biosppy
# import mne
# import neurokit2 as nk
import ast
import os
import scipy.io
from sklearn.preprocessing import LabelEncoder
import time
import datetime
from datetime import datetime
import glob
from scipy.stats import zscore, norm
# from neurokit2 import eda_phasic
from scipy.stats import linregress
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from tqdm.notebook import tqdm

def TimeStamp_Conversion(ts):
  """
  we have a unix epoch time in milliseconds i.e, a string with a length of 13 charcters example:1.5789360034388428E12
  
  parameters:
  -----
  ts = Epoch timesatmp in milliseconds.

  Returns:
  -----
  Std_Unix = standard epoch timestamp in seconds.

  """

  float_Input = float(ts) # converting the string-type(1.5789360034388428E12) Unix Epoch to float-type(1578936003.4388428).

  # float input is divided by 1000 to convert the Unix epoch in milliseconds to seconds 
  Std_Unix = float_Input/1000

  datetime_Input = datetime.fromtimestamp(Std_Unix) 
  # datetime.fromtimestamp converts the unix epoch in seconds to datetime returns example:datetime.datetime(2020, 1, 13, 17, 20, 3, 438843)

  return Std_Unix

def column_formatting(Timestamp_DF):
  """
  Column names of Timestamp annotation excel have column index attached to column name as we only need column name we are parsing column names.

  Parameters:
  -----
  Timestamp_DF = Input the DF after reading the timestamp annotationexcel file to  get list of column names['A1- ECG baseline start','B1- ECG baseline end',.....].

  Returns:
  -----
  Parsed_ColumnNames = list of parsed column names. ['ECG baseline start','ECG baseline end',....]

  """

  Parsed_colnames = ['Subject_ID'] ## Column with Participant ID is not named, so declaring first column as Subject_ID to an empty list

  for index in range(1,len(Timestamp_DF.columns)): ## Looping through the list of timestamp annotation columns list
    column = Timestamp_DF.columns[index][4:].lstrip() ## Drop first 3 indices of each column and strip space(" ") if present as left most
    Parsed_colnames.append(column) ## appending each column name after parsing

  return Parsed_colnames ## returns list fo parsed col names


def Annotation_timestamp(timestamp_path, sheet_name):
  """
  This function is to change the column names of timestamp annotations table and convert timestamps from milliseconds to standart epoch format of seconds.

  Parameters:
  -----
  timestamp_path = path to the directory of file location
  sheet_name =  there are two sheets present in the file, we work on file named D.

  Results:
  -----
  VR_Timestamps_D = Clean dataframe of timestamp annotations table.

  """

  VR_TimeStamps_D = pd.read_excel(Timestamp_path, sheet_name) ## read timestamp annotation file
  Parsed_colnames = column_formatting(VR_TimeStamps_D) ## using the column_formatting function defined earlier parse columns
  VR_TimeStamps_D.columns = Parsed_colnames ## Change colnames of Dataframe using the parsed list of col names
  
  ## As timestamp is in string format and in milli seconds iterating through each column to change the timestamp to standard epoch format.
  for col in VR_TimeStamps_D.columns: 
    ## Using Timestamp_Conversion function and lambda fucntion to map the function to each row of the column.
    if col == 'Subject_ID':
      pass
    else:
      VR_TimeStamps_D[col] = VR_TimeStamps_D[col].map(lambda instance: TimeStamp_Conversion(instance)) 

  return VR_TimeStamps_D

def Shimmers_csv2DF(path,filename):
  """
  This function is to read Shimmer data files and create a dataframe from tidy shimmers csv tables.

  Parameters:
  -----
  path = path to directory of shimmers file folder.

  filename = name of the file to be loaded.

  Results:
  -----
  Dataframe = organized and structured Shimmers Data.

  """

  with open(path + '/' + filename, 'r',) as file: # read the file
    reader = csv.reader(file)

    lists_eachrow = []
    for row in reader:
      lists_eachrow.append(row) # append each row in reader to a list

  del lists_eachrow[0] # del first row of list as it is only about \t delimiter used

  newlists = [] 
  # loop through the list of lists and split columnar values using the delimiter 
  for list_row in lists_eachrow:
    for row in list_row:
      newlists.append(list(row.split('\t')))
  # Extract subjectID from the file name for future use
  filename_parse = filename.replace("_", " ")
  Participant_ID = ast.literal_eval(re.findall(r'\b\d+\b', filename_parse)[0])
  
  # create dataframe from the list of columnar values 
  Dataframe = pd.DataFrame(newlists, columns = newlists[0])
  Dataframe = Dataframe.drop([0,1]) # drop columns 1 and 2 which are column names and units as we already have column names for new dataframe.
  Dataframe.reset_index(drop=True, inplace=True) # reset index

  return Dataframe, Participant_ID

def Unix_to_normal_time(ts):
  """
  we have a unix epoch time in milliseconds i.e, a string with a length of 13 charcters example:1.5789360034388428E12
  
  parameters:
  -----
  ts = Epoch timesatmp in milliseconds.

  Returns:
  -----
  Std_Unix = standard epoch timestamp in seconds.

  """

  float_Input = float(ts) # converting the string-type(1.5789360034388428E12) Unix Epoch to float-type(1578936003.4388428).

  # float input is divided by 1000 to convert the Unix epoch in milliseconds to seconds 
  Std_Unix = float_Input/1000

  datetime_Input = datetime.fromtimestamp(Std_Unix).strftime("%m/%d/%Y, %I:%M:%S %p")
  # datetime.fromtimestamp converts the unix epoch in seconds to datetime returns example:datetime.datetime(2020, 1, 13, 17, 20, 3, 438843)

  return datetime_Input

def find_participant_id(filename):
  # Extract subjectID from the file name for future use
  filename_parse = filename.replace("_", " ")
  Participant_ID = re.findall(r'\b\d+\b', filename_parse)[0]

  return Participant_ID


In [2]:
# Sanity Check 


def GSR_Sanity_check(gsr_folder_path):
    '''
    Check for sampling rates
    Check for their start time and end time 
    Check for Length of each dataframe 
    '''
    os.chdir(gsr_folder_path)
    gsr_list_csv = glob.glob('*.{}'.format('txt'))
    gsr_list_csv.extend(glob.glob('*.{}'.format('csv')))
    
    for i in gsr_list_csv:
        Dataframe, Participant_ID = Shimmers_csv2DF(gsr_folder_path, i)
        print("-- Participant ID", Participant_ID,"\n")
        print("     Number of Rows for Subject", Participant_ID, "is --", len(Dataframe))
        print("     Number of Columns for Subject", Participant_ID, "is --", Dataframe.shape[1])
        
        Starting_time_of_the_experiment = (Dataframe['Shimmer_89C4_Timestamp_Unix_CAL'][0])
        ending_time_of_the_experiment = (Dataframe['Shimmer_89C4_Timestamp_Unix_CAL'][len(Dataframe)-1])
        Duration_in_Seconds = round((float(ending_time_of_the_experiment) - float(Starting_time_of_the_experiment))/(1000))
        
        print("     Starting time -- ", Unix_to_normal_time(Starting_time_of_the_experiment))
        print("     Ending time -- ", Unix_to_normal_time(ending_time_of_the_experiment))
        print("     Approximate time in minutes -- ", round((Duration_in_Seconds/60)), "min") 
        print("    ", Duration_in_Seconds , " Seconds")
        print("     Frequency of the Dataset is approx ", round(len(Dataframe)/Duration_in_Seconds), "Hz", "\n")
        
        
    
def ECG_Sanity_check(ecg_folder_path):
    '''
    Check for sampling rates
    Check for their start time and end time 
    Check for Length of each dataframe 
    '''
    os.chdir(ecg_folder_path)
    ECG_list_csv = glob.glob('*.{}'.format('txt'))
    ECG_list_csv.extend(glob.glob('*.{}'.format('csv')))
    
    for i in ECG_list_csv:
        Dataframe, Participant_ID = Shimmers_csv2DF(ecg_folder_path, i)
        print("-- Participant ID", Participant_ID,"\n")
        print("     Number of Rows for Subject", Participant_ID, "is --", len(Dataframe))
        print("     Number of Columns for Subject", Participant_ID, "is --", Dataframe.shape[1])
        
        Starting_time_of_the_experiment = (Dataframe['Shimmer_CB7C_Timestamp_Unix_CAL'][0])
        ending_time_of_the_experiment = (Dataframe['Shimmer_CB7C_Timestamp_Unix_CAL'][len(Dataframe)-1])
        Duration_in_Seconds = round((float(ending_time_of_the_experiment) - float(Starting_time_of_the_experiment))/(1000))
        
        print("     Starting time -- ", Unix_to_normal_time(Starting_time_of_the_experiment))
        print("     Ending time -- ", Unix_to_normal_time(ending_time_of_the_experiment))
        print("     Approximate time in minutes -- ", round((Duration_in_Seconds/60)), "min")
        print("    ", Duration_in_Seconds , " Seconds")
        print("     Frequency of the Dataset is approx ", round(len(Dataframe)/Duration_in_Seconds), "Hz", "\n")
    

In [4]:
ecg_folder_path = 'C:/Users/govindd1/Desktop/New_Food_VR/ECG_DATA/ECG_DATA'
ECG_Sanity_check(ecg_folder_path)

-- Participant ID 1056 

     Number of Rows for Subject 1056 is -- 889607
     Number of Columns for Subject 1056 is -- 28
     Starting time --  03/10/2020, 11:25:53 AM
     Ending time --  03/10/2020, 11:55:09 AM
     Approximate time in minutes --  29 min
     1755  Seconds
     Frequency of the Dataset is approx  507 Hz 

-- Participant ID 1058 

     Number of Rows for Subject 1058 is -- 885567
     Number of Columns for Subject 1058 is -- 28
     Starting time --  03/10/2020, 01:07:36 PM
     Ending time --  03/10/2020, 01:36:45 PM
     Approximate time in minutes --  29 min
     1749  Seconds
     Frequency of the Dataset is approx  506 Hz 

-- Participant ID 793 

     Number of Rows for Subject 793 is -- 1011473
     Number of Columns for Subject 793 is -- 28
     Starting time --  02/18/2020, 01:02:29 PM
     Ending time --  02/18/2020, 01:35:46 PM
     Approximate time in minutes --  33 min
     1997  Seconds
     Frequency of the Dataset is approx  506 Hz 

-- Participant 

In [6]:
gsr_folder_path = 'C:/Users/govindd1/Desktop/New_Food_VR/GSR_DATA/GSR_DATA'
GSR_Sanity_check(gsr_folder_path)

-- Participant ID 1058 

     Number of Rows for Subject 1058 is -- 223935
     Number of Columns for Subject 1058 is -- 17
     Starting time --  03/10/2020, 01:07:34 PM
     Ending time --  03/10/2020, 01:36:45 PM
     Approximate time in minutes --  29 min
     1750  Seconds
     Frequency of the Dataset is approx  128 Hz 

-- Participant ID 942 

     Number of Rows for Subject 942 is -- 284172
     Number of Columns for Subject 942 is -- 17
     Starting time --  02/19/2020, 12:40:10 PM
     Ending time --  02/19/2020, 01:17:10 PM
     Approximate time in minutes --  37 min
     2221  Seconds
     Frequency of the Dataset is approx  128 Hz 

-- Participant ID 962 

     Number of Rows for Subject 962 is -- 225120
     Number of Columns for Subject 962 is -- 17
     Starting time --  03/06/2020, 12:09:32 PM
     Ending time --  03/06/2020, 12:38:51 PM
     Approximate time in minutes --  29 min
     1759  Seconds
     Frequency of the Dataset is approx  128 Hz 

-- Participant ID 1

In [7]:
# get meta table from both GSR and ECG files

def get_meta_table(gsr_folder_path, ecg_folder_path):
    
    os.chdir(gsr_folder_path)
    gsr_list_csv = glob.glob('*.{}'.format('txt'))
    gsr_list_csv.extend(glob.glob('*.{}'.format('csv')))

    
    os.chdir(ecg_folder_path)
    ECG_list_csv = glob.glob('*.{}'.format('txt'))
    ECG_list_csv.extend(glob.glob('*.{}'.format('csv')))

    MetaData_df = pd.DataFrame()
    meta_data = pd.DataFrame()
    
    Participant_num = []
    normal_start_of_gsr_signal = []
    normal_end_of_gsr_signal = []
    normal_start_of_ecg_signal = []
    normal_end_of_ecg_signal = []
    
    unix_start_of_gsr_signal = []
    unix_end_of_gsr_signal = []
    unix_start_of_ecg_signal = []
    unix_end_of_ecg_signal = []
    
    
    
    for i in gsr_list_csv:
        for j in ECG_list_csv:
            if find_participant_id(i) == find_participant_id(j):
                print(i, " -<< matches >>- ", j)   
                
                GSR_Dataframe, Participant_ID = Shimmers_csv2DF(gsr_folder_path, i)
                Unix_Starting_time_of_GSR_signal = (GSR_Dataframe['Shimmer_89C4_Timestamp_Unix_CAL'][0])
                Unix_Ending_time_of_GSR_signal = (GSR_Dataframe['Shimmer_89C4_Timestamp_Unix_CAL'][len(GSR_Dataframe)-1])
                Normal_Starting_time_of_GSR_signal = Unix_to_normal_time(Unix_Starting_time_of_GSR_signal)
                Normal_Ending_time_of_GSR_signal = Unix_to_normal_time(Unix_Ending_time_of_GSR_signal)
                
                # print("GSR Metadata : ")
                # print("unix time -- ", "start time -- ", Starting_time_of_GSR_signal, "end time -- ",  Ending_time_of_GSR_signal)
                # print("normal time -- ", "start time -- ", Unix_to_normal_time(Starting_time_of_GSR_signal), "end time -- ", Unix_to_normal_time(Ending_time_of_GSR_signal))
                
                ECG_Dataframe, Participant_ID_2 = Shimmers_csv2DF(ecg_folder_path, j)
                Unix_Starting_time_of_ECG_signal = (ECG_Dataframe['Shimmer_CB7C_Timestamp_Unix_CAL'][0])
                Unix_Ending_time_of_ECG_signal = (ECG_Dataframe['Shimmer_CB7C_Timestamp_Unix_CAL'][len(ECG_Dataframe)-1])  
                Normal_Starting_time_of_ECG_signal = Unix_to_normal_time(Unix_Starting_time_of_ECG_signal)
                Normal_Ending_time_of_ECG_signal = Unix_to_normal_time(Unix_Ending_time_of_ECG_signal)
                # print("ECG Metadata : ")  
                # print("unix time -- ", "start time -- ", Starting_time_of_ECG_signal, "end time -- ",  Ending_time_of_ECG_signal)
                # print("normal time -- ", "start time -- ", Unix_to_normal_time(Starting_time_of_ECG_signal), "end time -- ", Unix_to_normal_time(Ending_time_of_ECG_signal), "\n")        
                                
                Participant_num.append(Participant_ID)
                unix_start_of_gsr_signal.append(Unix_Starting_time_of_GSR_signal)
                unix_end_of_gsr_signal.append(Unix_Ending_time_of_GSR_signal)
                unix_start_of_ecg_signal.append(Unix_Starting_time_of_ECG_signal)
                unix_end_of_ecg_signal.append(Unix_Ending_time_of_ECG_signal)
                
                normal_start_of_gsr_signal.append(Normal_Starting_time_of_GSR_signal)
                normal_end_of_gsr_signal.append(Normal_Ending_time_of_GSR_signal)
                normal_start_of_ecg_signal.append(Normal_Starting_time_of_ECG_signal)
                normal_end_of_ecg_signal.append(Normal_Ending_time_of_ECG_signal)
        
    meta_data['Participant_ID'] = Participant_num
    meta_data['Unix_Starting_time_of_GSR_signal'] = unix_start_of_gsr_signal
    meta_data['Unix_Ending_time_of_GSR_signal'] = unix_end_of_gsr_signal
    meta_data['Unix_Starting_time_of_ECG_signal'] = unix_start_of_ecg_signal
    meta_data['Unix_Ending_time_of_ECG_signal'] = unix_end_of_ecg_signal
    
    meta_data['Normal_Starting_time_of_GSR_signal'] = normal_start_of_gsr_signal
    meta_data['Normal_Ending_time_of_GSR_signal'] = normal_end_of_gsr_signal
    meta_data['Normal_Starting_time_of_ECG_signal'] = normal_start_of_ecg_signal
    meta_data['Normal_Ending_time_of_ECG_signal'] = normal_end_of_ecg_signal
    
    MetaData_df = MetaData_df.append(meta_data)
    
    return MetaData_df
    
       

In [8]:
meta_data = get_meta_table(gsr_folder_path, ecg_folder_path)

Subj_VR_1058_Session1_Shimmer_89C4.txt  -<< matches >>-  Subj_VR_1058_Session1_Shimmer_CB7C_Calibrated_SD.csv
Subj_VR_942_Session1_Shimmer_89C4.txt  -<< matches >>-  Subj_VR_942_Session1_Shimmer_CB7C_Calibrated_SD.csv
Subj_VR_962_Session1_Shimmer_89C4.txt  -<< matches >>-  Subj_VR_962_Session1_Shimmer_CB7C_Calibrated_SD.csv
Subj_VR_1056_Session1_Shimmer_89C4_Calibrated_SD.csv  -<< matches >>-  Subj_VR_1056_Session1_Shimmer_CB7C_Calibrated_SD.csv
Subj_VR_793_Session1_Shimmer_89C4_Calibrated_SD.csv  -<< matches >>-  Subj_VR_793_Session1_Shimmer_CB7C_Calibrated_SD.csv
Subj_VR_946_Session1_Shimmer_89C4_Calibrated_SD.csv  -<< matches >>-  Subj_VR_946_Session1_Shimmer_CB7C_Calibrated_SD.csv
Subj_VR_961_Session1_Shimmer_89C4_Calibrated_SD.csv  -<< matches >>-  Subj_VR_961_Session1_Shimmer_CB7C_Calibrated_SD.csv
Subj_VR_963_Session1_Shimmer_89C4_Calibrated_SD.csv  -<< matches >>-  Subj_VR_963_Session1_Shimmer_CB7C_Calibrated_SD.csv
Subj_VR_966_Session1_Shimmer_89C4_Calibrated_SD.csv  -<< match

In [9]:
meta_data

Unnamed: 0,Participant_ID,Unix_Starting_time_of_GSR_signal,Unix_Ending_time_of_GSR_signal,Unix_Starting_time_of_ECG_signal,Unix_Ending_time_of_ECG_signal,Normal_Starting_time_of_GSR_signal,Normal_Ending_time_of_GSR_signal,Normal_Starting_time_of_ECG_signal,Normal_Ending_time_of_ECG_signal
0,1058,1583860054947.5403,1583861805220.9778,1583860056037.2925,1583861805111.5112,"03/10/2020, 01:07:34 PM","03/10/2020, 01:36:45 PM","03/10/2020, 01:07:36 PM","03/10/2020, 01:36:45 PM"
1,942,1582134010151.764,1582136230753.3264,1582134011222.0764,1582136229563.8733,"02/19/2020, 12:40:10 PM","02/19/2020, 01:17:10 PM","02/19/2020, 12:40:11 PM","02/19/2020, 01:17:09 PM"
2,962,1583514572002.6245,1583516331080.7495,1583514573063.7207,1583516331184.845,"03/06/2020, 12:09:32 PM","03/06/2020, 12:38:51 PM","03/06/2020, 12:09:33 PM","03/06/2020, 12:38:51 PM"
3,1056,1583853952862.915,1583855709198.8523,1583853953909.3018,1583855709208.13,"03/10/2020, 11:25:52 AM","03/10/2020, 11:55:09 AM","03/10/2020, 11:25:53 AM","03/10/2020, 11:55:09 AM"
4,793,1582048948726.2268,1582050946944.9768,1582048949851.471,1582050946800.6897,"02/18/2020, 01:02:28 PM","02/18/2020, 01:35:46 PM","02/18/2020, 01:02:29 PM","02/18/2020, 01:35:46 PM"
5,946,1583770136243.3472,1583772216368.3472,1583770137301.6355,1583772216371.9482,"03/09/2020, 12:08:56 PM","03/09/2020, 12:43:36 PM","03/09/2020, 12:08:57 PM","03/09/2020, 12:43:36 PM"
6,961,1582739312855.957,1582741386473.1443,1582739313932.7087,1582741386587.0056,"02/26/2020, 12:48:32 PM","02/26/2020, 01:23:06 PM","02/26/2020, 12:48:33 PM","02/26/2020, 01:23:06 PM"
7,963,1582648889498.9624,1582650810741.15,1582648890565.2466,1582650810793.7622,"02/25/2020, 11:41:29 AM","02/25/2020, 12:13:30 PM","02/25/2020, 11:41:30 AM","02/25/2020, 12:13:30 PM"
8,966,1583774299086.1511,1583776375593.9636,1583774300110.138,1583776375596.466,"03/09/2020, 01:18:19 PM","03/09/2020, 01:52:55 PM","03/09/2020, 01:18:20 PM","03/09/2020, 01:52:55 PM"
9,984,1582733631263.794,1582738190974.7314,1582733632377.777,1582738190371.9175,"02/26/2020, 11:13:51 AM","02/26/2020, 12:29:50 PM","02/26/2020, 11:13:52 AM","02/26/2020, 12:29:50 PM"
