# Concatenating Data (and upsampling ACC data)

This code file creates a singular dataframe for BVP, EDA, TEMP, ACC X Y & Z data for each participant. The BVP, EDA and TEMP data are not upsampled, and the missing values are repeated. The ACC data is upsampled to 64 Hz to match the BVP sampling frequency.

In [2]:
import pandas as pd
import numpy as np
import os
import time
from datetime import datetime, timedelta
from scipy.signal import resample

In [4]:
os.chdir('C:\\Users\\katgm\\Rutgers University\\Michelle Chen - Rutgers_Neuropsych_Lab\\COVID_Fatigue\\RC_award\\Data\\Empatica')
os.getcwd()

'C:\\Users\\katgm\\Rutgers University\\Michelle Chen - Rutgers_Neuropsych_Lab\\COVID_Fatigue\\RC_award\\Data\\Empatica'

In [50]:
def concatenate_lab_data(SubjID):
    wd = os.getcwd() + '\\' + SubjID + '\\' + SubjID + '_baseline\\'
    eda = pd.read_csv(wd + 'EDA.csv', header=None).to_numpy().flatten()
    bvp = pd.read_csv(wd + 'BVP.csv', header=None).to_numpy().flatten()
    acc = pd.read_csv(wd + 'ACC.csv', header=None).to_numpy()
    temp = pd.read_csv(wd + 'TEMP.csv', header=None).to_numpy().flatten()
    
    init_time = datetime.fromtimestamp(eda[0]) #they all have the same initial time
    
    eda = eda[2:]
    bvp = bvp[2:]
    acc = acc[2:]
    temp = temp[2:]
    
    eda_interval = timedelta(seconds=1/4)
    bvp_interval = timedelta(seconds=1/64)
    acc_interval = timedelta(seconds=1/32)
    temp_interval = timedelta(seconds=1/4)
    
    eda_timestamps = [init_time + i * eda_interval for i in range(len(eda))]
    bvp_timestamps = [init_time + i * bvp_interval for i in range(len(bvp))]
    acc_timestamps = [init_time + i * acc_interval for i in range(len(acc))]
    temp_timestamps = [init_time + i * temp_interval for i in range(len(temp))]
    
    # Create a new DataFrame with the timestamps and the original data columns
    eda_df = pd.DataFrame(data = eda, columns=['EDA (4 Hz)'])
    eda_df['timestamp'] = eda_timestamps

    bvp_df = pd.DataFrame(data = bvp, columns=['BVP (64 Hz)'])
    bvp_df['timestamp'] = bvp_timestamps

    acc_df = pd.DataFrame(data = acc, columns=['ACC X (32 Hz)', 'ACC Y (32 Hz)', 'ACC Z (32 Hz)'])
    acc_df['timestamp'] = acc_timestamps

    temp_df = pd.DataFrame(data = temp, columns=['TEMP (4 Hz)'])
    temp_df['timestamp'] = temp_timestamps
 
    #merge dataframes together, change column order, LEAVING NA values, because we should filter the signal first!
    merged_df = pd.merge(bvp_df, eda_df, on='timestamp', how='left').merge(acc_df, on='timestamp', how='left').merge(temp_df, on='timestamp', how='left')
    
    new_cols = ['timestamp','BVP (64 Hz)', 'EDA (4 Hz)','TEMP (4 Hz)','ACC X (32 Hz)','ACC Y (32 Hz)','ACC Z (32 Hz)']
    
    merged_df = merged_df[new_cols]
    # merged_df['EDA (4 Hz)'].ffill(axis=0, inplace=True)
    # merged_df['TEMP (4 Hz)'].ffill(axis=0, inplace=True)
    
    # instead of repeating values in ACC columns, interpolate missing values
    # Uses linear interpolation method
    # merged_df.interpolate(method = 'linear', inplace=True)
    
    return merged_df

In [36]:
cov4 = concatenate_lab_data('Cov4')
cov7 = concatenate_lab_data('Cov7')
cov8 = concatenate_lab_data('Cov8')
cov13 = concatenate_lab_data('Cov13')
cov14 = concatenate_lab_data('Cov14')
cov20 = concatenate_lab_data('Cov20')
cov19 = concatenate_lab_data('Cov19')
cov22 = concatenate_lab_data('Cov22')
cov23 = concatenate_lab_data('Cov23')

In [52]:
covtest = concatenate_lab_data('CovTest')

In [53]:
#now we need to add the fatigue ratings
ewd = 'C:\\Users\\Owner\\Rutgers University\\Michelle Chen - Rutgers_Neuropsych_Lab\\COVID_Fatigue\\RC_award\\Data\\E-Prime\\'

def get_ratings(SubjID):
    epr = pd.read_csv(ewd + SubjID + '\\' + SubjID + '_fatigue_ratings.txt', 
                      sep=':', encoding='UTF-16LE', on_bad_lines='skip')
    epr = epr.filter(regex='FatigueRating', axis=0)
    epr = epr.rename(columns={"*** Header Start ***": "Rating"})
  
    clean = []
    for i in range(len(epr)):
        try:
            clean.append(int(pd.Series.tolist(epr.iloc[i])[0]))
        except Exception:
            pass
    
    d = {'Block': [0,1,2,3,4,5,6], 'Rating': clean}
    clean_df = pd.DataFrame(d)
    
    return clean_df

In [54]:
#we also need to extract the tags for each block

def add_blocks(SubjID, data):
    wd = os.getcwd() + '\\' + SubjID + '\\' + SubjID + '_baseline\\'
    tags = pd.read_csv(wd + 'tags.csv', header=None)
    
    if len(tags) != 9:
        print("Incorrect number of tags. There are " + str(len(tags)) + " when there should be 9.")
        return None

    tags = tags.applymap(datetime.fromtimestamp)
     
    # 0th and 1st tags marks baseline period. 2nd tag marks beginning of first block.
    # We will remove data before 0th tag and between 1st and 2nd tags. Also remove extra data after 8th tag.
    # REMOVING THE DATA IS CAUSING ME PHYSICAL PAIN, WILL DO AFTER PRE-PROCESSING
    # NOT REMOVING THE ENDING DATA IS CAUSING ME MENTAL FATIGUE, so we're doing that
    #remove = data[ (data['timestamp'] < tags.iloc[0,0]) | 
                 # ((data['timestamp'] > tags.iloc[1,0]) & (data['timestamp'] < tags.iloc[2,0])) |
                 # (data['timestamp'] > tags.iloc[8,0])].index
            
    remove = data[(data['timestamp'] > tags.iloc[8,0])].index
    df = data.drop(remove)
    
    
    # Add block numbers to dataframe
    conditions = [
        (df['timestamp'] >= tags.iloc[0,0]) & (df['timestamp'] < tags.iloc[1,0]),
        (df['timestamp'] >= tags.iloc[2,0]) & (df['timestamp'] < tags.iloc[3,0]),
        (df['timestamp'] >= tags.iloc[3,0]) & (df['timestamp'] < tags.iloc[4,0]),
        (df['timestamp'] >= tags.iloc[4,0]) & (df['timestamp'] < tags.iloc[5,0]),
        (df['timestamp'] >= tags.iloc[5,0]) & (df['timestamp'] < tags.iloc[6,0]),
        (df['timestamp'] >= tags.iloc[6,0]) & (df['timestamp'] < tags.iloc[7,0]),
        (df['timestamp'] >= tags.iloc[7,0]) & (df['timestamp'] < tags.iloc[8,0])
    ]
    ratings = get_ratings(SubjID)

    df['Block'] = np.select(conditions, ratings['Block'], default=pd.NA)
    df['Fatigue_Rating'] = np.select(conditions, ratings['Rating'], default=pd.NA)
    
    return df

In [39]:
cov4_rat = add_blocks('Cov4', cov4)
cov7_rat = add_blocks('Cov7', cov7)
cov8_rat = add_blocks('Cov8', cov8)
cov13_rat = add_blocks('Cov13', cov13)
cov14_rat = add_blocks('Cov14', cov14)
cov20_rat = add_blocks('Cov20', cov20)

In [42]:
nwd = 'C:\\Users\\Owner\\Rutgers University\\Michelle Chen - Rutgers_Neuropsych_Lab\\COVID_Fatigue\\RC_award\\Data\\Concatenated_Data\\'

cov4_rat.to_csv(nwd + 'Cov4_lab.csv', index=False)
cov7_rat.to_csv(nwd + 'Cov7_lab.csv', index=False)
cov8_rat.to_csv(nwd + 'Cov8_lab.csv', index=False)
cov13_rat.to_csv(nwd + 'Cov13_lab.csv', index=False)
cov14_rat.to_csv(nwd + 'Cov14_lab.csv', index=False)
cov20_rat.to_csv(nwd + 'Cov20_lab.csv', index=False)

In [56]:
nwd = 'C:\\Users\\katgm\\Rutgers University\\Michelle Chen - Rutgers_Neuropsych_Lab\\COVID_Fatigue\\RC_award\\Data\\Concatenated_Data\\'
covtest.to_csv(nwd + 'CovTest2_lab.csv', index=False)