## Matching Session 3

Sess3 folder has E4 data and csvs of glasses/watch recording.  We want to grab data between the last TX_TIME_SEEN in the first CSV and the first TX_TIME_SEEN in the last CSV.

- For P2, the watch disconnected, so we have no final TX_TIME_SEEN.
- 
- For P22 and P24, they did *two* transitions for session 3 instead of one.  This can be inferred from the 'second_transition' folder existing in the sess3 folder.  The second transition data is again between the last TX_TIME_SEEN in the first CSV and the first TX_TIME_SEEN in the last CSV in the 'second_transition' folder.  The duration of this second session is required to compare against their time estimate in the final survey.

To process, we need to 
- Grab the glasses and E4 data, putting them in the form we have for the rest of the sessions.
- Grab the duration of the session and the time until noticed the light changed.

In [20]:
import os
import csv
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import math
import time
import seaborn as sns
import humanize
import re
import glob
import datetime
import pickle
from IPython.display import display, clear_output, HTML
from csv import reader
import matplotlib.dates as mdates
from pympler import asizeof
from scipy import interpolate
from ahrs.filters import EKF
from ahrs import QuaternionArray
from dateutil import parser
#%matplotlib inline
%matplotlib widget


TIMEZONE_OFFSET = 4

In [2]:
def generate_timestamps(length, start_time, sample_rate):
    time_offset = np.arange(length) / sample_rate
    timestamps = pd.to_datetime(start_time, unit='s', origin='unix') + pd.to_timedelta(time_offset, unit='s') - datetime.timedelta(hours=TIMEZONE_OFFSET)
    return timestamps


def process_file(file_path, file_name):
    with open(file_path, 'r') as file:
        # only take the first timestamp
        start_time = float(file.readline().split(',')[0].strip())
        df = pd.DataFrame()
        if 'IBI' in file_name:
            data = pd.read_csv(file, header=None, names=['time', 'ibi']).values.tolist()
            df['timestamp'] = pd.to_datetime([item[0] + start_time for item in data], unit='s', origin='unix')
        else:
            sample_rate = float(file.readline().split(',')[0].strip())
            data = pd.read_csv(file, header=None).values.tolist()
            df['timestamp'] = generate_timestamps(len(data), start_time, sample_rate)
            
            if 'ACC' in file_name:
                #accelerometer data is in 1/64 Gs.  Our glasses data is in m/s^2. Transform it to m/s^2
                data = [[(i/64)*9.8 for i in l] for l in data]
        df['datatype'] = file_name.split('.')[0]
        df['data'] = data
        return df

    
def load_session_data(folder, participant_id, session_id):
    data_dir = f'{folder}/{participant_id}/{participant_id}_{session_id}'
    data_frames = []

    expected_files = ['ACC.csv', 'BVP.csv', 'EDA.csv', 'HR.csv', 'IBI.csv', 'TEMP.csv']

    #load all files, save the output of 'process_file' into data_frames
    for subdir, dirs, files in os.walk(data_dir):
        for file in files:
            if file in expected_files:
                file_path = subdir + os.sep + file
                df = process_file(file_path, file)
                data_frames.append(df)
    
    #turn data_frames into df
    final_df = pd.concat(data_frames)
    final_df.set_index('timestamp', inplace=True)
    final_df.sort_index(inplace=True)
    
    return final_df


def extract_datatype(df, datatype):
    # Filter the dataframe for the given datatype
    df_filtered = df[df['datatype'] == datatype].copy()
    
    # Determine the number of columns based on the length of the 'data' list in the first row
    num_cols = len(df_filtered['data'].iloc[0])
    
    # Create column names
    if datatype == 'ACC':
        col_names = ['x','y','z']
    else:
        col_names = ['d' + str(i + 1) for i in range(num_cols)]
    
    # Split the 'data' list into multiple columns
    df_split = pd.DataFrame(df_filtered['data'].to_list(), columns=col_names)
    
    # Add timestamp back to the dataframe
    df_split.index = df_filtered.index
    
    return df_split

def plot_df(df, title):
    fig, ax = plt.subplots(figsize=(15, 3))

    # Assuming df has timestamp as index
    for column in df.columns:
        if 'tick' not in column:
            sns.lineplot(data=df, x=df.index, y=column, ax=ax, lw=1.5)
        
    # Format the x axis
    ax.xaxis.set_major_locator(mdates.AutoDateLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))

    # Set grid
    ax.grid(True, which="both", color='gray', linewidth=0.25)

    # Set title and labels
    ax.set_title(title, fontsize=14)
    ax.set_xlabel('Time', fontsize=11)
    ax.set_ylabel('Value', fontsize=11)

    # Set legend
    #ax.legend(loc='upper left', fontsize=14)

    # Set tick parameters
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=10)

    # Improve date tick display
    fig.autofmt_xdate()

    plt.show()

In [22]:
def get_size_mb(obj):
    size_bytes = asizeof.asizeof(obj)
    size_mb = size_bytes / (1024 * 1024)  # Convert bytes to megabytes
    return size_mb

def interpolate_df(df, start_timestamp, end_timestamp, new_freq):
    # Calculate the total time duration of the data in seconds
    total_time = (end_timestamp-start_timestamp).total_seconds()
    
    # Create new timestamp index at the desired frequency
    new_index = pd.date_range(start=start_timestamp, periods=int(total_time * new_freq), freq=f'{1/new_freq*1000}ms')

    # Initialize new DataFrame with the new index
    df_new = pd.DataFrame(index=new_index)

    # For each column in the original DataFrame, interpolate to the new frequency
    for col in df.columns:
        # Create spline interpolator for this column
        interpolator = interpolate.interp1d(df.index.values.astype(float), df[col].values, kind='cubic')

        # Interpolate this column at the new timestamps
        df_new[col] = interpolator(new_index.values.astype(float))

    return df_new

def gather_rows(path):
    csv_files = sorted([os.path.join(path, file) for file in os.listdir(path) if file.endswith('.csv')])

    # Initialize rows list
    rows = []

    # Iterate through the files
    for i, filepath in enumerate(csv_files):
        filesize = os.path.getsize(filepath)/(1024*1024)
        print(f'{filepath}, File size: {filesize:.1f} MB')

        with open(filepath, 'r') as file:
            csv_reader = csv.reader(file)
            file_rows = list(csv_reader)

            # If it's the first or the last file, handle 'TX_TIME_SEEN' times specially
            if i == 0 or i == len(csv_files) - 1:
                times_seen_indices = [index for index, row in enumerate(file_rows) if 'TX_TIME_SEEN' in row[3]]

                if i == 0 and times_seen_indices:
                    # On the first file, only append rows after the last TX_TIME_SEEN
                    last_time_seen_index = times_seen_indices[-1]
                    rows += file_rows[last_time_seen_index:]
                elif i == len(csv_files) - 1 and times_seen_indices:
                    # On the last file, only append rows before the first TX_TIME_SEEN
                    first_time_seen_index = -1
                    first_time_seen_index = times_seen_indices[0]
                    rows += file_rows[:first_time_seen_index+1]
                else:
                    # If no TX_TIME_SEEN in the last file, append all rows
                    rows += file_rows
            else:
                # For other files, append all rows
                rows += file_rows

    return rows


def load_session3_excel(folder, participant_id, session_id):
    
    # grab all filepaths in there, in order.
    data_dir = f'{folder}/{participant_id}/{participant_id}_{session_id}'
    rows = gather_rows(data_dir)

    logs, blinks, acc, gyro, thermal, ui, wtemp, wlux = [], [], [], [], [], [], [], []

    for row in rows:
        try:
            if   (row[0]=='l'):
                logs.append(row[1:])
            elif (row[0]=='g'):
                if   (row[2]=='b'):
                    blinks.append(row[1:])
                elif (row[2]=='t'):
                    thermal.append(row[1:])
                elif (row[2]=='a'):
                    acc.append(row[1:])
                elif (row[2]=='g'):
                    gyro.append(row[1:])
            elif (row[0]=='u'):
                ui.append(row[1:])
            elif (row[0]=='w'):
                if(row[3] == 'TX_TEMP_HUMD'):
                    wtemp.append(row[1:])
                elif(row[3] == 'TX_LUX_WHITELUX'):
                    wlux.append(row[1:])
                else:
                    ui.append(row[1:])
        except Exception as e:
            print(str(e) + ': ' + str(row))

    print('got raw data.')
        
    session = {
        'user': participant_id,
        'session': session_id,
        'filepath': data_dir,
        'session_type': 'Sess3FLOW',
        'timestamp': 'NA',
        'logs': logs,
        'ui': ui
    }
    
    print('\tthermal processing...')
    try:
        check_raw_packets(thermal, timeout_ms=4200)
        thermal_df = pd.DataFrame()
        for t in thermal:
            thermal_df = pd.concat([thermal_df, parseThermalRow(t)], axis=0)    
        thermal_df.reset_index(inplace=True)
        thermal_df = reindex_to_timestamp(thermal_df)
        thermal_df['nose_tp_rolling'] = thermal_df['nose_tp'].rolling(7).mean()
        thermal_df['temple_tp_rolling'] = thermal_df['temple_tp'].rolling(2).mean()
        thermal_df['nose_thermistor_rolling'] = thermal_df['nose_thermistor'].rolling(40).mean()
        thermal_df['temple_thermistor_rolling'] = thermal_df['temple_thermistor'].rolling(40).mean()
        thermal_df['nose_temp_C'] = thermal_df.apply(lambda x: convert_nose_temp(x['nose_tp_rolling'], x['nose_thermistor_rolling']), axis=1)
        thermal_df['temple_temp_C'] = thermal_df.apply(lambda x: convert_temple_temp(x['temple_tp_rolling'], x['temple_thermistor_rolling']), axis=1)
        thermal_df['differential_C'] = thermal_df['nose_temp_C'] - thermal_df['temple_temp_C']
        thermal_df['differential_C_rolling'] = thermal_df['differential_C'].rolling(15).mean() 
        session['thermal']= thermal_df
    except Exception as e:
        print('FAILED TO PROCESS DATA!!!! ' + str(e))
        
    print('\tgyro processing...')
    try:
        check_raw_packets(gyro, timeout_ms=900)
        gyro_df = parse_xyz_data(gyro)
        gyro_df.reset_index(inplace=True)
        gyro_df = reindex_to_timestamp(gyro_df)
        gyro_df = gyro_df[gyro_df['x'] < 10000]
        gyro_df = gyro_df[gyro_df['y'] < 10000]
        gyro_df = gyro_df[gyro_df['z'] < 10000]

        #convert int16s to floats with q=9.
        #this gives us rad/s (angular velocity), we expect values in the 0.3-3 range.
        #Gyro set to updates every 10ms, or 100Hz
        gyro_df['x'] = gyro_df['x'].astype(float) / 512
        gyro_df['y'] = gyro_df['y'].astype(float) / 512
        gyro_df['z'] = gyro_df['z'].astype(float) / 512
        session['gyro']= gyro_df
    except Exception as e:
        print('FAILED TO PROCESS DATA!!!! ' + str(e))
   
    print('\taccelerometer processing...')
    try:
        check_raw_packets(acc, timeout_ms=1350)
        acc_df = parse_xyz_data(acc)
        acc_df.reset_index(inplace=True) 
        acc_df = reindex_to_timestamp(acc_df)
        acc_df = acc_df[acc_df['x'] < 10000]
        acc_df = acc_df[acc_df['y'] < 10000]
        acc_df = acc_df[acc_df['z'] < 10000]

        #convert int16s to floats with q=8.
        #this gives us data in m/s^2, so we expect ~9.8 when at rest
        #accelerometer set for every 20ms, or 50Hz
        acc_df['x'] = acc_df['x'].astype(float) / 256
        acc_df['y'] = acc_df['y'].astype(float) / 256
        acc_df['z'] = acc_df['z'].astype(float) / 256
        session['acc']= acc_df
    except Exception as e:
        print('FAILED TO PROCESS DATA!!!! ' + str(e))
        
    print('\tquaternion extended kalman filter application...')
    try:
        start_timestamp = max(acc_df.index[0], gyro_df.index[0])
        end_timestamp = min(acc_df.index[-1], gyro_df.index[-1])

        acc_inter = interpolate_df(acc_df, start_timestamp, end_timestamp, 100)
        gyro_inter = interpolate_df(gyro_df, start_timestamp, end_timestamp, 100)

        ekf = EKF(gyr=gyro_inter[['x','y','z']].values, acc=acc_inter[['x','y','z']].values, frequency=100.0)
        qs = QuaternionArray(ekf.Q)
        qs.remove_jumps()
        qs_df = pd.DataFrame({'w': qs[:,0], 'i':qs[:,1], 'j':qs[:,2], 'k':qs[:,3]}, index=acc_inter.index)
        session['quaternions']= qs_df
    except Exception as e:
        print('FAILED TO PROCESS DATA!!!! ' + str(e))
        
    print('\tblink processing...')
    try:
        check_raw_packets(blinks, timeout_ms=500)
        blinks_df = parse_blink_rows(blinks)
        blinks_df.reset_index(inplace=True)
        blinks_df = reindex_to_timestamp(blinks_df)
        session['blinks']= blinks_df
    except Exception as e:
        print('FAILED TO PROCESS DATA!!!! ' + str(e))
        
    print('\twatch processing...')
    try:
        session['watch_temp']= processWatchTemp(wtemp)
        session['watch_lux'] = processWatchLux(wlux)     
    except Exception as e:
        print('FAILED TO PROCESS DATA!!!! ' + str(e))
    
    print(f"The size of the loaded data is {get_size_mb(session)} MB")
    return session


def processWatchTemp(watch_data):
    '''Apply simple calibration to temp data based on comparison with cheap temp/humd gauge from amazon.
    Also, process the timestamp and turn it into a df'''
    
    to_df = []    
    
    for item in watch_data:
        to_df.append({
            'timestamp': pd.to_datetime(item[0])-datetime.timedelta(hours=TIMEZONE_OFFSET),
            'temp': 21.2 + (6.7/5.4) * (float(item[3])-24),
            'humidity': float(item[4])
        })
    
    df = pd.DataFrame(to_df)

    try:
        df['timestamp'] = df['timestamp'].dt.tz_localize(None)
        df.set_index('timestamp', inplace=True)
    except:
        print('\t\tno entries')

    return df

def processWatchLux(watch_data):
    
    to_df = []    
    for item in watch_data:
        to_df.append({
            'timestamp': pd.to_datetime(item[0])-datetime.timedelta(hours=TIMEZONE_OFFSET),
            'lux': float(item[3]),
            'whitelux': float(item[4])
        })
    
    df = pd.DataFrame(to_df)
    try:
        df['timestamp'] = df['timestamp'].dt.tz_localize(None)
        df.set_index('timestamp', inplace=True)
    except:
        print('\t\tno entries')
    return df

def parse_blink_rows(blinks):
    blink_rows = []
    
    for b in blinks:
        packetLength = int(b[6])
        blink_df_row = pd.DataFrame(data=[b[0:12] for i in range(packetLength)], 
                                    columns=['serverTS', 'packetTypeLetter', 'packetType', 'packetNum', 
                                             'msFromStart', 'epoch', 'packetSize', 'res0', 'res1', 'res2','res3','res4']) 
        
        
        blink_df_row["data"] = pd.Series([int(v) for v in b[13:]], dtype=np.int32)
        blinkSampleRate = 2000 # 2kHz
        blinkSamplePeriod_ms = 1000 * (1/blinkSampleRate) # 0.5 ms
        blink_df_row["tick_ms"] = blink_df_row['msFromStart'].astype(int)
        blink_df_row["tick_ms"] /= 2
        blink_df_row["tick_ms"] -= (packetLength - np.array(range(1, packetLength+1))) * blinkSamplePeriod_ms
        blink_df_row["saturated"] = blink_df_row["res0"].astype(int)
        blink_rows.append(blink_df_row)   
     
    return pd.concat(blink_rows)
 
    
def parse_xyz_data(data):
    def parseXYZRow(p):
        packetLength = 25
        base_data = p[0:12]
        other_data = np.array(p[13:]).astype('int32').reshape(-1,5)
        return [(base_data + list(row)) for row in other_data]

    data_out = []
    for d in data:
        data_out.extend(parseXYZRow(d))

    df = pd.DataFrame(data_out, columns=['serverTS', 'packetTypeLetter', 'packetType', 'packetNum', 'msFromStart', 'epoch', 'packetSize', 'res0', 'res1', 'res2','res3','res4', 'x','y','z','imu_tick_ms','tick_ms'])
    
    df["tick_ms"] = df["tick_ms"]/2
    
    return df


def parseThermalRow(t):
    tempSamplePeriod_ms = 100
    numInnerPackets = 4
    rowsInInnerPacket = 5
    # each row has 128 values; 4x 32 value 'packets'; 
    # each 32 has 5 repetitions of 6 values of data + 2 (tick,epoch)
    # these 6 values are interleaved; i.e. the first three are temple numbers, and the first
    # 15 sets of three are temple vals; the second 15 sets of 3 are the corresponding nose vals.
    # to get a timestamped packet, we need [0,1,2] matched with [15,16,17] and so on within each
    # 32 value 'packet'.  
    
    df_row = pd.DataFrame(data=[t[0:12] for i in range(numInnerPackets*rowsInInnerPacket)], columns=['serverTS', 'packetTypeLetter', 'packetType', 'packetNum', 'msFromStart', 'epoch', 'packetSize', 'res0', 'res1', 'res2','res3','res4']) 
    
    payload = np.array(t[13:]).astype(np.int32).reshape(numInnerPackets, -1)
    
    def flatten(t): return [item for sublist in t for item in sublist]
    reindex = flatten([[*range(i*3,i*3+3)] + [*range(15+i*3, 18+i*3)] + [-2,-1] for i in range(rowsInInnerPacket)])
    proper_payload = payload[:,reindex].reshape(-1,8)
    
    df_row = pd.concat([df_row, pd.DataFrame(data=proper_payload, columns=["temple_tp","temple_thermistor","secondary_temple_tick_ms","nose_tp","nose_thermistor","secondary_nose_tick_ms","tick_ms","epoch"])], axis=1)
    df_row['tick_ms'] -= tempSamplePeriod_ms * np.array([4,3,2,1,0] * numInnerPackets)
    
    return df_row

def convert_thermistor_to_K(raw_value):
    # STEP 1. Calc R_thermistor value given ADC_val
    R_divider = 100000
    ADC_max = 4095
    
    R_t = (R_divider * raw_value/ADC_max) / (1 - raw_value/ADC_max)
    
    # STEP 2. Convert R_t to temp
    R_0 = 100000
    T_0 = 298.15
    Beta = 3960
    R_inf = R_0 * math.exp(-Beta/T_0)
    
    Temp_K = Beta / np.log(R_t / R_inf)
    return Temp_K
    
def convert_thermopile_to_C(raw_val, T_ref_K, A, a_0, a_1):
    #STEP 1. Calc V_thermopile based on preamp gain and bias
    V_tp = (((raw_val / 4095) * 3.3) - (1.15+0.6084))/1000.0
    
    #STEP 2. Solve for Temperature!
    f_V_tp = (V_tp - a_0) + a_1 * (V_tp - a_0)**2
    T_obj = (T_ref_K**4 + f_V_tp/A)**0.25
    return (T_obj - 273.15)
    
def convert_nose_temp(raw_val, raw_thermistor):
    A   =  7.8e-10
    a_0 = -2.31e-01
    a_1 =  3.61e-03
    try:
        return convert_thermopile_to_C(raw_val, 
                                  convert_thermistor_to_K(raw_thermistor), 
                                  A, a_0, a_1)
    except:
        return np.nan
    
def convert_temple_temp(raw_val, raw_thermistor):
    A   =  4.21e-10
    a_0 = -3.62e-01
    a_1 =  8.31e-02
    try:
        return convert_thermopile_to_C(raw_val, 
                                  convert_thermistor_to_K(raw_thermistor), 
                                  A, a_0, a_1)
    except:
        return np.nan
    

def check_raw_packets(data, timeout_ms=500):
    last_seen = pd.to_datetime(data[0][0]) #0 = serverTimestamp
    last_tick = int(data[0][4]) #4 = packet_tick
    last_packet = int(data[0][3]) #3 = packetNum
    packets = set() 
    packets.add(last_packet)

    for i, d in enumerate(data[1:]):

        current_seen = pd.to_datetime(d[0])
        current_packet = int(d[3])
        current_tick = int(d[4])

        # Check if serverTimestamp is proceeding in order, which it should. 
        if current_seen < last_seen:
            print(f"\t\t-- at index {i+1}: Timestamp out of order. Current: {current_seen}, Previous: {last_seen}")

        # Check for duplicate packetNumbers by checking for the number in set packets.
        #if current_packet in packets:
        #    print(f"Error at index {i+1}: Duplicate packetNum detected. packetNum: {current_packet}")

        # Make sure packetNum is one larger than previous.
        if current_packet != last_packet + 1 and current_packet != 0:
            print(f"\t\tat index {i+1}: packetNum not sequentially increasing. Current: {current_packet}, Previous: {last_packet}")

        # Make sure tick has increased from last_tick.
        if current_tick <= last_tick:
            print(f"\t\tat index {i+1}: packet_tick not increasing. Current: {current_tick}, Previous: {last_tick}")

        # Check if the current tick is no more than 500ms greater than last tick.
        if current_tick - last_tick > timeout_ms:
            print(f"\t\tat index {i+1}: tick increment greater than timeout. Difference: {current_tick - last_tick}ms")

        # Update last_seen, last_tick, last_packet, packets
        last_seen = current_seen
        last_tick = current_tick
        last_packet = current_packet

        
        
def check_time_deltas(data):
    ''' feed in raw data after reading it into a list, this will just check
    server timestamps vs packet vs ticks and look at the differences elapsed 
    from packet to packet.  For debugging. '''
    prev_server = pd.to_datetime(data[0][0])
    prev_packet = int(data[0][3])
    prev_tick   = int(data[0][4])

    for d in data[1:200]:
        cs = pd.to_datetime(d[0])
        cp = int(d[3])
        ct = int(d[4])
        
        print(f'Differences: PACKET:{cp-prev_packet}\tSERVER:{humanize.precisedelta(cs-prev_server)}\tTICK:{(ct-prev_tick)/1000}')
        
        prev_server = cs
        prev_packet = cp
        prev_tick   = ct
        

def reindex_to_timestamp(df, column='tick_ms'):
    '''assumes a serverTS and tick_ms.  grabs first serverTS, assumes it marks 
    the tick_ms of the last row in the first packet, and then uses the tick_ms
    as the gold standard for all other timestamps'''
    
    df = df.copy()

    #make sure column is a datetime type
    df[column] = pd.to_datetime(df[column], unit='ms')
    
    # Set the first serverTS as the base timestamp
    base_timestamp = pd.to_datetime(df.loc[0, 'serverTS'])

    # Find the tick_ms of the last entry in the first packet by looking at the change in packetNum
    first_packet_num = df.loc[0, 'packetNum']
    zero_time_index = np.where(df['packetNum'].shift(-1) != first_packet_num)[0]
    
    if len(zero_time_index) > 0:
        zero_time = df.loc[zero_time_index[0], column]
    else:
        zero_time = df.loc[0, column]

    # Adjust tick_ms to get the correct offset
    df.loc[:, column] -= zero_time
    df.loc[:, column] = pd.to_timedelta(df[column], unit='ms')

    
    # Create new timestamp and set as index
    df['timestamp'] = base_timestamp + df[column]
    
    # Drop duplicates
    l = len(df)
    df.drop_duplicates(subset='timestamp', keep='first', inplace=True)
    print(f'\t\tdropped {l-len(df)} rows of data due to duplication.')
    
    # Drop data that's not within 10 sec of first and last server timestamp
    l = len(df)
    cutoff = pd.to_datetime(df.iloc[0]['serverTS']) - datetime.timedelta(seconds=10)
    df = df[df['timestamp'] > cutoff]
    cutoff = pd.to_datetime(df.iloc[-1]['serverTS']) + datetime.timedelta(seconds=10)
    df = df[df['timestamp'] < cutoff]
    print(f'\t\tdropped {l-len(df)} rows of data due to weird timestamps.')
    
    df['timestamp'] -= datetime.timedelta(hours=TIMEZONE_OFFSET)
    
    df.set_index('timestamp', inplace=True)
    
    print('\t\t-----------------------------')
    #print('\t\t-- reindexed timestamps based on first packet server timestamp and last tick_ms of first packet')
    #print('\t\t-- there is the possibility of drift with this technique; check that these values of the')
    #print('\t\t-- final packet are roughly in sync.')
    #print('\t\t-- First server time: ' + pd.to_datetime(df['serverTS'].iloc[0]).time().isoformat())
    #print('\t\t-- Last server time: ' + pd.to_datetime(df['serverTS'].iloc[-1]).time().isoformat())
    #print('\t\t-- First timestamp: ' + df.index[0].time().isoformat())
    #print('\t\t-- Last timestamp: ' + df.index[-1].time().isoformat())
    #print('\t\t-- Duration serverTS:  ' + humanize.precisedelta(pd.to_datetime(df['serverTS'].iloc[-1])-pd.to_datetime(df['serverTS'].iloc[0])))
    #print('\t\t-- Duration timestamp: ' + humanize.precisedelta(df.index[-1]-df.index[0]))
    print('\t\tDuration: ' + humanize.precisedelta(df.index[-1]-df.index[0]))
    print('\t\tError (server timestamp vs tick): ' + humanize.precisedelta(pd.to_datetime(df['serverTS'].iloc[-1]) - df.index[-1] - datetime.timedelta(hours=TIMEZONE_OFFSET)))
    print('\t\t% drift: ' + str(100.0*(pd.to_datetime(df['serverTS'].iloc[-1]) - df.index[-1] - datetime.timedelta(hours=TIMEZONE_OFFSET))/(df.index[-1]- df.index[0])))
     
    df.drop([column, 'serverTS', 'msFromStart','index','packetTypeLetter','packetType','packetNum','epoch','packetSize','res0','res1','res2','res3','res4'], axis=1, inplace=True)

    return df

In [23]:
def update_sessions_with_E4_data(session_data, folder, user, session):
    
    #load E4 data
    df = load_session_data(folder, 'P' + str(user), 'sess' + str(session))
    
    df_names = ['ACC', 'IBI', 'TEMP', 'EDA', 'BVP', 'HR']
    
    #iterate over each data_type in the E4 session
    for df_type in df_names:
        
        #get just that data
        session_data['E4_' + df_type] = extract_datatype(df, df_type)
    
    return session_data

def process_ui_data(session_data):
    '''process the ui data from the session_data into a DF of important timestamps and a dictionary of survey data'''

    # Define lists for two dataframes
    timestamps = []
    
    for row in session_data['ui']:
        if row[2] in ['START_TRANSITION', 'FINISHED_TRANSITION']:
            timestamps.append([pd.to_datetime(row[0])-datetime.timedelta(hours=TIMEZONE_OFFSET), row[2]])
        elif row[2] == 'TX_TIME_SEEN':
            timestamps.append([pd.to_datetime(row[0])-datetime.timedelta(hours=TIMEZONE_OFFSET), 'WATCH_PRESS'])
        else:
            print(f'GOT unknown survey: {row}')
    
    # Create dataframes
    df_times = pd.DataFrame(timestamps, columns=['timestamp', 'event'])
    df_times['timestamp'] = pd.to_datetime(df_times['timestamp'], utc=True)
    df_times['timestamp'] = df_times['timestamp'].dt.tz_localize(None)
    df_times.set_index('timestamp', inplace=True)

    return df_times

In [28]:
users_to_process = []
#Done [5,21,20,9,16,17,18,15,19,24,22,23,12,2,3]; 13 below due to no E4 data

folder = '/Volumes/Secondary/PhDStudy_Results'

for user in users_to_process:
    try:
        #grab all sections of Session 1 for that user (glasses CSVs, each separated by task/env')
        session_data = load_session3_excel(folder, 'P' + str(user), 'sess3')
        print('Successfully processed P' + str(user) + ' sess3 csvs.')

        session_data = update_sessions_with_E4_data(session_data, folder, user, 3)
        print('Successfully processed P' + str(user) + ' sess3 E4 data.')

        session_data['timings'] = process_ui_data(session_data)
        del session_data['ui']
        print('Successfully processed P' + str(user) + ' sess3 timings.')

        #save to a pickle
        with open(folder + '/P' + str(user) + '_sess3_timeseries.pickle', 'wb') as handle:
            pickle.dump(session_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print('Successfully saved P' + str(user) + ' session #3 data.')
        
    except Exception as e:
        print(f'ERROR WITH USER {user}: {e}')

/Volumes/Secondary/PhDStudy_Results/P21/P21_sess3/recording_042223_163025.csv, File size: 12.1 MB
/Volumes/Secondary/PhDStudy_Results/P21/P21_sess3/recording_042223_164800.csv, File size: 8.7 MB
got raw data.
	thermal processing...
		dropped 0 rows of data due to duplication.
		dropped 0 rows of data due to weird timestamps.
		-----------------------------
		Duration: 25 minutes and 13.90 seconds
		Error (server timestamp vs tick): 0.11 seconds
		% drift: -0.007464165400620913
	gyro processing...
		at index 63: packetNum not sequentially increasing. Current: 56, Previous: 54
		at index 63: tick increment greater than timeout. Difference: 1206ms
		at index 476: packetNum not sequentially increasing. Current: 214, Previous: 212
		at index 476: tick increment greater than timeout. Difference: 1164ms
		at index 1620: packetNum not sequentially increasing. Current: 79, Previous: 77
		at index 1620: tick increment greater than timeout. Difference: 1308ms
		at index 1751: packetNum not sequen

In [29]:
#No E4 data version
users_to_process = [13]

folder = '/Volumes/Secondary/PhDStudy_Results'

for user in users_to_process:
    try:
        #grab all sections of Session 1 for that user (glasses CSVs, each separated by task/env')
        session_data = load_session3_excel(folder, 'P' + str(user), 'sess3')
        print('Successfully processed P' + str(user) + ' sess3 csvs.')

        session_data['timings'] = process_ui_data(session_data)
        del session_data['ui']
        print('Successfully processed P' + str(user) + ' sess3 timings.')

        #save to a pickle
        with open(folder + '/P' + str(user) + '_sess3_timeseries.pickle', 'wb') as handle:
            pickle.dump(session_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print('Successfully saved P' + str(user) + ' session #3 data.')
        
    except Exception as e:
        print(f'ERROR WITH USER {user}: {e}')

/Volumes/Secondary/PhDStudy_Results/P13/P13_sess3/recording_052023_233713.csv, File size: 10.1 MB
/Volumes/Secondary/PhDStudy_Results/P13/P13_sess3/recording_052023_235526.csv, File size: 4.6 MB
got raw data.
	thermal processing...
		at index 121: packetNum not sequentially increasing. Current: 125, Previous: 122
		at index 121: tick increment greater than timeout. Difference: 12013ms
		at index 123: packetNum not sequentially increasing. Current: 129, Previous: 126
		at index 123: tick increment greater than timeout. Difference: 11999ms
		at index 140: packetNum not sequentially increasing. Current: 147, Previous: 145
		at index 140: tick increment greater than timeout. Difference: 8005ms
		at index 142: packetNum not sequentially increasing. Current: 150, Previous: 148
		at index 142: tick increment greater than timeout. Difference: 7987ms
		at index 148: packetNum not sequentially increasing. Current: 157, Previous: 155
		at index 148: tick increment greater than timeout. Difference