In [1]:
%load_ext autoreload
%autoreload 2
from data_prep_helper import *

### Pipeline Steps

1- Import Raw EDA data, Angelia Log Data, and EDA Marker data

- Identify time of the first and last event marker in the EDA data, and the time of the "starting"/"beginning" and "ending"/"finishing" phrases in the Angelia logs to ascertain the true start and end times. Clip the data to fit these

2- Preprocess EDA Data
- Upsample to 8HZ
- Apply a first order Butterworth lowpass filter, with a cutoff frequency of 0.4 Hz, similar to Di Lascio et al. [15], to remove noisy high frequency fluctuations, which could be attributed to artifacts in the data [42] ("Lateralization Paper")
- Decomposed the signal into the phasic and tonic components [8],
using the cvxEDA method proposed by Greco et al. [30] ("Lateralization Paper")
- May apply any of the following optionally: min-max normalization, robust scaling with interquartile-range (IRQ), log-transformation, Yeo-Jonhson power scaling transformation [97] and quantile transformation.


3- Artifact analysis on the EDA Data
- On the filtered “mixed-EDA” signal using the ML-based open source tool developed by Gashi et al. [25]. Given 5-second non-overlapping windows of the EDA signal, over which some pre-processing is performed, the method applies an XGBoost machine learning classifier [10] to detect the presence of artifacts in
each window.
- Exclude data if more than 30% or smt of the samples are artificacts

4- Segmentation and Feature Extraction
- Segment the signals into non-overlapping windows and performed feature extraction, say 4 seconds
- Extract features for each EDA component, i.e., “tonic”, “phasic” and “mixed-EDA”,similarly to existing literature [3, 15, 98]. (Lateralization Paper)
    - Time-domain features: Mean, minimum, maximum, standard deviation, dynamic range, slope, and its absolute value, mean, and standard deviation of the first derivative, the number of EDA peaks in a window and their amplitude.
    - FFT: Performe a Fast Fourier Transform (FFT) on the EDA signals to obtain the Power Spectral Density (PSD), extract Direct Current (DC) term, sum of frequency coefficients, information entropy, and spectral energy

6- Explore potential normalizations/smoothing by making use of the other recorded data : temperature, BPM, BVP, Accelerometer


Angelia Data Pipeline:

1- Segmentation and Feature Extraction
- 4 second windows overlapping with the EDA Data
- Features: Number of character changes,  number of new characters, number of deleted characters, number of new/deleted words (characters separated by comma, =, space), Flag for file switches
    - Flag for whether all the aforementioned features are higher than the preceeding period, the successive period, the average of the 10 preceeding periods, the average of the 10 succeeding periods
    - Flag for gradescope submission
    

##TO DO:
 - may apply Z score standardize data before decomposition due to inter subject variablitiy
 

Read In Datasets

In [2]:
empatica_data_a3 = get_empatica_data(a3=True)
empatica_data_a4 = get_empatica_data(a3=False)
keylog_data_a3 = get_keylog_data(a3=True)
keylog_data_a4 = get_keylog_data(a3=False)

### Ascertain start times

First filter out log data from before the empatica is turned on

In [3]:
empatica_data_a3, keylog_data_a3 = clip_for_start_end_times(empatica_data_a3, keylog_data_a3, a3=True)
empatica_data_a4, keylog_data_a4 = clip_for_start_end_times(empatica_data_a4, keylog_data_a4, a3=False)

In [4]:
# # Check for correctness
print_start_end_times(empatica_data_a3, keylog_data_a3, a3=True)
print_start_end_times(empatica_data_a4, keylog_data_a4, a3=False)

P3_1
2024-03-13 20:35:40
2024-03-13 20:35:39.967000
2024-03-13 21:35:52.750000
2024-03-13 21:35:52.846000
P3_2
2024-03-14 01:14:05.750000
2024-03-14 01:14:05.511000
2024-03-14 01:44:59
2024-03-14 01:44:59.135000
P5
2024-02-28 18:35:11.250000
2024-02-28 18:35:11.051000
2024-02-28 22:50:41
2024-02-28 22:50:41.133000
P8_1
2024-03-08 15:31:02
2024-03-08 15:31:01.832000
2024-03-08 16:28:42.250000
2024-03-08 16:28:42.260000
P8_2
2024-03-09 02:46:55
2024-03-09 02:46:54.976000
2024-03-09 03:17:30.500000
2024-03-09 03:17:30.691000
P9_1
2024-03-06 03:24:18.250000
2024-03-06 03:24:18.203000
2024-03-06 04:33:05.500000
2024-03-06 04:33:05.716000
P9_2
2024-03-06 13:41:30.750000
2024-03-06 13:41:30.639000
2024-03-06 14:16:49.750000
2024-03-06 14:16:49.777000
P11
2024-03-13 01:22:30.500000
2024-03-13 01:22:30.300000
2024-03-13 03:21:59.500000
2024-03-13 03:21:59.589000
P12
2024-03-07 03:14:35.250000
2024-03-07 03:14:35.242000
2024-03-07 04:43:34
2024-03-07 04:43:34.110000
P13
2024-03-13 02:11:39.50000

### Process EDA Data: Get peaks etc
- Will give warning saying filtering is skipped, this is okay as our data is 4Hz and Neurokit will apply butterforth filter if signal >7Hz

In [5]:
empatica_data_a3 = process_eda_signal(empatica_data_a3, a3=True)
empatica_data_a4 = process_eda_signal(empatica_data_a4, a3=False)

  warn(
  warn(
  warn(
  empatica_data[p]["EDA"].fillna(0, inplace=True)
  warn(
  warn(
  warn(
  empatica_data[p]["EDA"].fillna(0, inplace=True)
  warn(
  warn(
  empatica_data[p]["EDA"].fillna(0, inplace=True)
  warn(
  empatica_data[p]["EDA"].fillna(0, inplace=True)
  warn(
  empatica_data[p]["EDA"].fillna(0, inplace=True)
  warn(
  empatica_data[p]["EDA"].fillna(0, inplace=True)
  warn(
  empatica_data[p]["EDA"].fillna(0, inplace=True)
  warn(
  empatica_data[p]["EDA"].fillna(0, inplace=True)
  warn(
  empatica_data[p]["EDA"].fillna(0, inplace=True)
  warn(
  empatica_data[p]["EDA"].fillna(0, inplace=True)
  warn(
  empatica_data[p]["EDA"].fillna(0, inplace=True)
  warn(
  empatica_data[p]["EDA"].fillna(0, inplace=True)
  warn(
  empatica_data[p]["EDA"].fillna(0, inplace=True)
  warn(
  empatica_data[p]["EDA"].fillna(0, inplace=True)
  warn(
  empatica_data[p]["EDA"].fillna(0, inplace=True)
  warn(
  warn(
  warn(
  empatica_data[p]["EDA"].fillna(0, inplace=True)
  warn(
  empati

In [6]:
empatica_data_a3['P11']["EDA"].head()

Unnamed: 0,EDA_Raw,EDA_Clean,EDA_Tonic,EDA_Phasic,SCR_Onsets,SCR_Peaks,SCR_Height,SCR_Amplitude,SCR_RiseTime,SCR_Recovery,SCR_RecoveryTime,Time_s,Time,EDA_Tonic_RollingAverage
0,0.078128,0.078128,0.077955,0.000173,0,0,0.0,0.0,0.0,0,0.0,0.2,2024-03-13 20:35:40,0.077955
1,0.076847,0.076847,0.077938,-0.001091,0,0,0.0,0.0,0.0,0,0.0,0.45,2024-03-13 20:35:40.250000,0.077946
2,0.076847,0.076847,0.07792,-0.001073,0,0,0.0,0.0,0.0,0,0.0,0.7,2024-03-13 20:35:40.500000,0.077937
3,0.078128,0.078128,0.077901,0.000227,0,0,0.0,0.0,0.0,0,0.0,0.95,2024-03-13 20:35:40.750000,0.077928
4,0.079409,0.079409,0.077881,0.001528,0,0,0.0,0.0,0.0,0,0.0,1.2,2024-03-13 20:35:41,0.077919


### Segmentation : 4-second non-overlapping windows
Issue: many time windows don't have any log activity

In [7]:
empatica_data_a3, keylog_data_a3 = create_windowed_data(empatica_data_a3, keylog_data_a3, a3=True)
empatica_data_a4, keylog_data_a4 = create_windowed_data(empatica_data_a4, keylog_data_a4, a3=False)

Manual check

In [8]:
columns_to_check = [
    'SCR_Onsets', 'SCR_Peaks', 'SCR_Height', 'SCR_Amplitude', 
    'SCR_RiseTime', 'SCR_Recovery', 'SCR_RecoveryTime'
]
# empatica_data['P11']['EDA'][(empatica_data['P11']['EDA'][columns_to_check] != 0).any(axis=1)]

In [9]:
empatica_data_a3['P11']["EDA"]

Unnamed: 0,EDA_Raw,EDA_Clean,EDA_Tonic,EDA_Phasic,SCR_Onsets,SCR_Peaks,SCR_Height,SCR_Amplitude,SCR_RiseTime,SCR_Recovery,SCR_RecoveryTime,Time_s,Time,EDA_Tonic_RollingAverage,Window
0,0.078128,0.078128,0.077955,0.000173,0,0,0.0,0.0,0.0,0,0.0,0.20,2024-03-13 20:35:40,0.077955,0
1,0.076847,0.076847,0.077938,-0.001091,0,0,0.0,0.0,0.0,0,0.0,0.45,2024-03-13 20:35:40.250000,0.077946,0
2,0.076847,0.076847,0.077920,-0.001073,0,0,0.0,0.0,0.0,0,0.0,0.70,2024-03-13 20:35:40.500000,0.077937,0
3,0.078128,0.078128,0.077901,0.000227,0,0,0.0,0.0,0.0,0,0.0,0.95,2024-03-13 20:35:40.750000,0.077928,0
4,0.079409,0.079409,0.077881,0.001528,0,0,0.0,0.0,0.0,0,0.0,1.20,2024-03-13 20:35:41,0.077919,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28667,0.188276,0.188276,0.189260,-0.000657,0,0,0.0,0.0,0.0,0,0.0,7166.95,0,0.133268,1791
28668,0.188276,0.188276,0.189228,-0.000601,0,0,0.0,0.0,0.0,0,0.0,7167.20,0,0.133270,1791
28669,0.189557,0.189557,0.189198,0.000736,0,0,0.0,0.0,0.0,0,0.0,7167.45,0,0.133272,1791
28670,0.188276,0.188276,0.189170,-0.000492,0,0,0.0,0.0,0.0,0,0.0,7167.70,0,0.133274,1791


## Compute useful features for each row

### Get EDA Time domain features for each window

In [10]:
empatica_data_a3 = create_aggregated_eda_window_features(empatica_data_a3, a3=True)
empatica_data_a4 = create_aggregated_eda_window_features(empatica_data_a4, a3=False)

In [11]:
empatica_data_a3 = create_extra_aggregated_eda_window_features(empatica_data_a3, a3=True)
empatica_data_a4 = create_extra_aggregated_eda_window_features(empatica_data_a4, a3=False)

Now processing participant:  P3_1
Now processing participant:  P3_2


  return -nansum(norm_psd * log(norm_psd), axis=0)
  return -nansum(norm_psd * log(norm_psd), axis=0)


Now processing participant:  P5
Now processing participant:  P8_1
Now processing participant:  P8_2
Now processing participant:  P9_1


  return -nansum(norm_psd * log(norm_psd), axis=0)
  return -nansum(norm_psd * log(norm_psd), axis=0)


Now processing participant:  P9_2
Now processing participant:  P11


  return -nansum(norm_psd * log(norm_psd), axis=0)
  return -nansum(norm_psd * log(norm_psd), axis=0)
  norm_psd = psd / nansum(psd)


Now processing participant:  P12
Now processing participant:  P13
Now processing participant:  P15
Now processing participant:  P19


  return -nansum(norm_psd * log(norm_psd), axis=0)
  return -nansum(norm_psd * log(norm_psd), axis=0)


Now processing participant:  P20
Now processing participant:  P21


  return -nansum(norm_psd * log(norm_psd), axis=0)
  return -nansum(norm_psd * log(norm_psd), axis=0)


Now processing participant:  P24
Now processing participant:  P25
Now processing participant:  P26
Now processing participant:  P28
Now processing participant:  P29
Now processing participant:  P30
Now processing participant:  P34_1
Now processing participant:  P34_2
Now processing participant:  P42
Now processing participant:  P43_1
Now processing participant:  P43_2
Now processing participant:  P46
Now processing participant:  P47
Now processing participant:  P9
Now processing participant:  P11
Now processing participant:  P12
Now processing participant:  P15
Now processing participant:  P16
Now processing participant:  P19


  return -nansum(norm_psd * log(norm_psd), axis=0)
  return -nansum(norm_psd * log(norm_psd), axis=0)


Now processing participant:  P20


  return -nansum(norm_psd * log(norm_psd), axis=0)
  return -nansum(norm_psd * log(norm_psd), axis=0)


Now processing participant:  P26
Now processing participant:  P29
Now processing participant:  P30
Now processing participant:  P34_1
Now processing participant:  P34_2
Now processing participant:  P36
Now processing participant:  P42
Now processing participant:  P43
Now processing participant:  P46
Now processing participant:  P50


  return -nansum(norm_psd * log(norm_psd), axis=0)
  return -nansum(norm_psd * log(norm_psd), axis=0)


Now processing participant:  P52
Now processing participant:  P53


# Create windowed data for logs


In [12]:
keylog_data_a3_copy = keylog_data_a3.copy()
keylog_data_a4_copy = keylog_data_a4.copy()

In [13]:
keylog_data_a3 = create_windowed_keylogger_features(keylog_data_a3, empatica_data_a3, a3=True)
keylog_data_a4 = create_windowed_keylogger_features(keylog_data_a4, empatica_data_a4, a3=False)

In [15]:
keylog_data_a3['P11']

Unnamed: 0,Window,sum_text_length,count_text_length_flag,max_time_diff,earliest_time_diff,activity
0,0,0.294984,-0.130668,-0.044522,-0.027878,1
1,1,-0.064493,-0.130668,0.024936,0.403523,1
2,2,-0.064493,-0.130668,-0.053821,-0.040330,0
3,3,-0.064493,-0.130668,-0.053821,-0.040330,0
4,4,-0.064493,-0.130668,-0.053821,-0.040330,0
...,...,...,...,...,...,...
1787,1787,-0.064493,-0.130668,-0.053821,-0.040330,0
1788,1788,-0.064493,-0.130668,-0.053821,-0.040330,0
1789,1789,-0.064493,-0.130668,-0.053821,-0.040330,0
1790,1790,-0.064493,-0.130668,-0.053821,-0.040330,0


Save joblib

In [14]:
with open('empatica_data_a3.pkl', 'wb') as f:
    pickle.dump(empatica_data_a3, f)

with open('log_data_a3.pkl', 'wb') as f:
    pickle.dump(keylog_data_a3, f)

with open('empatica_data_a4.pkl', 'wb') as f:
    pickle.dump(empatica_data_a4, f)

with open('log_data_a4.pkl', 'wb') as f:
    pickle.dump(keylog_data_a4, f)

Read joblib

In [10]:
with open('empatica_data_a3.pkl', 'rb') as f:
    empatica_data_a3 = pickle.load(f)

with open('log_data_windows_a3.pkl', 'rb') as f:
    keylog_data_a3 = pickle.load(f)

with open('empatica_data_a4.pkl', 'rb') as f:
    empatica_data_a4 = pickle.load(f)

with open('log_data_windows_a4.pkl', 'rb') as f:
    keylog_data_a4 = pickle.load(f)

## Save the datasets for use by the Model

- To do : put these into functions if needed

In [12]:
for p in p_list_a3:
    print(empatica_data_a3[p]['EDA_windowed'].shape)
for p in p_list_a4:
    print(empatica_data_a4[p]['EDA_windowed'].shape)

(903, 47)
(463, 47)
(3832, 47)
(865, 47)
(458, 47)
(1031, 47)
(529, 47)
(1792, 47)
(1334, 47)
(1514, 47)
(1333, 47)
(1589, 47)
(1391, 47)
(1447, 47)
(1515, 47)
(1076, 47)
(1445, 47)
(1350, 47)
(1514, 47)
(1385, 47)
(672, 47)
(789, 47)
(1473, 47)
(1704, 47)
(2640, 47)
(868, 47)
(1908, 47)
(1565, 47)
(1160, 47)
(1362, 47)
(1352, 47)
(1355, 47)
(1326, 47)
(1357, 47)
(1377, 47)
(1360, 47)
(1487, 47)
(834, 47)
(603, 47)
(1290, 47)
(1379, 47)
(1877, 47)
(1370, 47)
(1944, 47)
(1388, 47)
(1485, 47)


### 1. Participant by participant datasets
- Save a folder for each participant, store the a3 and a4 datasets in each

In [None]:
for p in p_list_a3:
    X = empatica_data_a3[p]['EDA_windowed'].drop(columns=['Window'])
    y = keylog_data_a3[p]['activity']

    # if the participant has multiple data save it as one file
    if '_' in p: 
        p = p.split('_')[0] 
    # print(p)

    # Create a folder Particiant/Assignment and save the data there
    folder_path = f"EDAModelDatasets/{p}/a3"
    os.makedirs(folder_path, exist_ok=True)
    x_path = f"{folder_path}/x.csv"
    y_path = f"{folder_path}/y.csv"

    if os.path.exists(x_path):
        X_existing = pd.read_csv(x_path)
        y_existing = pd.read_csv(y_path)
        X = pd.concat([X_existing, X], axis=0)
        y = pd.concat([y_existing, y], axis=0)

    X = X.reset_index(drop=True)
    y = y.reset_index(drop=True)
        
    X.to_csv(f"{folder_path}/x.csv", index=False)
    y.to_csv(f"{folder_path}/y.csv", index=False)

for p in p_list_a4:
    X = empatica_data_a4[p]['EDA_windowed'].drop(columns=['Window'])
    y = keylog_data_a4[p]['activity']

    # if the participant has multiple data save it as one file
    if '_' in p: 
        p = p.split('_')[0] 
    # print(p)

    # Create a folder Particiant/Assignment and save the data there
    folder_path = f"EDAModelDatasets/{p}/a4"
    os.makedirs(folder_path, exist_ok=True)
    x_path = f"{folder_path}/x.csv"
    y_path = f"{folder_path}/y.csv"

    if os.path.exists(x_path):
        X_existing = pd.read_csv(x_path)
        y_existing = pd.read_csv(y_path)
        X = pd.concat([X_existing, X], axis=0)
        y = pd.concat([y_existing, y], axis=0)

    X = X.reset_index(drop=True)
    y = y.reset_index(drop=True)
        
    X.to_csv(f"{folder_path}/x.csv", index=False)
    y.to_csv(f"{folder_path}/y.csv", index=False)

Combine all participants for a3 and a4 and save them as two files

A3

In [None]:
X_list_a3 = []
y_list_a3 = []

for p in p_list_a3:
    X_list_a3.append(empatica_data_a3[p]["EDA_windowed"].drop(columns=['Window']))
    y_list_a3.append(keylog_data_a3[p]['activity'])

X = pd.concat(X_list_a3, axis=0).reset_index(drop=True)
y = pd.concat(y_list_a3, axis=0).reset_index(drop=True)

folder_path = f"EDAModelDatasets/a3"
os.makedirs(folder_path, exist_ok=True)

X.to_csv(f"{folder_path}/x.csv")
y.to_csv(f"{folder_path}/y.csv")

print("saved a3")

X_list_a4 = []
y_list_a4 = []

for p in p_list_a4:
    X_list_a4.append(empatica_data_a4[p]["EDA_windowed"].drop(columns=['Window']))
    y_list_a4.append(keylog_data_a4[p]['activity'])

X = pd.concat(X_list_a4, axis=0).reset_index(drop=True)
y = pd.concat(y_list_a4, axis=0).reset_index(drop=True)

folder_path = f"EDAModelDatasets/a4"
os.makedirs(folder_path, exist_ok=True)

X.to_csv(f"{folder_path}/x.csv")
y.to_csv(f"{folder_path}/y.csv")

saved a3


### 3. Put them alltogether into one

In [15]:
X = pd.concat(X_list_a3 + X_list_a4, axis=0).reset_index(drop=True)
y = pd.concat(y_list_a3 + y_list_a4, axis=0).reset_index(drop=True)

In [None]:
folder_path = f"EDAModelDatasets/a3_a4_combined"
os.makedirs(folder_path, exist_ok=True)
X.to_csv(f"{folder_path}/x.csv")
y.to_csv(f"{folder_path}/y.csv")