DorothÃ©e Morand-Grondin

Working on Ingrid Demoly research project - Physiological analysis

Second script: EDA metrics at each marker

In [None]:
# libraries
import pandas as pd
import numpy as np
import unicodedata

# identify participant numbers
subjects = [f"Subject{str(i).zfill(2)}" for i in range(1, 83)]

find the shortest interval across participants. 
This will serve as our reference for induction period

In [None]:

# same as ECG data
WINDOW = 6.1875  # or shortest_df['shortest_interval'].min()

def process_subject(subject):
    print(f"computing {subject}") # tracking

    # load marker data 
    eda_df = pd.read_excel(f"data/{subject}_SC.xlsx", dtype={"Marqueurs": str})
    eda_df.rename(columns={"Time": "time", "EKG": "amplitude", "Marqueurs": "markers"}, inplace=True)

    debut_df = eda_df[eda_df["markers"].notna() & eda_df["markers"].str.startswith("Debut")].copy()
    debut_df["emotion"] = (
        debut_df["markers"]
        .str.replace("Debut ", "", regex=False)
        .str.lower()
        .apply(lambda s: ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn').strip()))

    # load cleaned EDA signal
    signal_df = pd.read_csv(f"data_cleaned/{subject}_EDA_cleaned.csv")
    time = signal_df["time"].to_numpy()
    eda = signal_df["EDA_Clean"].to_numpy()

    results = {"ID": subject}
    emotion_counts = {}

    # loop over stimuli
    for emotion, start_time in zip(debut_df["emotion"], debut_df["time"]):
        emotion_counts[emotion] = emotion_counts.get(emotion, 0) + 1
        name = f"{emotion}_{emotion_counts[emotion]}"

        start_idx = np.searchsorted(time, start_time, side="left")
        end_idx = np.searchsorted(time, start_time + WINDOW, side="right")

        windowed_signal = eda[start_idx:end_idx]
        windowed_time = time[start_idx:end_idx]
        windowed_time_rel = windowed_time - start_time  # relative time

        # saseline (0.5s before stimulus)
        baseline_start_idx = np.searchsorted(time, start_time - 0.5, side="left")
        baseline_signal = eda[baseline_start_idx:start_idx].mean()

        # onset latency (relative) Venables & Christie, 1980; Dawson et al., Handbook of Psychophysiology, 2007
        above_thresh = np.where(windowed_signal > baseline_signal + 0.01)[0]
        onset_latency = windowed_time_rel[above_thresh[0]] if above_thresh.size > 0 else np.nan

        # peak amplitude & latency (relative)
        peak_max = np.argmax(windowed_signal)
        peak_amplitude = windowed_signal[peak_max]
        latency_peak = windowed_time_rel[peak_max]

        # rise time
        rise_time = latency_peak - onset_latency if not np.isnan(onset_latency) else np.nan

        # AUC (relative)
        if not np.isnan(onset_latency):
            onset_idx = np.searchsorted(windowed_time_rel, onset_latency, side="left")
            auc = np.trapz(windowed_signal[onset_idx:], windowed_time_rel[onset_idx:])
        else:
            auc = np.trapz(windowed_signal, windowed_time_rel)

        # square root peak
        sqrt_peak = np.sqrt(peak_amplitude) if peak_amplitude > 0 else np.nan

        # half recovery (relative)
        half_recovery = np.nan
        half_amplitude = baseline_signal + ((peak_amplitude - baseline_signal) / 2)
        after_peak_signal = (windowed_signal <= half_amplitude) & (windowed_time_rel > latency_peak)
        recovery_idx = np.where(after_peak_signal)[0]
        if recovery_idx.size > 0:
            half_recovery = windowed_time_rel[recovery_idx[0]] - latency_peak

        # store metrics
        results[f"{name}_onset_latency"] = onset_latency
        results[f"{name}_peak_amplitude"] = peak_amplitude
        results[f"{name}_latency_peak"] = latency_peak
        results[f"{name}_rise_time"] = rise_time
        results[f"{name}_auc"] = auc
        results[f"{name}_sqrt_peak"] = sqrt_peak
        results[f"{name}_half_recovery"] = half_recovery

    return results


Parallel to process participants faster

In [3]:
from joblib import Parallel, delayed
from tqdm import tqdm

results = Parallel(n_jobs=-1)(
    delayed(process_subject)(subject) for subject in tqdm(subjects, total=len(subjects)))

eda_results_df = pd.DataFrame(results)
eda_results_df.to_excel("EDA_results_per_marker.xlsx", index=False)


In [None]:
# load data
df = pd.read_excel("EDA_results_per_marker.xlsx")

summary = []

for _, row in df.iterrows():
    subj = {"ID": row["ID"]}

    # loop over all columns except ID
    for col in df.columns[1:]:
        parts = col.split("_", 2)  # split into emotion, repetition, metric
        if len(parts) < 3:
            continue
        emotion, _, metric = parts
        key = f"{emotion}_{metric}"
        subj.setdefault(key, []).append(row[col])

    # average over repetitions
    for k, v in subj.items():
        if k == "ID":
            continue
        subj[k] = pd.Series(v).mean(skipna=True)

    summary.append(subj)

# build and save summary dataframe
summary_df = pd.DataFrame(summary)
summary_df.to_excel("EDA_results_summary.xlsx", index=False)
