In [1]:
import pandas as pd
import numpy as np
from numpy import arccos, clip
from scipy.stats import linregress, skew, kurtosis, entropy
from scipy.fft import fft
from functools import reduce

import warnings
warnings.filterwarnings('ignore')


In [2]:
df_original = pd.read_csv('../data_cleaning/data/imputed_data.csv')
df = df_original.copy()

In [3]:
df.rename(columns={'person': 'participant'}, inplace=True)
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.sort_values(by=['timestamp'], ascending=True, inplace=True)
df

Unnamed: 0,timestamp,heart_rate,x_coordinate,y_coordinate,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,genre,participant
6570,2025-06-05 16:25:55,62.0,409.206897,194.000000,2.602941,11.8,0.220588,comedy,kenji
6571,2025-06-05 16:25:56,61.5,413.433333,194.800000,2.602941,11.8,0.220588,comedy,kenji
6572,2025-06-05 16:25:59,60.0,397.333333,191.000000,2.602941,11.8,0.220588,comedy,kenji
6573,2025-06-05 16:26:01,60.0,397.166667,190.600000,2.602941,11.8,0.220588,comedy,kenji
6574,2025-06-05 16:26:04,60.0,391.000000,189.333333,2.602941,11.8,0.220588,comedy,kenji
...,...,...,...,...,...,...,...,...,...
2449,2025-06-12 10:33:07,70.0,356.000000,324.000000,2.726626,11.8,0.231070,horror,aimen
2450,2025-06-12 10:33:08,70.0,356.000000,324.000000,2.726626,11.8,0.231070,horror,aimen
2451,2025-06-12 10:33:09,70.0,355.741935,324.000000,2.726626,11.8,0.231070,horror,aimen
2452,2025-06-12 10:33:10,70.0,356.000000,324.000000,2.726626,11.8,0.231070,horror,aimen


In [4]:
window_size = 10

## 1. Head movement features

In [5]:
# Head displacement per frame
df['dx'] = df.groupby(['participant', 'genre'])['x_coordinate'].diff()
df['dy'] = df.groupby(['participant', 'genre'])['y_coordinate'].diff()
df['head_displacement'] = np.sqrt(df['dx']**2 + df['dy']**2)
df['head_displacement'] = df['head_displacement'].fillna(0)

# Head velocity per frame
df['dt'] = df.groupby(['participant', 'genre'])['timestamp'].diff().dt.total_seconds()
df['head_velocity'] = df['head_displacement'] / df['dt']
df['head_velocity'] = df['head_velocity'].fillna(0)

# Head direction change rate
df['dx_prev'] = df.groupby(['participant', 'genre'])['dx'].shift()
df['dy_prev'] = df.groupby(['participant', 'genre'])['dy'].shift()

dot = df['dx'] * df['dx_prev'] + df['dy'] * df['dy_prev']
norm_product = np.sqrt((df['dx']**2 + df['dy']**2) * (df['dx_prev']**2 + df['dy_prev']**2))
df['cos_theta'] = dot / norm_product
df['cos_theta'] = clip(df['cos_theta'], -1.0, 1.0)  # Clip for numeric stability

df['angle_change'] = arccos(df['cos_theta'])

df['head_direction_change_rate'] = df.groupby(['participant', 'genre'])['angle_change'].rolling(window=window_size, min_periods=1).mean().reset_index(level=[0,1], drop=True)
df['head_direction_change_rate'] = df['head_direction_change_rate'].fillna(0)

# Head stability
df['head_x_std'] = df.groupby(['participant', 'genre'])['x_coordinate'].rolling(window=window_size, min_periods=1).std().reset_index(level=[0,1], drop=True)
df['head_y_std'] = df.groupby(['participant', 'genre'])['y_coordinate'].rolling(window=window_size, min_periods=1).std().reset_index(level=[0,1], drop=True)
df['head_stability'] = (df['head_x_std'] + df['head_y_std']) / 2
df['head_stability'] = df['head_stability'].fillna(0)

# Centered coordinates
mean_x = df.groupby(['participant', 'genre'])['x_coordinate'].transform('mean')
mean_y = df.groupby(['participant', 'genre'])['y_coordinate'].transform('mean')
df['centered_x'] = df['x_coordinate'] - mean_x
df['centered_y'] = df['y_coordinate'] - mean_y

# Drop intermediate columns if needed
df.drop(columns=['dx', 'dy', 'dt', 'dx_prev', 'dy_prev', 'cos_theta', 'angle_change', 'head_x_std', 'head_y_std'], inplace=True)

In [6]:
def reorder_cols(df): 
    df.sort_values(by=['timestamp'], ascending=True, inplace=True)

    # Get the current column order
    cols = list(df.columns)
    
    # Move 'participant' to 2nd and 'genre' to last
    cols.remove('participant')
    cols.remove('genre')
    new_order = [cols[0], 'participant'] + cols[1:] + ['genre']
    
    # Reorder the DataFrame
    return df[new_order]

df = reorder_cols(df)
df

Unnamed: 0,timestamp,participant,heart_rate,x_coordinate,y_coordinate,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,head_displacement,head_velocity,head_direction_change_rate,head_stability,centered_x,centered_y,genre
6570,2025-06-05 16:25:55,kenji,62.0,409.206897,194.000000,2.602941,11.8,0.220588,0.000000,0.000000,0.000000,0.000000,33.476335,-15.891058,comedy
6571,2025-06-05 16:25:56,kenji,61.5,413.433333,194.800000,2.602941,11.8,0.220588,4.301484,4.301484,0.000000,1.777114,37.702772,-15.091058,comedy
6572,2025-06-05 16:25:59,kenji,60.0,397.333333,191.000000,2.602941,11.8,0.220588,16.542370,5.514123,3.096881,5.175264,21.602772,-18.891058,comedy
6573,2025-06-05 16:26:01,kenji,60.0,397.166667,190.600000,2.602941,11.8,0.220588,0.433333,0.216667,2.020552,5.207589,21.436105,-19.291058,comedy
6574,2025-06-05 16:26:04,kenji,60.0,391.000000,189.333333,2.602941,11.8,0.220588,6.295413,2.098471,1.671507,5.834263,15.269439,-20.557725,comedy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2449,2025-06-12 10:33:07,aimen,70.0,356.000000,324.000000,2.726626,11.8,0.231070,0.509902,0.509902,1.055985,0.784842,-64.809663,139.658730,horror
2450,2025-06-12 10:33:08,aimen,70.0,356.000000,324.000000,2.726626,11.8,0.231070,0.000000,0.000000,1.055985,0.747638,-64.809663,139.658730,horror
2451,2025-06-12 10:33:09,aimen,70.0,355.741935,324.000000,2.726626,11.8,0.231070,0.258065,0.258065,1.055985,0.650178,-65.067728,139.658730,horror
2452,2025-06-12 10:33:10,aimen,70.0,356.000000,324.000000,2.726626,11.8,0.231070,0.258065,0.258065,1.055985,0.510200,-64.809663,139.658730,horror


## 2. Time Domain Features

In [7]:
# Define features and window size
signals = ['heart_rate', 'centered_x', 'centered_y', 'pupil_diameter_mm', 'iris_diameter_mm', 'pupil_iris_ratio']
window = str(window_size)+ 's'

# Ensure timestamp is datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Function for custom features
def compute_features(group):
    group = group.set_index('timestamp').sort_index()

    for signal in signals:
        roll = group[signal].rolling(window=window, min_periods=2)

        # Basic stats
        group[f'td_{signal}_mean'] = roll.mean()
        group[f'td_{signal}_std'] = roll.std()
        group[f'td_{signal}_min'] = roll.min()
        group[f'td_{signal}_max'] = roll.max()
        group[f'td_{signal}_range'] = group[f'td_{signal}_max'] - group[f'td_{signal}_min']

        # Slope (linear regression over rolling window)
        group[f'td_{signal}_slope'] = roll.apply(
            lambda x: linregress(range(len(x)), x)[0] if len(x) > 1 else np.nan, raw=False
        )

        # Mean absolute change
        group[f'td_{signal}_mean_abs_change'] = roll.apply(
            lambda x: np.mean(np.abs(np.diff(x))) if len(x) > 1 else np.nan,
            raw=True
        )

        # Second-order difference mean 
        group[f'td_{signal}_second_order_diff_mean'] = roll.apply(
            lambda x: np.mean(np.abs(np.diff(x, n=2))) if len(x) > 2 else np.nan,
            raw=True
        )

    return group.reset_index()

# Apply per participant per genre
df = df.groupby(['participant', 'genre'], group_keys=False).apply(compute_features)

In [8]:
df.sort_values(by=['timestamp'], ascending=True, inplace=True)

In [9]:
# Backward fill because first one or two rows could be NaN
df = reorder_cols(df)
td_feature_columns = [col for col in df.columns if col.startswith('td_')]
df[td_feature_columns] = df[td_feature_columns].fillna(method='bfill').fillna(0)

In [10]:
df

Unnamed: 0,timestamp,participant,heart_rate,x_coordinate,y_coordinate,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,head_displacement,head_velocity,...,td_iris_diameter_mm_second_order_diff_mean,td_pupil_iris_ratio_mean,td_pupil_iris_ratio_std,td_pupil_iris_ratio_min,td_pupil_iris_ratio_max,td_pupil_iris_ratio_range,td_pupil_iris_ratio_slope,td_pupil_iris_ratio_mean_abs_change,td_pupil_iris_ratio_second_order_diff_mean,genre
0,2025-06-05 16:25:55,kenji,62.0,409.206897,194.000000,2.602941,11.8,0.220588,0.000000,0.000000,...,0.0,0.220588,0.0,0.220588,0.220588,0.0,0.0,0.0,0.0,comedy
1,2025-06-05 16:25:56,kenji,61.5,413.433333,194.800000,2.602941,11.8,0.220588,4.301484,4.301484,...,0.0,0.220588,0.0,0.220588,0.220588,0.0,0.0,0.0,0.0,comedy
2,2025-06-05 16:25:59,kenji,60.0,397.333333,191.000000,2.602941,11.8,0.220588,16.542370,5.514123,...,0.0,0.220588,0.0,0.220588,0.220588,0.0,0.0,0.0,0.0,comedy
3,2025-06-05 16:26:01,kenji,60.0,397.166667,190.600000,2.602941,11.8,0.220588,0.433333,0.216667,...,0.0,0.220588,0.0,0.220588,0.220588,0.0,0.0,0.0,0.0,comedy
4,2025-06-05 16:26:04,kenji,60.0,391.000000,189.333333,2.602941,11.8,0.220588,6.295413,2.098471,...,0.0,0.220588,0.0,0.220588,0.220588,0.0,0.0,0.0,0.0,comedy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1255,2025-06-12 10:33:07,aimen,70.0,356.000000,324.000000,2.726626,11.8,0.231070,0.509902,0.509902,...,0.0,0.231070,0.0,0.231070,0.231070,0.0,0.0,0.0,0.0,horror
1256,2025-06-12 10:33:08,aimen,70.0,356.000000,324.000000,2.726626,11.8,0.231070,0.000000,0.000000,...,0.0,0.231070,0.0,0.231070,0.231070,0.0,0.0,0.0,0.0,horror
1257,2025-06-12 10:33:09,aimen,70.0,355.741935,324.000000,2.726626,11.8,0.231070,0.258065,0.258065,...,0.0,0.231070,0.0,0.231070,0.231070,0.0,0.0,0.0,0.0,horror
1258,2025-06-12 10:33:10,aimen,70.0,356.000000,324.000000,2.726626,11.8,0.231070,0.258065,0.258065,...,0.0,0.231070,0.0,0.231070,0.231070,0.0,0.0,0.0,0.0,horror


## 3. Higher-Order Stats for Eye-Tracking


In [11]:
# skewness & kurtosis 
def compute_stats(group):
    return pd.Series({
        'pupil_skew': skew(group['pupil_diameter_mm'], bias=False),
        'pupil_kurtosis': kurtosis(group['pupil_diameter_mm'], bias=False),
        'iris_skew': skew(group['iris_diameter_mm'], bias=False),
        'iris_kurtosis': kurtosis(group['iris_diameter_mm'], bias=False),
        'ratio_skew': skew(group['pupil_iris_ratio'], bias=False),
        'ratio_kurtosis': kurtosis(group['pupil_iris_ratio'], bias=False),
    })

# Step 1: Compute skewness & kurtosis per participant-genre
agg_stats = df.groupby(['participant', 'genre']).apply(compute_stats).reset_index()

# Step 2: Merge back with the full original dataframe
df = df.merge(agg_stats, on=['participant', 'genre'], how='left')

# Step 3: Reorder columns if needed
df = reorder_cols(df)
df


Unnamed: 0,timestamp,participant,heart_rate,x_coordinate,y_coordinate,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,head_displacement,head_velocity,...,td_pupil_iris_ratio_slope,td_pupil_iris_ratio_mean_abs_change,td_pupil_iris_ratio_second_order_diff_mean,pupil_skew,pupil_kurtosis,iris_skew,iris_kurtosis,ratio_skew,ratio_kurtosis,genre
0,2025-06-05 16:25:55,kenji,62.0,409.206897,194.000000,2.602941,11.8,0.220588,0.000000,0.000000,...,0.0,0.0,0.0,0.950843,0.421356,,,0.950843,0.421356,comedy
1,2025-06-05 16:25:56,kenji,61.5,413.433333,194.800000,2.602941,11.8,0.220588,4.301484,4.301484,...,0.0,0.0,0.0,0.950843,0.421356,,,0.950843,0.421356,comedy
2,2025-06-05 16:25:59,kenji,60.0,397.333333,191.000000,2.602941,11.8,0.220588,16.542370,5.514123,...,0.0,0.0,0.0,0.950843,0.421356,,,0.950843,0.421356,comedy
3,2025-06-05 16:26:01,kenji,60.0,397.166667,190.600000,2.602941,11.8,0.220588,0.433333,0.216667,...,0.0,0.0,0.0,0.950843,0.421356,,,0.950843,0.421356,comedy
4,2025-06-05 16:26:04,kenji,60.0,391.000000,189.333333,2.602941,11.8,0.220588,6.295413,2.098471,...,0.0,0.0,0.0,0.950843,0.421356,,,0.950843,0.421356,comedy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9351,2025-06-12 10:33:07,aimen,70.0,356.000000,324.000000,2.726626,11.8,0.231070,0.509902,0.509902,...,0.0,0.0,0.0,0.985634,0.981498,,,0.985634,0.981498,horror
9352,2025-06-12 10:33:08,aimen,70.0,356.000000,324.000000,2.726626,11.8,0.231070,0.000000,0.000000,...,0.0,0.0,0.0,0.985634,0.981498,,,0.985634,0.981498,horror
9353,2025-06-12 10:33:09,aimen,70.0,355.741935,324.000000,2.726626,11.8,0.231070,0.258065,0.258065,...,0.0,0.0,0.0,0.985634,0.981498,,,0.985634,0.981498,horror
9354,2025-06-12 10:33:10,aimen,70.0,356.000000,324.000000,2.726626,11.8,0.231070,0.258065,0.258065,...,0.0,0.0,0.0,0.985634,0.981498,,,0.985634,0.981498,horror


In [12]:
df['iris_skew'] = df['iris_skew'].fillna(0)
df['iris_kurtosis'] = df['iris_kurtosis'].fillna(0)
df

Unnamed: 0,timestamp,participant,heart_rate,x_coordinate,y_coordinate,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,head_displacement,head_velocity,...,td_pupil_iris_ratio_slope,td_pupil_iris_ratio_mean_abs_change,td_pupil_iris_ratio_second_order_diff_mean,pupil_skew,pupil_kurtosis,iris_skew,iris_kurtosis,ratio_skew,ratio_kurtosis,genre
0,2025-06-05 16:25:55,kenji,62.0,409.206897,194.000000,2.602941,11.8,0.220588,0.000000,0.000000,...,0.0,0.0,0.0,0.950843,0.421356,0.0,0.0,0.950843,0.421356,comedy
1,2025-06-05 16:25:56,kenji,61.5,413.433333,194.800000,2.602941,11.8,0.220588,4.301484,4.301484,...,0.0,0.0,0.0,0.950843,0.421356,0.0,0.0,0.950843,0.421356,comedy
2,2025-06-05 16:25:59,kenji,60.0,397.333333,191.000000,2.602941,11.8,0.220588,16.542370,5.514123,...,0.0,0.0,0.0,0.950843,0.421356,0.0,0.0,0.950843,0.421356,comedy
3,2025-06-05 16:26:01,kenji,60.0,397.166667,190.600000,2.602941,11.8,0.220588,0.433333,0.216667,...,0.0,0.0,0.0,0.950843,0.421356,0.0,0.0,0.950843,0.421356,comedy
4,2025-06-05 16:26:04,kenji,60.0,391.000000,189.333333,2.602941,11.8,0.220588,6.295413,2.098471,...,0.0,0.0,0.0,0.950843,0.421356,0.0,0.0,0.950843,0.421356,comedy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9351,2025-06-12 10:33:07,aimen,70.0,356.000000,324.000000,2.726626,11.8,0.231070,0.509902,0.509902,...,0.0,0.0,0.0,0.985634,0.981498,0.0,0.0,0.985634,0.981498,horror
9352,2025-06-12 10:33:08,aimen,70.0,356.000000,324.000000,2.726626,11.8,0.231070,0.000000,0.000000,...,0.0,0.0,0.0,0.985634,0.981498,0.0,0.0,0.985634,0.981498,horror
9353,2025-06-12 10:33:09,aimen,70.0,355.741935,324.000000,2.726626,11.8,0.231070,0.258065,0.258065,...,0.0,0.0,0.0,0.985634,0.981498,0.0,0.0,0.985634,0.981498,horror
9354,2025-06-12 10:33:10,aimen,70.0,356.000000,324.000000,2.726626,11.8,0.231070,0.258065,0.258065,...,0.0,0.0,0.0,0.985634,0.981498,0.0,0.0,0.985634,0.981498,horror


## 4. Frequency-Domain Features


In [13]:
def extract_fft_features_multi(signal_dict):
    """Expect a dict of signals {feature_name: np.array}, return flat dict with names."""
    results = {}
    sampling_rate = 1

    for feat_name, signal in signal_dict.items():
        N = len(signal)
        if N < 4:
            # Fill NaNs for all freq features
            results.update({f"{feat_name}_fft_max_amp": np.nan,
                            f"{feat_name}_fft_dominant_freq": np.nan,
                            f"{feat_name}_fft_centroid": np.nan,
                            f"{feat_name}_fft_entropy": np.nan,
                            f"{feat_name}_fft_band_0_0.1": np.nan,
                            f"{feat_name}_fft_band_0.1_0.25": np.nan,
                            f"{feat_name}_fft_band_0.25_0.5": np.nan})
            continue

        freqs = np.fft.fftfreq(N, d=1/sampling_rate)
        fft_values = fft(signal)
        amplitudes = np.abs(fft_values[:N // 2])
        freqs = freqs[:N // 2]

        power = amplitudes**2
        power_sum = np.sum(power)
        power_norm = power / power_sum if power_sum != 0 else np.zeros_like(power)

        results[f"{feat_name}_fft_max_amp"] = np.max(amplitudes)
        results[f"{feat_name}_fft_dominant_freq"] = freqs[np.argmax(amplitudes)]
        results[f"{feat_name}_fft_centroid"] = np.sum(freqs * amplitudes) / np.sum(amplitudes)
        results[f"{feat_name}_fft_entropy"] = entropy(power_norm)
        results[f"{feat_name}_fft_band_0_0.1"] = np.mean(amplitudes[(freqs >= 0.0) & (freqs < 0.1)])
        results[f"{feat_name}_fft_band_0.1_0.25"] = np.mean(amplitudes[(freqs >= 0.1) & (freqs < 0.25)])
        results[f"{feat_name}_fft_band_0.25_0.5"] = np.mean(amplitudes[(freqs >= 0.25) & (freqs <= 0.5)])

    return pd.Series(results)

features_to_process = ['heart_rate', 'pupil_diameter_mm', 'iris_diameter_mm', 'pupil_iris_ratio']

df_freq = (
    df.groupby(['participant', 'genre'])
      .apply(lambda g: extract_fft_features_multi({feat: g[feat].values for feat in features_to_process}))
      .reset_index()
)

df_freq

Unnamed: 0,participant,genre,heart_rate_fft_max_amp,heart_rate_fft_dominant_freq,heart_rate_fft_centroid,heart_rate_fft_entropy,heart_rate_fft_band_0_0.1,heart_rate_fft_band_0.1_0.25,heart_rate_fft_band_0.25_0.5,pupil_diameter_mm_fft_max_amp,...,iris_diameter_mm_fft_band_0_0.1,iris_diameter_mm_fft_band_0.1_0.25,iris_diameter_mm_fft_band_0.25_0.5,pupil_iris_ratio_fft_max_amp,pupil_iris_ratio_fft_dominant_freq,pupil_iris_ratio_fft_centroid,pupil_iris_ratio_fft_entropy,pupil_iris_ratio_fft_band_0_0.1,pupil_iris_ratio_fft_band_0.1_0.25,pupil_iris_ratio_fft_band_0.25_0.5
0,aimen,comedy,49210.20221,0.0,0.031131,0.010909,843.873293,25.586765,20.911417,671.838516,...,116.438235,2.502608e-14,1.330782e-14,56.935467,0.0,0.063144,0.458765,2.123322,0.20504,0.0865
1,aimen,documentary,38696.0,0.0,0.015554,0.017508,872.0012,14.80259,7.927506,702.530971,...,116.441509,9.539184e-14,8.83981e-14,59.536523,0.0,0.047283,0.435343,2.616555,0.165564,0.066909
2,aimen,horror,89781.915311,0.0,0.012155,0.014945,899.570721,11.551346,4.391695,2913.199436,...,118.0,2.770075e-15,1.515148e-15,246.881308,0.0,0.084098,0.448298,5.623465,0.802676,0.368543
3,clara,comedy,101443.5,0.0,0.013919,0.008104,904.724306,13.689033,6.560981,1660.437306,...,118.0,5.208564e-15,2.992066e-15,140.715026,0.0,0.090686,0.328602,2.837709,0.442558,0.214166
4,clara,documentary,116276.0,0.0,0.009565,0.009456,905.824524,9.441243,3.108152,1876.333205,...,118.0,2.126225e-15,1.175346e-15,159.011289,0.0,0.054913,0.366308,2.784074,0.247529,0.089489
5,clara,horror,84742.0,0.0,0.012184,0.011243,848.108513,11.643946,4.566785,2811.468662,...,117.613115,3.8952e-15,2.117644e-15,238.260056,0.0,0.115149,0.517475,6.132208,1.311626,0.70058
6,kenji,comedy,57150.295238,0.0,0.021047,0.025767,804.254698,24.978643,6.311798,1807.496248,...,116.770833,5.032401e-15,2.791763e-15,153.177648,0.0,0.070367,0.379988,4.117889,0.44298,0.19898
7,kenji,documentary,57035.109524,0.0,0.024889,0.029939,812.925849,26.217619,10.360069,1470.847071,...,117.508333,1.368898e-13,1.287608e-13,124.648057,0.0,0.061981,0.536076,3.947799,0.351565,0.138496
8,kenji,horror,51273.984657,0.0,0.018355,0.033745,794.669264,17.299115,5.798882,2034.615479,...,116.674157,3.811775e-15,2.095136e-15,172.425041,0.0,0.040644,0.403451,4.62746,0.270495,0.103039


In [14]:
df = df.merge(df_freq, on=['participant', 'genre'], how='left')
df = reorder_cols(df)
df

Unnamed: 0,timestamp,participant,heart_rate,x_coordinate,y_coordinate,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,head_displacement,head_velocity,...,iris_diameter_mm_fft_band_0.1_0.25,iris_diameter_mm_fft_band_0.25_0.5,pupil_iris_ratio_fft_max_amp,pupil_iris_ratio_fft_dominant_freq,pupil_iris_ratio_fft_centroid,pupil_iris_ratio_fft_entropy,pupil_iris_ratio_fft_band_0_0.1,pupil_iris_ratio_fft_band_0.1_0.25,pupil_iris_ratio_fft_band_0.25_0.5,genre
0,2025-06-05 16:25:55,kenji,62.0,409.206897,194.000000,2.602941,11.8,0.220588,0.000000,0.000000,...,5.032401e-15,2.791763e-15,153.177648,0.0,0.070367,0.379988,4.117889,0.442980,0.198980,comedy
1,2025-06-05 16:25:56,kenji,61.5,413.433333,194.800000,2.602941,11.8,0.220588,4.301484,4.301484,...,5.032401e-15,2.791763e-15,153.177648,0.0,0.070367,0.379988,4.117889,0.442980,0.198980,comedy
2,2025-06-05 16:25:59,kenji,60.0,397.333333,191.000000,2.602941,11.8,0.220588,16.542370,5.514123,...,5.032401e-15,2.791763e-15,153.177648,0.0,0.070367,0.379988,4.117889,0.442980,0.198980,comedy
3,2025-06-05 16:26:01,kenji,60.0,397.166667,190.600000,2.602941,11.8,0.220588,0.433333,0.216667,...,5.032401e-15,2.791763e-15,153.177648,0.0,0.070367,0.379988,4.117889,0.442980,0.198980,comedy
4,2025-06-05 16:26:04,kenji,60.0,391.000000,189.333333,2.602941,11.8,0.220588,6.295413,2.098471,...,5.032401e-15,2.791763e-15,153.177648,0.0,0.070367,0.379988,4.117889,0.442980,0.198980,comedy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9351,2025-06-12 10:33:07,aimen,70.0,356.000000,324.000000,2.726626,11.8,0.231070,0.509902,0.509902,...,2.770075e-15,1.515148e-15,246.881308,0.0,0.084098,0.448298,5.623465,0.802676,0.368543,horror
9352,2025-06-12 10:33:08,aimen,70.0,356.000000,324.000000,2.726626,11.8,0.231070,0.000000,0.000000,...,2.770075e-15,1.515148e-15,246.881308,0.0,0.084098,0.448298,5.623465,0.802676,0.368543,horror
9353,2025-06-12 10:33:09,aimen,70.0,355.741935,324.000000,2.726626,11.8,0.231070,0.258065,0.258065,...,2.770075e-15,1.515148e-15,246.881308,0.0,0.084098,0.448298,5.623465,0.802676,0.368543,horror
9354,2025-06-12 10:33:10,aimen,70.0,356.000000,324.000000,2.726626,11.8,0.231070,0.258065,0.258065,...,2.770075e-15,1.515148e-15,246.881308,0.0,0.084098,0.448298,5.623465,0.802676,0.368543,horror


## 5. Nonlinear Irregularity 

In [15]:
# For heart_rate, pupil_diameter_mm: approximate_entropy, sample_entropy

def _phi(x, m, r):
    N = len(x)
    x_m = np.array([x[i:i+m] for i in range(N - m + 1)])
    C = np.sum(np.max(np.abs(x_m[:, None] - x_m[None, :]), axis=2) <= r, axis=1) / (N - m + 1)
    return np.sum(np.log(C)) / (N - m + 1)

def approximate_entropy(x, m=2, r=None):
    if r is None:
        r = 0.2 * np.std(x)
    return abs(_phi(x, m, r) - _phi(x, m + 1, r))

def sample_entropy(x, m=2, r=None):
    if r is None:
        r = 0.2 * np.std(x)
    N = len(x)
    x_m = np.array([x[i:i+m] for i in range(N - m + 1)])
    x_m1 = np.array([x[i:i+m+1] for i in range(N - m)])
    def _count_similar(template, data, r):
        return np.sum(np.max(np.abs(data - template), axis=1) <= r) - 1  # exclude self-match
    B = np.array([_count_similar(template, x_m, r) for template in x_m])
    A = np.array([_count_similar(template, x_m1, r) for template in x_m1])
    B_sum = np.sum(B)
    A_sum = np.sum(A)
    if B_sum == 0:
        return np.nan
    return -np.log(A_sum / B_sum)

# Features to compute nonlinear irregularity for:
nonlinear_features = ['heart_rate', 'pupil_diameter_mm']

def compute_nonlinear_features(group):
    results = {}
    for feat in nonlinear_features:
        signal = group[feat].values
        if len(signal) < 10:  # or any threshold to ensure meaningful entropy
            results[f"{feat}_approx_entropy"] = np.nan
            results[f"{feat}_sample_entropy"] = np.nan
        else:
            results[f"{feat}_approx_entropy"] = approximate_entropy(signal)
            results[f"{feat}_sample_entropy"] = sample_entropy(signal)
    return pd.Series(results)

# Apply per participant and genre
df_nonlinear = df.groupby(['participant', 'genre']).apply(compute_nonlinear_features).reset_index()
df_nonlinear

Unnamed: 0,participant,genre,heart_rate_approx_entropy,heart_rate_sample_entropy,pupil_diameter_mm_approx_entropy,pupil_diameter_mm_sample_entropy
0,aimen,comedy,1.203224,1.256768,0.476242,0.27104
1,aimen,documentary,0.308756,0.19404,0.298194,0.21663
2,aimen,horror,0.454756,0.341055,0.47712,0.233784
3,clara,comedy,0.50022,0.461385,0.614898,0.442069
4,clara,documentary,0.424043,0.330462,0.351457,0.23339
5,clara,horror,0.456147,0.357954,0.749608,0.55283
6,kenji,comedy,0.599852,0.488266,0.384398,0.203238
7,kenji,documentary,0.564577,0.434855,0.396251,0.215301
8,kenji,horror,0.572341,0.465428,0.240042,0.127048


In [16]:
df = df.merge(df_nonlinear, on=['participant', 'genre'], how='left')
df = reorder_cols(df)
df

Unnamed: 0,timestamp,participant,heart_rate,x_coordinate,y_coordinate,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,head_displacement,head_velocity,...,pupil_iris_ratio_fft_centroid,pupil_iris_ratio_fft_entropy,pupil_iris_ratio_fft_band_0_0.1,pupil_iris_ratio_fft_band_0.1_0.25,pupil_iris_ratio_fft_band_0.25_0.5,heart_rate_approx_entropy,heart_rate_sample_entropy,pupil_diameter_mm_approx_entropy,pupil_diameter_mm_sample_entropy,genre
0,2025-06-05 16:25:55,kenji,62.0,409.206897,194.000000,2.602941,11.8,0.220588,0.000000,0.000000,...,0.070367,0.379988,4.117889,0.442980,0.198980,0.599852,0.488266,0.384398,0.203238,comedy
1,2025-06-05 16:25:56,kenji,61.5,413.433333,194.800000,2.602941,11.8,0.220588,4.301484,4.301484,...,0.070367,0.379988,4.117889,0.442980,0.198980,0.599852,0.488266,0.384398,0.203238,comedy
2,2025-06-05 16:25:59,kenji,60.0,397.333333,191.000000,2.602941,11.8,0.220588,16.542370,5.514123,...,0.070367,0.379988,4.117889,0.442980,0.198980,0.599852,0.488266,0.384398,0.203238,comedy
3,2025-06-05 16:26:01,kenji,60.0,397.166667,190.600000,2.602941,11.8,0.220588,0.433333,0.216667,...,0.070367,0.379988,4.117889,0.442980,0.198980,0.599852,0.488266,0.384398,0.203238,comedy
4,2025-06-05 16:26:04,kenji,60.0,391.000000,189.333333,2.602941,11.8,0.220588,6.295413,2.098471,...,0.070367,0.379988,4.117889,0.442980,0.198980,0.599852,0.488266,0.384398,0.203238,comedy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9351,2025-06-12 10:33:07,aimen,70.0,356.000000,324.000000,2.726626,11.8,0.231070,0.509902,0.509902,...,0.084098,0.448298,5.623465,0.802676,0.368543,0.454756,0.341055,0.477120,0.233784,horror
9352,2025-06-12 10:33:08,aimen,70.0,356.000000,324.000000,2.726626,11.8,0.231070,0.000000,0.000000,...,0.084098,0.448298,5.623465,0.802676,0.368543,0.454756,0.341055,0.477120,0.233784,horror
9353,2025-06-12 10:33:09,aimen,70.0,355.741935,324.000000,2.726626,11.8,0.231070,0.258065,0.258065,...,0.084098,0.448298,5.623465,0.802676,0.368543,0.454756,0.341055,0.477120,0.233784,horror
9354,2025-06-12 10:33:10,aimen,70.0,356.000000,324.000000,2.726626,11.8,0.231070,0.258065,0.258065,...,0.084098,0.448298,5.623465,0.802676,0.368543,0.454756,0.341055,0.477120,0.233784,horror


## 6. Categorical + Temporal Patterns


In [17]:
# 1. Discretize each numerical feature per participant using z-score buckets
def discretize_zscore(group, cols):
    # Compute mean and std per participant per feature
    means = group[cols].mean()
    stds = group[cols].std()

    def bucketize(x, mean, std):
        if std == 0 or np.isnan(std):
            return 'normal'  # fallback if no variation
        z = (x - mean) / std
        if z < -1:
            return 'low'
        elif z > 1:
            return 'high'
        else:
            return 'normal'

    for col in cols:
        mean = means[col]
        std = stds[col]
        group[f'{col}_bucket'] = group[col].apply(bucketize, args=(mean, std))

    return group

numerical_cols = ['heart_rate', 'pupil_diameter_mm', 'iris_diameter_mm', 'pupil_iris_ratio']
df = df.groupby(['participant', 'genre']).apply(lambda g: discretize_zscore(g, numerical_cols))
df = df.reset_index(drop=True)


# 2. Extract temporal succession patterns of these buckets
def create_bigrams(group, col):
    group = group.sort_values('timestamp')
    buckets = group[f'{col}_bucket'].values
    bigrams = [f'{buckets[i]}_{buckets[i+1]}' for i in range(len(buckets)-1)]
    # Align bigrams with timestamps (dropping the last row without a bigram)
    group = group.iloc[:-1].copy()
    group[f'{col}_bigram'] = bigrams
    return group

for col in numerical_cols:
    df = df.groupby(['participant', 'genre']).apply(lambda g: create_bigrams(g, col))
    df = df.reset_index(drop=True)
    
# 3. Count frequencies or occurrences of these patterns
pattern_counts = {}

for col in numerical_cols:
    counts = (
        df.groupby(['participant', 'genre'])[f'{col}_bigram']
          .value_counts()
          .unstack(fill_value=0)
          .add_prefix(f'{col}_bigram_')
    )
    pattern_counts[col] = counts

# Combine all pattern counts into one DataFrame:
from functools import reduce
pattern_features = reduce(lambda a, b: a.join(b, how='outer'), pattern_counts.values()).fillna(0).reset_index()


In [18]:
df = df.merge(pattern_features, on=['participant', 'genre'], how='left')

In [19]:
df.drop(['heart_rate_bigram', 'pupil_diameter_mm_bigram', 'iris_diameter_mm_bigram', 'pupil_iris_ratio_bigram'], axis=1, inplace=True)
df.columns = [col.replace('_bigram', '') for col in df.columns]
df = reorder_cols(df)
df

Unnamed: 0,timestamp,participant,heart_rate,x_coordinate,y_coordinate,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,head_displacement,head_velocity,...,pupil_iris_ratio_high_high,pupil_iris_ratio_high_low,pupil_iris_ratio_high_normal,pupil_iris_ratio_low_high,pupil_iris_ratio_low_low,pupil_iris_ratio_low_normal,pupil_iris_ratio_normal_high,pupil_iris_ratio_normal_low,pupil_iris_ratio_normal_normal,genre
6546,2025-06-05 16:25:55,kenji,62.000000,409.206897,194.000000,2.602941,11.8,0.220588,0.000000,0.000000,...,153,1,13,1,102,11,12,11,642,comedy
6547,2025-06-05 16:25:56,kenji,61.500000,413.433333,194.800000,2.602941,11.8,0.220588,4.301484,4.301484,...,153,1,13,1,102,11,12,11,642,comedy
6548,2025-06-05 16:25:59,kenji,60.000000,397.333333,191.000000,2.602941,11.8,0.220588,16.542370,5.514123,...,153,1,13,1,102,11,12,11,642,comedy
6549,2025-06-05 16:26:01,kenji,60.000000,397.166667,190.600000,2.602941,11.8,0.220588,0.433333,0.216667,...,153,1,13,1,102,11,12,11,642,comedy
6550,2025-06-05 16:26:04,kenji,60.000000,391.000000,189.333333,2.602941,11.8,0.220588,6.295413,2.098471,...,153,1,13,1,102,11,12,11,642,comedy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2437,2025-06-12 10:33:03,aimen,69.555556,354.483871,323.258065,2.726626,11.8,0.231070,0.965066,0.965066,...,125,0,19,3,145,19,16,21,908,horror
2438,2025-06-12 10:33:04,aimen,69.666667,354.600000,324.000000,2.726626,11.8,0.231070,0.750969,0.750969,...,125,0,19,3,145,19,16,21,908,horror
2439,2025-06-12 10:33:05,aimen,69.777778,355.000000,323.931034,2.726626,11.8,0.231070,0.405902,0.405902,...,125,0,19,3,145,19,16,21,908,horror
2440,2025-06-12 10:33:06,aimen,69.888889,355.500000,323.900000,2.726626,11.8,0.231070,0.500962,0.500962,...,125,0,19,3,145,19,16,21,908,horror


## 7. Per-Person Normalization 

In [20]:
# Leave for modelling

## 8. Clustering?

In [21]:
# not necessarily?

## Final Step: save final set of features

In [22]:
df.to_csv('data_with_new_features_v3.csv', index=False)

In [23]:
with open("columns.txt", "w") as f:
    for col in df.columns:
        f.write(col + "\n")
